radeonsi: initial WIP SI code

author Tom Stellard <thomas.stellard@amd.com>

Fri, 6 Jan 2012 22:38:37 +0000 (17:38 -0500)

committer Tom Stellard <thomas.stellard@amd.com>

Fri, 13 Apr 2012 14:32:06 +0000 (10:32 -0400)
author Tom Stellard <thomas.stellard@amd.com>
Fri, 6 Jan 2012 22:38:37 +0000 (17:38 -0500)
committer Tom Stellard <thomas.stellard@amd.com>
Fri, 13 Apr 2012 14:32:06 +0000 (10:32 -0400)
diff --git a/Android.mk b/Android.mk

index 0d5917ce5f277162c3d19048526ec7631fe10f88..6a3014c81bb860fbfc3d4e7f25ccfb06c70f1f6f 100644 (file)
--- a/Android.mk
+++ b/Android.mk
@@ -24,7 +24,7 @@
  # BOARD_GPU_DRIVERS should be defined.  The valid values are
  #
  #   classic drivers: i915 i965
-#   gallium drivers: swrast i915g nouveau r300g r600g vmwgfx
+#   gallium drivers: swrast i915g nouveau r300g r600g radeonsi vmwgfx
  #
  # The main target is libGLES_mesa.  For each classic driver enabled, a DRI
  # module will also be built.  DRI modules will be loaded by libGLES_mesa.
@@ -37,7 +37,7 @@ DRM_TOP := external/drm
  DRM_GRALLOC_TOP := hardware/drm_gralloc
  
  classic_drivers := i915 i965
-gallium_drivers := swrast i915g nouveau r300g r600g vmwgfx
+gallium_drivers := swrast i915g nouveau r300g r600g radeonsi vmwgfx
  
  MESA_GPU_DRIVERS := $(strip $(BOARD_GPU_DRIVERS))
  
diff --git a/configs/autoconf.in b/configs/autoconf.in

index 95cca6f239e8957a1db0521311f2fa1abdb69957..ec3f3194e2d01c0db8edbdfd9db18ff7ba3da7de 100644 (file)
--- a/configs/autoconf.in
+++ b/configs/autoconf.in
@@ -32,9 +32,12 @@ INTEL_LIBS = @INTEL_LIBS@
  INTEL_CFLAGS = @INTEL_CFLAGS@
  X11_LIBS = @X11_LIBS@
  X11_CFLAGS = @X11_CFLAGS@
+LLVM_BINDIR = @LLVM_BINDIR@
  LLVM_CFLAGS = @LLVM_CFLAGS@
+LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
  LLVM_LDFLAGS = @LLVM_LDFLAGS@
  LLVM_LIBS = @LLVM_LIBS@
+LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@
  GLW_CFLAGS = @GLW_CFLAGS@
  GLX_TLS = @GLX_TLS@
  DRI_CFLAGS = @DRI_CFLAGS@
@@ -58,6 +61,9 @@ AWK = @AWK@
  GREP = @GREP@
  NM = @NM@
  
+# Perl
+PERL = @PERL@
+
  # Python and flags (generally only needed by the developers)
  PYTHON2 = @PYTHON2@
  PYTHON_FLAGS = -t -O -O
diff --git a/configure.ac b/configure.ac

index 65d358e0a8d6b9a6c5c2c77ca5721b958d2e596d..17564f12885190ed6424fec9dcca937511cd7a3e 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -67,6 +67,8 @@ if test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.y"; then
  fi
  AC_PROG_LEX
  
+AC_PATH_PROG([PERL], [perl])
+
  dnl Our fallback install-sh is a symlink to minstall. Use the existing
  dnl configuration in that case.
  AC_PROG_INSTALL
@@ -1647,9 +1649,12 @@ if test "x$with_gallium_drivers" != x; then
      SRC_DIRS="$SRC_DIRS gallium gallium/winsys gallium/targets"
  fi
  
+AC_SUBST([LLVM_BINDIR])
  AC_SUBST([LLVM_CFLAGS])
+AC_SUBST([LLVM_CXXFLAGS])
  AC_SUBST([LLVM_LIBS])
  AC_SUBST([LLVM_LDFLAGS])
+AC_SUBST([LLVM_INCLUDEDIR])
  AC_SUBST([LLVM_VERSION])
  
  case "x$enable_opengl$enable_gles1$enable_gles2" in
@@ -1795,6 +1800,9 @@ if test "x$enable_gallium_llvm" = xyes; then
             LLVM_LIBS="`$LLVM_CONFIG --libs engine bitwriter`"
         fi
         LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags`
+       LLVM_BINDIR=`$LLVM_CONFIG --bindir`
+       LLVM_CXXFLAGS=`$LLVM_CONFIG --cxxflags`
+       LLVM_INCLUDEDIR=`$LLVM_CONFIG --includedir`
         DEFINES="$DEFINES -D__STDC_CONSTANT_MACROS"
         MESA_LLVM=1
      else
@@ -1898,6 +1906,14 @@ if test "x$with_gallium_drivers" != x; then
              GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600"
              gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600"
              ;;
+        xradeonsi)
+            GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS radeonsi"
+            if test "x$LLVM_VERSION" != "x3.1"; then
+                AC_MSG_ERROR([LLVM 3.1 is required to build the radeonsi driver.])
+            fi
+           NEED_RADEON_GALLIUM=yes;
+            gallium_check_st "radeon/drm" "dri-radeonsi" "xorg-radeonsi"
+            ;;
          xnouveau)
              PKG_CHECK_MODULES([NOUVEAU], [libdrm_nouveau >= $LIBDRM_NOUVEAU_REQUIRED])
              GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS nouveau nvfx nv50 nvc0"
@@ -1957,6 +1973,7 @@ done
  AM_CONDITIONAL(HAVE_GALAHAD_GALLIUM, test x$HAVE_GALAHAD_GALLIUM = xyes)
  AM_CONDITIONAL(HAVE_IDENTITY_GALLIUM, test x$HAVE_IDENTITY_GALLIUM = xyes)
  AM_CONDITIONAL(HAVE_NOOP_GALLIUM, test x$HAVE_NOOP_GALLIUM = xyes)
+AM_CONDITIONAL(NEED_RADEON_GALLIUM, test x$NEED_RADEON_GALLIUM = xyes)
  AC_SUBST([GALLIUM_MAKE_DIRS])
  
  dnl prepend CORE_DIRS to SRC_DIRS
diff --git a/include/pci_ids/pci_id_driver_map.h b/include/pci_ids/pci_id_driver_map.h

index 232359f6e0d03688c98531b88ae879d3c9836edb..fce38af0fe04d4f512f0cf49dbf3791bbdee6729 100644 (file)
--- a/include/pci_ids/pci_id_driver_map.h
+++ b/include/pci_ids/pci_id_driver_map.h
@@ -45,6 +45,12 @@ static const int r600_chip_ids[] = {
  #undef CHIPSET
  };
  
+static const int radeonsi_chip_ids[] = {
+#define CHIPSET(chip, name, family) chip,
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+};
+
  static const int vmwgfx_chip_ids[] = {
  #define CHIPSET(chip, name, family) chip,
  #include "pci_ids/vmwgfx_pci_ids.h"
@@ -65,6 +71,7 @@ static const struct {
  #endif
     { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) },
     { 0x1002, "r600", r600_chip_ids, ARRAY_SIZE(r600_chip_ids) },
+   { 0x1002, "radeonsi", radeonsi_chip_ids, ARRAY_SIZE(radeonsi_chip_ids) },
     { 0x10de, "nouveau", NULL, -1 },
     { 0x15ad, "vmwgfx", vmwgfx_chip_ids, ARRAY_SIZE(vmwgfx_chip_ids) },
     { 0x0000, NULL, NULL, 0 },
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h

new file mode 100644 (file)

index 0000000..55ade12
--- /dev/null
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -0,0 +1,40 @@
+CHIPSET(0x6780, TAHITI_6780, TAHITI)
+CHIPSET(0x6784, TAHITI_6784, TAHITI)
+CHIPSET(0x6788, TAHITI_678A, TAHITI)
+CHIPSET(0x678A, TAHITI_678A, TAHITI)
+CHIPSET(0x6790, TAHITI_6790, TAHITI)
+CHIPSET(0x6798, TAHITI_6798, TAHITI)
+CHIPSET(0x6799, TAHITI_6799, TAHITI)
+CHIPSET(0x679A, TAHITI_679E, TAHITI)
+CHIPSET(0x679E, TAHITI_679E, TAHITI)
+CHIPSET(0x679F, TAHITI_679F, TAHITI)
+
+CHIPSET(0x6800, PITCAIRN_6800, PITCAIRN)
+CHIPSET(0x6801, PITCAIRN_6801, PITCAIRN)
+CHIPSET(0x6802, PITCAIRN_6802, PITCAIRN)
+CHIPSET(0x6808, PITCAIRN_6808, PITCAIRN)
+CHIPSET(0x6809, PITCAIRN_6809, PITCAIRN)
+CHIPSET(0x6810, PITCAIRN_6810, PITCAIRN)
+CHIPSET(0x6818, PITCAIRN_6818, PITCAIRN)
+CHIPSET(0x6819, PITCAIRN_6819, PITCAIRN)
+CHIPSET(0x684C, PITCAIRN_684C, PITCAIRN)
+
+CHIPSET(0x6820, VERDE_6820, VERDE)
+CHIPSET(0x6821, VERDE_6821, VERDE)
+CHIPSET(0x6823, VERDE_6824, VERDE)
+CHIPSET(0x6824, VERDE_6824, VERDE)
+CHIPSET(0x6825, VERDE_6825, VERDE)
+CHIPSET(0x6826, VERDE_6825, VERDE)
+CHIPSET(0x6827, VERDE_6827, VERDE)
+CHIPSET(0x6828, VERDE_6828, VERDE)
+CHIPSET(0x6829, VERDE_6829, VERDE)
+CHIPSET(0x682D, VERDE_682D, VERDE)
+CHIPSET(0x682F, VERDE_682F, VERDE)
+CHIPSET(0x6830, VERDE_6830, VERDE)
+CHIPSET(0x6831, VERDE_6831, VERDE)
+CHIPSET(0x6837, VERDE_6831, VERDE)
+CHIPSET(0x6838, VERDE_6838, VERDE)
+CHIPSET(0x6839, VERDE_6839, VERDE)
+CHIPSET(0x683B, VERDE_683B, VERDE)
+CHIPSET(0x683D, VERDE_683D, VERDE)
+CHIPSET(0x683F, VERDE_683F, VERDE)
diff --git a/src/egl/main/Android.mk b/src/egl/main/Android.mk

index d96da228aa7a109f2106d50dff056e894303e037..a4a00f3bb351d200f02b6da5280dfe92316c5f74 100644 (file)
--- a/src/egl/main/Android.mk
+++ b/src/egl/main/Android.mk
@@ -107,8 +107,8 @@ gallium_DRIVERS += \
  LOCAL_SHARED_LIBRARIES += libdrm_nouveau
  endif
  
-# r300g/r600g
-ifneq ($(filter r300g r600g, $(MESA_GPU_DRIVERS)),)
+# r300g/r600g/radeonsi
+ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),)
  gallium_DRIVERS += libmesa_winsys_radeon
  ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
  gallium_DRIVERS += libmesa_pipe_r300
@@ -116,6 +116,9 @@ endif
  ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
  gallium_DRIVERS += libmesa_pipe_r600
  endif
+ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_pipe_radeonsi
+endif
  endif
  
  # vmwgfx
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk

index 41c59b13c6ff840b2df1f075d6aa4ec57a446e3e..1d002d05374a7984ac4e5cd132f62f1c6a26a54a 100644 (file)
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -49,8 +49,8 @@ SUBDIRS += \
         drivers/nvc0
  endif
  
-# r300g/r600g
-ifneq ($(filter r300g r600g, $(MESA_GPU_DRIVERS)),)
+# r300g/r600g/radeonsi
+ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),)
  SUBDIRS += winsys/radeon/drm
  ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
  SUBDIRS += drivers/r300
@@ -58,6 +58,9 @@ endif
  ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
  SUBDIRS += drivers/r600
  endif
+ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
+SUBDIRS += drivers/radeonsi
+endif
  endif
  
  # vmwgfx
diff --git a/src/gallium/SConscript b/src/gallium/SConscript

index 4413bc8742b36e2c52dd89341a2b7fe3327ddfbc..da2e4dd5dedf5bedd11e1e13a6c1d76e80a74e9c 100644 (file)
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -33,6 +33,7 @@ if env['drm']:
          SConscript([
              'drivers/r300/SConscript',
              'drivers/r600/SConscript',
+            'drivers/radeonsi/SConscript',
          ])
      # XXX: nouveau drivers have a tight dependency on libdrm, so to enable
      # we need some version logic before we enable them. Also, ATM there is
@@ -152,6 +153,7 @@ if not env['embedded']:
              SConscript([
                  'targets/dri-r300/SConscript',
                  'targets/dri-r600/SConscript',
+                'targets/dri-radeonsi/SConscript',
              ])
  
      if env['xorg'] and env['drm']:
diff --git a/src/gallium/drivers/Makefile.am b/src/gallium/drivers/Makefile.am

index 0aa2653a0f199075822adf0ff3013ce4c6ba0f7e..97c5695fa15f9c5c551d5fbcd6574ec0a6202219 100644 (file)
--- a/src/gallium/drivers/Makefile.am
+++ b/src/gallium/drivers/Makefile.am
@@ -10,6 +10,8 @@ AM_CPPFLAGS = \
  
  noinst_LIBRARIES =
  
+SUBDIRS =
+
  ################################################################################
  
  if HAVE_GALAHAD_GALLIUM
@@ -52,7 +54,16 @@ noop_libnoop_a_SOURCES = \
  endif
  
  ################################################################################
-SUBDIRS = $(GALLIUM_MAKE_DIRS)
+
+if NEED_RADEON_GALLIUM
+
+SUBDIRS+= radeon
+
+endif
+
+################################################################################
+
+SUBDIRS+= $(GALLIUM_MAKE_DIRS)
  
  # FIXME: Remove when the rest of Gallium is converted to automake.
  default: all
diff --git a/src/gallium/drivers/radeon/AMDGPU.h b/src/gallium/drivers/radeon/AMDGPU.h

new file mode 100644 (file)

index 0000000..5613dab
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPU.h
@@ -0,0 +1,47 @@
+//===-- AMDGPU.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_H
+#define AMDGPU_H
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+    class FunctionPass;
+    class AMDGPUTargetMachine;
+
+    FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
+    FunctionPass *createR600LowerShaderInstructionsPass(TargetMachine &tm);
+    FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm);
+
+    FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
+    FunctionPass *createSIConvertToISAPass(TargetMachine &tm);
+    FunctionPass *createSIInitMachineFunctionInfoPass(TargetMachine &tm);
+    FunctionPass *createSILowerShaderInstructionsPass(TargetMachine &tm);
+    FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm);
+    FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+
+    FunctionPass *createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm);
+
+    FunctionPass *createAMDGPULowerShaderInstructionsPass(TargetMachine &tm);
+
+    FunctionPass *createAMDGPUDelimitInstGroupsPass(TargetMachine &tm);
+
+    FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+
+    FunctionPass *createAMDGPUFixRegClassesPass(TargetMachine &tm);
+
+} /* End namespace llvm */
+#endif /* AMDGPU_H */
diff --git a/src/gallium/drivers/radeon/AMDGPUConstants.pm b/src/gallium/drivers/radeon/AMDGPUConstants.pm

new file mode 100644 (file)

index 0000000..b64ff49
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUConstants.pm
@@ -0,0 +1,44 @@
+#===-- AMDGPUConstants.pm - TODO: Add brief description -------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===#
+#
+# TODO: Add full description
+#
+#===----------------------------------------------------------------------===#
+
+package AMDGPUConstants;
+
+use base 'Exporter';
+
+use constant CONST_REG_COUNT => 256;
+use constant TEMP_REG_COUNT => 128;
+
+our @EXPORT = ('TEMP_REG_COUNT', 'CONST_REG_COUNT', 'get_hw_index', 'get_chan_str');
+
+sub get_hw_index {
+  my ($index) = @_;
+  return int($index / 4);
+}
+
+sub get_chan_str {
+  my ($index) = @_;
+  my $chan = $index % 4;
+  if ($chan == 0 )  {
+    return 'X';
+  } elsif ($chan == 1) {
+    return 'Y';
+  } elsif ($chan == 2) {
+    return 'Z';
+  } elsif ($chan == 3) {
+    return 'W';
+  } else {
+    die("Unknown chan value: $chan");
+  }
+}
+
+1;
diff --git a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp

new file mode 100644 (file)

index 0000000..ce947f8
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
@@ -0,0 +1,65 @@
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers AMDIL machine instructions to the appropriate hardware
+// instructions. 
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class AMDGPUConvertToISAPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+    void lowerFLT(MachineInstr &MI);
+
+  public:
+    AMDGPUConvertToISAPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char AMDGPUConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
+  return new AMDGPUConvertToISAPass(tm);
+}
+
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const AMDGPUInstrInfo * TII =
+                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      MachineInstr * newInstr = TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
+      if (!newInstr) {
+        continue;
+      }
+      MBB.insert(I, newInstr);
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl

new file mode 100644 (file)

index 0000000..1fd4fb0
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
@@ -0,0 +1,126 @@
+#===-- AMDGPUGenInstrEnums.pl - TODO: Add brief description -------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===#
+#
+# TODO: Add full description
+#
+#===----------------------------------------------------------------------===#
+
+use warnings;
+use strict;
+
+my @F32_MULTICLASSES = qw {
+  UnaryIntrinsicFloat
+  UnaryIntrinsicFloatScalar
+  BinaryIntrinsicFloat
+  TernaryIntrinsicFloat
+  BinaryOpMCFloat
+};
+
+my @I32_MULTICLASSES = qw {
+  BinaryOpMCInt
+  BinaryOpMCi32
+  BinaryOpMCi32Const
+};
+
+my @GENERATION_ENUM = qw {
+  R600_CAYMAN
+  R600
+  EG
+  EG_CAYMAN
+  CAYMAN
+  SI
+};
+
+my $FILE_TYPE = $ARGV[0];
+
+open AMDIL, '<', 'AMDILInstructions.td';
+
+my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ');
+
+while (<AMDIL>) {
+  if ($_ =~ /defm\s+([A-Z_]+)\s+:\s+([A-Za-z0-9]+)</) {
+    if (grep {$_ eq $2} @F32_MULTICLASSES) {
+      push @INST_ENUMS, "$1\_f32";
+
+    } elsif (grep {$_ eq $2} @I32_MULTICLASSES) {
+      push @INST_ENUMS, "$1\_i32";
+    }
+  } elsif ($_ =~ /def\s+([A-Z_]+)(_[fi]32)/) {
+    push @INST_ENUMS, "$1$2";
+  }
+}
+
+if ($FILE_TYPE eq 'td') {
+
+  print_td_enum('AMDILInst', 'AMDILInstEnums', 'field bits<16>', @INST_ENUMS);
+
+  print_td_enum('AMDGPUGen', 'AMDGPUGenEnums', 'field bits<3>', @GENERATION_ENUM);
+
+  my %constants = (
+    'PI' =>      '0x40490fdb',
+    'TWO_PI' =>     '0x40c90fdb',
+    'TWO_PI_INV' => '0x3e22f983'
+  );
+
+  print "class Constants {\n";
+  foreach (keys(%constants)) {
+    print "int $_ = $constants{$_};\n";
+  }
+  print "}\n";
+  print "def CONST : Constants;\n";
+
+} elsif ($FILE_TYPE eq 'h') {
+
+  print "unsigned GetRealAMDILOpcode(unsigned internalOpcode) const;\n";
+
+  print_h_enum('AMDILTblgenOpcode', @INST_ENUMS);
+
+  print_h_enum('AMDGPUGen', @GENERATION_ENUM);
+
+} elsif ($FILE_TYPE eq 'inc') {
+  print "unsigned AMDGPUInstrInfo::GetRealAMDILOpcode(unsigned internalOpcode) const\n{\n";
+  print "  switch(internalOpcode) {\n";
+  #Start at 1 so we skip NONE
+  for (my $i = 1; $i < scalar(@INST_ENUMS); $i++) {
+    my $inst = $INST_ENUMS[$i];
+    print "  case AMDGPUInstrInfo::$inst: return AMDIL::$inst;\n";
+  }
+  print "  default: abort();\n";
+  print "  }\n}\n";
+}
+
+
+sub print_td_enum {
+  my ($instance, $class, $field, @values) = @_;
+
+  print "class $class {\n";
+
+  for (my $i = 0; $i < scalar(@values); $i++) {
+    print "  $field $values[$i] = $i;\n";
+  }
+  print "}\n";
+
+  print "def $instance : $class;\n";
+}
+
+sub print_h_enum {
+
+  my ($enum, @list) = @_;
+  print "enum $enum {\n";
+
+  for (my $i = 0; $i < scalar(@list); $i++) {
+    print "  $list[$i] = $i";
+    if ($i != $#list) {
+      print ',';
+    }
+    print "\n";
+  }
+  print "};\n";
+}
+
diff --git a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl

new file mode 100644 (file)

index 0000000..60523a7
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl
@@ -0,0 +1,30 @@
+#===-- AMDGPUGenShaderPatterns.pl - TODO: Add brief description -------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===#
+#
+# TODO: Add full description
+#
+#===----------------------------------------------------------------------===#
+
+use strict;
+use warnings;
+
+use AMDGPUConstants;
+
+my $reg_prefix = $ARGV[0];
+
+for (my $i = 0; $i < CONST_REG_COUNT * 4; $i++) {
+  my $index = get_hw_index($i);
+  my $chan = get_chan_str($i);
+print <<STRING;
+def : Pat <
+  (int_AMDGPU_load_const $i),
+  (f32 (MOV (f32 $reg_prefix$index\_$chan)))
+>;
+STRING
+}
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp

new file mode 100644 (file)

index 0000000..2c1052f
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
@@ -0,0 +1,31 @@
+//===-- AMDGPUISelLowering.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPUUtil.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
+  AMDILTargetLowering(TM)
+{
+}
+
+void AMDGPUTargetLowering::addLiveIn(MachineInstr * MI,
+    MachineFunction * MF, MachineRegisterInfo & MRI,
+    const struct TargetInstrInfo * TII, unsigned reg) const
+{
+  AMDGPU::utilAddLiveIn(MF, MRI, TII, reg, MI->getOperand(0).getReg()); 
+}
+
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.h b/src/gallium/drivers/radeon/AMDGPUISelLowering.h

new file mode 100644 (file)

index 0000000..3c5beb1
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
@@ -0,0 +1,35 @@
+//===-- AMDGPUISelLowering.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUISELLOWERING_H
+#define AMDGPUISELLOWERING_H
+
+#include "AMDILISelLowering.h"
+
+namespace llvm {
+
+class AMDGPUTargetLowering : public AMDILTargetLowering
+{
+protected:
+  void addLiveIn(MachineInstr * MI, MachineFunction * MF,
+                 MachineRegisterInfo & MRI, const struct TargetInstrInfo * TII,
+                unsigned reg) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM);
+
+};
+
+} /* End namespace llvm */
+
+#endif /* AMDGPUISELLOWERING_H */
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp

new file mode 100644 (file)

index 0000000..4742283
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
@@ -0,0 +1,116 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the TargetInstrInfo class that is
+// common to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(AMDGPUTargetMachine &tm)
+  : AMDILInstrInfo(tm), TM(tm)
+{
+  const AMDILDevice * dev = TM.getSubtarget<AMDILSubtarget>().device();
+  for (unsigned i = 0; i < AMDIL::INSTRUCTION_LIST_END; i++) {
+    const MCInstrDesc & instDesc = get(i);
+    uint32_t instGen = (instDesc.TSFlags >> 40) & 0x7;
+    uint32_t inst = (instDesc.TSFlags >>  48) & 0xffff;
+    if (inst == 0) {
+      continue;
+    }
+    switch (instGen) {
+    case AMDGPUInstrInfo::R600_CAYMAN:
+      if (dev->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        continue;
+      }
+      break;
+    case AMDGPUInstrInfo::R600:
+      if (dev->getGeneration() != AMDILDeviceInfo::HD4XXX) {
+        continue;
+      }
+      break;
+    case AMDGPUInstrInfo::EG_CAYMAN:
+      if (dev->getGeneration() < AMDILDeviceInfo::HD5XXX
+          || dev->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        continue;
+      }
+      break;
+    case AMDGPUInstrInfo::CAYMAN:
+      if (dev->getDeviceFlag() != OCL_DEVICE_CAYMAN) {
+        continue;
+      }
+      break;
+    case AMDGPUInstrInfo::SI:
+      if (dev->getGeneration() != AMDILDeviceInfo::HD7XXX) {
+        continue;
+      }
+      break;
+    default:
+      abort();
+      break;
+    }
+
+    unsigned amdilOpcode = GetRealAMDILOpcode(inst);
+    amdilToISA[amdilOpcode] = instDesc.Opcode;
+  }
+}
+
+MachineInstr * AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const
+{
+  MachineInstrBuilder newInstr;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPURegisterInfo & RI = getRegisterInfo();
+  unsigned ISAOpcode = getISAOpcode(MI.getOpcode());
+
+  /* Create the new instruction */
+  newInstr = BuildMI(MF, DL, TM.getInstrInfo()->get(ISAOpcode));
+
+  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+    MachineOperand &MO = MI.getOperand(i);
+    /* Convert dst regclass to one that is supported by the ISA */
+    if (MO.isReg() && MO.isDef()) {
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
+        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
+
+        assert(newRegClass);
+
+        MRI.setRegClass(MO.getReg(), newRegClass);
+      }
+    }
+    /* Add the operand to the new instruction */
+    newInstr.addOperand(MO);
+  }
+
+  return newInstr;
+}
+
+unsigned AMDGPUInstrInfo::getISAOpcode(unsigned opcode) const
+{
+  if (amdilToISA.count(opcode) == 0) {
+    return opcode;
+  } else {
+    return amdilToISA.find(opcode)->second;
+  }
+}
+
+bool AMDGPUInstrInfo::isRegPreload(const MachineInstr &MI) const
+{
+  return (get(MI.getOpcode()).TSFlags >> AMDGPU_TFLAG_SHIFTS::PRELOAD_REG) & 0x1;
+}
+
+#include "AMDGPUInstrEnums.include"
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h

new file mode 100644 (file)

index 0000000..fa009bc
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
@@ -0,0 +1,59 @@
+//===-- AMDGPUInstrInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTRUCTIONINFO_H_
+#define AMDGPUINSTRUCTIONINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDILInstrInfo.h"
+
+#include <map>
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class AMDGPUInstrInfo : public AMDILInstrInfo {
+  private:
+  AMDGPUTargetMachine & TM;
+  std::map<unsigned, unsigned> amdilToISA;
+
+  public:
+  explicit AMDGPUInstrInfo(AMDGPUTargetMachine &tm);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  virtual unsigned getISAOpcode(unsigned AMDILopcode) const;
+
+  virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const;
+
+  bool isRegPreload(const MachineInstr &MI) const;
+
+  #include "AMDGPUInstrEnums.h.include"
+  };
+
+} // End llvm namespace
+
+/* AMDGPU target flags are stored in bits 32-39 */
+namespace AMDGPU_TFLAG_SHIFTS {
+  enum TFLAGS {
+    PRELOAD_REG = 32
+  };
+}
+
+
+#endif // AMDGPUINSTRINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDGPUInstructions.td b/src/gallium/drivers/radeon/AMDGPUInstructions.td

new file mode 100644 (file)

index 0000000..10eceb6
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUInstructions.td
@@ -0,0 +1,90 @@
+//===-- AMDGPUInstructions.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+include "AMDGPUInstrEnums.td"
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bits<16> AMDILOp = 0;
+  field bits<3> Gen = 0;
+  field bit PreloadReg = 0;
+
+  let Namespace = "AMDIL";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let TSFlags{32} = PreloadReg;
+  let TSFlags{42-40} = Gen;
+  let TSFlags{63-48} = AMDILOp;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+let isCodeGenOnly = 1 in {
+
+  def EXPORT_REG : AMDGPUShaderInst <
+    (outs),
+    (ins GPRF32:$src),
+    "EXPORT_REG $src",
+    [(int_AMDGPU_export_reg GPRF32:$src)]
+  >;
+
+  def LOAD_INPUT : AMDGPUShaderInst <
+    (outs GPRF32:$dst),
+    (ins i32imm:$src),
+    "LOAD_INPUT $dst, $src",
+    [] >{
+    let PreloadReg = 1;
+  }
+
+  def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins GPRF32:$src),
+    "MASK_WRITE $src",
+    []
+  >;
+
+  def RESERVE_REG : AMDGPUShaderInst <
+    (outs GPRF32:$dst),
+    (ins i32imm:$src),
+    "RESERVE_REG $dst, $src",
+    [(set GPRF32:$dst, (int_AMDGPU_reserve_reg imm:$src))]> {
+    let PreloadReg = 1;
+  }
+
+  def STORE_OUTPUT: AMDGPUShaderInst <
+    (outs GPRF32:$dst),
+    (ins GPRF32:$src0, i32imm:$src1),
+    "STORE_OUTPUT $dst, $src0, $src1",
+    [(set GPRF32:$dst, (int_AMDGPU_store_output GPRF32:$src0, imm:$src1))]
+  >;
+}
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
+                  RegisterClass rc> : Pat <
+  (int_AMDGPU_pow rc:$src0, rc:$src1),
+  (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
+>;
+
+include "R600Instructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td

new file mode 100644 (file)

index 0000000..d2cda0d
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
@@ -0,0 +1,56 @@
+//===-- AMDGPUIntrinsics.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_export_reg : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
+  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], []>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
+  def int_AMDGPU_store_output : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], []>;
+
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_cos : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], []>;
+  def int_AMDGPU_floor : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_kill : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_sge : BinaryIntFloat;
+  def int_AMDGPU_sin : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_ssg : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp

new file mode 100644 (file)

index 0000000..d33055c
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp
@@ -0,0 +1,38 @@
+//===-- AMDGPULowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPULowerShaderInstructions.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+void AMDGPULowerShaderInstructionsPass::preloadRegister(MachineFunction * MF,
+    const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) const
+{
+  if (!MRI->isLiveIn(physReg)) {
+    MRI->addLiveIn(physReg, virtReg);
+    MachineBasicBlock &EntryMBB = MF->front();
+    BuildMI(MF->front(), EntryMBB.begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
+            virtReg)
+            .addReg(physReg);
+  } else {
+    /* We can't mark the same register as preloaded twice, but we still must
+     * associate virtReg with the correct preloaded register. */
+    unsigned newReg = MRI->getLiveInVirtReg(physReg);
+    MRI->replaceRegWith(virtReg, newReg);
+  }
+}
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h

new file mode 100644 (file)

index 0000000..5ee77fa
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h
@@ -0,0 +1,40 @@
+//===-- AMDGPULowerShaderInstructions.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef AMDGPU_LOWER_SHADER_INSTRUCTIONS
+#define AMDGPU_LOWER_SHADER_INSTRUCTIONS
+
+namespace llvm {
+
+class MachineFunction;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+class AMDGPULowerShaderInstructionsPass {
+
+  protected:
+    MachineRegisterInfo * MRI;
+    /**
+     * @param physReg The physical register that will be preloaded.
+     * @param virtReg The virtual register that currently holds the
+     *                preloaded value.
+     */
+    void preloadRegister(MachineFunction * MF, const TargetInstrInfo * TII,
+                         unsigned physReg, unsigned virtReg) const;
+};
+
+} // end namespace llvm
+
+
+#endif // AMDGPU_LOWER_SHADER_INSTRUCTIONS
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp

new file mode 100644 (file)

index 0000000..162a491
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
@@ -0,0 +1,24 @@
+//===-- AMDGPURegisterInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDILRegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h

new file mode 100644 (file)

index 0000000..f4492e9
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
@@ -0,0 +1,38 @@
+//===-- AMDGPURegisterInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUREGISTERINFO_H_
+#define AMDGPUREGISTERINFO_H_
+
+#include "AMDILRegisterInfo.h"
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class TargetInstrInfo;
+
+  struct AMDGPURegisterInfo : public AMDILRegisterInfo
+  {
+    AMDGPUTargetMachine &TM;
+    const TargetInstrInfo &TII;
+
+    AMDGPURegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+    virtual BitVector getReservedRegs(const MachineFunction &MF) const = 0;
+
+    virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass * rc) const = 0;
+  };
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td

new file mode 100644 (file)

index 0000000..173d662
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
@@ -0,0 +1,22 @@
+//===-- AMDGPURegisterInfo.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDIL" in {
+  def sel_x : SubRegIndex;
+  def sel_y : SubRegIndex;
+  def sel_z : SubRegIndex;
+  def sel_w : SubRegIndex;
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp

new file mode 100644 (file)

index 0000000..c923f19
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp
@@ -0,0 +1,66 @@
+//===-- AMDGPUReorderPreloadInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Function.h"
+
+using namespace llvm;
+
+namespace {
+  class AMDGPUReorderPreloadInstructionsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    AMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+      bool runOnMachineFunction(MachineFunction &MF);
+
+      const char *getPassName() const { return "AMDGPU Reorder Preload Instructions"; }
+    };
+} /* End anonymous namespace */
+
+char AMDGPUReorderPreloadInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) {
+    return new AMDGPUReorderPreloadInstructionsPass(tm);
+}
+
+/* This pass moves instructions that represent preloaded registers to the
+ * start of the program. */
+bool AMDGPUReorderPreloadInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  const AMDGPUInstrInfo * TII =
+                        static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      if (TII->isRegPreload(MI)) {
+         MF.front().insert(MF.front().begin(), MI.removeFromParent());
+      }
+    }
+  }
+  return false;
+}
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp

new file mode 100644 (file)

index 0000000..4d6a1bd
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
@@ -0,0 +1,180 @@
+//===-- AMDGPUTargetMachine.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILTargetMachine.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "R600KernelParameters.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
+    StringRef CPU, StringRef FS,
+  TargetOptions Options,
+  Reloc::Model RM, CodeModel::Model CM,
+  CodeGenOpt::Level OptLevel
+)
+:
+  AMDILTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+  Subtarget(TT, CPU, FS),
+  mGM(new AMDILGlobalManager(0 /* Debug mode */)),
+  mKM(new AMDILKernelManager(this, mGM)),
+  mDump(false)
+
+{
+  /* XXX: Add these two initializations to fix a segfault, not sure if this
+   * is correct.  These are normally initialized in the AsmPrinter, but AMDGPU
+   * does not use the asm printer */
+  Subtarget.setGlobalManager(mGM);
+  Subtarget.setKernelManager(mKM);
+  /* TLInfo uses InstrInfo so it must be initialized after. */
+  if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    InstrInfo = new R600InstrInfo(*this);
+    TLInfo = new R600TargetLowering(*this);
+  } else {
+    InstrInfo = new SIInstrInfo(*this);
+    TLInfo = new SITargetLowering(*this);
+  }
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine()
+{
+    delete mGM;
+    delete mKM;
+}
+
+bool AMDGPUTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
+                                              formatted_raw_ostream &Out,
+                                              CodeGenFileType FileType,
+                                              bool DisableVerify) {
+  /* XXX: Hack here addPassesToEmitFile will fail, but this is Ok since we are
+   * only using it to access addPassesToGenerateCode() */
+  bool fail = LLVMTargetMachine::addPassesToEmitFile(PM, Out, FileType,
+                                                     DisableVerify);
+  assert(fail);
+
+  const AMDILSubtarget &STM = getSubtarget<AMDILSubtarget>();
+  std::string gpu = STM.getDeviceName();
+  if (gpu == "SI") {
+    PM.add(createSICodeEmitterPass(Out));
+  } else if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    PM.add(createR600CodeEmitterPass(Out));
+  } else {
+    abort();
+    return true;
+  }
+  PM.add(createGCInfoDeleter());
+
+  return false;
+}
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDGPUPassConfig(this, PM);
+}
+
+bool
+AMDGPUPassConfig::addPreISel()
+{
+  const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
+  if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    PM.add(createR600KernelParametersPass(
+                     getAMDGPUTargetMachine().getTargetData()));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  PM.add(createAMDILBarrierDetect(*TM));
+  PM.add(createAMDILPrintfConvert(*TM));
+  PM.add(createAMDILInlinePass(*TM));
+  PM.add(createAMDILPeepholeOpt(*TM));
+  PM.add(createAMDILISelDag(getAMDGPUTargetMachine()));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreRegAlloc() {
+  const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
+
+  if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) {
+    PM.add(createSIInitMachineFunctionInfoPass(*TM));
+  }
+
+  PM.add(createAMDGPUReorderPreloadInstructionsPass(*TM));
+  if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    PM.add(createR600LowerShaderInstructionsPass(*TM));
+    PM.add(createR600LowerInstructionsPass(*TM));
+  } else {
+    PM.add(createSILowerShaderInstructionsPass(*TM));
+    PM.add(createSIAssignInterpRegsPass(*TM));
+    PM.add(createSIConvertToISAPass(*TM));
+  }
+  PM.add(createAMDGPUConvertToISAPass(*TM));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPostRegAlloc() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreSched2() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreEmitPass() {
+  const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
+  PM.add(createAMDILCFGPreparationPass(*TM));
+  PM.add(createAMDILCFGStructurizerPass(*TM));
+  if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) {
+    PM.add(createSIPropagateImmReadsPass(*TM));
+  }
+
+  PM.add(createAMDILIOExpansion(*TM));
+  return false;
+}
+
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h

new file mode 100644 (file)

index 0000000..d4165b0
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
@@ -0,0 +1,62 @@
+//===-- AMDGPUTargetMachine.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TARGET_MACHINE_H
+#define AMDGPU_TARGET_MACHINE_H
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDILTargetMachine.h"
+#include "R600ISelLowering.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
+
+class AMDGPUTargetMachine : public AMDILTargetMachine {
+  AMDILSubtarget Subtarget;
+     const AMDGPUInstrInfo * InstrInfo;
+     AMDGPUTargetLowering * TLInfo;
+     AMDILGlobalManager *mGM;
+     AMDILKernelManager *mKM;
+     bool mDump;
+
+public:
+   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                       StringRef CPU,
+                       TargetOptions Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+   ~AMDGPUTargetMachine();
+   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
+   virtual const AMDILSubtarget *getSubtargetImpl() const {return &Subtarget; }
+   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+      return &InstrInfo->getRegisterInfo();
+   }
+   virtual AMDGPUTargetLowering * getTargetLowering() const {
+      return TLInfo;
+   }
+   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+   virtual bool addPassesToEmitFile(PassManagerBase &PM,
+                                              formatted_raw_ostream &Out,
+                                              CodeGenFileType FileType,
+                                              bool DisableVerify);
+public:
+   void dumpCode() { mDump = true; }
+   bool shouldDumpCode() const { return mDump; }
+};
+
+} /* End namespace llvm */
+
+#endif /* AMDGPU_TARGET_MACHINE_H */
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.cpp b/src/gallium/drivers/radeon/AMDGPUUtil.cpp

new file mode 100644 (file)

index 0000000..d24b980
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.cpp
@@ -0,0 +1,127 @@
+//===-- AMDGPUUtil.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUUtil.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDIL.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+/* Some instructions act as place holders to emulate operations that the GPU
+ * hardware does automatically. This function can be used to check if
+ * an opcode falls into this category. */
+bool llvm::isPlaceHolderOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  default: return false;
+  case AMDIL::EXPORT_REG:
+  case AMDIL::RETURN:
+  case AMDIL::LOAD_INPUT:
+  case AMDIL::LAST:
+  case AMDIL::RESERVE_REG:
+    return true;
+  }
+}
+
+bool llvm::isTransOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+
+    case AMDIL::COS_f32:
+    case AMDIL::COS_r600:
+    case AMDIL::COS_eg:
+    case AMDIL::RSQ_f32:
+    case AMDIL::FTOI:
+    case AMDIL::ITOF:
+    case AMDIL::MULLIT:
+    case AMDIL::MUL_LIT_r600:
+    case AMDIL::MUL_LIT_eg:
+    case AMDIL::SHR_i32:
+    case AMDIL::SIN_f32:
+    case AMDIL::EXP_f32:
+    case AMDIL::EXP_IEEE_r600:
+    case AMDIL::EXP_IEEE_eg:
+    case AMDIL::LOG_CLAMPED_r600:
+    case AMDIL::LOG_IEEE_r600:
+    case AMDIL::LOG_CLAMPED_eg:
+    case AMDIL::LOG_IEEE_eg:
+    case AMDIL::LOG_f32:
+      return true;
+  }
+}
+
+bool llvm::isTexOp(unsigned opcode)
+{
+  switch(opcode) {
+  default: return false;
+  case AMDIL::TEX_SAMPLE:
+  case AMDIL::TEX_SAMPLE_C:
+  case AMDIL::TEX_SAMPLE_L:
+  case AMDIL::TEX_SAMPLE_C_L:
+  case AMDIL::TEX_SAMPLE_LB:
+  case AMDIL::TEX_SAMPLE_C_LB:
+  case AMDIL::TEX_SAMPLE_G:
+  case AMDIL::TEX_SAMPLE_C_G:
+    return true;
+  }
+}
+
+bool llvm::isReductionOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+    case AMDIL::DOT4_r600:
+    case AMDIL::DOT4_eg:
+      return true;
+  }
+}
+
+bool llvm::isFCOp(unsigned opcode)
+{
+  switch(opcode) {
+  default: return false;
+  case AMDIL::BREAK_LOGICALZ_f32:
+  case AMDIL::BREAK_LOGICALNZ_i32:
+  case AMDIL::BREAK_LOGICALZ_i32:
+  case AMDIL::CONTINUE_LOGICALNZ_f32:
+  case AMDIL::IF_LOGICALNZ_i32:
+  case AMDIL::IF_LOGICALZ_f32:
+       case AMDIL::ELSE:
+  case AMDIL::ENDIF:
+  case AMDIL::ENDLOOP:
+  case AMDIL::IF_LOGICALNZ_f32:
+  case AMDIL::WHILELOOP:
+    return true;
+  }
+}
+
+void AMDGPU::utilAddLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
+    const struct TargetInstrInfo * TII, unsigned physReg, unsigned virtReg)
+{
+    if (!MRI.isLiveIn(physReg)) {
+      MRI.addLiveIn(physReg, virtReg);
+      BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
+                           TII->get(TargetOpcode::COPY), virtReg)
+            .addReg(physReg);
+    } else {
+      MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
+    }
+}
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.h b/src/gallium/drivers/radeon/AMDGPUUtil.h

new file mode 100644 (file)

index 0000000..299146e
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.h
@@ -0,0 +1,49 @@
+//===-- AMDGPUUtil.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_UTIL_H
+#define AMDGPU_UTIL_H
+
+#include "AMDGPURegisterInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AMDILMachineFunctionInfo;
+
+class TargetMachine;
+class TargetRegisterInfo;
+
+bool isPlaceHolderOpcode(unsigned opcode);
+
+bool isTransOp(unsigned opcode);
+bool isTexOp(unsigned opcode);
+bool isReductionOp(unsigned opcode);
+bool isFCOp(unsigned opcode);
+
+/* XXX: Move these to AMDGPUInstrInfo.h */
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+
+} /* End namespace llvm */
+
+namespace AMDGPU {
+
+void utilAddLiveIn(llvm::MachineFunction * MF, llvm::MachineRegisterInfo & MRI,
+    const struct llvm::TargetInstrInfo * TII, unsigned physReg, unsigned virtReg);
+
+} // End namespace AMDGPU
+
+#endif /* AMDGPU_UTIL_H */
diff --git a/src/gallium/drivers/radeon/AMDIL.h b/src/gallium/drivers/radeon/AMDIL.h

new file mode 100644 (file)

index 0000000..cc6590c
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL.h
@@ -0,0 +1,292 @@
+//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// AMDIL back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_H_
+#define AMDIL_H_
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define AMDIL_MAJOR_VERSION 2
+#define AMDIL_MINOR_VERSION 0
+#define AMDIL_REVISION_NUMBER 74
+#define ARENA_SEGMENT_RESERVED_UAVS 12
+#define DEFAULT_ARENA_UAV_ID 8
+#define DEFAULT_RAW_UAV_ID 7
+#define GLOBAL_RETURN_RAW_UAV_ID 11
+#define HW_MAX_NUM_CB 8
+#define MAX_NUM_UNIQUE_UAVS 8
+#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
+#define OPENCL_MAX_READ_IMAGES 128
+#define OPENCL_MAX_WRITE_IMAGES 8
+#define OPENCL_MAX_SAMPLERS 16
+
+// The next two values can never be zero, as zero is the ID that is
+// used to assert against.
+#define DEFAULT_LDS_ID     1
+#define DEFAULT_GDS_ID     1
+#define DEFAULT_SCRATCH_ID 1
+#define DEFAULT_VEC_SLOTS  8
+
+// SC->CAL version matchings.
+#define CAL_VERSION_SC_150               1700
+#define CAL_VERSION_SC_149               1700
+#define CAL_VERSION_SC_148               1525
+#define CAL_VERSION_SC_147               1525
+#define CAL_VERSION_SC_146               1525
+#define CAL_VERSION_SC_145               1451
+#define CAL_VERSION_SC_144               1451
+#define CAL_VERSION_SC_143               1441
+#define CAL_VERSION_SC_142               1441
+#define CAL_VERSION_SC_141               1420
+#define CAL_VERSION_SC_140               1400
+#define CAL_VERSION_SC_139               1387
+#define CAL_VERSION_SC_138               1387
+#define CAL_APPEND_BUFFER_SUPPORT        1340
+#define CAL_VERSION_SC_137               1331
+#define CAL_VERSION_SC_136                982
+#define CAL_VERSION_SC_135                950
+#define CAL_VERSION_GLOBAL_RETURN_BUFFER  990
+
+#define OCL_DEVICE_RV710        0x0001
+#define OCL_DEVICE_RV730        0x0002
+#define OCL_DEVICE_RV770        0x0004
+#define OCL_DEVICE_CEDAR        0x0008
+#define OCL_DEVICE_REDWOOD      0x0010
+#define OCL_DEVICE_JUNIPER      0x0020
+#define OCL_DEVICE_CYPRESS      0x0040
+#define OCL_DEVICE_CAICOS       0x0080
+#define OCL_DEVICE_TURKS        0x0100
+#define OCL_DEVICE_BARTS        0x0200
+#define OCL_DEVICE_CAYMAN       0x0400
+#define OCL_DEVICE_ALL          0x3FFF
+
+/// The number of function ID's that are reserved for 
+/// internal compiler usage.
+const unsigned int RESERVED_FUNCS = 1024;
+
+#define AMDIL_OPT_LEVEL_DECL
+#define  AMDIL_OPT_LEVEL_VAR
+#define AMDIL_OPT_LEVEL_VAR_NO_COMMA
+
+namespace llvm {
+class AMDILInstrPrinter;
+class AMDILTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+/// Instruction selection passes.
+FunctionPass*
+  createAMDILISelDag(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILPrintfConvert(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILInlinePass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILPeepholeOpt(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+
+/// Pre regalloc passes.
+FunctionPass*
+  createAMDILPointerManager(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILMachinePeephole(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+
+/// Pre emit passes.
+FunctionPass*
+  createAMDILCFGPreparationPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILCFGStructurizerPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILLiteralManager(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+FunctionPass*
+  createAMDILIOExpansion(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+
+extern Target TheAMDILTarget;
+extern Target TheAMDGPUTarget;
+} // end namespace llvm;
+
+#define GET_REGINFO_ENUM
+#include "AMDILGenRegisterInfo.inc"
+#define GET_INSTRINFO_ENUM
+#include "AMDILGenInstrInfo.inc"
+
+/// Include device information enumerations
+#include "AMDILDeviceInfo.h"
+
+namespace llvm {
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDILAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, // Address space for private memory.
+  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, // Address space for constant memory.
+  LOCAL_ADDRESS    = 3, // Address space for local memory.
+  REGION_ADDRESS   = 4, // Address space for region memory.
+  ADDRESS_NONE     = 5, // Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
+  LAST_ADDRESS     = 8
+};
+
+// We are piggybacking on the CommentFlag enum in MachineInstr.h to
+// set bits in AsmPrinterFlags of the MachineInstruction. We will
+// start at bit 16 and allocate down while LLVM will start at bit
+// 1 and allocate up.
+
+// This union/struct combination is an easy way to read out the
+// exact bits that are needed.
+typedef union ResourceRec {
+  struct {
+#ifdef __BIG_ENDIAN__
+    unsigned short isImage       : 1;  // Reserved for future use/llvm.
+    unsigned short ResourceID    : 10; // Flag to specify the resourece ID for
+                                       // the op.
+    unsigned short HardwareInst  : 1;  // Flag to specify that this instruction
+                                       // is a hardware instruction.
+    unsigned short ConflictPtr   : 1;  // Flag to specify that the pointer has a
+                                       // conflict.
+    unsigned short ByteStore     : 1;  // Flag to specify if the op is a byte
+                                       // store op.
+    unsigned short PointerPath   : 1;  // Flag to specify if the op is on the
+                                       // pointer path.
+    unsigned short CacheableRead : 1;  // Flag to specify if the read is
+                                       // cacheable.
+#else
+    unsigned short CacheableRead : 1;  // Flag to specify if the read is
+                                       // cacheable.
+    unsigned short PointerPath   : 1;  // Flag to specify if the op is on the
+                                       // pointer path.
+    unsigned short ByteStore     : 1;  // Flag to specify if the op is byte
+                                       // store op.
+    unsigned short ConflictPtr   : 1;  // Flag to specify that the pointer has
+                                       // a conflict.
+    unsigned short HardwareInst  : 1;  // Flag to specify that this instruction
+                                       // is a hardware instruction.
+    unsigned short ResourceID    : 10; // Flag to specify the resource ID for
+                                       // the op.
+    unsigned short isImage       : 1;  // Reserved for future use.
+#endif
+  } bits;
+  unsigned short u16all;
+} InstrResEnc;
+
+} // namespace AMDILAS
+
+// The OpSwizzle encodes a subset of all possible
+// swizzle combinations into a number of bits using
+// only the combinations utilized by the backend.
+// The lower 128 are for source swizzles and the
+// upper 128 or for destination swizzles.
+// The valid mappings can be found in the
+// getSrcSwizzle and getDstSwizzle functions of
+// AMDILUtilityFunctions.cpp.
+typedef union SwizzleRec {
+  struct {
+#ifdef __BIG_ENDIAN__
+    unsigned char dst : 1;
+    unsigned char swizzle : 7;
+#else
+    unsigned char swizzle : 7;
+    unsigned char dst : 1;
+#endif
+  } bits;
+  unsigned char u8all;
+} OpSwizzle;
+// Enums corresponding to AMDIL condition codes for IL.  These
+// values must be kept in sync with the ones in the .td file.
+namespace AMDILCC {
+enum CondCodes {
+  // AMDIL specific condition codes. These correspond to the IL_CC_*
+  // in AMDILInstrInfo.td and must be kept in the same order.
+  IL_CC_D_EQ  =  0,   // DEQ instruction.
+  IL_CC_D_GE  =  1,   // DGE instruction.
+  IL_CC_D_LT  =  2,   // DLT instruction.
+  IL_CC_D_NE  =  3,   // DNE instruction.
+  IL_CC_F_EQ  =  4,   //  EQ instruction.
+  IL_CC_F_GE  =  5,   //  GE instruction.
+  IL_CC_F_LT  =  6,   //  LT instruction.
+  IL_CC_F_NE  =  7,   //  NE instruction.
+  IL_CC_I_EQ  =  8,   // IEQ instruction.
+  IL_CC_I_GE  =  9,   // IGE instruction.
+  IL_CC_I_LT  = 10,   // ILT instruction.
+  IL_CC_I_NE  = 11,   // INE instruction.
+  IL_CC_U_GE  = 12,   // UGE instruction.
+  IL_CC_U_LT  = 13,   // ULE instruction.
+  // Pseudo IL Comparison instructions here.
+  IL_CC_F_GT  = 14,   //  GT instruction.
+  IL_CC_U_GT  = 15,
+  IL_CC_I_GT  = 16,
+  IL_CC_D_GT  = 17,
+  IL_CC_F_LE  = 18,   //  LE instruction
+  IL_CC_U_LE  = 19,
+  IL_CC_I_LE  = 20,
+  IL_CC_D_LE  = 21,
+  IL_CC_F_UNE = 22,
+  IL_CC_F_UEQ = 23,
+  IL_CC_F_ULT = 24,
+  IL_CC_F_UGT = 25,
+  IL_CC_F_ULE = 26,
+  IL_CC_F_UGE = 27,
+  IL_CC_F_ONE = 28,
+  IL_CC_F_OEQ = 29,
+  IL_CC_F_OLT = 30,
+  IL_CC_F_OGT = 31,
+  IL_CC_F_OLE = 32,
+  IL_CC_F_OGE = 33,
+  IL_CC_D_UNE = 34,
+  IL_CC_D_UEQ = 35,
+  IL_CC_D_ULT = 36,
+  IL_CC_D_UGT = 37,
+  IL_CC_D_ULE = 38,
+  IL_CC_D_UGE = 39,
+  IL_CC_D_ONE = 40,
+  IL_CC_D_OEQ = 41,
+  IL_CC_D_OLT = 42,
+  IL_CC_D_OGT = 43,
+  IL_CC_D_OLE = 44,
+  IL_CC_D_OGE = 45,
+  IL_CC_U_EQ  = 46,
+  IL_CC_U_NE  = 47,
+  IL_CC_F_O   = 48,
+  IL_CC_D_O   = 49,
+  IL_CC_F_UO  = 50,
+  IL_CC_D_UO  = 51,
+  IL_CC_L_LE  = 52,
+  IL_CC_L_GE  = 53,
+  IL_CC_L_EQ  = 54,
+  IL_CC_L_NE  = 55,
+  IL_CC_L_LT  = 56,
+  IL_CC_L_GT  = 57,
+  IL_CC_UL_LE = 58,
+  IL_CC_UL_GE = 59,
+  IL_CC_UL_EQ = 60,
+  IL_CC_UL_NE = 61,
+  IL_CC_UL_LT = 62,
+  IL_CC_UL_GT = 63,
+  COND_ERROR  = 64
+};
+
+} // end namespace AMDILCC
+} // end namespace llvm
+#endif // AMDIL_H_
diff --git a/src/gallium/drivers/radeon/AMDIL.td b/src/gallium/drivers/radeon/AMDIL.td

new file mode 100644 (file)

index 0000000..9bcccac
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL.td
@@ -0,0 +1,19 @@
+//===-- AMDIL.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// This file specifies where the base TD file exists
+// and where the version specific TD file exists.
+include "AMDILBase.td"
+include "AMDILVersion.td"
+
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
diff --git a/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp

new file mode 100644 (file)

index 0000000..cf5afb9
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp
@@ -0,0 +1,723 @@
+//===-- AMDIL789IOExpansion.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// @file AMDIL789IOExpansion.cpp
+// @details Implementation of the IO expansion class for 789 devices.
+//
+#include "AMDILCompilerErrors.h"
+#include "AMDILCompilerWarnings.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILIOExpansion.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+AMDIL789IOExpansion::AMDIL789IOExpansion(TargetMachine &tm
+    AMDIL_OPT_LEVEL_DECL) 
+: AMDILIOExpansion(tm  AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+AMDIL789IOExpansion::~AMDIL789IOExpansion() {
+}
+
+const char *AMDIL789IOExpansion::getPassName() const
+{
+  return "AMDIL 789 IO Expansion Pass";
+}
+// This code produces the following pseudo-IL:
+// mov r1007, $src.y000
+// cmov_logical r1007.x___, $flag.yyyy, r1007.xxxx, $src.xxxx
+// mov r1006, $src.z000
+// cmov_logical r1007.x___, $flag.zzzz, r1006.xxxx, r1007.xxxx
+// mov r1006, $src.w000
+// cmov_logical $dst.x___, $flag.wwww, r1006.xxxx, r1007.xxxx
+void
+AMDIL789IOExpansion::emitComponentExtract(MachineInstr *MI, 
+    unsigned flag, unsigned src, unsigned dst, bool before)
+{
+  MachineBasicBlock::iterator I = *MI;
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+    .addReg(src)
+    .addImm(2);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007)
+    .addReg(flag)
+    .addReg(AMDIL::R1007)
+    .addReg(src);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006)
+    .addReg(src)
+    .addImm(3);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1007)
+    .addReg(flag)
+    .addReg(AMDIL::R1006)
+    .addReg(AMDIL::R1007);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006)
+    .addReg(src)
+    .addImm(4);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_W_i32), dst)
+    .addReg(flag)
+    .addReg(AMDIL::R1006)
+    .addReg(AMDIL::R1007);
+
+}
+// We have a 128 bit load but a 8/16/32bit value, so we need to
+// select the correct component and make sure that the correct
+// bits are selected. For the 8 and 16 bit cases we need to 
+// extract from the component the correct bits and for 32 bits
+// we just need to select the correct component.
+  void
+AMDIL789IOExpansion::emitDataLoadSelect(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  DebugLoc DL = MI->getDebugLoc();
+  emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false);
+  if (getMemorySize(MI) == 1) {
+    // This produces the following pseudo-IL:
+    // iand r1006.x___, r1010.xxxx, l14.xxxx
+    // mov r1006, r1006.xxxx
+    // iadd r1006, r1006, {0, -1, 2, 3}
+    // ieq r1008, r1006, 0
+    // mov r1011, r1011.xxxx
+    // ishr r1011, r1011, {0, 8, 16, 24}
+    // mov r1007, r1011.y000
+    // cmov_logical r1007.x___, r1008.yyyy, r1007.xxxx, r1011.xxxx
+    // mov r1006, r1011.z000
+    // cmov_logical r1007.x___, r1008.zzzz, r1006.xxxx, r1007.xxxx
+    // mov r1006, r1011.w000
+    // cmov_logical r1011.x___, r1008.wwww, r1006.xxxx, r1007.xxxx
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1006)
+      .addReg(AMDIL::R1010)
+      .addImm(mMFI->addi32Literal(3));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1006)
+      .addReg(AMDIL::R1006);
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1006)
+      .addReg(AMDIL::R1006)
+      .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+            (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008)
+      .addReg(AMDIL::R1006)
+      .addImm(mMFI->addi32Literal(0));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011)
+      .addReg(AMDIL::R1011);
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1011)
+      .addReg(AMDIL::R1011)
+      .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+    emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false);
+  } else if (getMemorySize(MI) == 2) {
+    // This produces the following pseudo-IL:
+    // ishr r1007.x___, r1010.xxxx, 1
+    // iand r1008.x___, r1007.xxxx, 1
+    // ishr r1007.x___, r1011.xxxx, 16
+    // cmov_logical r1011.x___, r1008.xxxx, r1007.xxxx, r1011.xxxx
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+      .addReg(AMDIL::R1010)
+      .addImm(mMFI->addi32Literal(1));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+      .addReg(AMDIL::R1007)
+      .addImm(mMFI->addi32Literal(1));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+      .addReg(AMDIL::R1011)
+      .addImm(mMFI->addi32Literal(16));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1011)
+      .addReg(AMDIL::R1008)
+      .addReg(AMDIL::R1007)
+      .addReg(AMDIL::R1011);
+  }
+}
+// This function does address calculations modifications to load from a vector
+// register type instead of a dword addressed load.
+  void 
+AMDIL789IOExpansion::emitVectorAddressCalc(MachineInstr *MI, bool is32bit, bool needsSelect)
+{
+  MachineBasicBlock::iterator I = *MI;
+  DebugLoc DL = MI->getDebugLoc();
+  // This produces the following pseudo-IL:
+  // ishr r1007.x___, r1010.xxxx, (is32bit) ? 2 : 3
+  // iand r1008.x___, r1007.xxxx, (is32bit) ? 3 : 1
+  // ishr r1007.x___, r1007.xxxx, (is32bit) ? 2 : 1
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+    .addReg(AMDIL::R1010)
+    .addImm(mMFI->addi32Literal((is32bit) ? 0x2 : 3));
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+    .addReg(AMDIL::R1007)
+    .addImm(mMFI->addi32Literal((is32bit) ? 3 : 1));
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007)
+    .addReg(AMDIL::R1007)
+    .addImm(mMFI->addi32Literal((is32bit) ? 2 : 1));
+  if (needsSelect) {
+    // If the component selection is required, the following 
+    // pseudo-IL is produced.
+    // mov r1008, r1008.xxxx
+    // iadd r1008, r1008, (is32bit) ? {0, -1, -2, -3} : {0, 0, -1, -1}
+    // ieq r1008, r1008, 0
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+      .addReg(AMDIL::R1008);
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+      .addReg(AMDIL::R1008)
+      .addImm(mMFI->addi128Literal((is32bit) ? 0xFFFFFFFFULL << 32 : 0ULL,  
+            (is32bit) ? 0xFFFFFFFEULL | (0xFFFFFFFDULL << 32) :
+            -1ULL));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008)
+      .addReg(AMDIL::R1008)
+      .addImm(mMFI->addi32Literal(0));
+  }
+}
+// This function emits a switch statement and writes 32bit/64bit 
+// value to a 128bit vector register type.
+  void
+AMDIL789IOExpansion::emitVectorSwitchWrite(MachineInstr *MI, bool is32bit) 
+{
+  MachineBasicBlock::iterator I = *MI;
+  uint32_t xID = getPointerID(MI);
+  assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n");
+  // This section generates the following pseudo-IL:
+  // switch r1008.x
+  // default
+  //   mov x1[r1007.x].(is32bit) ? x___ : xy__, r1011.x{y}
+  // break
+  // case 1
+  //   mov x1[r1007.x].(is32bit) ? _y__ : __zw, r1011.x{yxy}
+  // break
+  // if is32bit is true, case 2 and 3 are emitted.
+  // case 2
+  //   mov x1[r1007.x].__z_, r1011.x
+  // break
+  // case 3
+  //   mov x1[r1007.x].___w, r1011.x
+  // break
+  // endswitch
+  DebugLoc DL;
+  BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::SWITCH))
+    .addReg(AMDIL::R1008);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::DEFAULT));
+  BuildMI(*mBB, I, DL,
+      mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_X : AMDIL::SCRATCHSTORE_XY)
+      , AMDIL::R1007)
+    .addReg(AMDIL::R1011)
+    .addImm(xID);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(1);
+  BuildMI(*mBB, I, DL,
+      mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_Y : AMDIL::SCRATCHSTORE_ZW), AMDIL::R1007)
+    .addReg(AMDIL::R1011)
+    .addImm(xID);
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+  if (is32bit) {
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(2);
+    BuildMI(*mBB, I, DL,
+        mTII->get(AMDIL::SCRATCHSTORE_Z), AMDIL::R1007)
+      .addReg(AMDIL::R1011)
+      .addImm(xID);
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(3);
+    BuildMI(*mBB, I, DL,
+        mTII->get(AMDIL::SCRATCHSTORE_W), AMDIL::R1007)
+      .addReg(AMDIL::R1011)
+      .addImm(xID);
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK));
+  }
+  BuildMI(*mBB, I, DL, mTII->get(AMDIL::ENDSWITCH));
+
+}
+  void
+AMDIL789IOExpansion::expandPrivateLoad(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+  if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t xID = getPointerID(MI);
+  assert(xID && "Found a scratch load that was incorrectly marked as zero ID!\n");
+  if (!xID) {
+    xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL;
+  // These instructions go before the current MI.
+  expandLoadStartCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      // Since the private register is a 128 bit aligned, we have to align the address
+      // first, since our source address is 32bit aligned and then load the data.
+      // This produces the following pseudo-IL:
+      // ishr r1010.x___, r1010.xxxx, 4
+           // mov r1011, x1[r1010.x]
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(4));
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(xID);
+      break;
+    case 1:
+    case 2:
+    case 4:
+      emitVectorAddressCalc(MI, true, true);
+      // This produces the following pseudo-IL:
+      // mov r1011, x1[r1007.x]
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1007)
+        .addImm(xID);
+      // These instructions go after the current MI.
+      emitDataLoadSelect(MI);
+     break;
+    case 8:
+      emitVectorAddressCalc(MI, false, true);
+      // This produces the following pseudo-IL:
+      // mov r1011, x1[r1007.x]
+      // mov r1007, r1011.zw00
+      // cmov_logical r1011.xy__, r1008.xxxx, r1011.xy, r1007.zw
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1007)
+        .addImm(xID);
+      // These instructions go after the current MI.
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007)
+        .addReg(AMDIL::R1011)
+        .addImm(2);
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1007);
+     break;
+  }
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, I, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)),
+      MI->getOperand(0).getReg())
+    .addReg(AMDIL::R1011);
+}
+
+
+  void
+AMDIL789IOExpansion::expandConstantLoad(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  if (!isHardwareInst(MI) || MI->memoperands_empty()) {
+    return expandGlobalLoad(MI);
+  }
+  uint32_t cID = getPointerID(MI);
+  if (cID < 2) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::CONSTANT_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+
+  DebugLoc DL;
+  // These instructions go before the current MI.
+  expandLoadStartCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(4));
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(cID);
+      break;
+    case 1:
+    case 2:
+    case 4:
+      emitVectorAddressCalc(MI, true, true);
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1007)
+        .addImm(cID);
+      // These instructions go after the current MI.
+      emitDataLoadSelect(MI);
+      break;
+    case 8:
+      emitVectorAddressCalc(MI, false, true);
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::CBLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1007)
+        .addImm(cID);
+      // These instructions go after the current MI.
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007)
+        .addReg(AMDIL::R1011)
+        .addImm(2);
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1007);
+      break;
+  }
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, I, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)),
+      MI->getOperand(0).getReg())
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+
+  void
+AMDIL789IOExpansion::expandConstantPoolLoad(MachineInstr *MI)
+{
+  if (!isStaticCPLoad(MI)) {
+    return expandConstantLoad(MI);
+  } else {
+    uint32_t idx = MI->getOperand(1).getIndex();
+    const MachineConstantPool *MCP = MI->getParent()->getParent()
+      ->getConstantPool();
+    const std::vector<MachineConstantPoolEntry> &consts
+      = MCP->getConstants();
+    const Constant *C = consts[idx].Val.ConstVal;
+    emitCPInst(MI, C, mKM, 0, isExtendLoad(MI));
+  }
+}
+
+  void
+AMDIL789IOExpansion::expandPrivateStore(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+  if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) {
+    return expandGlobalStore(MI);
+  }
+   if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t xID = getPointerID(MI);
+  assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n");
+  if (!xID) {
+    xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL;
+   // These instructions go before the current MI.
+  expandStoreSetupCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      // This section generates the following pseudo-IL:
+      // ishr r1010.x___, r1010.xxxx, 4
+           // mov x1[r1010.x], r1011
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SHR_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(4));
+      BuildMI(*mBB, I, MI->getDebugLoc(),
+          mTII->get(AMDIL::SCRATCHSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(xID);
+      break;
+    case 1:
+      emitVectorAddressCalc(MI, true, true);
+      // This section generates the following pseudo-IL:
+      // mov r1002, x1[r1007.x]
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002)
+        .addReg(AMDIL::R1007)
+        .addImm(xID);
+      emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true);
+      // This section generates the following pseudo-IL:
+      // iand r1003.x, r1010.x, 3
+      // mov r1003, r1003.xxxx
+      // iadd r1000, r1003, {0, -1, -2, -3}
+      // ieq r1000, r1000, 0
+      // mov r1002, r1002.xxxx
+      // ishr r1002, r1002, {0, 8, 16, 24}
+      // mov r1011, r1011.xxxx
+      // cmov_logical r1002, r1000, r1011, r1002
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1003)
+        .addReg(AMDIL::R1003);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1001)
+        .addReg(AMDIL::R1003)
+        .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+              (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1001)
+        .addReg(AMDIL::R1001)
+        .addImm(mMFI->addi32Literal(0));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1002)
+        .addReg(AMDIL::R1002);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1002)
+      .addReg(AMDIL::R1002)
+      .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_v4i32), AMDIL::R1002)
+        .addReg(AMDIL::R1001)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1002);
+      if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+        // This section generates the following pseudo-IL:
+        // iand r1002, r1002, 0xFF
+        // ishl r1002, r1002, {0, 8, 16, 24}
+        // ior r1002.xy, r1002.xy, r1002.zw
+        // ior r1011.x, r1002.x, r1002.y
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1002)
+          .addReg(AMDIL::R1002)
+          .addImm(mMFI->addi32Literal(0xFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1002)
+          .addReg(AMDIL::R1002)
+          .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32)));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1002)
+          .addReg(AMDIL::R1002).addReg(AMDIL::R1002);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1002).addReg(AMDIL::R1002);
+      } else {
+        // This section generates the following pseudo-IL:
+        // mov r1001.xy, r1002.yw
+        // mov r1002.xy, r1002.xz
+        // ubit_insert r1002.xy, 8, 8, r1001.xy, r1002.xy
+        // mov r1001.x, r1002.y
+        // ubit_insert r1011.x, 16, 16, r1002.y, r1002.x
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1001)
+          .addReg(AMDIL::R1002);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1002)
+          .addReg(AMDIL::R1002);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), AMDIL::R1002)
+          .addImm(mMFI->addi32Literal(8))
+          .addImm(mMFI->addi32Literal(8))
+          .addReg(AMDIL::R1001)
+          .addReg(AMDIL::R1002);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1001)
+          .addReg(AMDIL::R1002);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16))
+          .addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1001)
+          .addReg(AMDIL::R1002);
+      }
+      emitVectorAddressCalc(MI, true, false);
+      emitVectorSwitchWrite(MI, true);
+      break;
+    case 2:
+      emitVectorAddressCalc(MI, true, true);
+      // This section generates the following pseudo-IL:
+      // mov r1002, x1[r1007.x]
+      BuildMI(*mBB, I, DL,
+          mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002)
+        .addReg(AMDIL::R1007)
+        .addImm(xID);
+      emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true);
+      // This section generates the following pseudo-IL:
+      // ishr r1003.x, r1010.x, 1
+      // iand r1003.x, r1003.x, 1
+      // ishr r1001.x, r1002.x, 16
+      // cmov_logical r1002.x, r1003.x, r1002.x, r1011.x
+      // cmov_logical r1001.x, r1003.x, r1011.x, r1001.x
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1003)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(1));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003)
+        .addReg(AMDIL::R1003)
+        .addImm(mMFI->addi32Literal(1));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1001)
+        .addReg(AMDIL::R1002)
+        .addImm(mMFI->addi32Literal(16));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1002)
+        .addReg(AMDIL::R1003)
+        .addReg(AMDIL::R1002)
+        .addReg(AMDIL::R1011);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1001)
+        .addReg(AMDIL::R1003)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1001);
+      if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+        // This section generates the following pseudo-IL:
+        // iand r1002.x, r1002.x, 0xFFFF
+        // iand r1001.x, r1001.x, 0xFFFF
+        // ishl r1001.x, r1002.x, 16
+        // ior r1011.x, r1002.x, r1001.x
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1002)
+          .addReg(AMDIL::R1002)
+          .addImm(mMFI->addi32Literal(0xFFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1001)
+          .addReg(AMDIL::R1001)
+          .addImm(mMFI->addi32Literal(0xFFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1001)
+          .addReg(AMDIL::R1001)
+          .addImm(mMFI->addi32Literal(16));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_OR_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1002).addReg(AMDIL::R1001);
+      } else {
+        // This section generates the following pseudo-IL:
+        // ubit_insert r1011.x, 16, 16, r1001.y, r1002.x
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16))
+          .addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1001)
+          .addReg(AMDIL::R1002);
+      }
+      emitVectorAddressCalc(MI, true, false);
+      emitVectorSwitchWrite(MI, true);
+      break;
+    case 4:
+      emitVectorAddressCalc(MI, true, false);
+      emitVectorSwitchWrite(MI, true);
+      break;
+    case 8:
+      emitVectorAddressCalc(MI, false, false);
+      emitVectorSwitchWrite(MI, false);
+      break;
+  };
+}
+ void
+AMDIL789IOExpansion::expandStoreSetupCode(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  DebugLoc DL;
+  if (MI->getOperand(0).isUndef()) {
+  BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011)
+      .addImm(mMFI->addi32Literal(0));
+  } else {
+  BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011)
+      .addReg(MI->getOperand(0).getReg());
+  }
+  expandTruncData(MI);
+  if (MI->getOperand(2).isReg()) {
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_i32), AMDIL::R1010)
+      .addReg(MI->getOperand(1).getReg())
+      .addReg(MI->getOperand(2).getReg());
+  } else {
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::MOVE_i32), AMDIL::R1010)
+      .addReg(MI->getOperand(1).getReg());
+  }
+  expandAddressCalc(MI);
+  expandPackedData(MI);
+}
+
+
+void
+AMDIL789IOExpansion::expandPackedData(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  if (!isPackedData(MI)) {
+    return;
+  }
+  DebugLoc DL;
+  // If we have packed data, then the shift size is no longer
+  // the same as the load size and we need to adjust accordingly
+  switch(getPackedID(MI)) {
+    default:
+      break;
+    case PACK_V2I8:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi64Literal(0xFFULL | (0xFFULL << 32)));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addImm(mMFI->addi64Literal(8ULL << 32));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V4I8:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32))));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V2I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi64Literal(16ULL << 32));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V4I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi64Literal(16ULL << 32));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v4i16), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+      }
+      break;
+    case UNPACK_V2I8:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+        .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+      break;
+    case UNPACK_V4I8:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i8), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v4i8), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32))));
+      }
+      break;
+    case UNPACK_V2I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+      }
+      break;
+    case UNPACK_V4I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v2i32), AMDIL::R1012)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+      }
+      break;
+  };
+}
diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp

new file mode 100644 (file)

index 0000000..df81c44
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
@@ -0,0 +1,157 @@
+//===-- AMDIL7XXDevice.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDIL7XXDevice.h"
+#ifdef UPSTREAM_LLVM
+#include "AMDIL7XXAsmPrinter.h"
+#endif
+#include "AMDILDevice.h"
+#include "AMDILIOExpansion.h"
+#include "AMDILPointerManager.h"
+
+using namespace llvm;
+
+AMDIL7XXDevice::AMDIL7XXDevice(AMDILSubtarget *ST) : AMDILDevice(ST)
+{
+  setCaps();
+  std::string name = mSTM->getDeviceName();
+  if (name == "rv710") {
+    mDeviceFlag = OCL_DEVICE_RV710;
+  } else if (name == "rv730") {
+    mDeviceFlag = OCL_DEVICE_RV730;
+  } else {
+    mDeviceFlag = OCL_DEVICE_RV770;
+  }
+}
+
+AMDIL7XXDevice::~AMDIL7XXDevice()
+{
+}
+
+void AMDIL7XXDevice::setCaps()
+{
+  mSWBits.set(AMDILDeviceInfo::LocalMem);
+}
+
+size_t AMDIL7XXDevice::getMaxLDSSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_700;
+  }
+  return 0;
+}
+
+size_t AMDIL7XXDevice::getWavefrontSize() const
+{
+  return AMDILDevice::HalfWavefrontSize;
+}
+
+uint32_t AMDIL7XXDevice::getGeneration() const
+{
+  return AMDILDeviceInfo::HD4XXX;
+}
+
+uint32_t AMDIL7XXDevice::getResourceID(uint32_t DeviceID) const
+{
+  switch (DeviceID) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case GLOBAL_ID:
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+  case ARENA_UAV_ID:
+    break;
+  case LDS_ID:
+    if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    }
+    break;
+  case SCRATCH_ID:
+    if (usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    }
+    break;
+  case GDS_ID:
+    assert(0 && "GDS UAV ID is not supported on this chip");
+    if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    }
+    break;
+  };
+
+  return 0;
+}
+
+uint32_t AMDIL7XXDevice::getMaxNumUAVs() const
+{
+  return 1;
+}
+
+FunctionPass* 
+AMDIL7XXDevice::getIOExpansion(
+    TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const
+{
+  return new AMDIL7XXIOExpansion(TM  AMDIL_OPT_LEVEL_VAR);
+}
+
+AsmPrinter*
+AMDIL7XXDevice::getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const
+{
+#ifdef UPSTREAM_LLVM
+  return new AMDIL7XXAsmPrinter(TM, Streamer);
+#else
+  return NULL;
+#endif
+}
+
+FunctionPass*
+AMDIL7XXDevice::getPointerManager(
+    TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const
+{
+  return new AMDILPointerManager(TM  AMDIL_OPT_LEVEL_VAR);
+}
+
+AMDIL770Device::AMDIL770Device(AMDILSubtarget *ST): AMDIL7XXDevice(ST)
+{
+  setCaps();
+}
+
+AMDIL770Device::~AMDIL770Device()
+{
+}
+
+void AMDIL770Device::setCaps()
+{
+  if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) {
+    mSWBits.set(AMDILDeviceInfo::FMA);
+    mHWBits.set(AMDILDeviceInfo::DoubleOps);
+  }
+  mSWBits.set(AMDILDeviceInfo::BarrierDetect);
+  mHWBits.reset(AMDILDeviceInfo::LongOps);
+  mSWBits.set(AMDILDeviceInfo::LongOps);
+  mSWBits.set(AMDILDeviceInfo::LocalMem);
+}
+
+size_t AMDIL770Device::getWavefrontSize() const
+{
+  return AMDILDevice::WavefrontSize;
+}
+
+AMDIL710Device::AMDIL710Device(AMDILSubtarget *ST) : AMDIL7XXDevice(ST)
+{
+}
+
+AMDIL710Device::~AMDIL710Device()
+{
+}
+
+size_t AMDIL710Device::getWavefrontSize() const
+{
+  return AMDILDevice::QuarterWavefrontSize;
+}
diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.h b/src/gallium/drivers/radeon/AMDIL7XXDevice.h

new file mode 100644 (file)

index 0000000..87238e9
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.h
@@ -0,0 +1,77 @@
+//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDIL7XXDEVICEIMPL_H_
+#define _AMDIL7XXDEVICEIMPL_H_
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+class AMDILSubtarget;
+
+//===----------------------------------------------------------------------===//
+// 7XX generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+// The AMDIL7XXDevice class represents the generic 7XX device. All 7XX
+// devices are derived from this class. The AMDIL7XX device will only
+// support the minimal features that are required to be considered OpenCL 1.0
+// compliant and nothing more.
+class AMDIL7XXDevice : public AMDILDevice {
+public:
+  AMDIL7XXDevice(AMDILSubtarget *ST);
+  virtual ~AMDIL7XXDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getResourceID(uint32_t DeviceID) const;
+  virtual uint32_t getMaxNumUAVs() const;
+  FunctionPass*
+    getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const;
+  AsmPrinter* 
+    getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const;
+  FunctionPass*
+    getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const;
+
+protected:
+  virtual void setCaps();
+}; // AMDIL7XXDevice
+
+// The AMDIL770Device class represents the RV770 chip and it's
+// derivative cards. The difference between this device and the base
+// class is this device device adds support for double precision
+// and has a larger wavefront size.
+class AMDIL770Device : public AMDIL7XXDevice {
+public:
+  AMDIL770Device(AMDILSubtarget *ST);
+  virtual ~AMDIL770Device();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDIL770Device
+
+// The AMDIL710Device class derives from the 7XX base class, but this
+// class is a smaller derivative, so we need to overload some of the
+// functions in order to correctly specify this information.
+class AMDIL710Device : public AMDIL7XXDevice {
+public:
+  AMDIL710Device(AMDILSubtarget *ST);
+  virtual ~AMDIL710Device();
+  virtual size_t getWavefrontSize() const;
+}; // AMDIL710Device
+
+} // namespace llvm
+#endif // _AMDILDEVICEIMPL_H_
diff --git a/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp b/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp

new file mode 100644 (file)

index 0000000..cddde31
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp
@@ -0,0 +1,548 @@
+//===-- AMDIL7XXIOExpansion.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// @file AMDIL7XXIOExpansion.cpp
+// @details Implementation of the IO Printing class for 7XX devices
+//
+#include "AMDILCompilerErrors.h"
+#include "AMDILCompilerWarnings.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILIOExpansion.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+AMDIL7XXIOExpansion::AMDIL7XXIOExpansion(TargetMachine &tm
+    AMDIL_OPT_LEVEL_DECL) : AMDIL789IOExpansion(tm  AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+AMDIL7XXIOExpansion::~AMDIL7XXIOExpansion() {
+}
+const char *AMDIL7XXIOExpansion::getPassName() const
+{
+  return "AMDIL 7XX IO Expansion Pass";
+}
+
+  void
+AMDIL7XXIOExpansion::expandGlobalLoad(MachineInstr *MI)
+{
+  DebugLoc DL;
+  // These instructions go before the current MI.
+  expandLoadStartCode(MI);
+  uint32_t ID = getPointerID(MI);
+  mKM->setOutputInst();
+  switch(getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_v4i32), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(ID);
+      break;
+    case 4:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(ID);
+      break;
+    case 8:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_v2i32), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(ID);
+      break;
+    case 1:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+                (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1012)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(0));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1012)
+        .addImm(mMFI->addi32Literal(0))
+        .addImm(mMFI->addi32Literal(24));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1012)
+        .addImm(mMFI->addi32Literal(8))
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1012)
+        .addImm(mMFI->addi32Literal(16))
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(ID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i8), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      break;
+    case 2:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(1));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(16))
+        .addImm(mMFI->addi32Literal(0));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(ID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i16), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      break;
+  }
+  // These instructions go after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, MI, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+
+  void
+AMDIL7XXIOExpansion::expandRegionLoad(MachineInstr *MI)
+{
+  bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) {
+    mMFI->addErrorMsg(
+        amd::CompilerErrorMessage[REGION_MEMORY_ERROR]);
+    return;
+  }
+  if (!HWRegion || !isHardwareRegion(MI)) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::GDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t gID = getPointerID(MI);
+  assert(gID && "Found a GDS load that was incorrectly marked as zero ID!\n");
+  if (!gID) {
+    gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  
+  DebugLoc DL;
+  // These instructions go before the current MI.
+  expandLoadStartCode(MI);
+   switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32)));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Z), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_W), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+    case 1:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      // The instruction would normally fit in right here so everything created
+      // after this point needs to go into the afterInst vector.
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(24));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(24));
+      break;
+    case 2:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      // The instruction would normally fit in right here so everything created
+      // after this point needs to go into the afterInst vector.
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(16));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(16));
+      break;
+    case 4:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+    case 8:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi64Literal(1ULL << 32));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+   }
+
+  // These instructions go after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, MI, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+  void
+AMDIL7XXIOExpansion::expandLocalLoad(MachineInstr *MI)
+{
+  bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  if (!HWLocal || !isHardwareLocal(MI)) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::LDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t lID = getPointerID(MI);
+  assert(lID && "Found a LDS load that was incorrectly marked as zero ID!\n");
+  if (!lID) {
+    lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL;
+  // These instructions go before the current MI.
+  expandLoadStartCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+    case 8:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOADVEC), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      break;
+    case 4:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      break;
+    case 1:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(24));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(24));
+      break;
+    case 2:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(16));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(16));
+      break;
+   }
+
+  // These instructions go after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, MI, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+
+  void
+AMDIL7XXIOExpansion::expandGlobalStore(MachineInstr *MI)
+{
+  uint32_t ID = getPointerID(MI);
+  mKM->setOutputInst();
+  DebugLoc DL = MI->getDebugLoc();
+  // These instructions go before the current MI.
+  expandStoreSetupCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_v4i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(ID);
+      break;
+    case 1:
+      mMFI->addErrorMsg(
+          amd::CompilerErrorMessage[BYTE_STORE_ERROR]);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(ID);
+      break;
+    case 2:
+      mMFI->addErrorMsg(
+          amd::CompilerErrorMessage[BYTE_STORE_ERROR]);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(ID);
+      break;
+    case 4:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(ID);
+      break;
+    case 8:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_v2i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(ID);
+      break;
+  };
+}
+
+  void
+AMDIL7XXIOExpansion::expandRegionStore(MachineInstr *MI)
+{
+  bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) {
+    mMFI->addErrorMsg(
+        amd::CompilerErrorMessage[REGION_MEMORY_ERROR]);
+    return;
+  }
+  if (!HWRegion || !isHardwareRegion(MI)) {
+    return expandGlobalStore(MI);
+  }
+  DebugLoc DL = MI->getDebugLoc();
+  mKM->setOutputInst();
+  if (!mMFI->usesMem(AMDILDevice::GDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t gID = getPointerID(MI);
+  assert(gID && "Found a GDS store that was incorrectly marked as zero ID!\n");
+  if (!gID) {
+    gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+
+  // These instructions go before the current MI.
+  expandStoreSetupCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32)));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Z), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_W), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 1:
+      mMFI->addErrorMsg(
+          amd::CompilerErrorMessage[BYTE_STORE_ERROR]);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0xFF));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+              (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1006)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(0xFFFFFF00))
+        .addImm(mMFI->addi32Literal(0x00FFFFFF));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1007)
+        .addImm(mMFI->addi32Literal(0xFF00FFFF));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1007)
+        .addImm(mMFI->addi32Literal(0xFFFF00FF));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1007);
+       BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+       break;
+    case 2:
+      mMFI->addErrorMsg(
+          amd::CompilerErrorMessage[BYTE_STORE_ERROR]);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0x0000FFFF));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(1));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(0x0000FFFF))
+        .addImm(mMFI->addi32Literal(0xFFFF0000));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(16))
+        .addImm(mMFI->addi32Literal(0));
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+       BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+       break;
+    case 4:
+       BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 8:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi64Literal(1ULL << 32));
+       BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+       BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+   };
+}
+
+  void
+AMDIL7XXIOExpansion::expandLocalStore(MachineInstr *MI)
+{
+  bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  if (!HWLocal || !isHardwareLocal(MI)) {
+    return expandGlobalStore(MI);
+  }
+  uint32_t lID = getPointerID(MI);
+  assert(lID && "Found a LDS store that was incorrectly marked as zero ID!\n");
+  if (!lID) {
+    lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL = MI->getDebugLoc();
+  // These instructions go before the current MI.
+  expandStoreSetupCode(MI);
+  BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSSTOREVEC), AMDIL::MEM)
+    .addReg(AMDIL::R1010)
+    .addReg(AMDIL::R1011)
+    .addImm(lID);
+}
diff --git a/src/gallium/drivers/radeon/AMDILAlgorithms.tpp b/src/gallium/drivers/radeon/AMDILAlgorithms.tpp

new file mode 100644 (file)

index 0000000..058475f
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILAlgorithms.tpp
@@ -0,0 +1,93 @@
+//===------ AMDILAlgorithms.tpp - AMDIL Template Algorithms Header --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides templates algorithms that extend the STL algorithms, but
+// are useful for the AMDIL backend
+//
+//===----------------------------------------------------------------------===//
+
+// A template function that loops through the iterators and passes the second
+// argument along with each iterator to the function. If the function returns
+// true, then the current iterator is invalidated and it moves back, before
+// moving forward to the next iterator, otherwise it moves forward without
+// issue. This is based on the for_each STL function, but allows a reference to
+// the second argument
+template<class InputIterator, class Function, typename Arg>
+Function binaryForEach(InputIterator First, InputIterator Last, Function F,
+                       Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    F(*First, Second);
+  }
+  return F;
+}
+
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryForEach(InputIterator First, InputIterator Last, Function F,
+                           Arg &Second)
+{
+  for ( ; First!=Last; ++First ) {
+    if (F(*First, Second)) {
+      --First;
+    }
+  }
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with the passed in argument. See binaryForEach for further
+// explanation
+template<class InputIterator, class Function, typename Arg>
+Function binaryNestedForEach(InputIterator First, InputIterator Last,
+                             Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    binaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+template<class InputIterator, class Function, typename Arg>
+Function safeBinaryNestedForEach(InputIterator First, InputIterator Last,
+                                 Function F, Arg &Second)
+{
+  for ( ; First != Last; ++First) {
+    safeBinaryForEach(First->begin(), First->end(), F, Second);
+  }
+  return F;
+}
+
+// Unlike the STL, a pointer to the iterator itself is passed in with the 'safe'
+// versions of these functions This allows the function to handle situations
+// such as invalidated iterators
+template<class InputIterator, class Function>
+Function safeForEach(InputIterator First, InputIterator Last, Function F)
+{
+  for ( ; First!=Last; ++First )  F(&First)
+    ; // Do nothing.
+  return F;
+}
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator. See binaryForEach for
+// further explanation
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F)
+{
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
diff --git a/src/gallium/drivers/radeon/AMDILAsmBackend.cpp b/src/gallium/drivers/radeon/AMDILAsmBackend.cpp

new file mode 100644 (file)

index 0000000..63b688d
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILAsmBackend.cpp
@@ -0,0 +1,82 @@
+//===------ AMDILAsmBackend.cpp - AMDIL Assembly Backend ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//
+#include "AMDILAsmBackend.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+namespace llvm {
+  ASM_BACKEND_CLASS* createAMDILAsmBackend(const ASM_BACKEND_CLASS &T,
+                                          const std::string &TT)
+  {
+    return new AMDILAsmBackend(T);
+  }
+} // namespace llvm
+
+//===--------------------- Default AMDIL Asm Backend ---------------------===//
+AMDILAsmBackend::AMDILAsmBackend(const ASM_BACKEND_CLASS &T) 
+  : ASM_BACKEND_CLASS()
+{
+}
+
+MCObjectWriter *
+AMDILAsmBackend::createObjectWriter(raw_ostream &OS) const
+{
+  return 0;
+}
+
+bool 
+AMDILAsmBackend::doesSectionRequireSymbols(const MCSection &Section) const
+{
+  return false;
+}
+
+bool 
+AMDILAsmBackend::isSectionAtomizable(const MCSection &Section) const
+{
+  return true;
+}
+
+bool 
+AMDILAsmBackend::isVirtualSection(const MCSection &Section) const
+{
+  return false;
+  //const MCSectionELF &SE = static_cast<const MCSectionELF&>(Section);
+  //return SE.getType() == MCSectionELF::SHT_NOBITS;
+}
+void 
+AMDILAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                          uint64_t Value) const
+{
+}
+
+bool 
+AMDILAsmBackend::MayNeedRelaxation(const MCInst &Inst) const
+{
+    return false;
+}
+
+void 
+AMDILAsmBackend::RelaxInstruction(const MCInst &Inst,
+                                       MCInst &Res) const
+{
+}
+
+bool 
+AMDILAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const
+{
+  return false;
+}
+
+unsigned
+AMDILAsmBackend::getNumFixupKinds() const
+{
+  return 0;
+}
diff --git a/src/gallium/drivers/radeon/AMDILAsmBackend.h b/src/gallium/drivers/radeon/AMDILAsmBackend.h

new file mode 100644 (file)

index 0000000..ae02768
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILAsmBackend.h
@@ -0,0 +1,49 @@
+//===-- AMDILAsmBackend.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDIL_ASM_BACKEND_H_
+#define _AMDIL_ASM_BACKEND_H_
+#include "AMDIL.h"
+#include "llvm/MC/MCAsmBackend.h"
+
+#define ASM_BACKEND_CLASS MCAsmBackend
+
+using namespace llvm;
+namespace llvm {
+  class AMDILAsmBackend : public ASM_BACKEND_CLASS {
+  public:
+    AMDILAsmBackend(const ASM_BACKEND_CLASS &T);
+    virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+    virtual bool doesSectionRequireSymbols(const MCSection &Section) const;
+    virtual bool isSectionAtomizable(const MCSection &Section) const;
+    virtual bool isVirtualSection(const MCSection &Section) const;
+    virtual void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                          uint64_t Value) const;
+    virtual bool
+      MayNeedRelaxation(const MCInst &Inst
+      ) const;
+    virtual void RelaxInstruction(const MCInst &Inst, MCInst &Res) const;
+    virtual bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const;
+    unsigned getNumFixupKinds() const;
+
+  virtual void applyFixup(const MCFixup &Fixup, char * Data, unsigned DataSize,
+                          uint64_t value) const { }
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
+  virtual bool fixupNeedsRelaxation(const MCFixup &fixup, uint64_t value,
+                                    const MCInstFragment *DF,
+                                    const MCAsmLayout &Layout) const
+                                    { return false; }
+  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const
+                                {}
+  virtual bool writeNopData(uint64_t data, llvm::MCObjectWriter * writer) const
+  { return false; }
+
+  }; // class AMDILAsmBackend;
+} // llvm namespace
+
+#endif // _AMDIL_ASM_BACKEND_H_
diff --git a/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp b/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp

new file mode 100644 (file)

index 0000000..1a73929
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp
@@ -0,0 +1,149 @@
+//===-- AMDILAsmPrinter7XX.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDIL7XXAsmPrinter.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDIL7XXAsmPrinter.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Type.h"
+
+using namespace llvm;
+
+// TODO: Add support for verbose.
+  AMDIL7XXAsmPrinter::AMDIL7XXAsmPrinter(TargetMachine& TM, MCStreamer &Streamer)
+: AMDILAsmPrinter(TM, Streamer)
+{
+}
+
+AMDIL7XXAsmPrinter::~AMDIL7XXAsmPrinter()
+{
+}
+///
+/// @param name
+/// @brief strips KERNEL_PREFIX and KERNEL_SUFFIX from the name
+/// and returns that name if both of the tokens are present.
+///
+  static
+std::string Strip(const std::string &name)
+{
+  size_t start = name.find("__OpenCL_");
+  size_t end = name.find("_kernel");
+  if (start == std::string::npos
+      || end == std::string::npos
+      || (start == end)) {
+    return name;
+  } else {
+    return name.substr(9, name.length()-16);
+  }
+}
+  void
+AMDIL7XXAsmPrinter::emitMacroFunc(const MachineInstr *MI,
+    llvm::raw_ostream &O)
+{
+  const AMDILSubtarget *curTarget = mTM->getSubtargetImpl();
+  const char *name = "unknown";
+  llvm::StringRef nameRef;
+  if (MI->getOperand(0).isGlobal()) {
+    nameRef = MI->getOperand(0).getGlobal()->getName();
+    name = nameRef.data();
+    if (curTarget->device()->usesHardware(
+          AMDILDeviceInfo::DoubleOps)
+        && !::strncmp(name, "__sqrt_f64", 10) ) {
+      name = "__sqrt_f64_7xx";
+    }
+  }
+  emitMCallInst(MI, O, name);
+}
+
+  bool
+AMDIL7XXAsmPrinter::runOnMachineFunction(MachineFunction &lMF)
+{
+  this->MF = &lMF;
+  mMeta->setMF(&lMF);
+  mMFI = lMF.getInfo<AMDILMachineFunctionInfo>();
+  SetupMachineFunction(lMF);
+  std::string kernelName = MF->getFunction()->getName();
+  mName = Strip(kernelName);
+
+  mKernelName = kernelName;
+  EmitFunctionHeader();
+  EmitFunctionBody();
+  return false;
+}
+
+  void
+AMDIL7XXAsmPrinter::EmitInstruction(const MachineInstr *II)
+{
+  std::string FunStr;
+  raw_string_ostream OFunStr(FunStr);
+  formatted_raw_ostream O(OFunStr);
+  const AMDILSubtarget *curTarget = mTM->getSubtargetImpl();
+  if (mDebugMode) {
+    O << ";" ;
+    II->print(O);
+  }
+   if (isMacroFunc(II)) {
+    emitMacroFunc(II, O);
+    O.flush();
+    OutStreamer.EmitRawText(StringRef(FunStr));
+    return;
+  }
+  if (isMacroCall(II)) {
+    const char *name;
+    name = mTM->getInstrInfo()->getName(II->getOpcode()) + 5;
+    int macronum = amd::MacroDBFindMacro(name);
+    O << "\t;"<< name<<"\n";
+    O << "\tmcall("<<macronum<<")";
+    if (curTarget->device()->isSupported(
+          AMDILDeviceInfo::MacroDB)) {
+      mMacroIDs.insert(macronum);
+    } else {
+      mMFI->addCalledIntr(macronum);
+    }
+  }
+
+  // Print the assembly for the instruction.
+  // We want to make sure that we do HW constants
+  // before we do arena segment
+  if (mMeta->useCompilerWrite(II)) {
+    // TODO: This is a hack to get around some
+    // conformance failures. 
+    O << "\tif_logicalz cb0[0].x\n";
+    O << "\tuav_raw_store_id("
+      << curTarget->device()->getResourceID(AMDILDevice::RAW_UAV_ID)
+      << ") ";
+    O << "mem0.x___, cb0[3].x, r0.0\n";
+    O << "\tendif\n";
+    mMFI->addMetadata(";memory:compilerwrite");
+  } else {
+    printInstruction(II, O);
+  }
+  O.flush();
+  OutStreamer.EmitRawText(StringRef(FunStr));
+}
diff --git a/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp b/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp

new file mode 100644 (file)

index 0000000..4a9732a
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp
@@ -0,0 +1,162 @@
+//===-- AMDILAsmPrinterEG.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILEGAsmPrinter.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILDevices.h"
+#include "AMDILEGAsmPrinter.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Metadata.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Type.h"
+
+using namespace llvm;
+
+
+// TODO: Add support for verbose.
+AMDILEGAsmPrinter::AMDILEGAsmPrinter(TargetMachine& TM, MCStreamer &Streamer)
+: AMDILAsmPrinter(TM, Streamer)
+{
+}
+
+AMDILEGAsmPrinter::~AMDILEGAsmPrinter()
+{
+}
+//
+// @param name
+// @brief strips KERNEL_PREFIX and KERNEL_SUFFIX from the name
+// and returns that name if both of the tokens are present.
+//
+  static
+std::string Strip(const std::string &name)
+{
+  size_t start = name.find("__OpenCL_");
+  size_t end = name.find("_kernel");
+  if (start == std::string::npos
+      || end == std::string::npos
+      || (start == end)) {
+    return name;
+  } else {
+    return name.substr(9, name.length()-16);
+  }
+}
+void
+AMDILEGAsmPrinter::emitMacroFunc(const MachineInstr *MI,
+    llvm::raw_ostream &O)
+{
+  const AMDILSubtarget *curTarget = mTM->getSubtargetImpl();
+  const char *name = "unknown";
+  llvm::StringRef nameRef;
+  if (MI->getOperand(0).isGlobal()) {
+    nameRef = MI->getOperand(0).getGlobal()->getName();
+    name = nameRef.data();
+  }
+  if (!::strncmp(name, "__fma_f32", 9) && curTarget->device()->usesHardware(
+        AMDILDeviceInfo::FMA)) {
+    name = "__hwfma_f32";
+  }
+  emitMCallInst(MI, O, name);
+}
+
+  bool
+AMDILEGAsmPrinter::runOnMachineFunction(MachineFunction &lMF)
+{
+  this->MF = &lMF;
+  mMeta->setMF(&lMF);
+  mMFI = lMF.getInfo<AMDILMachineFunctionInfo>();
+  SetupMachineFunction(lMF);
+  std::string kernelName = MF->getFunction()->getName();
+  mName = Strip(kernelName);
+
+  mKernelName = kernelName;
+  EmitFunctionHeader();
+  EmitFunctionBody();
+  return false;
+}
+  void
+AMDILEGAsmPrinter::EmitInstruction(const MachineInstr *II)
+{
+  std::string FunStr;
+  raw_string_ostream OFunStr(FunStr);
+  formatted_raw_ostream O(OFunStr);
+  const AMDILSubtarget *curTarget = mTM->getSubtargetImpl();
+  if (mDebugMode) {
+    O << ";" ;
+    II->print(O);
+  }
+   if (isMacroFunc(II)) {
+    emitMacroFunc(II, O);
+    O.flush();
+    OutStreamer.EmitRawText(StringRef(FunStr));
+    return;
+  }
+  if (isMacroCall(II)) {
+    const char *name;
+    name = mTM->getInstrInfo()->getName(II->getOpcode()) + 5;
+    if (!::strncmp(name, "__fma_f32", 9)
+        && curTarget->device()->usesHardware(
+          AMDILDeviceInfo::FMA)) {
+      name = "__hwfma_f32";
+    }
+    //assert(0 &&
+    //"Found a macro that is still in use!");
+    int macronum = amd::MacroDBFindMacro(name);
+    O << "\t;"<< name<<"\n";
+    O << "\tmcall("<<macronum<<")";
+    if (curTarget->device()->isSupported(
+          AMDILDeviceInfo::MacroDB)) {
+      mMacroIDs.insert(macronum);
+    } else {
+      mMFI->addCalledIntr(macronum);
+    }
+  }
+
+  // Print the assembly for the instruction.
+  // We want to make sure that we do HW constants
+  // before we do arena segment
+  // TODO: This is a hack to get around some
+  // conformance failures. 
+  if (mMeta->useCompilerWrite(II)) {
+    O << "\tif_logicalz cb0[0].x\n";
+    if (mMFI->usesMem(AMDILDevice::RAW_UAV_ID)) {
+      O << "\tuav_raw_store_id("
+        << curTarget->device()->getResourceID(AMDILDevice::RAW_UAV_ID)
+        << ") ";
+      O << "mem0.x___, cb0[3].x, r0.0\n";
+    } else {
+      O << "\tuav_arena_store_id("
+        << curTarget->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)
+        << ")_size(dword) ";
+      O << "cb0[3].x, r0.0\n";
+    }
+    O << "\tendif\n";
+    mMFI->addMetadata(";memory:compilerwrite");
+  } else {
+    printInstruction(II, O);
+  }
+  O.flush();
+  OutStreamer.EmitRawText(StringRef(FunStr));
+}
diff --git a/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp b/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp

new file mode 100644 (file)

index 0000000..1bc9651
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp
@@ -0,0 +1,254 @@
+//===----- AMDILBarrierDetect.cpp - Barrier Detect pass -*- C++ -*- ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "BarrierDetect"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+#include "AMDILAlgorithms.tpp"
+#include "AMDILCompilerWarnings.h"
+#include "AMDILDevices.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILTargetMachine.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// The barrier detect pass determines if a barrier has been duplicated in the
+// source program which can cause undefined behaviour if more than a single
+// wavefront is executed in a group. This is because LLVM does not have an
+// execution barrier and if this barrier function gets duplicated, undefined
+// behaviour can occur. In order to work around this, we detect the duplicated
+// barrier and then make the work-group execute in a single wavefront mode,
+// essentially making the barrier a no-op.
+
+namespace
+{
+  class LLVM_LIBRARY_VISIBILITY AMDILBarrierDetect : public FunctionPass
+  {
+    TargetMachine &TM;
+    static char ID;
+  public:
+    AMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+    ~AMDILBarrierDetect();
+    const char *getPassName() const;
+    bool runOnFunction(Function &F);
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+    void getAnalysisUsage(AnalysisUsage &AU) const;
+  private:
+    bool detectBarrier(BasicBlock::iterator *BBI);
+    bool detectMemFence(BasicBlock::iterator *BBI);
+    bool mChanged;
+    SmallVector<int64_t, DEFAULT_VEC_SLOTS> bVecMap;
+    const AMDILSubtarget *mStm;
+
+    // Constants used to define memory type.
+    static const unsigned int LOCAL_MEM_FENCE = 1<<0;
+    static const unsigned int GLOBAL_MEM_FENCE = 1<<1;
+    static const unsigned int REGION_MEM_FENCE = 1<<2;
+  };
+  char AMDILBarrierDetect::ID = 0;
+} // anonymouse namespace
+
+namespace llvm
+{
+  FunctionPass *
+  createAMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL)
+  {
+    return new AMDILBarrierDetect(TM  AMDIL_OPT_LEVEL_VAR);
+  }
+} // llvm namespace
+
+AMDILBarrierDetect::AMDILBarrierDetect(TargetMachine &TM
+                                       AMDIL_OPT_LEVEL_DECL)
+  :
+  FunctionPass(ID),
+  TM(TM)
+{
+}
+
+AMDILBarrierDetect::~AMDILBarrierDetect()
+{
+}
+
+bool AMDILBarrierDetect::detectBarrier(BasicBlock::iterator *BBI)
+{
+  SmallVector<int64_t, DEFAULT_VEC_SLOTS>::iterator bIter;
+  int64_t bID;
+  Instruction *inst = (*BBI);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+
+  if (!CI || !CI->getNumOperands()) {
+    return false;
+  }
+  const Value *funcVal = CI->getOperand(CI->getNumOperands() - 1);
+  if (funcVal && strncmp(funcVal->getName().data(), "__amd_barrier", 13)) {
+    return false;
+  }
+
+  if (inst->getNumOperands() >= 3) {
+    const Value *V = inst->getOperand(0);
+    const ConstantInt *Cint = dyn_cast<ConstantInt>(V);
+    bID = Cint->getSExtValue();
+    bIter = std::find(bVecMap.begin(), bVecMap.end(), bID);
+    if (bIter == bVecMap.end()) {
+      bVecMap.push_back(bID);
+    } else {
+      if (mStm->device()->isSupported(AMDILDeviceInfo::BarrierDetect)) {
+        AMDILMachineFunctionInfo *MFI =
+          getAnalysis<MachineFunctionAnalysis>().getMF()
+          .getInfo<AMDILMachineFunctionInfo>();
+        MFI->addMetadata(";limitgroupsize");
+        MFI->addErrorMsg(amd::CompilerWarningMessage[BAD_BARRIER_OPT]);
+      }
+    }
+  }
+  if (mStm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    AMDILMachineFunctionInfo *MFI =
+      getAnalysis<MachineFunctionAnalysis>().getMF()
+          .getInfo<AMDILMachineFunctionInfo>();
+    MFI->addErrorMsg(amd::CompilerWarningMessage[LIMIT_BARRIER]);
+    MFI->addMetadata(";limitgroupsize");
+    MFI->setUsesLocal();
+  }
+  const Value *V = inst->getOperand(inst->getNumOperands()-2);
+  const ConstantInt *Cint = dyn_cast<ConstantInt>(V);
+  Function *iF = dyn_cast<Function>(inst->getOperand(inst->getNumOperands()-1));
+  Module *M = iF->getParent();
+  bID = Cint->getSExtValue();
+  if (bID > 0) {
+    const char *name = "barrier";
+    if (bID == GLOBAL_MEM_FENCE) {
+      name = "barrierGlobal";
+    } else if (bID == LOCAL_MEM_FENCE
+        && mStm->device()->usesHardware(AMDILDeviceInfo::LocalMem)) {
+      name = "barrierLocal";
+    } else if (bID == REGION_MEM_FENCE
+               && mStm->device()->usesHardware(AMDILDeviceInfo::RegionMem)) {
+      name = "barrierRegion";
+    }
+    Function *nF =
+      dyn_cast<Function>(M->getOrInsertFunction(name, iF->getFunctionType()));
+    inst->setOperand(inst->getNumOperands()-1, nF);
+    return false;
+  }
+
+  return false;
+}
+
+bool AMDILBarrierDetect::detectMemFence(BasicBlock::iterator *BBI)
+{
+  int64_t bID;
+  Instruction *inst = (*BBI);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+
+  if (!CI || CI->getNumOperands() != 2) {
+    return false;
+  }
+
+  const Value *V = inst->getOperand(inst->getNumOperands()-2);
+  const ConstantInt *Cint = dyn_cast<ConstantInt>(V);
+  Function *iF = dyn_cast<Function>(inst->getOperand(inst->getNumOperands()-1));
+
+  const char *fence_local_name;
+  const char *fence_global_name;
+  const char *fence_region_name;
+  const char* fence_name = "mem_fence";
+  if (!iF) {
+    return false;
+  }
+
+  if (strncmp(iF->getName().data(), "mem_fence", 9) == 0) {
+    fence_local_name = "mem_fence_local";
+    fence_global_name = "mem_fence_global";
+    fence_region_name = "mem_fence_region";
+  } else if (strncmp(iF->getName().data(), "read_mem_fence", 14) == 0) {
+    fence_local_name = "read_mem_fence_local";
+    fence_global_name = "read_mem_fence_global";
+    fence_region_name = "read_mem_fence_region";
+  } else if (strncmp(iF->getName().data(), "write_mem_fence", 15) == 0) {
+    fence_local_name = "write_mem_fence_local";
+    fence_global_name = "write_mem_fence_global";
+    fence_region_name = "write_mem_fence_region";
+  } else {
+    return false;
+  }
+
+  Module *M = iF->getParent();
+  bID = Cint->getSExtValue();
+  if (bID > 0) {
+    const char *name = fence_name;
+    if (bID == GLOBAL_MEM_FENCE) {
+      name = fence_global_name;
+    } else if (bID == LOCAL_MEM_FENCE
+        && mStm->device()->usesHardware(AMDILDeviceInfo::LocalMem)) {
+      name = fence_local_name;
+    } else if (bID == REGION_MEM_FENCE
+               && mStm->device()->usesHardware(AMDILDeviceInfo::RegionMem)) {
+      name = fence_region_name;
+    }
+    Function *nF =
+      dyn_cast<Function>(M->getOrInsertFunction(name, iF->getFunctionType()));
+    inst->setOperand(inst->getNumOperands()-1, nF);
+    return false;
+  }
+
+  return false;
+
+}
+
+bool AMDILBarrierDetect::runOnFunction(Function &MF)
+{
+  mChanged = false;
+  bVecMap.clear();
+  mStm = &TM.getSubtarget<AMDILSubtarget>();
+  Function *F = &MF;
+  safeNestedForEach(F->begin(), F->end(), F->begin()->begin(),
+      std::bind1st(
+        std::mem_fun(
+          &AMDILBarrierDetect::detectBarrier), this));
+  safeNestedForEach(F->begin(), F->end(), F->begin()->begin(),
+      std::bind1st(
+        std::mem_fun(
+          &AMDILBarrierDetect::detectMemFence), this));
+  return mChanged;
+}
+
+const char* AMDILBarrierDetect::getPassName() const
+{
+  return "AMDIL Barrier Detect Pass";
+}
+
+bool AMDILBarrierDetect::doInitialization(Module &M)
+{
+  return false;
+}
+
+bool AMDILBarrierDetect::doFinalization(Module &M)
+{
+  return false;
+}
+
+void AMDILBarrierDetect::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
diff --git a/src/gallium/drivers/radeon/AMDILBase.td b/src/gallium/drivers/radeon/AMDILBase.td

new file mode 100644 (file)

index 0000000..2706b21
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILBase.td
@@ -0,0 +1,104 @@
+//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// AMDIL Subtarget features.
+//===----------------------------------------------------------------------===//
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "CapsOverride[AMDILDeviceInfo::DoubleOps]",
+        "true",
+        "Enable 64bit double precision operations">;
+def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
+        "CapsOverride[AMDILDeviceInfo::ByteStores]",
+        "true",
+        "Enable byte addressable stores">;
+def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
+        "CapsOverride[AMDILDeviceInfo::BarrierDetect]",
+        "true",
+        "Enable duplicate barrier detection(HD5XXX or later).">;
+def FeatureImages : SubtargetFeature<"images",
+        "CapsOverride[AMDILDeviceInfo::Images]",
+        "true",
+        "Enable image functions">;
+def FeatureMultiUAV : SubtargetFeature<"multi_uav",
+        "CapsOverride[AMDILDeviceInfo::MultiUAV]",
+        "true",
+        "Generate multiple UAV code(HD5XXX family or later)">;
+def FeatureMacroDB : SubtargetFeature<"macrodb",
+        "CapsOverride[AMDILDeviceInfo::MacroDB]",
+        "true",
+        "Use internal macrodb, instead of macrodb in driver">;
+def FeatureNoAlias : SubtargetFeature<"noalias",
+        "CapsOverride[AMDILDeviceInfo::NoAlias]",
+        "true",
+        "assert that all kernel argument pointers are not aliased">;
+def FeatureNoInline : SubtargetFeature<"no-inline",
+        "CapsOverride[AMDILDeviceInfo::NoInline]",
+        "true",
+        "specify whether to not inline functions">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "mIs64bit",
+        "false",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "mIs32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+def FeatureDebug : SubtargetFeature<"debug",
+        "CapsOverride[AMDILDeviceInfo::Debug]",
+        "true",
+        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+
+include "AMDILRegisterInfo.td"
+include "AMDILCallingConv.td"
+include "AMDILInstrInfo.td"
+
+def AMDILInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// AMDIL processors supported.
+//===----------------------------------------------------------------------===//
+//include "Processors.td"
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+def AMDILAsmWriter : AsmWriter {
+    string AsmWriterClassName = "AsmPrinter";
+    int Variant = 0;
+}
+
+def AMDILAsmParser : AsmParser {
+    string AsmParserClassName = "AsmParser";
+    int Variant = 0;
+
+    string CommentDelimiter = ";";
+
+    string RegisterPrefix = "r";
+
+}
+
+
+def AMDIL : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDILInstrInfo;
+  let AssemblyWriters = [AMDILAsmWriter];
+  let AssemblyParsers = [AMDILAsmParser];
+}
diff --git a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp

new file mode 100644 (file)

index 0000000..a7d3946
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
@@ -0,0 +1,3257 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "structcfg"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILCompilerErrors.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define FirstNonDebugInstr(A) A->begin()
+using namespace llvm;
+
+// bixia TODO: move this out to analysis lib. Make this work for both target
+// AMDIL and CBackend.
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
+    "pattern matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace llvmCFGStruct
+{
+#define SHOWNEWINSTR(i) \
+  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+
+#define SHOWNEWBLK(b, msg) \
+if (DEBUGME) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  errs() << "\n"; \
+}
+
+#define SHOWBLK_DETAIL(b, msg) \
+if (DEBUGME) { \
+  if (b) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(errs()); \
+  errs() << "\n"; \
+  } \
+}
+
+#define INVALIDSCCNUM -1
+#define INVALIDREGNUM 0
+
+template<class LoopinfoT>
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
+  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
+       iterEnd = LoopInfo.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->print(OS, 0);
+  }
+}
+
+template<class NodeT>
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} //end namespace llvmCFGStruct
+
+
+//===----------------------------------------------------------------------===//
+//
+// MachinePostDominatorTree
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILCompilerErrors.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+namespace llvm {
+
+/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
+/// to compute the a post-dominator tree.
+///
+struct MachinePostDominatorTree : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  DominatorTreeBase<MachineBasicBlock> *DT;
+  MachinePostDominatorTree() : MachineFunctionPass(ID)
+  {
+    DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+    // postdominator
+  }
+
+  ~MachinePostDominatorTree();
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  inline const std::vector<MachineBasicBlock *> &getRoots() const {
+    return DT->getRoots();
+  }
+
+  inline MachineDomTreeNode *getRootNode() const {
+    return DT->getRootNode();
+  }
+
+  inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  inline bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->dominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline bool
+  properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  inline MachineBasicBlock *
+  findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B) {
+    return DT->findNearestCommonDominator(A, B);
+  }
+
+  virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const {
+    DT->print(OS);
+  }
+};
+} //end of namespace llvm
+
+char MachinePostDominatorTree::ID = 0;
+static RegisterPass<MachinePostDominatorTree>
+machinePostDominatorTreePass("machinepostdomtree",
+                             "MachinePostDominator Tree Construction",
+                             true, true);
+
+//const PassInfo *const llvm::MachinePostDominatorsID
+//= &machinePostDominatorTreePass;
+
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  //DEBUG(DT->dump());
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+template<class PassT>
+struct CFGStructTraits {
+};
+
+template <class InstrT>
+class BlockInformation {
+public:
+  bool isRetired;
+  int  sccNum;
+  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
+  //Instructions defining the corresponding successor.
+  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
+};
+
+template <class BlockT, class InstrT, class RegiT>
+class LandInformation {
+public:
+  BlockT *landBlk;
+  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
+                                  //WHILELOOP(thisloop) init before entering
+                                  //thisloop.
+  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
+                                  //WHILELOOP(thisloop) init after entering
+                                  //thisloop.
+  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
+                                     //land block, branch cond on this reg.
+  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
+                                     //endif" after ENDLOOP(thisloop) break
+                                     //outerLoopOf(thisLoop).
+  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
+                                    //endif" after ENDLOOP(thisloop) continue on
+                                    //outerLoopOf(thisLoop).
+  LandInformation() : landBlk(NULL) {}
+};
+
+} //end of namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
+template<class PassT>
+class  CFGStructurizer
+{
+public:
+  typedef enum {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  } PathToKind;
+
+public:
+  typedef typename PassT::InstructionType         InstrT;
+  typedef typename PassT::FunctionType            FuncT;
+  typedef typename PassT::DominatortreeType       DomTreeT;
+  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
+  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
+  typedef typename PassT::LoopinfoType            LoopInfoT;
+
+  typedef GraphTraits<FuncT *>                    FuncGTraits;
+  //typedef FuncGTraits::nodes_iterator BlockIterator;
+  typedef typename FuncT::iterator                BlockIterator;
+
+  typedef typename FuncGTraits::NodeType          BlockT;
+  typedef GraphTraits<BlockT *>                   BlockGTraits;
+  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
+  //typedef BlockGTraits::succ_iterator InstructionIterator;
+  typedef typename BlockT::iterator               InstrIterator;
+
+  typedef CFGStructTraits<PassT>                  CFGTraits;
+  typedef BlockInformation<InstrT>                BlockInfo;
+  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
+
+  typedef int                                     RegiT;
+  typedef typename PassT::LoopType                LoopT;
+  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
+        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
+        //landing info for loop break
+  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+
+public:
+  CFGStructurizer();
+  ~CFGStructurizer();
+
+  /// Perform the CFG structurization
+  bool run(FuncT &Func, PassT &Pass);
+
+  /// Perform the CFG preparation
+  bool prepare(FuncT &Func, PassT &Pass);
+
+private:
+  void   orderBlocks();
+  void   printOrderedBlocks(llvm::raw_ostream &OS);
+  int patternMatch(BlockT *CurBlock);
+  int patternMatchGroup(BlockT *CurBlock);
+
+  int serialPatternMatch(BlockT *CurBlock);
+  int ifPatternMatch(BlockT *CurBlock);
+  int switchPatternMatch(BlockT *CurBlock);
+  int loopendPatternMatch(BlockT *CurBlock);
+  int loopPatternMatch(BlockT *CurBlock);
+
+  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  //int loopWithoutBreak(BlockT *);
+
+  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
+                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
+  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
+                           BlockT *ContBlock, LoopT *contLoop);
+  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
+  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                       BlockT *FalseBlock);
+  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
+                          BlockT *FalseBlock);
+  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                              BlockT *FalseBlock, BlockT **LandBlockPtr);
+  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                                   BlockT *FalseBlock, BlockT *LandBlock,
+                                   bool Detail = false);
+  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
+                          bool AllowSideEntry = true);
+  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
+                        bool AllowSideEntry = true);
+  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
+  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
+
+  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
+                            BlockT *TrueBlock, BlockT *FalseBlock,
+                            BlockT *LandBlock);
+  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
+  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
+                           BlockT *ExitLandBlock, RegiT SetReg);
+  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
+                           RegiT SetReg);
+  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
+                                std::set<BlockT*> &ExitBlockSet,
+                                BlockT *ExitLandBlk);
+  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
+                                BlockTSmallerVector &ExitingBlocks,
+                                BlockTSmallerVector &ExitBlocks);
+  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
+  void removeUnconditionalBranch(BlockT *SrcBlock);
+  void removeRedundantConditionalBranch(BlockT *SrcBlock);
+  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
+
+  void removeSuccessor(BlockT *SrcBlock);
+  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
+  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
+
+  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
+                          InstrIterator InsertPos);
+
+  void recordSccnum(BlockT *SrcBlock, int SCCNum);
+  int getSCCNum(BlockT *srcBlk);
+
+  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
+  bool isRetiredBlock(BlockT *SrcBlock);
+  bool isActiveLoophead(BlockT *CurBlock);
+  bool needMigrateBlock(BlockT *Block);
+
+  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
+                              BlockTSmallerVector &exitBlocks,
+                              std::set<BlockT*> &ExitBlockSet);
+  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
+  BlockT *getLoopLandBlock(LoopT *LoopRep);
+  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
+
+  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
+
+  bool hasBackEdge(BlockT *curBlock);
+  unsigned getLoopDepth  (LoopT *LoopRep);
+  int countActiveBlock(
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
+    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
+  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+
+private:
+  DomTreeT *domTree;
+  PostDomTreeT *postDomTree;
+  LoopInfoT *loopInfo;
+  PassT *passRep;
+  FuncT *funcRep;
+
+  BlockInfoMap blockInfoMap;
+  LoopLandInfoMap loopLandInfoMap;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
+
+};  //template class CFGStructurizer
+
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
+  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+}
+
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
+  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
+       E = blockInfoMap.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass) {
+  passRep = &pass;
+  funcRep = &func;
+
+  bool changed = false;
+  //func.RenumberBlocks();
+
+  //to do, if not reducible flow graph, make it so ???
+
+  if (DEBUGME) {
+        errs() << "AMDILCFGStructurizer::prepare\n";
+    //func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //domTree = CFGTraits::getDominatorTree(pass);
+      //if (DEBUGME) {
+      //    domTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+      //postDomTree = CFGTraits::getPostDominatorTree(pass);
+      //if (DEBUGME) {
+      //   postDomTree->print(errs());
+    //}
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+
+  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
+       iterEnd = loopInfo->end();
+       iter != iterEnd; ++iter) {
+    LoopT* loopRep = (*iter);
+    BlockTSmallerVector exitingBlks;
+    loopRep->getExitingBlocks(exitingBlks);
+    
+    if (exitingBlks.size() == 0) {
+      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
+      if (dummyExitBlk != NULL)
+        retBlks.push_back(dummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
+       iterBlk != iterEndBlk;
+       ++iterBlk) {
+    BlockT *curBlk = *iterBlk;
+    removeUnconditionalBranch(curBlk);
+    removeRedundantConditionalBranch(curBlk);
+    if (CFGTraits::isReturnBlock(curBlk)) {
+      retBlks.push_back(curBlk);
+    }
+    assert(curBlk->succ_size() <= 2);
+    //assert(curBlk->size() > 0);
+    //removeEmptyBlock(curBlk) ??
+  } //for
+
+  if (retBlks.size() >= 2) {
+    addDummyExitBlock(retBlks);
+    changed = true;
+  }
+
+  return changed;
+} //CFGStructurizer::prepare
+
+template<class PassT>
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass) {
+  passRep = &pass;
+  funcRep = &func;
+
+  //func.RenumberBlocks();
+
+  //Assume reducible CFG...
+  if (DEBUGME) {
+    errs() << "AMDILCFGStructurizer::run\n";
+    //errs() << func.getFunction()->getNameStr() << "\n";
+    func.viewCFG();
+    //func.viewCFGOnly();
+    //func.dump();
+  }
+
+#if 1
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  domTree = CFGTraits::getDominatorTree(pass);
+  if (DEBUGME) {
+    domTree->print(errs(), (const llvm::Module*)0);
+  }
+#endif
+
+  //FIXME: gcc complains on this.
+  //domTree = &pass.getAnalysis<DomTreeT>();
+  postDomTree = CFGTraits::getPostDominatorTree(pass);
+  if (DEBUGME) {
+    postDomTree->print(errs());
+  }
+
+  //FIXME: gcc complains on this.
+  //loopInfo = &pass.getAnalysis<LoopInfoT>();
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+//#define STRESSTEST
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+  int numIter = 0;
+  bool finish = false;
+  BlockT *curBlk;
+  bool makeProgress = false;
+  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
+                                        orderedBlks.end());
+
+  do {
+    ++numIter;
+    if (DEBUGME) {
+      errs() << "numIter = " << numIter
+             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+    }
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin();
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlkEnd = orderedBlks.end();
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      sccBeginIter = iterBlk;
+    BlockT *sccBeginBlk = NULL;
+    int sccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int sccNumIter;     // Number of iteration in this SCC.
+
+    while (iterBlk != iterBlkEnd) {
+      curBlk = *iterBlk;
+
+      if (sccBeginBlk == NULL) {
+        sccBeginIter = iterBlk;
+        sccBeginBlk = curBlk;
+        sccNumIter = 0;
+        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+        if (DEBUGME) {
+              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              errs() << "\n";
+        }
+      }
+
+      if (!isRetiredBlock(curBlk)) {
+        patternMatch(curBlk);
+      }
+
+      ++iterBlk;
+
+      bool contNextScc = true;
+      if (iterBlk == iterBlkEnd
+          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+        // Just finish one scc.
+        ++sccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+          if (DEBUGME) {
+            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
+                   << ", sccNumIter = " << sccNumIter;
+            errs() << "doesn't make any progress\n";
+          }
+          contNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
+          sccNumBlk = sccRemainedNumBlk;
+          iterBlk = sccBeginIter;
+          contNextScc = false;
+          if (DEBUGME) {
+            errs() << "repeat processing SCC" << getSCCNum(curBlk)
+                   << "sccNumIter = " << sccNumIter << "\n";
+            func.viewCFG();
+            //func.viewCFGOnly();
+          }
+        } else {
+          // Finish the current scc.
+          contNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        contNextScc = false;
+      }
+
+      if (contNextScc) {
+        sccBeginBlk = NULL;
+      }
+    } //while, "one iteration" over the function.
+
+    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
+    if (entryBlk->succ_size() == 0) {
+      finish = true;
+      if (DEBUGME) {
+        errs() << "Reduce to one block\n";
+      }
+    } else {
+      int newnumRemainedBlk
+        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      // consider cloned blocks ??
+      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
+        makeProgress = true;
+        numRemainedBlk = newnumRemainedBlk;
+      } else {
+        makeProgress = false;
+        if (DEBUGME) {
+          errs() << "No progress\n";
+        }
+      }
+    }
+  } while (!finish && makeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+
+  // Detach retired Block, release memory.
+  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
+       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    if ((*iterMap).second && (*iterMap).second->isRetired) {
+      assert(((*iterMap).first)->getNumber() != -1);
+      if (DEBUGME) {
+        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+      }
+      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*iterMap).second;
+  }
+  blockInfoMap.clear();
+
+  // clear loopLandInfoMap
+  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
+       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    delete (*iterMap).second;
+  }
+  loopLandInfoMap.clear();
+
+  if (DEBUGME) {
+    func.viewCFG();
+    //func.dump();
+  }
+
+  if (!finish) {
+    MachineFunction *MF = &func;
+    AMDILMachineFunctionInfo *mMFI =
+      MF->getInfo<AMDILMachineFunctionInfo>();
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[IRREDUCIBLE_CF]);
+  }
+
+  return true;
+} //CFGStructurizer::run
+
+/// Print the ordered Blocks.
+///
+template<class PassT>
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
+  size_t i = 0;
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
+       iterBlk != iterBlkEnd;
+       ++iterBlk, ++i) {
+    os << "BB" << (*iterBlk)->getNumber();
+    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+    if (i != 0 && i % 10 == 0) {
+      os << "\n";
+    } else {
+      os << " ";
+    }
+  }
+} //printOrderedBlocks
+
+/// Compute the reversed DFS post order of Blocks
+///
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
+  int sccNum = 0;
+  BlockT *bb;
+  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
+       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
+    std::vector<BlockT *> &sccNext = *sccIter;
+    for (typename std::vector<BlockT *>::const_iterator
+         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      bb = *blockIter;
+      orderedBlks.push_back(bb);
+      recordSccnum(bb, sccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
+       blockEnd1 = FuncGTraits::nodes_end(funcRep);
+       blockIter1 != blockEnd1; ++blockIter1) {
+    BlockT *bb = &(*blockIter1);
+    sccNum = getSCCNum(bb);
+    if (sccNum == INVALIDSCCNUM) {
+      errs() << "unreachable block BB" << bb->getNumber() << "\n";
+    }
+  } //end of for
+} //orderBlocks
+
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
+  int numMatch = 0;
+  int curMatch;
+
+  if (DEBUGME) {
+        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+  }
+
+  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
+    numMatch += curMatch;
+  }
+
+  if (DEBUGME) {
+        errs() << "End patternMatch BB" << curBlk->getNumber()
+      << ", numMatch = " << numMatch << "\n";
+  }
+
+  return numMatch;
+} //patternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
+  int numMatch = 0;
+  numMatch += serialPatternMatch(curBlk);
+  numMatch += ifPatternMatch(curBlk);
+  //numMatch += switchPatternMatch(curBlk);
+  numMatch += loopendPatternMatch(curBlk);
+  numMatch += loopPatternMatch(curBlk);
+  return numMatch;
+}//patternMatchGroup
+
+template<class PassT>
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 1) {
+    return 0;
+  }
+
+  BlockT *childBlk = *curBlk->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+    return 0;
+  }
+
+  mergeSerialBlock(curBlk, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+} //serialPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+  //two edges
+  if (curBlk->succ_size() != 2) {
+    return 0;
+  }
+
+  if (hasBackEdge(curBlk)) {
+    return 0;
+  }
+
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
+  if (branchInstr == NULL) {
+    return 0;
+  }
+
+  assert(CFGTraits::isCondBranch(branchInstr));
+
+  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
+  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
+  BlockT *landBlk;
+  int cloned = 0;
+
+  // TODO: Simplify
+  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
+    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
+    landBlk = *trueBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
+    landBlk = NULL;
+  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
+    landBlk = falseBlk;
+    falseBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && *falseBlk->succ_begin() == trueBlk) {
+    landBlk = trueBlk;
+    trueBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
+    landBlk = *falseBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 1
+    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
+    landBlk = *trueBlk->succ_begin();
+  } else {
+    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (landBlk != NULL &&
+      ((trueBlk && trueBlk->pred_size() > 1)
+      || (falseBlk && falseBlk->pred_size() > 1))) {
+     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  }
+
+  if (trueBlk && trueBlk->pred_size() > 1) {
+    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
+    ++cloned;
+  }
+
+  if (falseBlk && falseBlk->pred_size() > 1) {
+    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
+    ++cloned;
+  }
+
+  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += cloned;
+
+  return 1 + cloned;
+} //ifPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
+  return 0;
+} //switchPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  typename std::vector<LoopT *> nestedLoops;
+  while (loopRep) {
+    nestedLoops.push_back(loopRep);
+    loopRep = loopRep->getParentLoop();
+  }
+
+  if (nestedLoops.size() == 0) {
+    return 0;
+  }
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int num = 0;
+  for (typename std::vector<LoopT *>::reverse_iterator
+       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
+       iter != iterEnd; ++iter) {
+    loopRep = *iter;
+
+    if (getLoopLandBlock(loopRep) != NULL) {
+      continue;
+    }
+
+    BlockT *loopHeader = loopRep->getHeader();
+
+    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+
+    if (numBreak == -1) {
+      break;
+    }
+
+    int numCont = loopcontPatternMatch(loopRep, loopHeader);
+    num += numBreak + numCont;
+  }
+
+  return num;
+} //loopendPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 0) {
+    return 0;
+  }
+
+  int numLoop = 0;
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+    if (loopLand) {
+      BlockT *landBlk = loopLand->landBlk;
+      assert(landBlk);
+      if (!isRetiredBlock(landBlk)) {
+        mergeLooplandBlock(curBlk, loopLand);
+        ++numLoop;
+      }
+    }
+    loopRep = loopRep->getParentLoop();
+  }
+
+  numLoopPatternMatch += numLoop;
+
+  return numLoop;
+} //loopPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
+                                                  BlockT *loopHeader) {
+  BlockTSmallerVector exitingBlks;
+  loopRep->getExitingBlocks(exitingBlks);
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
+  }
+
+  if (exitingBlks.size() == 0) {
+    setLoopLandBlock(loopRep);
+    return 0;
+  }
+
+  // Compute the corresponding exitBlks and exit block set.
+  BlockTSmallerVector exitBlks;
+  std::set<BlockT *> exitBlkSet;
+  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
+       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *exitingBlk = *iter;
+    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+    exitBlks.push_back(exitBlk);
+    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  }
+
+  assert(exitBlkSet.size() > 0);
+  assert(exitBlks.size() == exitingBlks.size());
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  }
+
+  // Find exitLandBlk.
+  BlockT *exitLandBlk = NULL;
+  int numCloned = 0;
+  int numSerial = 0;
+
+  if (exitBlkSet.size() == 1)
+  {
+    exitLandBlk = *exitBlkSet.begin();
+  } else {
+    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+
+    if (exitLandBlk == NULL) {
+      return -1;
+    }
+
+    bool allInPath = true;
+    bool allNotInPath = true;
+    for (typename std::set<BlockT*>::const_iterator
+         iter = exitBlkSet.begin(),
+         iterEnd = exitBlkSet.end();
+         iter != iterEnd; ++iter) {
+      BlockT *exitBlk = *iter;
+
+      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
+      if (DEBUGME) {
+        errs() << "BB" << exitBlk->getNumber()
+               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
+               << pathKind << "\n";
+      }
+
+      allInPath = allInPath && (pathKind == SinglePath_InPath);
+      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+
+      if (!allInPath && !allNotInPath) {
+        if (DEBUGME) {
+              errs() << "singlePath check fail\n";
+        }
+        return -1;
+      }
+    } // check all exit blocks
+
+    if (allNotInPath) {
+#if 1
+
+      // TODO: Simplify, maybe separate function?
+      //funcRep->viewCFG();
+      LoopT *parentLoopRep = loopRep->getParentLoop();
+      BlockT *parentLoopHeader = NULL;
+      if (parentLoopRep)
+        parentLoopHeader = parentLoopRep->getHeader();
+
+      if (exitLandBlk == parentLoopHeader &&
+          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
+                                               loopRep,
+                                               exitBlkSet,
+                                               exitLandBlk)) != NULL) {
+        if (DEBUGME) {
+          errs() << "relocateLoopcontBlock success\n";
+        }
+      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
+                                                      exitingBlks,
+                                                      exitBlks)) != NULL) {
+        if (DEBUGME) {
+          errs() << "insertEndbranchBlock success\n";
+        }
+      } else {
+        if (DEBUGME) {
+          errs() << "loop exit fail\n";
+        }
+        return -1;
+      }
+#else
+      return -1;
+#endif
+    }
+
+    // Handle side entry to exit path.
+    exitBlks.clear();
+    exitBlkSet.clear();
+    for (typename BlockTSmallerVector::iterator iterExiting =
+           exitingBlks.begin(),
+         iterExitingEnd = exitingBlks.end();
+         iterExiting != iterExitingEnd; ++iterExiting) {
+      BlockT *exitingBlk = *iterExiting;
+      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+      BlockT *newExitBlk = exitBlk;
+
+      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
+        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
+        ++numCloned;
+      }
+
+      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+
+      exitBlks.push_back(newExitBlk);
+      exitBlkSet.insert(newExitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      numSerial += serialPatternMatch(exitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      if (exitBlk->pred_size() > 1) {
+        if (exitBlk != exitLandBlk) {
+          return -1;
+        }
+      } else {
+        if (exitBlk != exitLandBlk &&
+            (exitBlk->succ_size() != 1 ||
+            *exitBlk->succ_begin() != exitLandBlk)) {
+          return -1;
+        }
+      }
+    }
+  } // else
+
+  // LoopT *exitLandLoop = loopInfo->getLoopFor(exitLandBlk);
+  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+
+  // Fold break into the breaking block. Leverage across level breaks.
+  assert(exitingBlks.size() == exitBlks.size());
+  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
+       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
+       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
+    BlockT *exitBlk = *iterExit;
+    BlockT *exitingBlk = *iterExiting;
+    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
+    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
+    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
+  }
+
+  int numBreak = static_cast<int>(exitingBlks.size());
+  numLoopbreakPatternMatch += numBreak;
+  numClonedBlock += numCloned;
+  return numBreak + numSerial + numCloned;
+} //loopbreakPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
+                                                 BlockT *loopHeader) {
+  int numCont = 0;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
+  for (typename InvBlockGTraits::ChildIteratorType iter =
+       InvBlockGTraits::child_begin(loopHeader),
+       iterEnd = InvBlockGTraits::child_end(loopHeader);
+       iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    if (loopRep->contains(curBlk)) {
+      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
+                          loopHeader, loopRep);
+      contBlk.push_back(curBlk);
+      ++numCont;
+    }
+  }
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
+       iter = contBlk.begin(), iterEnd = contBlk.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->removeSuccessor(loopHeader);
+  }
+
+  numLoopcontPatternMatch += numCont;
+
+  return numCont;
+} //loopcontPatternMatch
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
+                                                         BlockT *src2Blk) {
+  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
+  // same loop with LoopLandInfo without explicitly keeping track of
+  // loopContBlks and loopBreakBlks, this is a method to get the information.
+  //
+  if (src1Blk->succ_size() == 0) {
+    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
+    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
+      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+      if (theEntry != NULL) {
+        if (DEBUGME) {
+          errs() << "isLoopContBreakBlock yes src1 = BB"
+                 << src1Blk->getNumber()
+                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}  //isSameloopDetachedContbreak
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
+                                             BlockT *trueBlk,
+                                             BlockT *falseBlk) {
+  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
+  if (num == 0) {
+    if (DEBUGME) {
+      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    }
+    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+  }
+  return num;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
+                                                BlockT *trueBlk,
+                                                BlockT *falseBlk) {
+  int num = 0;
+  BlockT *downBlk;
+
+  //trueBlk could be the common post dominator
+  downBlk = trueBlk;
+
+  if (DEBUGME) {
+    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
+           << " true = BB" << trueBlk->getNumber()
+           << ", numSucc=" << trueBlk->succ_size()
+           << " false = BB" << falseBlk->getNumber() << "\n";
+  }
+
+  while (downBlk) {
+    if (DEBUGME) {
+      errs() << "check down = BB" << downBlk->getNumber();
+    }
+
+    if (//postDomTree->dominates(downBlk, falseBlk) &&
+        singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+      if (DEBUGME) {
+        errs() << " working\n";
+      }
+
+      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
+      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+
+      numClonedBlock += num;
+      num += serialPatternMatch(*headBlk->succ_begin());
+      num += serialPatternMatch(*(++headBlk->succ_begin()));
+      num += ifPatternMatch(headBlk);
+      assert(num > 0); //
+
+      break;
+    }
+    if (DEBUGME) {
+      errs() << " not working\n";
+    }
+    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
+
+  return num;
+} //handleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
+                                                         BlockT *trueBlk,
+                                                         BlockT *falseBlk,
+                                                         BlockT *landBlk,
+                                                         bool detail) {
+  errs() << "head = BB" << headBlk->getNumber()
+         << " size = " << headBlk->size();
+  if (detail) {
+    errs() << "\n";
+    headBlk->print(errs());
+    errs() << "\n";
+  }
+
+  if (trueBlk) {
+    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
+           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      trueBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (falseBlk) {
+    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
+           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      falseBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (landBlk) {
+    errs() << ", land = BB" << landBlk->getNumber() << " size = "
+           << landBlk->size() << " numPred = " << landBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      landBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+
+    errs() << "\n";
+} //showImproveSimpleJumpintoIf
+
+template<class PassT>
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
+                                                    BlockT *trueBlk,
+                                                    BlockT *falseBlk,
+                                                    BlockT **plandBlk) {
+  bool migrateTrue = false;
+  bool migrateFalse = false;
+
+  BlockT *landBlk = *plandBlk;
+
+  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
+         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+
+  if (trueBlk == falseBlk) {
+    return 0;
+  }
+
+#if 0
+  if (DEBUGME) {
+    errs() << "improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+#endif
+
+  // unsigned landPredSize = landBlk ? landBlk->pred_size() : 0;
+  // May consider the # landBlk->pred_size() as it represents the number of
+  // assignment initReg = .. needed to insert.
+  migrateTrue = needMigrateBlock(trueBlk);
+  migrateFalse = needMigrateBlock(falseBlk);
+
+  if (!migrateTrue && !migrateFalse) {
+    return 0;
+  }
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
+    migrateTrue = true;
+  }
+  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
+    migrateFalse = true;
+  }
+
+  if (DEBUGME) {
+    errs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+  unsigned initReg =
+    funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+  if (!migrateTrue || !migrateFalse) {
+    int initVal = migrateTrue ? 0 : 1;
+    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  }
+
+  int numNewBlk = 0;
+
+  if (landBlk == NULL) {
+    landBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(landBlk);  //insert to function
+
+    if (trueBlk) {
+      trueBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    if (falseBlk) {
+      falseBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    numNewBlk ++;
+  }
+
+  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+
+  //insert AMDIL::ENDIF to avoid special case "input landBlk == NULL"
+  typename BlockT::iterator insertPos =
+    CFGTraits::getInstrPos
+    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDIL::ENDIF, passRep));
+
+  if (landBlkHasOtherPred) {
+    unsigned immReg =
+      funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
+    unsigned cmpResReg =
+      funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+
+    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
+                                        initReg, immReg);
+    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
+                                      AMDIL::IF_LOGICALZ_i32, passRep,
+                                      cmpResReg, DebugLoc());
+  }
+
+  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDIL::IF_LOGICALNZ_i32,
+                                    passRep, initReg, DebugLoc());
+
+  if (migrateTrue) {
+    migrateInstruction(trueBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+  }
+  CFGTraits::insertInstrBefore(insertPos, AMDIL::ELSE, passRep);
+
+  if (migrateFalse) {
+    migrateInstruction(falseBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+  }
+  //CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep);
+
+  if (landBlkHasOtherPred) {
+    // add endif
+    CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
+         ++predIter) {
+      BlockT *curBlk = *predIter;
+      if (curBlk != trueBlk && curBlk != falseBlk) {
+        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
+      }
+    } //for
+  }
+  if (DEBUGME) {
+    errs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+    //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1);
+  }
+
+  // update landBlk
+  *plandBlk = landBlk;
+
+  return numNewBlk;
+} //improveSimpleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
+                                              LoopT *exitingLoop,
+                                             BlockT *exitBlk,
+                                              LoopT *exitLoop,
+                                             BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
+           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  if (exitingLoop != exitLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    assert(initReg != INVALIDREGNUM);
+    addLoopBreakInitReg(exitLoop, initReg);
+    while (exitingLoop != exitLoop && exitingLoop) {
+      addLoopBreakOnReg(exitingLoop, initReg);
+      exitingLoop = exitingLoop->getParentLoop();
+    }
+    assert(exitingLoop == exitLoop);
+  }
+
+  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+
+} //handleLoopbreak
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
+                                                  LoopT *contingLoop,
+                                                 BlockT *contBlk,
+                                                  LoopT *contLoop) {
+  if (DEBUGME) {
+    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
+           << " header = BB" << contBlk->getNumber() << "\n";
+
+    errs() << "Trying to continue loop-depth = "
+           << getLoopDepth(contLoop)
+           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  if (contingLoop != contLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    assert(initReg != INVALIDREGNUM);
+    addLoopContInitReg(contLoop, initReg);
+    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
+      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
+      contingLoop = contingLoop->getParentLoop();
+    }
+    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
+    addLoopContOnReg(contingLoop, initReg);
+  }
+
+  settleLoopcontBlock(contingBlk, contBlk, initReg);
+  //contingBlk->removeSuccessor(loopHeader);
+} //handleLoopcontBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+    errs() << "serialPattern BB" << dstBlk->getNumber()
+           << " <= BB" << srcBlk->getNumber() << "\n";
+  }
+  //removeUnconditionalBranch(dstBlk);
+  dstBlk->splice(dstBlk->end(), srcBlk, FirstNonDebugInstr(srcBlk), srcBlk->end());
+
+  dstBlk->removeSuccessor(srcBlk);
+  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+
+  removeSuccessor(srcBlk);
+  retireBlock(dstBlk, srcBlk);
+} //mergeSerialBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
+                                                  BlockT *curBlk,
+                                                  BlockT *trueBlk,
+                                                  BlockT *falseBlk,
+                                                  BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "ifPattern BB" << curBlk->getNumber();
+    errs() << "{  ";
+    if (trueBlk) {
+      errs() << "BB" << trueBlk->getNumber();
+    }
+    errs() << "  } else ";
+    errs() << "{  ";
+    if (falseBlk) {
+      errs() << "BB" << falseBlk->getNumber();
+    }
+    errs() << "  }\n ";
+    errs() << "landBlock: ";
+    if (landBlk == NULL) {
+      errs() << "NULL";
+    } else {
+      errs() << "BB" << landBlk->getNumber();
+    }
+    errs() << "\n";
+  }
+
+  int oldOpcode = branchInstr->getOpcode();
+  DebugLoc branchDL = branchInstr->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(curBlk, branchInstr);
+  CFGTraits::insertCondBranchBefore(branchInstrPos,
+                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
+                                    passRep,
+                                                                       branchDL);
+
+  if (trueBlk) {
+    curBlk->splice(branchInstrPos, trueBlk, FirstNonDebugInstr(trueBlk), trueBlk->end());
+    curBlk->removeSuccessor(trueBlk);
+    if (landBlk && trueBlk->succ_size()!=0) {
+      trueBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, trueBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ELSE, passRep);
+
+  if (falseBlk) {
+    curBlk->splice(branchInstrPos, falseBlk, FirstNonDebugInstr(falseBlk),
+                   falseBlk->end());
+    curBlk->removeSuccessor(falseBlk);
+    if (landBlk && falseBlk->succ_size() != 0) {
+      falseBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, falseBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep);
+
+  //curBlk->remove(branchInstrPos);
+  branchInstr->eraseFromParent();
+
+  if (landBlk && trueBlk && falseBlk) {
+    curBlk->addSuccessor(landBlk);
+  }
+
+} //mergeIfthenelseBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
+                                                LoopLandInfo *loopLand) {
+  BlockT *landBlk = loopLand->landBlk;
+
+  if (DEBUGME) {
+    errs() << "loopPattern header = BB" << dstBlk->getNumber()
+           << " land = BB" << landBlk->getNumber() << "\n";
+  }
+
+  // Loop contInitRegs are init at the beginning of the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->contInitRegs.begin(),
+       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the
+   * BREAK_LOGICALZ_i32 or AMDIL::BREAK_LOGICALNZ statement in the current dstBlk.
+   * search for the DebugLoc in the that statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
+  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrBefore(dstBlk, AMDIL::WHILELOOP, passRep, DLBreak);
+  // Loop breakInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakInitRegs.begin(),
+       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter)
+  {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+  // Loop endbranchInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->endbranchInitRegs.begin(),
+       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+   * search for the DebugLoc in the continue statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
+  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrEnd(dstBlk, AMDIL::ENDLOOP, passRep, DLContinue);
+  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
+  // loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakOnRegs.begin(),
+       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::BREAK_LOGICALNZ_i32, passRep,
+                                   *iter);
+  }
+
+  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
+  // loop.
+  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
+       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::CONTINUE_LOGICALNZ_i32,
+                                   passRep, *iter);
+  }
+
+  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+
+  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
+       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
+    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
+  }
+
+  removeSuccessor(landBlk);
+  retireBlock(dstBlk, landBlk);
+} //mergeLooplandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
+                                                 BlockT *exitBlk,
+                                                 BlockT *exitLandBlk,
+                                                 RegiT  setReg) {
+  if (DEBUGME) {
+    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
+           << " exit = BB" << exitBlk->getNumber()
+           << " land = BB" << exitLandBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
+  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+
+  DebugLoc DL = branchInstr->getDebugLoc();
+
+  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+  int oldOpcode = branchInstr->getOpcode();
+
+  //    transform exitingBlk to
+  //    if ( ) {
+  //       exitBlk (if exitBlk != exitLandBlk)
+  //       setReg = 1
+  //       break
+  //    }endif
+  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(exitingBlk, branchInstr);
+
+  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
+    //break_logical
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBreakNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBreakZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+  } else {
+    int newOpcode =
+    (trueBranch == exitBlk) ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                            : CFGTraits::getBranchZeroOpcode(oldOpcode);
+    CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL);
+    if (exitBlk != exitLandBlk) {
+      //splice is insert-before ...
+      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
+                         exitBlk->end());
+    }
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+    }
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::BREAK, passRep);
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep);
+  } //if_logical
+
+  //now branchInst can be erase safely
+  //exitingBlk->eraseFromParent(branchInstr);
+  branchInstr->eraseFromParent();
+
+  //now take care of successors, retire blocks
+  exitingBlk->removeSuccessor(exitBlk);
+  if (exitBlk != exitLandBlk) {
+    //splice is insert-before ...
+    exitBlk->removeSuccessor(exitLandBlk);
+    retireBlock(exitingBlk, exitBlk);
+  }
+
+} //mergeLoopbreakBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
+                                                 BlockT *contBlk,
+                                                 RegiT   setReg) {
+  if (DEBUGME) {
+    errs() << "settleLoopcontBlock conting = BB"
+           << contingBlk->getNumber()
+           << ", cont = BB" << contBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
+  if (branchInstr) {
+    assert(CFGTraits::isCondBranch(branchInstr));
+    typename BlockT::iterator branchInstrPos =
+      CFGTraits::getInstrPos(contingBlk, branchInstr);
+    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+    int oldOpcode = branchInstr->getOpcode();
+       DebugLoc DL = branchInstr->getDebugLoc();
+
+    //    transform contingBlk to
+    //     if () {
+    //          move instr after branchInstr
+    //          continue
+    //        or
+    //          setReg = 1
+    //          break
+    //     }endif
+    //     successor = {orgSuccessor(contingBlk) - loopHeader}
+
+    bool useContinueLogical = 
+      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
+
+    if (useContinueLogical == false) 
+    {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+
+      if (setReg != INVALIDREGNUM) {
+        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, DL);
+      } else {
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, DL);
+      }
+
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::ENDIF, passRep, DL);
+    } else {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
+                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+    }
+
+    //contingBlk->eraseFromParent(branchInstr);
+    branchInstr->eraseFromParent();
+  } else {
+    /* if we've arrived here then we've already erased the branch instruction
+        * travel back up the basic block to see the last reference of our debug location
+        * we've just inserted that reference here so it should be representative */
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    } else {
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    }
+  } //else
+
+} //settleLoopcontBlock
+
+// BBs in exitBlkSet are determined as in break-path for loopRep,
+// before we can put code for BBs as inside loop-body for loopRep
+// check whether those BBs are determined as cont-BB for parentLoopRep
+// earlier.
+// If so, generate a new BB newBlk
+//    (1) set newBlk common successor of BBs in exitBlkSet
+//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
+//    (3) generate continue-instr in newBlk
+//
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
+                                              LoopT *loopRep,
+                                              std::set<BlockT *> &exitBlkSet,
+                                              BlockT *exitLandBlk) {
+  std::set<BlockT *> endBlkSet;
+
+//  BlockT *parentLoopHead = parentLoopRep->getHeader();
+
+
+  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
+       iterEnd = exitBlkSet.end();
+       iter != iterEnd; ++iter) {
+    BlockT *exitBlk = *iter;
+    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
+
+    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
+      return NULL;
+
+    endBlkSet.insert(endBlk);
+  }
+
+  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(newBlk);  //insert to function
+  CFGTraits::insertInstrEnd(newBlk, AMDIL::CONTINUE, passRep);
+  SHOWNEWBLK(newBlk, "New continue block: ");
+
+  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
+       iterEnd = endBlkSet.end();
+       iter != iterEnd; ++iter) {
+      BlockT *endBlk = *iter;
+      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
+      if (contInstr) {
+        contInstr->eraseFromParent();
+      }
+      endBlk->addSuccessor(newBlk);
+      if (DEBUGME) {
+        errs() << "Add new continue Block to BB"
+               << endBlk->getNumber() << " successors\n";
+      }
+  }
+
+  return newBlk;
+} //relocateLoopcontBlock
+
+
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
+// pathes corresponding to the loop exiting branches.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
+                                              BlockTSmallerVector &exitingBlks,
+                                              BlockTSmallerVector &exitBlks) {
+  const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+  RegiT endBranchReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+  assert(endBranchReg >= 0);
+
+  // reg = 0 before entering the loop
+  addLoopEndbranchInitReg(loopRep, endBranchReg);
+
+  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
+  assert(numBlks >=2 && numBlks == exitBlks.size());
+
+  BlockT *preExitingBlk = exitingBlks[0];
+  BlockT *preExitBlk = exitBlks[0];
+  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(preBranchBlk);  //insert to function
+  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
+
+  BlockT *newLandBlk = preBranchBlk;
+
+      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
+        newLandBlk);
+  preExitingBlk->removeSuccessor(preExitBlk);
+  preExitingBlk->addSuccessor(newLandBlk);
+
+  //it is redundant to add reg = 0 to exitingBlks[0]
+
+  // For 1..n th exiting path (the last iteration handles two pathes) create the
+  // branch to the previous path and the current path.
+  for (uint32_t i = 1; i < numBlks; ++i) {
+    BlockT *curExitingBlk = exitingBlks[i];
+    BlockT *curExitBlk = exitBlks[i];
+    BlockT *curBranchBlk;
+
+    if (i == numBlks - 1) {
+      curBranchBlk = curExitBlk;
+    } else {
+      curBranchBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(curBranchBlk);  //insert to function
+      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
+    }
+
+    // Add reg = i to exitingBlks[i].
+    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
+                                       endBranchReg, i);
+
+    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
+    // (exitingBlks[i], newLandBlk).
+    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
+                                          newLandBlk);
+    curExitingBlk->removeSuccessor(curExitBlk);
+    curExitingBlk->addSuccessor(newLandBlk);
+
+    // add to preBranchBlk the branch instruction:
+    // if (endBranchReg == preVal)
+    //    preExitBlk
+    // else
+    //    curBranchBlk
+    //
+    // preValReg = i - 1
+
+  DebugLoc DL;
+  RegiT preValReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+  BuildMI(preBranchBlk, DL, tii->get(AMDIL::LOADCONST_i32), preValReg)
+    .addImm(i - 1); //preVal
+
+  // condResReg = (endBranchReg == preValReg)
+    RegiT condResReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass));
+    BuildMI(preBranchBlk, DL, tii->get(AMDIL::IEQ), condResReg)
+      .addReg(endBranchReg).addReg(preValReg);
+
+    BuildMI(preBranchBlk, DL, tii->get(AMDIL::BRANCH_COND_i32))
+      .addMBB(preExitBlk).addReg(condResReg);
+
+    preBranchBlk->addSuccessor(preExitBlk);
+    preBranchBlk->addSuccessor(curBranchBlk);
+
+    // Update preExitingBlk, preExitBlk, preBranchBlk.
+    preExitingBlk = curExitingBlk;
+    preExitBlk = curExitBlk;
+    preBranchBlk = curBranchBlk;
+
+  }  //end for 1 .. n blocks
+
+  return newLandBlk;
+} //addLoopEndbranchBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::PathToKind
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
+                                     bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return SinglePath_InPath;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == dstBlk) {
+      return SinglePath_InPath;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return Not_SinglePath;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return SinglePath_NotInPath;
+  }
+
+  return Not_SinglePath;
+} //singlePathTo
+
+// If there is a single path from srcBlk to dstBlk, return the last block before
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
+// last block in the path Otherwise, return NULL
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
+                                      bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return srcBlk;
+  }
+
+  if (srcBlk->succ_size() == 0) {
+    return srcBlk;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    BlockT *preBlk = srcBlk;
+
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == NULL) {
+      return preBlk;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return NULL;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return srcBlk;
+  }
+
+  return NULL;
+
+} //singlePathEnd
+
+template<class PassT>
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
+                                               BlockT *dstBlk) {
+  int cloned = 0;
+  assert(preBlk->isSuccessor(srcBlk));
+  while (srcBlk && srcBlk != dstBlk) {
+    assert(srcBlk->succ_size() == 1);
+    if (srcBlk->pred_size() > 1) {
+      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
+      ++cloned;
+    }
+
+    preBlk = srcBlk;
+    srcBlk = *srcBlk->succ_begin();
+  }
+
+  return cloned;
+} //cloneOnSideEntryTo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
+                                                 BlockT *predBlk) {
+  assert(predBlk->isSuccessor(curBlk) &&
+         "succBlk is not a prececessor of curBlk");
+
+  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
+  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  //srcBlk, oldBlk, newBlk
+
+  predBlk->removeSuccessor(curBlk);
+  predBlk->addSuccessor(cloneBlk);
+
+  // add all successor to cloneBlk
+  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+
+  numClonedInstr += curBlk->size();
+
+  if (DEBUGME) {
+    errs() << "Cloned block: " << "BB"
+           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  }
+
+  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+
+  return cloneBlk;
+} //cloneBlockForPredecessor
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
+                                               BlockT *exitingBlk) {
+  BlockT *exitBlk = NULL;
+
+  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
+       iterSuccEnd = exitingBlk->succ_end();
+       iterSucc != iterSuccEnd; ++iterSucc) {
+    BlockT *curBlk = *iterSucc;
+    if (!loopRep->contains(curBlk)) {
+      assert(exitBlk == NULL);
+      exitBlk = curBlk;
+    }
+  }
+
+  assert(exitBlk != NULL);
+
+  return exitBlk;
+} //exitingBlock2ExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
+                                                BlockT *dstBlk,
+                                                InstrIterator insertPos) {
+  InstrIterator spliceEnd;
+  //look for the input branchinstr, not the AMDIL branchinstr
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+  if (branchInstr == NULL) {
+    if (DEBUGME) {
+      errs() << "migrateInstruction don't see branch instr\n" ;
+    }
+    spliceEnd = srcBlk->end();
+  } else {
+    if (DEBUGME) {
+      errs() << "migrateInstruction see branch instr\n" ;
+      branchInstr->dump();
+    }
+    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+  }
+  if (DEBUGME) {
+    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+
+  //splice insert before insertPos
+  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+
+  if (DEBUGME) {
+    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+} //migrateInstruction
+
+// normalizeInfiniteLoopExit change
+//   B1:
+//        uncond_br LoopHeader
+//
+// to
+//   B1:
+//        cond_br 1 LoopHeader dummyExit
+// and return the newly added dummy exit block
+// 
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
+  BlockT *loopHeader;
+  BlockT *loopLatch;
+  loopHeader = LoopRep->getHeader();
+  loopLatch = LoopRep->getLoopLatch();
+  BlockT *dummyExitBlk = NULL;
+  if (loopHeader!=NULL && loopLatch!=NULL) {
+    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
+    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
+      dummyExitBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(dummyExitBlk);  //insert to function
+      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+
+      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
+
+      typename BlockT::iterator insertPos =
+        CFGTraits::getInstrPos(loopLatch, branchInstr);
+      unsigned immReg =
+        funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass);
+      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
+      InstrT *newInstr = 
+        CFGTraits::insertInstrBefore(insertPos, AMDIL::BRANCH_COND_i32, passRep);
+      MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
+
+      SHOWNEWINSTR(newInstr);
+
+      branchInstr->eraseFromParent();
+      loopLatch->addSuccessor(dummyExitBlk);
+    }
+  }
+
+  return dummyExitBlk;
+} //normalizeInfiniteLoopExit
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
+  InstrT *branchInstr;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
+          && CFGTraits::isUncondBranch(branchInstr)) {
+    if (DEBUGME) {
+          errs() << "Removing unconditional branch instruction" ;
+      branchInstr->dump();
+    }
+    branchInstr->eraseFromParent();
+  }
+} //removeUnconditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
+  if (srcBlk->succ_size() == 2) {
+    BlockT *blk1 = *srcBlk->succ_begin();
+    BlockT *blk2 = *(++srcBlk->succ_begin());
+
+    if (blk1 == blk2) {
+      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+      if (DEBUGME) {
+        errs() << "Removing unneeded conditional branch instruction" ;
+        branchInstr->dump();
+      }
+      branchInstr->eraseFromParent();
+      SHOWNEWBLK(blk1, "Removing redundant successor");
+      srcBlk->removeSuccessor(blk1);
+    }
+  }
+} //removeRedundantConditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
+                                               DEFAULT_VEC_SLOTS> &retBlks) {
+  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(dummyExitBlk);  //insert to function
+  CFGTraits::insertInstrEnd(dummyExitBlk, AMDIL::RETURN, passRep);
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
+         retBlks.begin(),
+       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
+    if (curInstr) {
+      curInstr->eraseFromParent();
+    }
+#if 0
+    if (curBlk->size()==0 && curBlk->pred_size() == 1) {
+      if (DEBUGME) {
+        errs() << "Replace empty block BB" <<  curBlk->getNumber()
+          << " with dummyExitBlock\n";
+      }
+      BlockT *predb = *curBlk->pred_begin();
+      predb->removeSuccessor(curBlk);
+      curBlk = predb;
+    } //handle empty curBlk
+#endif
+    curBlk->addSuccessor(dummyExitBlk);
+    if (DEBUGME) {
+      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+             << " successors\n";
+    }
+  } //for
+
+  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
+} //addDummyExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
+  while (srcBlk->succ_size()) {
+    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->sccNum = sccNum;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+  }
+
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->isRetired = true;
+  //int i = srcBlk->succ_size();
+  //int j = srcBlk->pred_size();
+  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+         && "can't retire block yet");
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return (srcBlkInfo && srcBlkInfo->isRetired);
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+
+    if(loopLand == NULL)
+      return true;
+
+    BlockT *landBlk = loopLand->landBlk;
+    assert(landBlk);
+    if (!isRetiredBlock(landBlk)) {
+      return true;
+    }
+
+    loopRep = loopRep->getParentLoop();
+  }
+
+  return false;
+} //isActiveLoophead
+
+template<class PassT>
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
+  const unsigned blockSizeThreshold = 30;
+  const unsigned cloneInstrThreshold = 100;
+
+  bool multiplePreds = blk && (blk->pred_size() > 1);
+
+  if(!multiplePreds)
+    return false;
+
+  unsigned blkSize = blk->size();
+  return ((blkSize > blockSizeThreshold)
+          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
+} //needMigrateBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
+                                            BlockTSmallerVector &exitBlks,
+                                            std::set<BlockT *> &exitBlkSet) {
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
+
+  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+       predIterEnd = landBlk->pred_end();
+       predIter != predIterEnd; ++predIter) {
+    BlockT *curBlk = *predIter;
+    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
+      inpathBlks.push_back(curBlk);
+    }
+  } //for
+
+  //if landBlk has predecessors that are not in the given loop,
+  //create a new block
+  BlockT *newLandBlk = landBlk;
+  if (inpathBlks.size() != landBlk->pred_size()) {
+    newLandBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(newLandBlk);  //insert to function
+    newLandBlk->addSuccessor(landBlk);
+    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
+         inpathBlks.begin(),
+         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
+      BlockT *curBlk = *iter;
+      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
+      //srcBlk, oldBlk, newBlk
+      curBlk->removeSuccessor(landBlk);
+      curBlk->addSuccessor(newLandBlk);
+    }
+    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
+      if (exitBlks[i] == landBlk) {
+        exitBlks[i] = newLandBlk;
+      }
+    }
+    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
+  }
+
+  setLoopLandBlock(loopRep, newLandBlk);
+
+  return newLandBlk;
+} // recordLoopbreakLand
+
+template<class PassT>
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  assert(theEntry->landBlk == NULL);
+
+  if (blk == NULL) {
+    blk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(blk);  //insert to function
+    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
+  }
+
+  theEntry->landBlk = blk;
+
+  if (DEBUGME) {
+    errs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << blk->getNumber() << "\n";
+  }
+} // setLoopLandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+
+  theEntry->breakOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->breakInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
+                                                     RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->endbranchInitRegs.insert(regNum);
+
+  if (DEBUGME)
+  {
+        errs() << "addLoopEndbranchInitReg loop-header = BB"
+      << loopRep->getHeader()->getNumber()
+      << "  regNum = " << regNum << "\n";
+  }
+} // addLoopEndbranchInitReg
+
+template<class PassT>
+typename CFGStructurizer<PassT>::LoopLandInfo *
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry;
+} // getLoopLandInfo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry ? theEntry->landBlk : NULL;
+} // getLoopLandBlock
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  if (loopRep == NULL)
+    return false;
+
+  BlockT *loopHeader = loopRep->getHeader();
+
+  return curBlk->isSuccessor(loopHeader);
+
+} //hasBackEdge
+
+template<class PassT>
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
+  return loopRep ? loopRep->getLoopDepth() : 0;
+} //getLoopDepth
+
+template<class PassT>
+int CFGStructurizer<PassT>::countActiveBlock
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
+  int count = 0;
+  while (iterStart != iterEnd) {
+    if (!isRetiredBlock(*iterStart)) {
+      ++count;
+    }
+    ++iterStart;
+  }
+
+  return count;
+} //countActiveBlock
+
+// This is work around solution for findNearestCommonDominator not avaiable to
+// post dom a proper fix should go to Dominators.h.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT*
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+
+  if (postDomTree->dominates(blk1, blk2)) {
+    return blk1;
+  }
+  if (postDomTree->dominates(blk2, blk1)) {
+    return blk2;
+  }
+
+  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
+  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+
+  // Handle newly cloned node.
+  if (node1 == NULL && blk1->succ_size() == 1) {
+    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
+  }
+  if (node2 == NULL && blk2->succ_size() == 1) {
+    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
+  }
+
+  if (node1 == NULL || node2 == NULL) {
+    return NULL;
+  }
+
+  node1 = node1->getIDom();
+  while (node1) {
+    if (postDomTree->dominates(node1, node2)) {
+      return node1->getBlock();
+    }
+    node1 = node1->getIDom();
+  }
+
+  return NULL;
+}
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::findNearestCommonPostDom
+(typename std::set<BlockT *> &blks) {
+  BlockT *commonDom;
+  typename std::set<BlockT *>::const_iterator iter = blks.begin();
+  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
+  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
+    BlockT *curBlk = *iter;
+    if (curBlk != commonDom) {
+      commonDom = findNearestCommonPostDom(curBlk, commonDom);
+    }
+  }
+
+  if (DEBUGME) {
+    errs() << "Common post dominator for exit blocks is ";
+    if (commonDom) {
+          errs() << "BB" << commonDom->getNumber() << "\n";
+    } else {
+      errs() << "NULL\n";
+    }
+  }
+
+  return commonDom;
+} //findNearestCommonPostDom
+
+} //end namespace llvm
+
+//todo: move-end
+
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer for AMDIL
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGStructurizer : public MachineFunctionPass
+{
+public:
+  typedef MachineInstr              InstructionType;
+  typedef MachineFunction           FunctionType;
+  typedef MachineBasicBlock         BlockType;
+  typedef MachineLoopInfo           LoopinfoType;
+  typedef MachineDominatorTree      DominatortreeType;
+  typedef MachinePostDominatorTree  PostDominatortreeType;
+  typedef MachineDomTreeNode        DomTreeNodeType;
+  typedef MachineLoop               LoopType;
+//private:
+  TargetMachine &TM;
+  const TargetInstrInfo *TII;
+
+//public:
+//  static char ID;
+
+public:
+  AMDILCFGStructurizer(char &pid, TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  const TargetInstrInfo *getTargetInstrInfo() const;
+  //bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGStructurizer
+
+//char AMDILCFGStructurizer::ID = 0;
+} //end of namespace llvm
+AMDILCFGStructurizer::AMDILCFGStructurizer(char &pid, TargetMachine &tm
+                                           AMDIL_OPT_LEVEL_DECL)
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()) {
+}
+
+const TargetInstrInfo *AMDILCFGStructurizer::getTargetInstrInfo() const {
+  return TII;
+}
+//===----------------------------------------------------------------------===//
+//
+// CFGPrepare
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPrepare : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPrepare
+
+char AMDILCFGPrepare::ID = 0;
+} //end of namespace llvm
+
+AMDILCFGPrepare::AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : AMDILCFGStructurizer(ID, tm  AMDIL_OPT_LEVEL_VAR) 
+{
+}
+const char *AMDILCFGPrepare::getPassName() const {
+  return "AMD IL Control Flow Graph Preparation Pass";
+}
+
+void AMDILCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGPerform
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm
+{
+class AMDILCFGPerform : public AMDILCFGStructurizer
+{
+public:
+  static char ID;
+
+public:
+  AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};   //end of class AMDILCFGPerform
+
+char AMDILCFGPerform::ID = 0;
+} //end of namespace llvm
+
+  AMDILCFGPerform::AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+: AMDILCFGStructurizer(ID, tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+const char *AMDILCFGPerform::getPassName() const {
+  return "AMD IL Control Flow Graph structurizer Pass";
+}
+
+void AMDILCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructTraits<AMDILCFGStructurizer>
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct
+{
+// this class is tailor to the AMDIL backend
+template<>
+struct CFGStructTraits<AMDILCFGStructurizer>
+{
+  typedef int RegiT;
+
+  static int getBreakNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBreakZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALNZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getBranchZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueNzeroOpcode(int oldOpcode)
+  {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALNZ);
+      default:
+        assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+      ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALZ);
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+// the explicitly represented branch target is the true branch target
+#define getExplicitBranch getTrueBranch
+#define setExplicitBranch setTrueBranch
+
+  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
+    return instr->getOperand(0).getMBB();
+  }
+
+  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
+    instr->getOperand(0).setMBB(blk);
+  }
+
+  static MachineBasicBlock *
+  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(blk->succ_size() == 2);
+    MachineBasicBlock *trueBranch = getTrueBranch(instr);
+    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
+    MachineBasicBlock::succ_iterator iterNext = iter;
+    ++iterNext;
+
+    return (*iter == trueBranch) ? *iterNext : *iter;
+  }
+
+  static bool isCondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isUncondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+    case AMDIL::BRANCH:
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isPhimove(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      ExpandCaseToAllTypes(AMDIL::MOVE);
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
+    //get DebugLoc from the first MachineBasicBlock instruction with debug info
+    DebugLoc DL;
+       for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
+         MachineInstr *instr = &(*iter);
+         if (instr->getDebugLoc().isUnknown() == false) {
+           DL = instr->getDebugLoc();
+         }
+    }
+    return DL;
+  }
+
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    MachineInstr *instr = &*iter;
+    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
+      return instr;
+    }
+    return NULL;
+  }
+
+  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  //
+  // BB with backward-edge could have move instructions after the branch
+  // instruction.  Such move instruction "belong to" the loop backward-edge.
+  //
+  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
+         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
+      // FIXME: Simplify
+      MachineInstr *instr = &*iter;
+      if (instr) {
+        if (isCondBranch(instr) || isUncondBranch(instr)) {
+          return instr;
+        } else if (!isPhimove(instr)) {
+          break;
+        }
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDIL::RETURN) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDIL::CONTINUE) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if ((instr->getOpcode() == AMDIL::BREAK_LOGICALNZ_i32) || (instr->getOpcode() == AMDIL::BREAK_LOGICALZ_i32)) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static bool isReturnBlock(MachineBasicBlock *blk) {
+    MachineInstr *instr = getReturnInstr(blk);
+    bool isReturn = (blk->succ_size() == 0);
+    if (instr) {
+      assert(isReturn);
+    } else if (isReturn) {
+      if (DEBUGME) {
+        errs() << "BB" << blk->getNumber()
+               <<" is return block without RETURN instr\n";
+      }
+    }
+
+    return  isReturn;
+  }
+
+  static MachineBasicBlock::iterator
+  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(instr->getParent() == blk && "instruction doesn't belong to block");
+    MachineBasicBlock::iterator iter = blk->begin();
+    MachineBasicBlock::iterator iterEnd = blk->end();
+    while (&(*iter) != instr && iter != iterEnd) {
+      ++iter;
+    }
+
+    assert(iter != iterEnd);
+    return iter;
+  }//getInstrPos
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep) {
+    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrBefore
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    MachineBasicBlock::iterator res;
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep) {
+    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrEnd
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDILCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+   MachineInstr *newInstr = blk->getParent()
+      ->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->push_back(newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+  } //insertInstrEnd
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
+                                         int newOpcode, 
+                                         AMDILCFGStructurizer *passRep) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DebugLoc());
+
+    blk->insert(instrPos, newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+                                                                        DebugLoc DL) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DL);
+
+    blk->insert(instrPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
+                                         false);
+
+    SHOWNEWINSTR(newInstr);
+    //erase later oldInstr->eraseFromParent();
+  } //insertCondBranchBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock *blk,
+                                     MachineBasicBlock::iterator insertPos,
+                                     int newOpcode,
+                                     AMDILCFGStructurizer *passRep,
+                                     RegiT regNum,
+                                                                        DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    //insert before
+    blk->insert(insertPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchBefore
+
+  static void insertCondBranchEnd(MachineBasicBlock *blk,
+                                  int newOpcode,
+                                  AMDILCFGStructurizer *passRep,
+                                  RegiT regNum) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
+
+    blk->push_back(newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchEnd
+
+
+  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32),
+                                           DebugLoc());
+    MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addImm(regVal); //set src value
+
+    blk->insert(instrPos, newInstr);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertAssignInstrBefore
+
+  static void insertAssignInstrBefore(MachineBasicBlock *blk,
+                                      AMDILCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32),
+                                           DebugLoc());
+    MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addImm(regVal); //set src value
+
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+  } //insertInstrBefore
+
+  static void insertCompareInstrBefore(MachineBasicBlock *blk,
+                                       MachineBasicBlock::iterator instrPos,
+                                       AMDILCFGStructurizer *passRep,
+                                       RegiT dstReg, RegiT src1Reg,
+                                       RegiT src2Reg) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(AMDIL::IEQ), DebugLoc());
+
+    MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
+    MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
+
+    blk->insert(instrPos, newInstr);
+    SHOWNEWINSTR(newInstr);
+
+  } //insertCompareInstrBefore
+
+  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
+                                 MachineBasicBlock *srcBlk) {
+    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
+         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
+      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
+    }
+  } //cloneSuccessorList
+
+  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
+    MachineFunction *func = srcBlk->getParent();
+    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
+    func->push_back(newBlk);  //insert to function
+    //newBlk->setNumber(srcBlk->getNumber());
+    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
+         iterEnd = srcBlk->end();
+         iter != iterEnd; ++iter) {
+      MachineInstr *instr = func->CloneMachineInstr(iter);
+      // This is a workaround for LLVM bugzilla 8420 because CloneMachineInstr
+      // does not clone the AsmPrinterFlags.
+      instr->setAsmPrinterFlag(
+         (llvm::MachineInstr::CommentFlag)iter->getAsmPrinterFlags());
+      newBlk->push_back(instr);
+    }
+    return newBlk;
+  }
+
+  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
+  //the AMDIL instruction is not recognized as terminator fix this and retire
+  //this routine
+  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
+                                         MachineBasicBlock *oldBlk,
+                                         MachineBasicBlock *newBlk) {
+    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
+    if (branchInstr && isCondBranch(branchInstr) &&
+        getExplicitBranch(branchInstr) == oldBlk) {
+      setExplicitBranch(branchInstr, newBlk);
+    }
+  }
+
+  static void wrapup(MachineBasicBlock *entryBlk) {
+    assert((!entryBlk->getParent()->getJumpTableInfo()
+            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
+           && "found a jump table");
+
+     //collect continue right before endloop
+     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
+     MachineBasicBlock::iterator pre = entryBlk->begin();
+     MachineBasicBlock::iterator iterEnd = entryBlk->end();
+     MachineBasicBlock::iterator iter = pre;
+     while (iter != iterEnd) {
+       if (pre->getOpcode() == AMDIL::CONTINUE
+           && iter->getOpcode() == AMDIL::ENDLOOP) {
+         contInstr.push_back(pre);
+       }
+       pre = iter;
+       ++iter;
+     } //end while
+
+     //delete continue right before endloop
+     for (unsigned i = 0; i < contInstr.size(); ++i) {
+        contInstr[i]->eraseFromParent();
+     }
+
+     // TODO to fix up jump table so later phase won't be confused.  if
+     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+     // there isn't such an interface yet.  alternatively, replace all the other
+     // blocks in the jump table with the entryBlk //}
+
+  } //wrapup
+
+  static MachineDominatorTree *getDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineDominatorTree>();
+  }
+
+  static MachinePostDominatorTree*
+  getPostDominatorTree(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachinePostDominatorTree>();
+  }
+
+  static MachineLoopInfo *getLoopInfo(AMDILCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineLoopInfo>();
+  }
+}; // template class CFGStructTraits
+} //end of namespace llvm
+
+// createAMDILCFGPreparationPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGPreparationPass(TargetMachine &tm
+                                                  AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPrepare(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPrepare::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().prepare(func,
+                                                                        *this);
+}
+
+// createAMDILCFGStructurizerPass- Returns a pass
+FunctionPass *llvm::createAMDILCFGStructurizerPass(TargetMachine &tm
+                                                   AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILCFGPerform(tm  AMDIL_OPT_LEVEL_VAR);
+}
+
+bool AMDILCFGPerform::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().run(func,
+                                                                    *this);
+}
+
+//end of file newline goes below
+
diff --git a/src/gallium/drivers/radeon/AMDILCallingConv.td b/src/gallium/drivers/radeon/AMDILCallingConv.td

new file mode 100644 (file)

index 0000000..c37ff0a
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILCallingConv.td
@@ -0,0 +1,75 @@
+//===- AMDILCallingConv.td - Calling Conventions AMDIL -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the AMDIL architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// AMDIL 32-bit C return-value convention.
+def RetCC_AMDIL32 : CallingConv<[
+ // Since IL has no return values, all values can be emulated on the stack
+ // The stack can then be mapped to a number of sequential virtual registers
+ // in IL
+
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ CCIfType<[i1, i8, i16, i32, f32, f64, i64], CCAssignToReg<
+ [
+ R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 2-element Short vector types get 16 byte alignment and size of 8 bytes
+ CCIfType<[v2i32, v2f32, v2i8, v4i8, v2i16, v4i16], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 4-element Short vector types get 16 byte alignment and size of 16 bytes
+ CCIfType<[v4i32, v4f32], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 2-element 64-bit vector types get aligned to 16 bytes with a size of 16 bytes
+ CCIfType<[v2f64, v2i64], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >, CCAssignToStack<16, 16>
+]>;
+
+// AMDIL 32-bit C Calling convention.
+def CC_AMDIL32 : CallingConv<[
+  // Since IL has parameter values, all values can be emulated on the stack
+ // The stack can then be mapped to a number of sequential virtual registers
+ // in IL
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ // Integer and FP scalar values get put on the stack at 16-byte alignment
+ // but with a size of 4 bytes
+ CCIfType<[i1, i8, i16, i32, f32, f64, i64], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 2-element Short vector types get 16 byte alignment and size of 8 bytes
+ CCIfType<[v2i32, v2f32, v2i8, v4i8, v2i16, v4i16], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 4-element Short vector types get 16 byte alignment and size of 16 bytes
+ CCIfType<[v4i32, v4f32], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >,
+
+ // 2-element 64-bit vector types get aligned to 16 bytes with a size of 16 bytes
+ CCIfType<[v2f64, v2i64], CCAssignToReg<
+[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767
+]> >, CCAssignToStack<16, 16>
+]>;
+
diff --git a/src/gallium/drivers/radeon/AMDILCodeEmitter.h b/src/gallium/drivers/radeon/AMDILCodeEmitter.h

new file mode 100644 (file)

index 0000000..b0ea145
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILCodeEmitter.h
@@ -0,0 +1,46 @@
+//                     The LLVM Compiler Infrastructure
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+
+#ifndef AMDILCODEEMITTER_H
+#define AMDILCODEEMITTER_H
+
+namespace llvm {
+
+  /* XXX: Temp HACK to work around tablegen name generation */
+  class AMDILCodeEmitter {
+  public:
+    uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+    virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                   const MachineOperand &MO) const { return 0; }
+    virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
+                                     unsigned OpNo) const {
+      return 0;
+    }
+    virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                   uint64_t Value) const {
+      return Value;
+    }
+    virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
+                                      unsigned OpNo) const {
+      return 0;
+    }
+  };
+
+} // End namespace llvm
+
+#endif // AMDILCODEEMITTER_H
diff --git a/src/gallium/drivers/radeon/AMDILCompilerErrors.h b/src/gallium/drivers/radeon/AMDILCompilerErrors.h

new file mode 100644 (file)

index 0000000..7d935f5
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILCompilerErrors.h
@@ -0,0 +1,75 @@
+//===-- AMDILCompilerErrors.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDIL_COMPILER_ERRORS_H_
+#define _AMDIL_COMPILER_ERRORS_H_
+// Compiler errors generated by the backend that will cause
+// the runtime to abort compilation. These are mainly for
+// device constraint violations or invalid code.
+namespace amd {
+
+#define INVALID_COMPUTE 0
+#define GENERIC_ERROR 1
+#define INTERNAL_ERROR 2
+#define MISSING_FUNCTION_CALL 3
+#define RESERVED_FUNCTION 4
+#define BYTE_STORE_ERROR 5
+#define UNKNOWN_TYPE_NAME 6
+#define NO_IMAGE_SUPPORT 7
+#define NO_ATOMIC_32 8
+#define NO_ATOMIC_64 9
+#define IRREDUCIBLE_CF 10
+#define INSUFFICIENT_RESOURCES 11
+#define INSUFFICIENT_LOCAL_RESOURCES 12
+#define INSUFFICIENT_PRIVATE_RESOURCES 13
+#define INSUFFICIENT_IMAGE_RESOURCES 14
+#define DOUBLE_NOT_SUPPORTED 15
+#define INVALID_CONSTANT_WRITE 16
+#define INSUFFICIENT_CONSTANT_RESOURCES 17
+#define INSUFFICIENT_COUNTER_RESOURCES 18
+#define INSUFFICIENT_REGION_RESOURCES 19
+#define REGION_MEMORY_ERROR 20
+#define MEMOP_NO_ALLOCATION 21
+#define RECURSIVE_FUNCTION 22
+#define INCORRECT_COUNTER_USAGE 23
+#define INVALID_INTRINSIC_USAGE 24
+#define NUM_ERROR_MESSAGES 25
+
+
+  static const char *CompilerErrorMessage[NUM_ERROR_MESSAGES] =
+  {
+    "E000:Compute Shader Not Supported!   ",
+    "E001:Generic Compiler Error Message! ",
+    "E002:Internal Compiler Error Message!",
+    "E003:Missing Function Call Detected! ",
+    "E004:Reserved Function Call Detected!",
+    "E005:Byte Addressable Stores Invalid!",
+    "E006:Kernel Arg Type Name Is Invalid!",
+    "E007:Image 1.0 Extension Unsupported!",
+    "E008:32bit Atomic Op are Unsupported!",
+    "E009:64bit Atomic Op are Unsupported!",
+    "E010:Irreducible ControlFlow Detected",
+    "E011:Insufficient Resources Detected!",
+    "E012:Insufficient Local Resources!   ",
+    "E013:Insufficient Private Resources! ",
+    "E014:Images not currently supported! ",
+    "E015:Double precision not supported! ",
+    "E016:Invalid Constant Memory Write!  ",
+    "E017:Max number Constant Ptr reached!",
+    "E018:Max number of Counters reached! ",
+    "E019:Insufficient Region Resources!  ",
+    "E020:Region address space invalid!   ",
+    "E021:MemOp with no memory allocated! ",
+    "E022:Recursive Function detected!    ",
+    "E023:Illegal Inc+Dec to same counter!",
+    "E024:Illegal usage of intrinsic inst!"
+  };
+
+}
+
+#endif // _AMDIL_COMPILER_ERRORS_H_
diff --git a/src/gallium/drivers/radeon/AMDILCompilerWarnings.h b/src/gallium/drivers/radeon/AMDILCompilerWarnings.h

new file mode 100644 (file)

index 0000000..c257980
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILCompilerWarnings.h
@@ -0,0 +1,31 @@
+//===-- AMDILCompilerWarnings.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDIL_COMPILER_WARNINGS_H_
+#define _AMDIL_COMPILER_WARNINGS_H_
+/// Compiler backend generated warnings that might cause
+/// issues with compilation. These warnings become errors if
+/// -Werror is specified on the command line.
+namespace amd {
+
+#define LIMIT_BARRIER 0
+#define BAD_BARRIER_OPT 1
+#define RECOVERABLE_ERROR 2
+#define NUM_WARN_MESSAGES 3
+    /// All warnings must be prefixed with the W token or they might be
+    /// treated as errors.
+    static const char *CompilerWarningMessage[NUM_WARN_MESSAGES] =
+    {
+        "W000:Barrier caused limited groupsize",
+        "W001:Dangerous Barrier Opt Detected! ",
+        "W002:Recoverable BE Error Detected!  "
+
+    };
+}
+
+#endif // _AMDIL_COMPILER_WARNINGS_H_
diff --git a/src/gallium/drivers/radeon/AMDILConversions.td b/src/gallium/drivers/radeon/AMDILConversions.td

new file mode 100644 (file)

index 0000000..0db66ae
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILConversions.td
@@ -0,0 +1,1022 @@
+//===-- AMDILConversions.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+def actos_i16:Pat < (i16 (anyext GPRI8:$src)),
+(IL_ASSHORT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def uctos_i16:Pat < (i16 (zext GPRI8:$src)),
+(IL_ASSHORT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def sctos_i16:Pat < (i16 (sext GPRI8:$src)),
+(IL_ASSHORT_i32
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def actoi_i32:Pat < (i32 (anyext GPRI8:$src)),
+(IL_ASINT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def uctoi_i32:Pat < (i32 (zext GPRI8:$src)),
+(IL_ASINT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def sctoi_i32:Pat < (i32 (sext GPRI8:$src)),
+(IL_ASINT_i32
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))) >;
+
+
+def actol_i64:Pat < (i64 (anyext GPRI8:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24)),
+ (LOADCONST_i32 0)) >;
+
+
+def uctol_i64:Pat < (i64 (zext GPRI8:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24)),
+ (LOADCONST_i32 0)) >;
+
+
+def sctol_i64:Pat < (i64 (sext GPRI8:$src)),
+(LCREATE
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24)),
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 31))) >;
+
+
+def astoi_i32:Pat < (i32 (anyext GPRI16:$src)),
+(IL_ASINT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16))) >;
+
+
+def ustoi_i32:Pat < (i32 (zext GPRI16:$src)),
+(IL_ASINT_i32
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16))) >;
+
+
+def sstoi_i32:Pat < (i32 (sext GPRI16:$src)),
+(IL_ASINT_i32
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16))) >;
+
+
+def astol_i64:Pat < (i64 (anyext GPRI16:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16)),
+ (LOADCONST_i32 0)) >;
+
+
+def ustol_i64:Pat < (i64 (zext GPRI16:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16)),
+ (LOADCONST_i32 0)) >;
+
+
+def sstol_i64:Pat < (i64 (sext GPRI16:$src)),
+(LCREATE
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16)),
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 31))) >;
+
+
+def aitol_i64:Pat < (i64 (anyext GPRI32:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i32 GPRI32:$src),
+   (LOADCONST_i32 0)),
+  (LOADCONST_i32 0)),
+ (LOADCONST_i32 0)) >;
+
+
+def uitol_i64:Pat < (i64 (zext GPRI32:$src)),
+(LCREATE
+ (USHR_i32
+  (SHL_i32
+(IL_ASINT_i32 GPRI32:$src),
+   (LOADCONST_i32 0)),
+  (LOADCONST_i32 0)),
+ (LOADCONST_i32 0)) >;
+
+
+def sitol_i64:Pat < (i64 (sext GPRI32:$src)),
+(LCREATE
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i32 GPRI32:$src),
+   (LOADCONST_i32 0)),
+  (LOADCONST_i32 0)),
+ (SHR_i32
+  (SHL_i32
+(IL_ASINT_i32 GPRI32:$src),
+   (LOADCONST_i32 0)),
+  (LOADCONST_i32 31))) >;
+
+
+
+def sctof_f32:Pat < (f32 (sint_to_fp GPRI8:$src)),
+(f32
+ (ITOF
+  (SHR_i32
+   (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+    (LOADCONST_i32 24)),
+   (LOADCONST_i32 24)))) >;
+
+
+def uctof_f32:Pat < (f32 (uint_to_fp GPRI8:$src)),
+(f32
+ (UTOF
+  (USHR_i32
+   (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+    (LOADCONST_i32 24)),
+   (LOADCONST_i32 24)))) >;
+
+
+def ftosc_i8:Pat < (i8 (fp_to_sint GPRF32:$src)),
+(i8
+ (IL_ASCHAR_i32
+  (BINARY_AND_i32
+(FTOI GPRF32:$src),
+   (LOADCONST_i32 0x000000FF)))) >;
+
+
+def ftouc_i8:Pat < (i8 (fp_to_uint GPRF32:$src)),
+(i8
+ (IL_ASCHAR_i32
+  (BINARY_AND_i32
+(FTOU GPRF32:$src),
+   (LOADCONST_i32 0x000000FF)))) >;
+
+
+def sctod_f64:Pat < (f64 (sint_to_fp GPRI8:$src)),
+(f64 (FTOD
+      (ITOF
+       (SHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))))) >;
+
+
+def uctod_f64:Pat < (f64 (uint_to_fp GPRI8:$src)),
+(f64 (FTOD
+      (UTOF
+       (USHR_i32
+  (SHL_i32
+(IL_ASINT_i8 GPRI8:$src),
+   (LOADCONST_i32 24)),
+  (LOADCONST_i32 24))))) >;
+
+
+def dtosc_i8:Pat < (i8 (fp_to_sint GPRF64:$src)),
+(i8
+ (IL_ASCHAR_i32
+  (BINARY_AND_i32
+(FTOI (DTOF GPRF64:$src)),
+   (LOADCONST_i32 0x000000FF)))) >;
+
+
+def dtouc_i8:Pat < (i8 (fp_to_uint GPRF64:$src)),
+(i8
+ (IL_ASCHAR_i32
+  (BINARY_AND_i32
+(FTOU (DTOF GPRF64:$src)),
+   (LOADCONST_i32 0x000000FF)))) >;
+
+
+def sstof_f32:Pat < (f32 (sint_to_fp GPRI16:$src)),
+(f32
+ (ITOF
+  (SHR_i32
+   (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+    (LOADCONST_i32 16)),
+   (LOADCONST_i32 16)))) >;
+
+
+def ustof_f32:Pat < (f32 (uint_to_fp GPRI16:$src)),
+(f32
+ (UTOF
+  (USHR_i32
+   (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+    (LOADCONST_i32 16)),
+   (LOADCONST_i32 16)))) >;
+
+
+def ftoss_i16:Pat < (i16 (fp_to_sint GPRF32:$src)),
+(i16
+ (IL_ASSHORT_i32
+  (BINARY_AND_i32
+(FTOI GPRF32:$src),
+   (LOADCONST_i32 0x0000FFFF)))) >;
+
+
+def ftous_i16:Pat < (i16 (fp_to_uint GPRF32:$src)),
+(i16
+ (IL_ASSHORT_i32
+  (BINARY_AND_i32
+(FTOU GPRF32:$src),
+   (LOADCONST_i32 0x0000FFFF)))) >;
+
+
+def sstod_f64:Pat < (f64 (sint_to_fp GPRI16:$src)),
+(f64 (FTOD
+      (ITOF
+       (SHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16))))) >;
+
+
+def ustod_f64:Pat < (f64 (uint_to_fp GPRI16:$src)),
+(f64 (FTOD
+      (UTOF
+       (USHR_i32
+  (SHL_i32
+(IL_ASINT_i16 GPRI16:$src),
+   (LOADCONST_i32 16)),
+  (LOADCONST_i32 16))))) >;
+
+
+def dtoss_i16:Pat < (i16 (fp_to_sint GPRF64:$src)),
+(i16
+ (IL_ASSHORT_i32
+  (BINARY_AND_i32
+(FTOI (DTOF GPRF64:$src)),
+   (LOADCONST_i32 0x0000FFFF)))) >;
+
+
+def dtous_i16:Pat < (i16 (fp_to_uint GPRF64:$src)),
+(i16
+ (IL_ASSHORT_i32
+  (BINARY_AND_i32
+(FTOU (DTOF GPRF64:$src)),
+   (LOADCONST_i32 0x0000FFFF)))) >;
+
+
+
+
+
+def stoc_i8:Pat < (i8 (trunc GPRI16:$src)),
+(IL_ASCHAR_i32
+   (IL_ASINT_i16
+(BINARY_AND_i16 GPRI16:$src,
+     (LOADCONST_i16 0x000000FF)))
+  ) >;
+
+
+def itoc_i8:Pat < (i8 (trunc GPRI32:$src)),
+(IL_ASCHAR_i32
+   (IL_ASINT_i32
+(BINARY_AND_i32 GPRI32:$src,
+     (LOADCONST_i32 0x000000FF)))
+  ) >;
+
+
+def itos_i16:Pat < (i16 (trunc GPRI32:$src)),
+(IL_ASSHORT_i32
+   (IL_ASINT_i32
+(BINARY_AND_i32 GPRI32:$src,
+     (LOADCONST_i32 0x0000FFFF)))
+  ) >;
+
+
+def ltoc_i8:Pat < (i8 (trunc GPRI64:$src)),
+(IL_ASCHAR_i32
+   (BINARY_AND_i32
+(LLO GPRI64:$src),
+    (LOADCONST_i32 0x000000FF))
+  ) >;
+
+
+def ltos_i16:Pat < (i16 (trunc GPRI64:$src)),
+(IL_ASSHORT_i32
+   (BINARY_AND_i32
+(LLO GPRI64:$src),
+    (LOADCONST_i32 0x0000FFFF))
+  ) >;
+
+
+def ltoi_i32:Pat < (i32 (trunc GPRI64:$src)),
+(IL_ASINT_i32
+   (BINARY_AND_i32
+(LLO GPRI64:$src),
+    (LOADCONST_i32 0xFFFFFFFF))
+  ) >;
+
+
+def actos_v2i16:Pat < (v2i16 (anyext GPRV2I8:$src)),
+(IL_ASV2SHORT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def uctos_v2i16:Pat < (v2i16 (zext GPRV2I8:$src)),
+(IL_ASV2SHORT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def sctos_v2i16:Pat < (v2i16 (sext GPRV2I8:$src)),
+(IL_ASV2SHORT_v2i32
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def actoi_v2i32:Pat < (v2i32 (anyext GPRV2I8:$src)),
+(IL_ASV2INT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def uctoi_v2i32:Pat < (v2i32 (zext GPRV2I8:$src)),
+(IL_ASV2INT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def sctoi_v2i32:Pat < (v2i32 (sext GPRV2I8:$src)),
+(IL_ASV2INT_v2i32
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24)))) >;
+
+
+def actol_v2i64:Pat < (v2i64 (anyext GPRV2I8:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def uctol_v2i64:Pat < (v2i64 (zext GPRV2I8:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def sctol_v2i64:Pat < (v2i64 (sext GPRV2I8:$src)),
+(LCREATE_v2i64
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 24))),
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 24))),
+  (VCREATE_v2i32 (LOADCONST_i32 31)))) >;
+
+
+def astoi_v2i32:Pat < (v2i32 (anyext GPRV2I16:$src)),
+(IL_ASV2INT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16)))) >;
+
+
+def ustoi_v2i32:Pat < (v2i32 (zext GPRV2I16:$src)),
+(IL_ASV2INT_v2i32
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16)))) >;
+
+
+def sstoi_v2i32:Pat < (v2i32 (sext GPRV2I16:$src)),
+(IL_ASV2INT_v2i32
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16)))) >;
+
+
+def astol_v2i64:Pat < (v2i64 (anyext GPRV2I16:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def ustol_v2i64:Pat < (v2i64 (zext GPRV2I16:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def sstol_v2i64:Pat < (v2i64 (sext GPRV2I16:$src)),
+(LCREATE_v2i64
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 16))),
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 16))),
+  (VCREATE_v2i32 (LOADCONST_i32 31)))) >;
+
+
+def aitol_v2i64:Pat < (v2i64 (anyext GPRV2I32:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i32 GPRV2I32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0))),
+  (VCREATE_v2i32 (LOADCONST_i32 0))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def uitol_v2i64:Pat < (v2i64 (zext GPRV2I32:$src)),
+(LCREATE_v2i64
+ (USHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i32 GPRV2I32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0))),
+  (VCREATE_v2i32 (LOADCONST_i32 0))),
+ (VCREATE_v2i32 (LOADCONST_i32 0))) >;
+
+
+def sitol_v2i64:Pat < (v2i64 (sext GPRV2I32:$src)),
+(LCREATE_v2i64
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i32 GPRV2I32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0))),
+  (VCREATE_v2i32 (LOADCONST_i32 0))),
+ (SHRVEC_v2i32
+  (SHLVEC_v2i32
+(IL_ASV2INT_v2i32 GPRV2I32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0))),
+  (VCREATE_v2i32 (LOADCONST_i32 31)))) >;
+
+
+
+def sctof_v2f32:Pat < (v2f32 (sint_to_fp GPRV2I8:$src)),
+(v2f32
+ (ITOF_v2f32
+  (SHRVEC_v2i32
+   (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 24))),
+   (VCREATE_v2i32 (LOADCONST_i32 24))))) >;
+
+
+def uctof_v2f32:Pat < (v2f32 (uint_to_fp GPRV2I8:$src)),
+(v2f32
+ (UTOF_v2f32
+  (USHRVEC_v2i32
+   (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 24))),
+   (VCREATE_v2i32 (LOADCONST_i32 24))))) >;
+
+
+def ftosc_v2i8:Pat < (v2i8 (fp_to_sint GPRV2F32:$src)),
+(v2i8
+ (IL_ASV2CHAR_v2i32
+  (BINARY_AND_v2i32
+(FTOI_v2i32 GPRV2F32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >;
+
+
+def ftouc_v2i8:Pat < (v2i8 (fp_to_uint GPRV2F32:$src)),
+(v2i8
+ (IL_ASV2CHAR_v2i32
+  (BINARY_AND_v2i32
+(FTOU_v2i32 GPRV2F32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >;
+
+def sctod_v2f64:Pat < (v2f64 (sint_to_fp GPRV2I8:$src)),
+(v2f64 
+ (VINSERT_v2f64
+ (VCREATE_v2f64 
+ (FTOD
+  (VEXTRACT_v2f32
+  (ITOF_v2f32
+   (SHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 24))),
+    (VCREATE_v2i32 (LOADCONST_i32 24)))),
+  1)
+ )),
+ (FTOD
+  (VEXTRACT_v2f32
+  (ITOF_v2f32
+   (SHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 24))),
+    (VCREATE_v2i32 (LOADCONST_i32 24)))),
+  2)
+  ), 1, 256)
+ ) >;
+
+def uctod_v2f64:Pat < (v2f64 (uint_to_fp GPRV2I8:$src)),
+(v2f64 
+ (VINSERT_v2f64
+ (VCREATE_v2f64 
+ (FTOD
+  (VEXTRACT_v2f32
+  (UTOF_v2f32
+   (USHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 24))),
+    (VCREATE_v2i32 (LOADCONST_i32 24)))),
+  1)
+ )),
+ (FTOD
+  (VEXTRACT_v2f32
+  (UTOF_v2f32
+   (USHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i8 GPRV2I8:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 24))),
+    (VCREATE_v2i32 (LOADCONST_i32 24)))),
+  2)
+  ), 1, 256)
+ ) >;
+
+
+def dtosc_v2i8:Pat < (v2i8 (fp_to_sint GPRV2F64:$src)),
+(v2i8
+ (IL_ASV2CHAR_v2i32
+  (BINARY_AND_v2i32
+(FTOI_v2i32 (VINSERT_v2f32 
+             (VCREATE_v2f32 
+              (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))),
+             (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)),
+   (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >;
+
+
+def dtouc_v2i8:Pat < (v2i8 (fp_to_uint GPRV2F64:$src)),
+(v2i8
+ (IL_ASV2CHAR_v2i32
+  (BINARY_AND_v2i32
+(FTOU_v2i32 (VINSERT_v2f32 
+             (VCREATE_v2f32 
+              (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))),
+             (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)),
+   (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >;
+
+
+def sstof_v2f32:Pat < (v2f32 (sint_to_fp GPRV2I16:$src)),
+(v2f32
+ (ITOF_v2f32
+  (SHRVEC_v2i32
+   (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 16))),
+   (VCREATE_v2i32 (LOADCONST_i32 16))))) >;
+
+
+def ustof_v2f32:Pat < (v2f32 (uint_to_fp GPRV2I16:$src)),
+(v2f32
+ (UTOF_v2f32
+  (USHRVEC_v2i32
+   (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 16))),
+   (VCREATE_v2i32 (LOADCONST_i32 16))))) >;
+
+
+def ftoss_v2i16:Pat < (v2i16 (fp_to_sint GPRV2F32:$src)),
+(v2i16
+ (IL_ASV2SHORT_v2i32
+  (BINARY_AND_v2i32
+(FTOI_v2i32 GPRV2F32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+
+def ftous_v2i16:Pat < (v2i16 (fp_to_uint GPRV2F32:$src)),
+(v2i16
+ (IL_ASV2SHORT_v2i32
+  (BINARY_AND_v2i32
+(FTOU_v2i32 GPRV2F32:$src),
+   (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+
+def sstod_v2f64:Pat < (v2f64 (sint_to_fp GPRV2I16:$src)),
+(v2f64 
+ (VINSERT_v2f64
+ (VCREATE_v2f64 
+ (FTOD
+  (VEXTRACT_v2f32
+  (ITOF_v2f32
+   (SHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 16))),
+    (VCREATE_v2i32 (LOADCONST_i32 16)))),
+  1)
+ )),
+ (FTOD
+  (VEXTRACT_v2f32
+  (ITOF_v2f32
+   (SHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 16))),
+    (VCREATE_v2i32 (LOADCONST_i32 16)))),
+  2)
+  ), 1, 256)
+ ) >;
+
+def ustod_v2f64:Pat < (v2f64 (uint_to_fp GPRV2I16:$src)),
+(v2f64 
+ (VINSERT_v2f64
+ (VCREATE_v2f64 
+ (FTOD
+  (VEXTRACT_v2f32
+  (UTOF_v2f32
+   (USHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 16))),
+    (VCREATE_v2i32 (LOADCONST_i32 16)))),
+  1)
+ )),
+ (FTOD
+  (VEXTRACT_v2f32
+  (UTOF_v2f32
+   (USHRVEC_v2i32
+    (SHLVEC_v2i32
+(IL_ASV2INT_v2i16 GPRV2I16:$src),
+     (VCREATE_v2i32 (LOADCONST_i32 16))),
+    (VCREATE_v2i32 (LOADCONST_i32 16)))),
+  2)
+  ), 1, 256)
+ ) >;
+
+
+def dtoss_v2i16:Pat < (v2i16 (fp_to_sint GPRV2F64:$src)),
+(v2i16
+ (IL_ASV2SHORT_v2i32
+  (BINARY_AND_v2i32
+(FTOI_v2i32 (VINSERT_v2f32 
+             (VCREATE_v2f32 
+              (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))),
+             (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)),
+   (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+
+def dtous_v2i16:Pat < (v2i16 (fp_to_uint GPRV2F64:$src)),
+(v2i16
+ (IL_ASV2SHORT_v2i32
+  (BINARY_AND_v2i32
+(FTOU_v2i32 (VINSERT_v2f32 
+             (VCREATE_v2f32 
+              (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))),
+             (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)),
+   (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+def stoc_v2i8:Pat < (v2i8 (trunc GPRV2I16:$src)),
+(IL_ASV2CHAR_v2i32
+   (IL_ASV2INT_v2i16
+(BINARY_AND_v2i16 GPRV2I16:$src,
+     (VCREATE_v2i16 (LOADCONST_i16 0x000000FF))))
+  ) >;
+
+
+def itoc_v2i8:Pat < (v2i8 (trunc GPRV2I32:$src)),
+(IL_ASV2CHAR_v2i32
+   (IL_ASV2INT_v2i32
+(BINARY_AND_v2i32 GPRV2I32:$src,
+     (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))
+  ) >;
+
+
+def itos_v2i16:Pat < (v2i16 (trunc GPRV2I32:$src)),
+(IL_ASV2SHORT_v2i32
+   (IL_ASV2INT_v2i32
+(BINARY_AND_v2i32 GPRV2I32:$src,
+     (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))
+  ) >;
+
+
+def ltoc_v2i8:Pat < (v2i8 (trunc GPRV2I64:$src)),
+(IL_ASV2CHAR_v2i32
+   (BINARY_AND_v2i32
+(LLO_v2i64 GPRV2I64:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 0x000000FF)))
+  ) >;
+
+
+def ltos_v2i16:Pat < (v2i16 (trunc GPRV2I64:$src)),
+(IL_ASV2SHORT_v2i32
+   (BINARY_AND_v2i32
+(LLO_v2i64 GPRV2I64:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF)))
+  ) >;
+
+
+def ltoi_v2i32:Pat < (v2i32 (trunc GPRV2I64:$src)),
+(IL_ASV2INT_v2i32
+   (BINARY_AND_v2i32
+(LLO_v2i64 GPRV2I64:$src),
+    (VCREATE_v2i32 (LOADCONST_i32 0xFFFFFFFF)))
+  ) >;
+
+
+
+
+def actos_v4i16:Pat < (v4i16 (anyext GPRV4I8:$src)),
+(IL_ASV4SHORT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def uctos_v4i16:Pat < (v4i16 (zext GPRV4I8:$src)),
+(IL_ASV4SHORT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def sctos_v4i16:Pat < (v4i16 (sext GPRV4I8:$src)),
+(IL_ASV4SHORT_v4i32
+ (SHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def actoi_v4i32:Pat < (v4i32 (anyext GPRV4I8:$src)),
+(IL_ASV4INT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def uctoi_v4i32:Pat < (v4i32 (zext GPRV4I8:$src)),
+(IL_ASV4INT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def sctoi_v4i32:Pat < (v4i32 (sext GPRV4I8:$src)),
+(IL_ASV4INT_v4i32
+ (SHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 24))),
+  (VCREATE_v4i32 (LOADCONST_i32 24)))) >;
+
+
+def astoi_v4i32:Pat < (v4i32 (anyext GPRV4I16:$src)),
+(IL_ASV4INT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i16 GPRV4I16:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 16))),
+  (VCREATE_v4i32 (LOADCONST_i32 16)))) >;
+
+
+def ustoi_v4i32:Pat < (v4i32 (zext GPRV4I16:$src)),
+(IL_ASV4INT_v4i32
+ (USHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i16 GPRV4I16:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 16))),
+  (VCREATE_v4i32 (LOADCONST_i32 16)))) >;
+
+
+def sstoi_v4i32:Pat < (v4i32 (sext GPRV4I16:$src)),
+(IL_ASV4INT_v4i32
+ (SHRVEC_v4i32
+  (SHLVEC_v4i32
+(IL_ASV4INT_v4i16 GPRV4I16:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 16))),
+  (VCREATE_v4i32 (LOADCONST_i32 16)))) >;
+
+
+
+def sctof_v4f32:Pat < (v4f32 (sint_to_fp GPRV4I8:$src)),
+(v4f32
+ (ITOF_v4f32
+  (SHRVEC_v4i32
+   (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+    (VCREATE_v4i32 (LOADCONST_i32 24))),
+   (VCREATE_v4i32 (LOADCONST_i32 24))))) >;
+
+
+def uctof_v4f32:Pat < (v4f32 (uint_to_fp GPRV4I8:$src)),
+(v4f32
+ (UTOF_v4f32
+  (USHRVEC_v4i32
+   (SHLVEC_v4i32
+(IL_ASV4INT_v4i8 GPRV4I8:$src),
+    (VCREATE_v4i32 (LOADCONST_i32 24))),
+   (VCREATE_v4i32 (LOADCONST_i32 24))))) >;
+
+
+def ftosc_v4i8:Pat < (v4i8 (fp_to_sint GPRV4F32:$src)),
+(v4i8
+ (IL_ASV4CHAR_v4i32
+  (BINARY_AND_v4i32
+(FTOI_v4i32 GPRV4F32:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 0x000000FF))))) >;
+
+
+def ftouc_v4i8:Pat < (v4i8 (fp_to_uint GPRV4F32:$src)),
+(v4i8
+ (IL_ASV4CHAR_v4i32
+  (BINARY_AND_v4i32
+(FTOU_v4i32 GPRV4F32:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 0x000000FF))))) >;
+
+
+def sstof_v4f32:Pat < (v4f32 (sint_to_fp GPRV4I16:$src)),
+(v4f32
+ (ITOF_v4f32
+  (SHRVEC_v4i32
+   (SHLVEC_v4i32
+(IL_ASV4INT_v4i16 GPRV4I16:$src),
+    (VCREATE_v4i32 (LOADCONST_i32 16))),
+   (VCREATE_v4i32 (LOADCONST_i32 16))))) >;
+
+
+def ustof_v4f32:Pat < (v4f32 (uint_to_fp GPRV4I16:$src)),
+(v4f32
+ (UTOF_v4f32
+  (USHRVEC_v4i32
+   (SHLVEC_v4i32
+(IL_ASV4INT_v4i16 GPRV4I16:$src),
+    (VCREATE_v4i32 (LOADCONST_i32 16))),
+   (VCREATE_v4i32 (LOADCONST_i32 16))))) >;
+
+
+def ftoss_v4i16:Pat < (v4i16 (fp_to_sint GPRV4F32:$src)),
+(v4i16
+ (IL_ASV4SHORT_v4i32
+  (BINARY_AND_v4i32
+(FTOI_v4i32 GPRV4F32:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+
+def ftous_v4i16:Pat < (v4i16 (fp_to_uint GPRV4F32:$src)),
+(v4i16
+ (IL_ASV4SHORT_v4i32
+  (BINARY_AND_v4i32
+(FTOU_v4i32 GPRV4F32:$src),
+   (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF))))) >;
+
+
+
+
+
+def stoc_v4i8:Pat < (v4i8 (trunc GPRV4I16:$src)),
+(IL_ASV4CHAR_v4i32
+   (IL_ASV4INT_v4i16
+(BINARY_AND_v4i16 GPRV4I16:$src,
+     (VCREATE_v4i16 (LOADCONST_i16 0x000000FF))))
+  ) >;
+
+
+def itoc_v4i8:Pat < (v4i8 (trunc GPRV4I32:$src)),
+(IL_ASV4CHAR_v4i32
+   (IL_ASV4INT_v4i32
+(BINARY_AND_v4i32 GPRV4I32:$src,
+     (VCREATE_v4i32 (LOADCONST_i32 0x000000FF))))
+  ) >;
+
+
+def itos_v4i16:Pat < (v4i16 (trunc GPRV4I32:$src)),
+(IL_ASV4SHORT_v4i32
+   (IL_ASV4INT_v4i32
+(BINARY_AND_v4i32 GPRV4I32:$src,
+     (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF))))
+  ) >;
+
+
diff --git a/src/gallium/drivers/radeon/AMDILDevice.cpp b/src/gallium/drivers/radeon/AMDILDevice.cpp

new file mode 100644 (file)

index 0000000..aa6d8af
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILDevice.cpp
@@ -0,0 +1,137 @@
+//===-- AMDILDevice.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+using namespace llvm;
+// Default implementation for all of the classes.
+AMDILDevice::AMDILDevice(AMDILSubtarget *ST) : mSTM(ST)
+{
+  mHWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities);
+  mSWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities);
+  setCaps();
+  mDeviceFlag = OCL_DEVICE_ALL;
+}
+
+AMDILDevice::~AMDILDevice()
+{
+    mHWBits.clear();
+    mSWBits.clear();
+}
+
+size_t AMDILDevice::getMaxGDSSize() const
+{
+  return 0;
+}
+
+uint32_t 
+AMDILDevice::getDeviceFlag() const
+{
+  return mDeviceFlag;
+}
+
+size_t AMDILDevice::getMaxNumCBs() const
+{
+  if (usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    return HW_MAX_NUM_CB;
+  }
+
+  return 0;
+}
+
+size_t AMDILDevice::getMaxCBSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    return MAX_CB_SIZE;
+  }
+
+  return 0;
+}
+
+size_t AMDILDevice::getMaxScratchSize() const
+{
+  return 65536;
+}
+
+uint32_t AMDILDevice::getStackAlignment() const
+{
+  return 16;
+}
+
+void AMDILDevice::setCaps()
+{
+  mSWBits.set(AMDILDeviceInfo::HalfOps);
+  mSWBits.set(AMDILDeviceInfo::ByteOps);
+  mSWBits.set(AMDILDeviceInfo::ShortOps);
+  mSWBits.set(AMDILDeviceInfo::HW64BitDivMod);
+  if (mSTM->isOverride(AMDILDeviceInfo::NoInline)) {
+    mSWBits.set(AMDILDeviceInfo::NoInline);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::MacroDB)) {
+    mSWBits.set(AMDILDeviceInfo::MacroDB);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::ConstantMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::ConstantMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::PrivateMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::PrivateMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::BarrierDetect)) {
+    mSWBits.set(AMDILDeviceInfo::BarrierDetect);
+  }
+  mSWBits.set(AMDILDeviceInfo::ByteLDSOps);
+  mSWBits.set(AMDILDeviceInfo::LongOps);
+}
+
+AMDILDeviceInfo::ExecutionMode
+AMDILDevice::getExecutionMode(AMDILDeviceInfo::Caps Caps) const
+{
+  if (mHWBits[Caps]) {
+    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDILDeviceInfo::Hardware;
+  }
+
+  if (mSWBits[Caps]) {
+    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDILDeviceInfo::Software;
+  }
+
+  return AMDILDeviceInfo::Unsupported;
+
+}
+
+bool AMDILDevice::isSupported(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) != AMDILDeviceInfo::Unsupported;
+}
+
+bool AMDILDevice::usesHardware(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) == AMDILDeviceInfo::Hardware;
+}
+
+bool AMDILDevice::usesSoftware(AMDILDeviceInfo::Caps Mode) const
+{
+  return getExecutionMode(Mode) == AMDILDeviceInfo::Software;
+}
+
+std::string
+AMDILDevice::getDataLayout() const
+{
+    return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/src/gallium/drivers/radeon/AMDILDevice.h b/src/gallium/drivers/radeon/AMDILDevice.h

new file mode 100644 (file)

index 0000000..3382121
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILDevice.h
@@ -0,0 +1,132 @@
+//===---- AMDILDevice.h - Define Device Data for AMDIL -----*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILDEVICEIMPL_H_
+#define _AMDILDEVICEIMPL_H_
+#include "AMDIL.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+  class AMDILAsmPrinter;
+  class AMDILIOExpansion;
+  class AMDILPointerManager;
+  class AsmPrinter;
+  class MCStreamer;
+//===----------------------------------------------------------------------===//
+// Interface for data that is specific to a single device
+//===----------------------------------------------------------------------===//
+class AMDILDevice {
+public:
+  AMDILDevice(AMDILSubtarget *ST);
+  virtual ~AMDILDevice();
+
+  // Enum values for the various memory types.
+  enum {
+    RAW_UAV_ID   = 0,
+    ARENA_UAV_ID = 1,
+    LDS_ID       = 2,
+    GDS_ID       = 3,
+    SCRATCH_ID   = 4,
+    CONSTANT_ID  = 5,
+    GLOBAL_ID    = 6,
+    MAX_IDS      = 7
+  } IO_TYPE_IDS;
+
+  // Returns the max LDS size that the hardware supports.  Size is in
+  // bytes.
+  virtual size_t getMaxLDSSize() const = 0;
+
+  // Returns the max GDS size that the hardware supports if the GDS is
+  // supported by the hardware.  Size is in bytes.
+  virtual size_t getMaxGDSSize() const;
+
+  // Returns the max number of hardware constant address spaces that
+  // are supported by this device.
+  virtual size_t getMaxNumCBs() const;
+
+  // Returns the max number of bytes a single hardware constant buffer
+  // can support.  Size is in bytes.
+  virtual size_t getMaxCBSize() const;
+
+  // Returns the max number of bytes allowed by the hardware scratch
+  // buffer.  Size is in bytes.
+  virtual size_t getMaxScratchSize() const;
+
+  // Get the flag that corresponds to the device.
+  virtual uint32_t getDeviceFlag() const;
+
+  // Returns the number of work-items that exist in a single hardware
+  // wavefront.
+  virtual size_t getWavefrontSize() const = 0;
+
+  // Get the generational name of this specific device.
+  virtual uint32_t getGeneration() const = 0;
+
+  // Get the stack alignment of this specific device.
+  virtual uint32_t getStackAlignment() const;
+
+  // Get the resource ID for this specific device.
+  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
+
+  // Get the max number of UAV's for this device.
+  virtual uint32_t getMaxNumUAVs() const = 0;
+
+  // Interface to get the IO Expansion pass for each device.
+  virtual FunctionPass* 
+    getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const = 0;
+
+  // Interface to get the Asm printer for each device.
+  virtual AsmPrinter*
+    getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const = 0;
+
+  // Interface to get the Pointer manager pass for each device.
+  virtual FunctionPass* 
+    getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const = 0;
+
+
+  // API utilizing more detailed capabilities of each family of
+  // cards. If a capability is supported, then either usesHardware or
+  // usesSoftware returned true.  If usesHardware returned true, then
+  // usesSoftware must return false for the same capability.  Hardware
+  // execution means that the feature is done natively by the hardware
+  // and is not emulated by the softare.  Software execution means
+  // that the feature could be done in the hardware, but there is
+  // software that emulates it with possibly using the hardware for
+  // support since the hardware does not fully comply with OpenCL
+  // specs.
+  bool isSupported(AMDILDeviceInfo::Caps Mode) const;
+  bool usesHardware(AMDILDeviceInfo::Caps Mode) const;
+  bool usesSoftware(AMDILDeviceInfo::Caps Mode) const;
+  virtual std::string getDataLayout() const;
+  static const unsigned int MAX_LDS_SIZE_700 = 16384;
+  static const unsigned int MAX_LDS_SIZE_800 = 32768;
+  static const unsigned int WavefrontSize = 64;
+  static const unsigned int HalfWavefrontSize = 32;
+  static const unsigned int QuarterWavefrontSize = 16;
+protected:
+  virtual void setCaps();
+  llvm::BitVector mHWBits;
+  llvm::BitVector mSWBits;
+  AMDILSubtarget *mSTM;
+  uint32_t mDeviceFlag;
+private:
+  AMDILDeviceInfo::ExecutionMode
+  getExecutionMode(AMDILDeviceInfo::Caps Caps) const;
+}; // AMDILDevice
+
+} // namespace llvm
+#endif // _AMDILDEVICEIMPL_H_
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp

new file mode 100644 (file)

index 0000000..89b8312
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
@@ -0,0 +1,87 @@
+//===-- AMDILDeviceInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevices.h"
+#include "AMDILSubtarget.h"
+
+using namespace llvm;
+namespace llvm {
+    AMDILDevice*
+getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64bit, bool is64on32bit)
+{
+    if (deviceName.c_str()[2] == '7') {
+        switch (deviceName.c_str()[3]) {
+            case '1':
+                return new AMDIL710Device(ptr);
+            case '7':
+                return new AMDIL770Device(ptr);
+            default:
+                return new AMDIL7XXDevice(ptr);
+        };
+    } else if (deviceName == "cypress") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCypressDevice(ptr);
+    } else if (deviceName == "juniper") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILEvergreenDevice(ptr);
+    } else if (deviceName == "redwood") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+      return new AMDILRedwoodDevice(ptr);
+    } else if (deviceName == "cedar") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCedarDevice(ptr);
+    } else if (deviceName == "barts"
+      || deviceName == "turks") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILNIDevice(ptr);
+    } else if (deviceName == "cayman") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILCaymanDevice(ptr);
+    } else if (deviceName == "caicos") {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDILNIDevice(ptr);
+    } else if (deviceName == "SI") {
+        return new AMDILSIDevice(ptr);
+    } else {
+#if DEBUG
+      assert(!is64bit && "This device does not support 64bit pointers!");
+      assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+        return new AMDIL7XXDevice(ptr);
+    }
+}
+}
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.h b/src/gallium/drivers/radeon/AMDILDeviceInfo.h

new file mode 100644 (file)

index 0000000..c4acf91
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.h
@@ -0,0 +1,89 @@
+//===-- AMDILDeviceInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDILDEVICEINFO_H_
+#define _AMDILDEVICEINFO_H_
+
+
+#include <string>
+
+namespace llvm
+{
+  class AMDILDevice;
+  class AMDILSubtarget;
+  namespace AMDILDeviceInfo
+  {
+    // Each Capabilities can be executed using a hardware instruction,
+    // emulated with a sequence of software instructions, or not
+    // supported at all.
+    enum ExecutionMode {
+      Unsupported = 0, // Unsupported feature on the card(Default value)
+      Software, // This is the execution mode that is set if the
+      // feature is emulated in software
+      Hardware  // This execution mode is set if the feature exists
+        // natively in hardware
+    };
+
+    // Any changes to this needs to have a corresponding update to the
+    // twiki page GPUMetadataABI
+    enum Caps {
+      HalfOps          = 0x1,  // Half float is supported or not.
+      DoubleOps        = 0x2,  // Double is supported or not.
+      ByteOps          = 0x3,  // Byte(char) is support or not.
+      ShortOps         = 0x4,  // Short is supported or not.
+      LongOps          = 0x5,  // Long is supported or not.
+      Images           = 0x6,  // Images are supported or not.
+      ByteStores       = 0x7,  // ByteStores available(!HD4XXX).
+      ConstantMem      = 0x8,  // Constant/CB memory.
+      LocalMem         = 0x9,  // Local/LDS memory.
+      PrivateMem       = 0xA,  // Scratch/Private/Stack memory.
+      RegionMem        = 0xB,  // OCL GDS Memory Extension.
+      FMA              = 0xC,  // Use HW FMA or SW FMA.
+      ArenaSegment     = 0xD,  // Use for Arena UAV per pointer 12-1023.
+      MultiUAV         = 0xE,  // Use for UAV per Pointer 0-7.
+      Reserved0        = 0xF,  // ReservedFlag
+      NoAlias          = 0x10, // Cached loads.
+      Signed24BitOps   = 0x11, // Peephole Optimization.
+      // Debug mode implies that no hardware features or optimizations
+      // are performned and that all memory access go through a single
+      // uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
+      Debug            = 0x12, // Debug mode is enabled.
+      CachedMem        = 0x13, // Cached mem is available or not.
+      BarrierDetect    = 0x14, // Detect duplicate barriers.
+      Reserved1        = 0x15, // Reserved flag
+      ByteLDSOps       = 0x16, // Flag to specify if byte LDS ops are available.
+      ArenaVectors     = 0x17, // Flag to specify if vector loads from arena work.
+      TmrReg           = 0x18, // Flag to specify if Tmr register is supported.
+      NoInline         = 0x19, // Flag to specify that no inlining should occur.
+      MacroDB          = 0x1A, // Flag to specify that backend handles macrodb.
+      HW64BitDivMod    = 0x1B, // Flag for backend to generate 64bit div/mod.
+      ArenaUAV         = 0x1C, // Flag to specify that arena uav is supported.
+      PrivateUAV       = 0x1D, // Flag to specify that private memory uses uav's.
+      // If more capabilities are required, then
+      // this number needs to be increased.
+      // All capabilities must come before this
+      // number.
+      MaxNumberCapabilities = 0x20
+    };
+    // These have to be in order with the older generations
+    // having the lower number enumerations.
+    enum Generation {
+      HD4XXX = 0, // 7XX based devices.
+      HD5XXX, // Evergreen based devices.
+      HD6XXX, // NI/Evergreen+ based devices.
+      HD7XXX,
+      HDTEST, // Experimental feature testing device.
+      HDNUMGEN
+    };
+
+
+  } // namespace AMDILDeviceInfo
+  llvm::AMDILDevice*
+    getDeviceFromName(const std::string &name, llvm::AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
+} // namespace llvm
+#endif // _AMDILDEVICEINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILDevices.h b/src/gallium/drivers/radeon/AMDILDevices.h

new file mode 100644 (file)

index 0000000..3fc5fa0
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILDevices.h
@@ -0,0 +1,19 @@
+//===-- AMDILDevices.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef __AMDIL_DEVICES_H_
+#define __AMDIL_DEVICES_H_
+// Include all of the device specific header files
+// This file is for Internal use only!
+#include "AMDIL7XXDevice.h"
+#include "AMDILDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSIDevice.h"
+
+#endif // _AMDIL_DEVICES_H_
diff --git a/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp b/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp

new file mode 100644 (file)

index 0000000..185fc70
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp
@@ -0,0 +1,1093 @@
+//===-- AMDILEGIOExpansion.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// @file AMDILEGIOExpansion.cpp
+// @details Implementation of IO expansion class for evergreen and NI devices.
+//
+#include "AMDILCompilerErrors.h"
+#include "AMDILCompilerWarnings.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILIOExpansion.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+AMDILEGIOExpansion::AMDILEGIOExpansion(TargetMachine &tm
+     AMDIL_OPT_LEVEL_DECL) : AMDILImageExpansion(tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+AMDILEGIOExpansion::~AMDILEGIOExpansion() {
+}
+const char *AMDILEGIOExpansion::getPassName() const
+{
+  return "AMDIL EG/NI IO Expansion Pass";
+}
+  bool
+AMDILEGIOExpansion::isImageIO(MachineInstr *MI)
+{
+  if (!MI->getOperand(0).isGlobal()) {
+    return false;
+  }
+  const llvm::StringRef& nameRef = MI->getOperand(0).getGlobal()->getName();
+  const char *name = nameRef.data();
+  if (nameRef.size() > 8 && !strncmp(name, "__amdil_", 8)) {
+    name += 8;
+    if (!strncmp(name, "sample_data", 11)
+        || !strncmp(name, "write_image", 11)
+        || !strncmp(name, "get_image2d_params", 18)
+        || !strncmp(name, "get_image3d_params", 18)) {
+      return true;
+    }
+  }
+  return false;
+}
+bool
+AMDILEGIOExpansion::isIOInstruction(MachineInstr *MI)
+{
+  if (!MI) {
+    return false;
+  }
+  switch (MI->getOpcode()) {
+    default:
+      return AMDILIOExpansion::isIOInstruction(MI);
+    case AMDIL::IMAGE2D_READ:
+    case AMDIL::IMAGE2D_READ_UNNORM:
+    case AMDIL::IMAGE2D_WRITE:
+    case AMDIL::IMAGE2D_INFO0:
+    case AMDIL::IMAGE2D_INFO1:
+    case AMDIL::IMAGE3D_READ:
+    case AMDIL::IMAGE3D_READ_UNNORM:
+    case AMDIL::IMAGE3D_WRITE:
+    case AMDIL::IMAGE3D_INFO0:
+    case AMDIL::IMAGE3D_INFO1:
+      return true;
+  };
+  return false;
+}
+void 
+AMDILEGIOExpansion::expandIOInstruction(MachineInstr *MI)
+{
+  assert(isIOInstruction(MI) && "Must be an IO instruction to "
+      "be passed to this function!");
+  switch (MI->getOpcode()) {
+    default:
+      AMDILIOExpansion::expandIOInstruction(MI);
+      break;
+    case AMDIL::IMAGE2D_READ:
+    case AMDIL::IMAGE3D_READ:
+    case AMDIL::IMAGE2D_READ_UNNORM:
+    case AMDIL::IMAGE3D_READ_UNNORM:
+        expandImageLoad(mBB, MI);
+        break;
+    case AMDIL::IMAGE2D_WRITE:
+    case AMDIL::IMAGE3D_WRITE:
+        expandImageStore(mBB, MI);
+        break;
+    case AMDIL::IMAGE2D_INFO0:
+    case AMDIL::IMAGE2D_INFO1:
+    case AMDIL::IMAGE3D_INFO0:
+    case AMDIL::IMAGE3D_INFO1:
+        expandImageParam(mBB, MI);
+        break;
+  };
+}
+  bool
+AMDILEGIOExpansion::isCacheableOp(MachineInstr *MI)
+{
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  // We only support caching on UAV11 - JeffG
+  if (curRes.bits.ResourceID == 11) {
+    return curRes.bits.CacheableRead;
+  } else {
+    return false;
+  }
+}
+  bool
+AMDILEGIOExpansion::isArenaOp(MachineInstr *MI)
+{
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  return curRes.bits.ResourceID 
+    == mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)
+    || curRes.bits.ResourceID >= ARENA_SEGMENT_RESERVED_UAVS;
+}
+  void
+AMDILEGIOExpansion::expandPackedData(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  if (!isPackedData(MI)) {
+    return;
+  }
+  // There is a bug in the CAL compiler that incorrectly
+  // errors when the UBIT_INSERT instruction is 
+  if (mSTM->calVersion() < CAL_VERSION_SC_137) {
+    AMDIL789IOExpansion::expandPackedData(MI);
+    return;
+  }
+  DebugLoc DL;
+  // If we have packed data, then the shift size is no longer
+  // the same as the load size and we need to adjust accordingly
+  switch(getPackedID(MI)) {
+    default:
+      break;
+    case PACK_V2I8:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(8)).addImm(mMFI->addi32Literal(8))
+          .addReg(AMDIL::R1012).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V4I8:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1012)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), 
+            AMDIL::R1011)
+          .addImm(mMFI->addi64Literal(8ULL | (8ULL << 32)))
+          .addImm(mMFI->addi64Literal(8ULL | (8ULL << 32)))
+          .addReg(AMDIL::R1012).addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16)).addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1012).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V2I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16)).addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1012).addReg(AMDIL::R1011);
+      }
+      break;
+    case PACK_V4I16:
+      {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1012)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), AMDIL::R1011)
+          .addImm(mMFI->addi64Literal(16ULL | (16ULL << 32)))
+          .addImm(mMFI->addi64Literal(16ULL | (16ULL << 32)))
+          .addReg(AMDIL::R1012).addReg(AMDIL::R1011);
+      }
+      break;
+    case UNPACK_V2I8:
+    case UNPACK_V4I8:
+    case UNPACK_V2I16:
+    case UNPACK_V4I16:
+      AMDIL789IOExpansion::expandPackedData(MI);
+      break;
+  };
+}
+
+  void
+AMDILEGIOExpansion::expandGlobalLoad(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool usesArena = isArenaOp(MI);
+  bool cacheable = isCacheableOp(MI);
+  uint32_t ID = getPointerID(MI);
+  mKM->setOutputInst();
+  if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID)
+      && !mMFI->usesMem(AMDILDevice::ARENA_UAV_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  // These instructions are generated before the current MI.
+  expandLoadStartCode(MI);
+  expandArenaSetup(MI);
+  DebugLoc DL;
+  if (getMemorySize(MI) == 1) {
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i8), AMDIL::R1011)
+          .addReg(AMDIL::R1010)
+          .addImm(ID);
+      } else {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+                (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1012)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(0));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1012)
+          .addImm(mMFI->addi32Literal(0))
+          .addImm(mMFI->addi32Literal(24));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1012)
+          .addImm(mMFI->addi32Literal(8))
+          .addReg(AMDIL::R1008);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1012)
+          .addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1008);
+        if (cacheable) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        }
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_v4i8), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addReg(AMDIL::R1008);
+      }
+  } else if (getMemorySize(MI) == 2) {
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i16), AMDIL::R1011)
+          .addReg(AMDIL::R1010)
+          .addImm(ID);
+      } else {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(1));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(16))
+          .addImm(mMFI->addi32Literal(0));
+        if (cacheable) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        }
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i16), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addReg(AMDIL::R1008);
+      }
+  } else if (getMemorySize(MI) == 4) {
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1010)
+          .addImm(ID);
+      } else {
+        if (cacheable) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        }
+      }
+  } else if (getMemorySize(MI) == 8) {
+    if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1010)
+          .addImm(ID);
+        if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Y_i32), AMDIL::R1011)
+            .addReg(AMDIL::R1010)
+            .addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(2);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1007)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addReg(AMDIL::R1008);
+        }
+      } else {
+        if (cacheable) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_v2i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_v2i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        }
+      }
+  } else {
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1010)
+          .addImm(ID);
+        if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Y_i32), AMDIL::R1011)
+            .addReg(AMDIL::R1010)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Z_i32), AMDIL::R1011)
+            .addReg(AMDIL::R1010)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_W_i32), AMDIL::R1011)
+            .addReg(AMDIL::R1010)
+            .addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(2);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1007)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addReg(AMDIL::R1008);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(3);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1007)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(4);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1006)
+            .addReg(AMDIL::R1007)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1008)
+            .addReg(AMDIL::R1006)
+            .addReg(AMDIL::R1008);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addReg(AMDIL::R1008);
+        }
+      } else {
+        if (cacheable) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_v4i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_v4i32),
+              AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID);
+        }
+      }
+  }
+  // These instructions are generated after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, I, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+
+  void
+AMDILEGIOExpansion::expandRegionLoad(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) {
+    mMFI->addErrorMsg(
+        amd::CompilerErrorMessage[REGION_MEMORY_ERROR]);
+    return;
+  }
+  if (!HWRegion || !isHardwareRegion(MI)) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::GDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  DebugLoc DL;
+  unsigned mulOp = 0;
+  uint32_t gID = getPointerID(MI);
+  assert(gID && "Found a GDS load that was incorrectly marked as zero ID!\n");
+  if (!gID) {
+    gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  // These instructions are generated before the current MI.
+  expandLoadStartCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32)));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Z), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_W), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+    case 1:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::RegionMem))
+        ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32;
+      BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      // The instruction would normally fit in right here so everything created
+      // after this point needs to go into the afterInst vector.
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(8))
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1011);
+      break;
+    case 2:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::RegionMem))
+        ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32;
+      BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+       BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(16))
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1011);
+      break;
+    case 4:
+       BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+    case 8:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi64Literal(1ULL << 32));
+       BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+       BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011)
+        .addReg(AMDIL::R1010)
+        .addImm(gID);
+      break;
+   };
+
+  // These instructions are generated after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, I, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+  void
+AMDILEGIOExpansion::expandLocalLoad(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  if (!HWLocal || !isHardwareLocal(MI)) {
+    return expandGlobalLoad(MI);
+  }
+  if (!mMFI->usesMem(AMDILDevice::LDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t lID = getPointerID(MI);
+  assert(lID && "Found a LDS load that was incorrectly marked as zero ID!\n");
+  if (!lID) {
+    lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL;
+  unsigned mulOp = 0;
+  // These instructions are generated before the current MI.
+  expandLoadStartCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOADVEC_v4i32), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      break;
+    case 8:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOADVEC_v2i32), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      break;
+    case 4:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+      break;
+    case 1:
+      if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem))
+          ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32;
+        BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(8));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+          .addReg(AMDIL::R1010)
+          .addImm(lID);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(8))
+          .addReg(AMDIL::R1008)
+          .addReg(AMDIL::R1011);
+      } else {
+        if (isSWSExtLoadInst(MI)) { 
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_i8), AMDIL::R1011) 
+            .addReg(AMDIL::R1010)
+            .addImm(lID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_u8), AMDIL::R1011) 
+            .addReg(AMDIL::R1010)
+            .addImm(lID);
+        }
+      }
+      break;
+    case 2:
+      if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem))
+          ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32;
+        BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(8));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(0xFFFFFFFC));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) 
+          .addReg(AMDIL::R1010)
+          .addImm(lID);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(16))
+          .addReg(AMDIL::R1008)
+          .addReg(AMDIL::R1011);
+      } else {
+        if (isSWSExtLoadInst(MI)) { 
+           BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_i16), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+        } else {
+           BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_u16), AMDIL::R1011) 
+        .addReg(AMDIL::R1010)
+        .addImm(lID);
+        }
+      }
+      break;
+   }
+
+  // These instructions are generated after the current MI.
+  expandPackedData(MI);
+  expandExtendLoad(MI);
+  BuildMI(*mBB, I, MI->getDebugLoc(),
+      mTII->get(getMoveInstFromID(
+          MI->getDesc().OpInfo[0].RegClass)))
+    .addOperand(MI->getOperand(0))
+    .addReg(AMDIL::R1011);
+  MI->getOperand(0).setReg(AMDIL::R1011);
+}
+  void
+AMDILEGIOExpansion::expandGlobalStore(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool usesArena = isArenaOp(MI);
+  uint32_t ID = getPointerID(MI);
+  mKM->setOutputInst();
+  if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID)
+      && !mMFI->usesMem(AMDILDevice::ARENA_UAV_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  DebugLoc DL;
+  // These instructions are expandted before the current MI.
+  expandStoreSetupCode(MI);
+  expandArenaSetup(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      if (usesArena) {
+         BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+        if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_Y_i32), AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_Z_i32), AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(ID);
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_W_i32), AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(ID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(2);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008)
+            .addReg(AMDIL::R1011)
+            .addImm(2);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1008)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(3);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008)
+            .addReg(AMDIL::R1011)
+            .addImm(3);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1008)
+            .addImm(ID);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(4);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008)
+            .addReg(AMDIL::R1011)
+            .addImm(4);
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008)
+            .addReg(AMDIL::R1008)
+            .addImm(ID);
+        }
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_v4i32), AMDIL::MEM)
+          .addReg(AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      }
+      break;
+    case 1:
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFF));
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i8), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+          .addReg(AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      }
+      break;
+    case 2:
+      if (usesArena) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFFFF));
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i16), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+          .addReg(AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      }
+      break;
+    case 4:
+      if (usesArena) {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM)
+          .addReg(AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      }
+      break;
+    case 8:
+      if (usesArena) {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+        if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) {
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_Y_i32), AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(ID);
+        } else {
+           BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007)
+            .addReg(AMDIL::R1010)
+            .addImm(2);
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008)
+            .addReg(AMDIL::R1011)
+            .addImm(2);
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1007)
+            .addReg(AMDIL::R1008)
+            .addImm(ID);
+        }
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_v2i32), AMDIL::MEM)
+          .addReg(AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(ID);
+      }
+      break;
+  };
+}
+  void
+AMDILEGIOExpansion::expandRegionStore(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  if (!HWRegion || !isHardwareRegion(MI)) {
+    return expandGlobalStore(MI);
+  }
+  mKM->setOutputInst();
+  if (!mMFI->usesMem(AMDILDevice::GDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t gID = getPointerID(MI);
+  assert(gID && "Found a GDS store that was incorrectly marked as zero ID!\n");
+  if (!gID) {
+    gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  DebugLoc DL;
+  unsigned mulOp = HWRegion ? AMDIL::UMUL24_i32 : AMDIL::UMUL24_i32;
+  // These instructions are expandted before the current MI.
+  expandStoreSetupCode(MI);
+  expandArenaSetup(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32)));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE_Z), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE_W), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 1:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0xFF));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+              (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+      BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1006)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(8));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(0xFFFFFF00))
+        .addImm(mMFI->addi32Literal(0x00FFFFFF));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1007)
+        .addImm(mMFI->addi32Literal(0xFF00FFFF));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1008)
+        .addReg(AMDIL::R1007)
+        .addImm(mMFI->addi32Literal(0xFFFF00FF));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1007);
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_R_MSKOR), AMDIL::R1010)
+        .addReg(AMDIL::R1012)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 2:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0x0000FFFF));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi32Literal(3));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(1));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(0x0000FFFF))
+        .addImm(mMFI->addi32Literal(0xFFFF0000));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+        .addReg(AMDIL::R1008)
+        .addImm(mMFI->addi32Literal(16))
+        .addImm(mMFI->addi32Literal(0));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addReg(AMDIL::R1008);
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_R_MSKOR), AMDIL::R1010)
+        .addReg(AMDIL::R1012)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 4:
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+    case 8:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi64Literal(1ULL << 32));
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(gID);
+      break;
+  };
+
+}
+
+  void
+AMDILEGIOExpansion::expandLocalStore(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  if (!HWLocal || !isHardwareLocal(MI)) {
+    return expandGlobalStore(MI);
+  }
+  DebugLoc DL;
+  if (!mMFI->usesMem(AMDILDevice::LDS_ID)
+      && mKM->isKernel()) {
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]);
+  }
+  uint32_t lID = getPointerID(MI);
+  assert(lID && "Found a LDS store that was incorrectly marked as zero ID!\n");
+  if (!lID) {
+    lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID);
+    mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]);
+  }
+  unsigned mulOp = HWLocal ? AMDIL::UMUL24_i32 : AMDIL::UMUL24_i32;
+  // These instructions are expandted before the current MI.
+  expandStoreSetupCode(MI);
+  switch (getMemorySize(MI)) {
+    default:
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTOREVEC_v4i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(lID);
+      break;
+    case 8:
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTOREVEC_v2i32), AMDIL::MEM)
+        .addReg(AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(lID);
+      break;
+    case 4:
+      BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE), AMDIL::R1010)
+        .addReg(AMDIL::R1011)
+        .addImm(lID);
+      break;
+    case 1:
+      if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0xFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008);
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, 
+                (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32))));
+        BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1006)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(8));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(0xFFFFFF00))
+          .addImm(mMFI->addi32Literal(0x00FFFFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007)
+          .addReg(AMDIL::R1008)
+          .addReg(AMDIL::R1007)
+          .addImm(mMFI->addi32Literal(0xFF00FFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012)
+          .addReg(AMDIL::R1008)
+          .addReg(AMDIL::R1007)
+          .addImm(mMFI->addi32Literal(0xFFFF00FF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addReg(AMDIL::R1007);
+        if (mSTM->calVersion() >= CAL_VERSION_SC_137) {
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_MSKOR_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1012)
+            .addReg(AMDIL::R1011)
+            .addImm(lID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::ATOM_L_ADD_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1012)
+            .addImm(lID);
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_OR_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(lID);
+        }
+      } else {
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE_i8), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(lID);
+      }
+      break;
+    case 2:
+      if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) {
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addImm(mMFI->addi32Literal(0x0000FFFF));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1010)
+          .addImm(mMFI->addi32Literal(3));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(1));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(0x0000FFFF))
+          .addImm(mMFI->addi32Literal(0xFFFF0000));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008)
+          .addReg(AMDIL::R1008)
+          .addImm(mMFI->addi32Literal(16))
+          .addImm(mMFI->addi32Literal(0));
+        BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011)
+          .addReg(AMDIL::R1011)
+          .addReg(AMDIL::R1008);
+        if (mSTM->calVersion() >= CAL_VERSION_SC_137) {
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_MSKOR_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1012)
+            .addReg(AMDIL::R1011)
+            .addImm(lID);
+        } else {
+          BuildMI(*mBB, I, DL, mTII->get(AMDIL::ATOM_L_ADD_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1012)
+            .addImm(lID);
+          BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_OR_NORET), 
+              AMDIL::R1010)
+            .addReg(AMDIL::R1011)
+            .addImm(lID);
+        }
+      } else { 
+        BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE_i16), AMDIL::R1010)
+          .addReg(AMDIL::R1011)
+          .addImm(lID);
+      }
+      break;
+  }
+}
+
+
+  void
+AMDILEGIOExpansion::expandStoreSetupCode(MachineInstr *MI)
+{
+  AMDIL789IOExpansion::expandStoreSetupCode(MI);
+}
+  void
+AMDILEGIOExpansion::expandArenaSetup(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  if (!isArenaOp(MI)) {
+    return;
+  }
+  const MCInstrDesc &TID = (MI->getDesc());
+  const MCOperandInfo &TOI = TID.OpInfo[0];
+  unsigned short RegClass = TOI.RegClass;
+  DebugLoc DL;
+  switch (RegClass) {
+    case AMDIL::GPRV4I16RegClassID:
+    case AMDIL::GPRI64RegClassID:
+    case AMDIL::GPRF64RegClassID:
+    case AMDIL::GPRV2I32RegClassID:
+    case AMDIL::GPRV2F32RegClassID:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v2i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi64Literal(4ULL << 32));
+      break;
+    default:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010);
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010)
+        .addReg(AMDIL::R1010)
+        .addImm(mMFI->addi128Literal(4ULL << 32, 8ULL | (12ULL << 32)));
+      break;
+    case AMDIL::GPRI8RegClassID:
+    case AMDIL::GPRV2I8RegClassID:
+    case AMDIL::GPRI16RegClassID:
+    case AMDIL::GPRV2I16RegClassID:
+    case AMDIL::GPRV4I8RegClassID:
+    case AMDIL::GPRI32RegClassID:
+    case AMDIL::GPRF32RegClassID:
+      break;
+  };
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp b/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp

new file mode 100644 (file)

index 0000000..84ae9a3
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp
@@ -0,0 +1,71 @@
+//===-- AMDILELFWriterInfo.cpp - Elf Writer Info for AMDIL ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//   This file implements ELF writer information for the AMDIL backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILELFWriterInfo.h"
+#include "AMDIL.h"
+#include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetELFWriterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Implementation of the AMDILELFWriterInfo class
+//===----------------------------------------------------------------------===//
+AMDILELFWriterInfo::AMDILELFWriterInfo(bool is64bit, bool endian)
+  : TargetELFWriterInfo(is64bit, endian)
+{
+}
+
+AMDILELFWriterInfo::~AMDILELFWriterInfo() {
+}
+
+unsigned AMDILELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return 0;
+}
+
+bool AMDILELFWriterInfo::hasRelocationAddend() const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return false;
+}
+
+long int AMDILELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
+                                                      long int Modifier) const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return 0;
+}
+
+unsigned AMDILELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return 0;
+}
+
+bool AMDILELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return false;
+}
+
+unsigned AMDILELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return 0;
+}
+
+long int AMDILELFWriterInfo::computeRelocation(unsigned SymOffset,
+                                               unsigned RelOffset,
+                                               unsigned RelTy) const {
+  assert(0 && "What do we do here? Lets assert an analyze");
+  return 0;
+}
diff --git a/src/gallium/drivers/radeon/AMDILELFWriterInfo.h b/src/gallium/drivers/radeon/AMDILELFWriterInfo.h

new file mode 100644 (file)

index 0000000..0bcffd2
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILELFWriterInfo.h
@@ -0,0 +1,54 @@
+//===-- AMDILELFWriterInfo.h - Elf Writer Info for AMDIL ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+//   This file implements ELF writer information for the AMDIL backend.
+//
+//===---------------------------------------------------------------------===//
+#ifndef _AMDIL_ELF_WRITER_INFO_H_
+#define _AMDIL_ELF_WRITER_INFO_H_
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+  class AMDILELFWriterInfo : public TargetELFWriterInfo {
+  public:
+    AMDILELFWriterInfo(bool is64Bit_, bool isLittleEndian_);
+    virtual ~AMDILELFWriterInfo();
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+    /// 'hasRelocationAddend - True if the target uses and addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const;
+
+    /// getDefaultAddendForRelTy - Gets the default addend value for a
+    /// relocation entry based on the target ELF relocation type.
+    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
+                                              long int Modifier = 0) const;
+
+    /// getRelTySize - Returns the size of relocatble field in bits
+    virtual unsigned getRelocationTySize(unsigned RelTy) const;
+
+    /// isPCRelativeRel - True if the relocation type is pc relative
+    virtual bool isPCRelativeRel(unsigned RelTy) const;
+
+    /// getJumpTableRelocationTy - Returns the machine relocation type used
+    /// to reference a jumptable.
+    virtual unsigned getAbsoluteLabelMachineRelTy() const;
+
+    /// computeRelocation - Some relocatable fields could be relocated
+    /// directly, avoiding the relocation symbol emission, compute the
+    /// final relocation value for this symbol.
+    virtual long int computeRelocation(unsigned SymOffset,
+                                       unsigned RelOffset,
+                                       unsigned RelTy) const;
+  };
+} // namespace llvm
+#endif // _AMDIL_ELF_WRITER_INFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td

new file mode 100644 (file)

index 0000000..445fd60
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
@@ -0,0 +1,522 @@
+//===-- AMDILEnumeratedTypes.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// ILEnumreatedTypes.td - The IL Enumerated Types
+//===--------------------------------------------------------------------===//
+
+// Section 5.1  IL Shader
+class ILShader<bits<8> val> {
+    bits<8> Value = val;
+}
+// Table 5-1
+def IL_SHADER_PIXEL : ILShader<0>;
+def IL_SHADER_COMPUTE : ILShader<1>;
+
+// Section 5.2 IL RegType
+class ILRegType<bits<6> val> {
+    bits<6> Value = val;
+}
+// Table 5-2
+def IL_REGTYPE_TEMP      : ILRegType<0>;
+def IL_REGTYPE_WINCOORD  : ILRegType<1>;
+def IL_REGTYPE_CONST_BUF : ILRegType<2>;
+def IL_REGTYPE_LITERAL   : ILRegType<3>;
+def IL_REGTYPE_ITEMP     : ILRegType<4>;
+def IL_REGTYPE_GLOBAL    : ILRegType<5>;
+
+// Section 5.3 IL Component Select
+class ILComponentSelect<bits<3> val, string text> {
+     bits<3> Value = val;
+     string Text = text;
+}
+// Table 5-3
+def IL_COMPSEL_X : ILComponentSelect<0, "x">;
+def IL_COMPSEL_Y : ILComponentSelect<1, "y">;
+def IL_COMPSEL_Z : ILComponentSelect<2, "z">;
+def IL_COMPSEL_W : ILComponentSelect<3, "w">;
+def IL_COMPSEL_0 : ILComponentSelect<4, "0">;
+def IL_COMPSEL_1 : ILComponentSelect<5, "1">;
+
+// Section 5.4 IL Mod Dst Comp
+class ILModDstComp<bits<2> val, string text> {
+    bits<2> Value = val;
+    string Text = text;
+}
+// Table 5-4
+def IL_MODCOMP_NOWRITE : ILModDstComp<0, "_">;
+def IL_MODCOMP_WRITE_X : ILModDstComp<1, "x">;
+def IL_MODCOMP_WRITE_y : ILModDstComp<1, "y">;
+def IL_MODCOMP_WRITE_z : ILModDstComp<1, "z">;
+def IL_MODCOMP_WRITE_w : ILModDstComp<1, "w">;
+def IL_MODCOMP_0       : ILModDstComp<2, "0">;
+def IL_MODCOMP_1       : ILModDstComp<3, "1">;
+
+// Section 5.5 IL Import Usage
+class ILImportUsage<bits<1> val, string usage> {
+    bits<1> Value = val;
+    string Text = usage;
+}
+// Table 5-5
+def IL_IMPORTUSAGE_WINCOORD : ILImportUsage<0, "_usage(wincoord)">;
+
+// Section 5.6 Il Shift Scale
+class ILShiftScale<bits<4> val, string scale> {
+    bits<4> Value = val;
+    string Text = scale;
+}
+
+// Table 5-6
+def IL_SHIFT_NONE   : ILShiftScale<0, "">;
+def IL_SHIFT_X2     : ILShiftScale<1, "_x2">;
+def IL_SHIFT_X4     : ILShiftScale<2, "_x4">;
+def IL_SHIFT_X8     : ILShiftScale<3, "_x8">;
+def IL_SHIFT_D2     : ILShiftScale<4, "_d2">;
+def IL_SHIFT_D4     : ILShiftScale<5, "_d4">;
+def IL_SHIFT_D8     : ILShiftScale<6, "_d8">;
+
+// Section 5.7 IL Divide Component
+class ILDivComp<bits<3> val, string divcomp> {
+    bits<3> Value = val;
+    string Text = divcomp;
+}
+
+// Table 5-7
+def IL_DIVCOMP_NONE : ILDivComp<0, "_divcomp(none)">;
+def IL_DIVCOMP_Y    : ILDivComp<1, "_divcomp(y)">;
+def IL_DIVCOMP_Z    : ILDivComp<2, "_divcomp(z)">;
+def IL_DIVCOMP_W    : ILDivComp<3, "_divcomp(w)">;
+//def IL_DIVCOMP_UNKNOWN : ILDivComp<4, "_divcomp(unknown)">;
+
+// Section 5.8 IL Relational Op
+class ILRelOp<bits<3> val, string op> {
+    bits<3> Value = val;
+    string Text = op;
+}
+
+// Table 5-8
+def IL_RELOP_EQ : ILRelOp<0, "_relop(eq)">;
+def IL_RELOP_NE : ILRelOp<1, "_relop(ne)">;
+def IL_RELOP_GT : ILRelOp<2, "_relop(gt)">;
+def IL_RELOP_GE : ILRelOp<3, "_relop(ge)">;
+def IL_RELOP_LT : ILRelOp<4, "_relop(lt)">;
+def IL_RELOP_LE : ILRelOp<5, "_relop(le)">;
+
+// Section 5.9 IL Zero Op
+class ILZeroOp<bits<3> val, string behavior> {
+    bits<3> Value = val;
+    string Text = behavior;
+}
+
+// Table 5-9
+def IL_ZEROOP_FLTMAX    : ILZeroOp<0, "_zeroop(fltmax)">;
+def IL_ZEROOP_0         : ILZeroOp<1, "_zeroop(zero)">;
+def IL_ZEROOP_INFINITY  : ILZeroOp<2, "_zeroop(infinity)">;
+def IL_ZEROOP_INF_ELSE_MAX : ILZeroOp<3, "_zeroop(inf_else_max)">;
+
+// Section 5.10 IL Cmp Value
+class ILCmpValue<bits<3> val, string num> {
+    bits<3> Value = val;
+    string Text = num;
+}
+
+// Table 5-10
+def IL_CMPVAL_0_0     : ILCmpValue<0, "0.0">;
+def IL_CMPVAL_0_5     : ILCmpValue<1, "0.5">;
+def IL_CMPVAL_1_0     : ILCmpValue<2, "1.0">;
+def IL_CMPVAL_NEG_0_5 : ILCmpValue<3, "-0.5">;
+def IL_CMPVAL_NEG_1_0 : ILCmpValue<4, "-1.0">;
+
+// Section 5.11 IL Addressing
+class ILAddressing<bits<3> val> {
+    bits<3> Value = val;
+}
+
+// Table 5-11
+def IL_ADDR_ABSOLUTE     : ILAddressing<0>;
+def IL_ADDR_RELATIVE     : ILAddressing<1>;
+def IL_ADDR_REG_RELATIVE : ILAddressing<2>;
+
+// Section 5.11 IL Element Format
+class ILElementFormat<bits<5> val> {
+    bits<5> Value = val;
+}
+
+// Table 5-11
+def IL_ELEMENTFORMAT_UNKNOWN : ILElementFormat<0>;
+def IL_ELEMENTFORMAT_SNORM   : ILElementFormat<1>;
+def IL_ELEMENTFORMAT_UNORM   : ILElementFormat<2>;
+def IL_ELEMENTFORMAT_SINT    : ILElementFormat<3>;
+def IL_ELEMENTFORMAT_UINT    : ILElementFormat<4>;
+def IL_ELEMENTFORMAT_FLOAT   : ILElementFormat<5>;
+def IL_ELEMENTFORMAT_SRGB    : ILElementFormat<6>;
+def IL_ELEMENTFORMAT_MIXED   : ILElementFormat<7>;
+def IL_ELEMENTFORMAT_Last    : ILElementFormat<8>;
+
+// Section 5.12 IL Op Code
+class ILOpCode<bits<16> val = -1, string cmd> {
+    bits<16> Value = val;
+    string Text = cmd;
+}
+
+// Table 5-12
+def IL_DCL_CONST_BUFFER         : ILOpCode<0, "dcl_cb">;
+def IL_DCL_INDEXED_TEMP_ARRAY   : ILOpCode<1, "dcl_index_temp_array">;
+def IL_DCL_INPUT                : ILOpCode<2, "dcl_input">;
+def IL_DCL_LITERAL              : ILOpCode<3, "dcl_literal">;
+def IL_DCL_OUTPUT               : ILOpCode<4, "dcl_output">;
+def IL_DCL_RESOURCE             : ILOpCode<5, "dcl_resource">;
+def IL_OP_ABS                   : ILOpCode<6, "abs">;
+def IL_OP_ADD                   : ILOpCode<7, "add">;
+def IL_OP_AND                   : ILOpCode<8, "iand">;
+def IL_OP_BREAK                 : ILOpCode<9, "break">;
+def IL_OP_BREAK_LOGICALNZ       : ILOpCode<10, "break_logicalnz">;
+def IL_OP_BREAK_LOGICALZ        : ILOpCode<11, "break_logicalz">;
+def IL_OP_BREAKC                : ILOpCode<12, "breakc">;
+def IL_OP_CALL                  : ILOpCode<13, "call">;
+def IL_OP_CALL_LOGICALNZ        : ILOpCode<14, "call_logicalnz">;
+def IL_OP_CALL_LOGICALZ         : ILOpCode<15, "call_logicalz">;
+def IL_OP_CASE                  : ILOpCode<16, "case">;
+def IL_OP_CLG                   : ILOpCode<17, "clg">;
+def IL_OP_CMOV                  : ILOpCode<18, "cmov">;
+def IL_OP_CMOV_LOGICAL          : ILOpCode<19, "cmov_logical">;
+def IL_OP_CMP                   : ILOpCode<20, "cmp">;
+def IL_OP_CONTINUE              : ILOpCode<21, "continue">;
+def IL_OP_CONTINUE_LOGICALNZ    : ILOpCode<22, "continue_logicalnz">;
+def IL_OP_CONTINUE_LOGICALZ     : ILOpCode<23, "continue_logicalz">;
+def IL_OP_CONTINUEC             : ILOpCode<24, "continuec">;
+def IL_OP_COS                   : ILOpCode<25, "cos">;
+def IL_OP_COS_VEC               : ILOpCode<26, "cos_vec">;
+def IL_OP_D_2_F                 : ILOpCode<27, "d2f">;
+def IL_OP_D_ADD                 : ILOpCode<28, "dadd">;
+def IL_OP_D_EQ                  : ILOpCode<29, "deq">;
+def IL_OP_D_FRC                 : ILOpCode<30, "dfrac">;
+def IL_OP_D_FREXP               : ILOpCode<31, "dfrexp">;
+def IL_OP_D_GE                  : ILOpCode<32, "dge">;
+def IL_OP_D_LDEXP               : ILOpCode<33, "dldexp">;
+def IL_OP_D_LT                  : ILOpCode<34, "dlt">;
+def IL_OP_D_MAD                 : ILOpCode<35, "dmad">;
+def IL_OP_D_MUL                 : ILOpCode<36, "dmul">;
+def IL_OP_D_NE                  : ILOpCode<37, "dne">;
+def IL_OP_DEFAULT               : ILOpCode<38, "default">;
+def IL_OP_DISCARD_LOGICALNZ     : ILOpCode<39, "discard_logicalnz">;
+def IL_OP_DISCARD_LOGICALZ      : ILOpCode<40, "discard_logicalz">;
+def IL_OP_DIV                   : ILOpCode<41, "div_zeroop(infinity)">;
+def IL_OP_DP2                   : ILOpCode<42, "dp2">;
+def IL_OP_DP3                   : ILOpCode<43, "dp3">;
+def IL_OP_DP4                   : ILOpCode<44, "dp4">;
+def IL_OP_ELSE                  : ILOpCode<45, "else">;
+def IL_OP_END                   : ILOpCode<46, "end">;
+def IL_OP_ENDFUNC               : ILOpCode<47, "endfunc">;
+def IL_OP_ENDIF                 : ILOpCode<48, "endif">;
+def IL_OP_ENDLOOP               : ILOpCode<49, "endloop">;
+def IL_OP_ENDMAIN               : ILOpCode<50, "endmain">;
+def IL_OP_ENDSWITCH             : ILOpCode<51, "endswitch">;
+def IL_OP_EQ                    : ILOpCode<52, "eq">;
+def IL_OP_EXP                   : ILOpCode<53, "exp">;
+def IL_OP_EXP_VEC               : ILOpCode<54, "exp_vec">;
+def IL_OP_F_2_D                 : ILOpCode<55, "f2d">;
+def IL_OP_FLR                   : ILOpCode<56, "flr">;
+def IL_OP_FRC                   : ILOpCode<57, "frc">;
+def IL_OP_FTOI                  : ILOpCode<58, "ftoi">;
+def IL_OP_FTOU                  : ILOpCode<59, "ftou">;
+def IL_OP_FUNC                  : ILOpCode<60, "func">;
+def IL_OP_GE                    : ILOpCode<61, "ge">;
+def IL_OP_I_ADD                 : ILOpCode<62, "iadd">;
+def IL_OP_I_EQ                  : ILOpCode<63, "ieq">;
+def IL_OP_I_GE                  : ILOpCode<64, "ige">;
+def IL_OP_I_LT                  : ILOpCode<65, "ilt">;
+def IL_OP_I_MAD                 : ILOpCode<66, "imad">;
+def IL_OP_I_MAX                 : ILOpCode<67, "imax">;
+def IL_OP_I_MIN                 : ILOpCode<68, "imin">;
+def IL_OP_I_MUL                 : ILOpCode<69, "imul">;
+def IL_OP_I_MUL_HIGH            : ILOpCode<70, "imul_high">;
+def IL_OP_I_NE                  : ILOpCode<71, "ine">;
+def IL_OP_I_NEGATE              : ILOpCode<72, "inegate">;
+def IL_OP_I_NOT                 : ILOpCode<73, "inot">;
+def IL_OP_I_OR                  : ILOpCode<74, "ior">;
+def IL_OP_I_SHL                 : ILOpCode<75, "ishl">;
+def IL_OP_I_SHR                 : ILOpCode<76, "ishr">;
+def IL_OP_I_XOR                 : ILOpCode<77, "ixor">;
+def IL_OP_IF_LOGICALNZ          : ILOpCode<78, "if_logicalnz">;
+def IL_OP_IF_LOGICALZ           : ILOpCode<79, "if_logicalz">;
+def IL_OP_IFC                   : ILOpCode<80, "ifc">;
+def IL_OP_ITOF                  : ILOpCode<81, "itof">;
+def IL_OP_LN                    : ILOpCode<82, "ln">;
+def IL_OP_LOG                   : ILOpCode<83, "log">;
+def IL_OP_LOG_VEC               : ILOpCode<84, "log_vec">;
+def IL_OP_LOOP                  : ILOpCode<85, "loop">;
+def IL_OP_LT                    : ILOpCode<86, "lt">;
+def IL_OP_MAD                   : ILOpCode<87, "mad_ieee">;
+def IL_OP_MAX                   : ILOpCode<88, "max_ieee">;
+def IL_OP_MIN                   : ILOpCode<89, "min_ieee">;
+def IL_OP_MOD                   : ILOpCode<90, "mod_ieee">;
+def IL_OP_MOV                   : ILOpCode<91, "mov">;
+def IL_OP_MUL_IEEE              : ILOpCode<92, "mul_ieee">;
+def IL_OP_NE                    : ILOpCode<93, "ne">;
+def IL_OP_NRM                   : ILOpCode<94, "nrm_nrm4_zeroop(zero)">;
+def IL_OP_POW                   : ILOpCode<95, "pow">;
+def IL_OP_RCP                   : ILOpCode<96, "rcp">;
+def IL_OP_RET                   : ILOpCode<97, "ret">;
+def IL_OP_RET_DYN               : ILOpCode<98, "ret_dyn">;
+def IL_OP_RET_LOGICALNZ         : ILOpCode<99, "ret_logicalnz">;
+def IL_OP_RET_LOGICALZ          : ILOpCode<100, "ret_logicalz">;
+def IL_OP_RND                   : ILOpCode<101, "rnd">;
+def IL_OP_ROUND_NEAR            : ILOpCode<102, "round_nearest">;
+def IL_OP_ROUND_NEG_INF         : ILOpCode<103, "round_neginf">;
+def IL_OP_ROUND_POS_INF         : ILOpCode<104, "round_plusinf">;
+def IL_OP_ROUND_ZERO            : ILOpCode<105, "round_z">;
+def IL_OP_RSQ                   : ILOpCode<106, "rsq">;
+def IL_OP_RSQ_VEC               : ILOpCode<107, "rsq_vec">;
+def IL_OP_SAMPLE                : ILOpCode<108, "sample">;
+def IL_OP_SAMPLE_L              : ILOpCode<109, "sample_l">;
+def IL_OP_SET                   : ILOpCode<110, "set">;
+def IL_OP_SGN                   : ILOpCode<111, "sgn">;
+def IL_OP_SIN                   : ILOpCode<112, "sin">;
+def IL_OP_SIN_VEC               : ILOpCode<113, "sin_vec">;
+def IL_OP_SUB                   : ILOpCode<114, "sub">;
+def IL_OP_SWITCH                : ILOpCode<115, "switch">;
+def IL_OP_TRC                   : ILOpCode<116, "trc">;
+def IL_OP_U_DIV                 : ILOpCode<117, "udiv">;
+def IL_OP_U_GE                  : ILOpCode<118, "uge">;
+def IL_OP_U_LT                  : ILOpCode<119, "ult">;
+def IL_OP_U_MAD                 : ILOpCode<120, "umad">;
+def IL_OP_U_MAX                 : ILOpCode<121, "umax">;
+def IL_OP_U_MIN                 : ILOpCode<122, "umin">;
+def IL_OP_U_MOD                 : ILOpCode<123, "umod">;
+def IL_OP_U_MUL                 : ILOpCode<124, "umul">;
+def IL_OP_U_MUL_HIGH            : ILOpCode<125, "umul_high">;
+def IL_OP_U_SHR                 : ILOpCode<126, "ushr">;
+def IL_OP_UTOF                  : ILOpCode<127, "utof">;
+def IL_OP_WHILE                 : ILOpCode<128, "whileloop">;
+// SC IL instructions that are not in CAL IL
+def IL_OP_ACOS                  : ILOpCode<129, "acos">;
+def IL_OP_ASIN                  : ILOpCode<130, "asin">;
+def IL_OP_EXN                   : ILOpCode<131, "exn">;
+def IL_OP_UBIT_REVERSE          : ILOpCode<132, "ubit_reverse">;
+def IL_OP_UBIT_EXTRACT          : ILOpCode<133, "ubit_extract">;
+def IL_OP_IBIT_EXTRACT          : ILOpCode<134, "ibit_extract">;
+def IL_OP_SQRT                  : ILOpCode<135, "sqrt">;
+def IL_OP_SQRT_VEC              : ILOpCode<136, "sqrt_vec">;
+def IL_OP_ATAN                  : ILOpCode<137, "atan">;
+def IL_OP_TAN                   : ILOpCode<137, "tan">;
+def IL_OP_D_DIV                 : ILOpCode<138, "ddiv">;
+def IL_OP_F_NEG                 : ILOpCode<139, "mov">;
+def IL_OP_GT                    : ILOpCode<140, "gt">;
+def IL_OP_LE                    : ILOpCode<141, "lt">;
+def IL_OP_DIST                  : ILOpCode<142, "dist">;
+def IL_OP_LEN                   : ILOpCode<143, "len">;
+def IL_OP_MACRO                 : ILOpCode<144, "mcall">;
+def IL_OP_INTR                  : ILOpCode<145, "call">;
+def IL_OP_I_FFB_HI              : ILOpCode<146, "ffb_hi">;
+def IL_OP_I_FFB_LO              : ILOpCode<147, "ffb_lo">;
+def IL_OP_BARRIER               : ILOpCode<148, "fence_threads_memory_lds">;
+def IL_OP_BARRIER_LOCAL         : ILOpCode<149, "fence_threads_lds">;
+def IL_OP_BARRIER_GLOBAL        : ILOpCode<150, "fence_threads_memory">;
+def IL_OP_FENCE                 : ILOpCode<151, "fence_lds_memory">;
+def IL_OP_FENCE_READ_ONLY       : ILOpCode<152, "fence_lds_mem_read_only">;
+def IL_OP_FENCE_WRITE_ONLY      : ILOpCode<153, "fence_lds_mem_write_only">;
+def IL_PSEUDO_INST              : ILOpCode<154, ";Pseudo Op">;
+def IL_OP_UNPACK_0              : ILOpCode<155, "unpack0">;
+def IL_OP_UNPACK_1              : ILOpCode<156, "unpack1">;
+def IL_OP_UNPACK_2              : ILOpCode<157, "unpack2">;
+def IL_OP_UNPACK_3              : ILOpCode<158, "unpack3">;
+def IL_OP_PI_REDUCE             : ILOpCode<159, "pireduce">;
+def IL_OP_IBIT_COUNT            : ILOpCode<160, "icbits">;
+def IL_OP_I_FFB_SGN             : ILOpCode<161, "ffb_shi">;
+def IL_OP_F2U4                  : ILOpCode<162, "f_2_u4">;
+def IL_OP_BIT_ALIGN             : ILOpCode<163, "bitalign">;
+def IL_OP_BYTE_ALIGN            : ILOpCode<164, "bytealign">;
+def IL_OP_U4_LERP               : ILOpCode<165, "u4lerp">;
+def IL_OP_SAD                   : ILOpCode<166, "sad">;
+def IL_OP_SAD_HI                : ILOpCode<167, "sadhi">;
+def IL_OP_SAD4                  : ILOpCode<168, "sad4">;
+def IL_OP_UBIT_INSERT           : ILOpCode<169, "ubit_insert">;
+def IL_OP_I_CARRY               : ILOpCode<170, "icarry">;
+def IL_OP_I_BORROW              : ILOpCode<171, "iborrow">;
+def IL_OP_U_MAD24               : ILOpCode<172, "umad24">;
+def IL_OP_U_MUL24               : ILOpCode<173, "umul24">;
+def IL_OP_I_MAD24               : ILOpCode<174, "imad24">;
+def IL_OP_I_MUL24               : ILOpCode<175, "imul24">;
+def IL_OP_CLAMP                 : ILOpCode<176, "clamp">;
+def IL_OP_LERP                  : ILOpCode<177, "lrp">;
+def IL_OP_FMA                   : ILOpCode<178, "fma">;
+def IL_OP_D_MIN                 : ILOpCode<179, "dmin">;
+def IL_OP_D_MAX                 : ILOpCode<180, "dmax">;
+def IL_OP_D_SQRT                : ILOpCode<181, "dsqrt">;
+def IL_OP_DP2_ADD               : ILOpCode<182, "dp2add">;
+def IL_OP_F16_TO_F32            : ILOpCode<183, "f162f">;
+def IL_OP_F32_TO_F16            : ILOpCode<184, "f2f16">;
+def IL_REG_LOCAL_ID_FLAT        : ILOpCode<185, "vTidInGrpFlat">;
+def IL_REG_LOCAL_ID             : ILOpCode<186, "vTidInGrp">;
+def IL_REG_GLOBAL_ID_FLAT       : ILOpCode<187, "vAbsTidFlag">;
+def IL_REG_GLOBAL_ID            : ILOpCode<188, "vAbsTid">;
+def IL_REG_GROUP_ID_FLAT        : ILOpCode<189, "vThreadGrpIDFlat">;
+def IL_REG_GROUP_ID             : ILOpCode<190, "vThreadGrpID">;
+def IL_OP_D_RCP                 : ILOpCode<191, "drcp_zeroop(infinity)">;
+def IL_OP_D_RSQ                 : ILOpCode<192, "drsq_zeroop(infinity)">;
+def IL_OP_D_MOV                 : ILOpCode<193, "dmov">;
+def IL_OP_D_MOVC                : ILOpCode<194, "dmovc">;
+def IL_OP_NOP                   : ILOpCode<195, "nop">;
+def IL_OP_UAV_ADD               : ILOpCode<196, "uav_add">;
+def IL_OP_UAV_AND               : ILOpCode<197, "uav_and">;
+def IL_OP_UAV_MAX               : ILOpCode<198, "uav_max">;
+def IL_OP_UAV_MIN               : ILOpCode<199, "uav_min">;
+def IL_OP_UAV_OR                : ILOpCode<200, "uav_or">;
+def IL_OP_UAV_RSUB              : ILOpCode<201, "uav_rsub">;
+def IL_OP_UAV_SUB               : ILOpCode<202, "uav_sub">;
+def IL_OP_UAV_UMAX              : ILOpCode<203, "uav_umax">;
+def IL_OP_UAV_UMIN              : ILOpCode<204, "uav_umin">;
+def IL_OP_UAV_XOR               : ILOpCode<205, "uav_xor">;
+def IL_OP_UAV_INC               : ILOpCode<206, "uav_uinc">;
+def IL_OP_UAV_DEC               : ILOpCode<207, "uav_udec">;
+def IL_OP_UAV_CMP               : ILOpCode<208, "uav_cmp">;
+def IL_OP_UAV_READ_ADD          : ILOpCode<209, "uav_read_add">;
+def IL_OP_UAV_READ_AND          : ILOpCode<210, "uav_read_and">;
+def IL_OP_UAV_READ_MAX          : ILOpCode<211, "uav_read_max">;
+def IL_OP_UAV_READ_MIN          : ILOpCode<212, "uav_read_min">;
+def IL_OP_UAV_READ_OR           : ILOpCode<213, "uav_read_or">;
+def IL_OP_UAV_READ_RSUB         : ILOpCode<214, "uav_read_rsub">;
+def IL_OP_UAV_READ_SUB          : ILOpCode<215, "uav_read_sub">;
+def IL_OP_UAV_READ_UMAX         : ILOpCode<216, "uav_read_umax">;
+def IL_OP_UAV_READ_UMIN         : ILOpCode<217, "uav_read_umin">;
+def IL_OP_UAV_READ_XOR          : ILOpCode<218, "uav_read_xor">;
+def IL_OP_UAV_READ_INC          : ILOpCode<219, "uav_read_uinc">;
+def IL_OP_UAV_READ_DEC          : ILOpCode<220, "uav_read_udec">;
+def IL_OP_UAV_READ_XCHG         : ILOpCode<221, "uav_read_xchg">;
+def IL_OP_UAV_READ_CMPXCHG      : ILOpCode<222, "uav_read_cmp_xchg">;
+def IL_OP_LDS_ADD               : ILOpCode<223, "lds_add">;
+def IL_OP_LDS_AND               : ILOpCode<224, "lds_and">;
+def IL_OP_LDS_MAX               : ILOpCode<225, "lds_max">;
+def IL_OP_LDS_MIN               : ILOpCode<226, "lds_min">;
+def IL_OP_LDS_OR                : ILOpCode<227, "lds_or">;
+def IL_OP_LDS_RSUB              : ILOpCode<228, "lds_rsub">;
+def IL_OP_LDS_SUB               : ILOpCode<229, "lds_sub">;
+def IL_OP_LDS_UMAX              : ILOpCode<230, "lds_umax">;
+def IL_OP_LDS_UMIN              : ILOpCode<231, "lds_umin">;
+def IL_OP_LDS_XOR               : ILOpCode<232, "lds_xor">;
+def IL_OP_LDS_INC               : ILOpCode<233, "lds_inc">;
+def IL_OP_LDS_DEC               : ILOpCode<234, "lds_dec">;
+def IL_OP_LDS_CMP               : ILOpCode<235, "lds_cmp">;
+def IL_OP_LDS_READ_ADD          : ILOpCode<236, "lds_read_add">;
+def IL_OP_LDS_READ_AND          : ILOpCode<237, "lds_read_and">;
+def IL_OP_LDS_READ_MAX          : ILOpCode<238, "lds_read_max">;
+def IL_OP_LDS_READ_MIN          : ILOpCode<239, "lds_read_min">;
+def IL_OP_LDS_READ_OR           : ILOpCode<240, "lds_read_or">;
+def IL_OP_LDS_READ_RSUB         : ILOpCode<241, "lds_read_rsub">;
+def IL_OP_LDS_READ_SUB          : ILOpCode<242, "lds_read_sub">;
+def IL_OP_LDS_READ_UMAX         : ILOpCode<243, "lds_read_umax">;
+def IL_OP_LDS_READ_UMIN         : ILOpCode<244, "lds_read_umin">;
+def IL_OP_LDS_READ_XOR          : ILOpCode<245, "lds_read_xor">;
+def IL_OP_LDS_READ_INC          : ILOpCode<246, "lds_read_inc">;
+def IL_OP_LDS_READ_DEC          : ILOpCode<247, "lds_read_dec">;
+def IL_OP_LDS_READ_XCHG         : ILOpCode<248, "lds_read_xchg">;
+def IL_OP_LDS_READ_CMPXCHG      : ILOpCode<249, "lds_read_cmp_xchg">;
+def IL_OP_GDS_ADD               : ILOpCode<250, "gds_add">;
+def IL_OP_GDS_AND               : ILOpCode<251, "gds_and">;
+def IL_OP_GDS_MAX               : ILOpCode<252, "gds_max">;
+def IL_OP_GDS_MIN               : ILOpCode<253, "gds_min">;
+def IL_OP_GDS_OR                : ILOpCode<254, "gds_or">;
+def IL_OP_GDS_RSUB              : ILOpCode<255, "gds_rsub">;
+def IL_OP_GDS_SUB               : ILOpCode<256, "gds_sub">;
+def IL_OP_GDS_UMAX              : ILOpCode<257, "gds_umax">;
+def IL_OP_GDS_UMIN              : ILOpCode<258, "gds_umin">;
+def IL_OP_GDS_MSKOR             : ILOpCode<259, "gds_mskor">;
+def IL_OP_GDS_XOR               : ILOpCode<260, "gds_xor">;
+def IL_OP_GDS_INC               : ILOpCode<261, "gds_inc">;
+def IL_OP_GDS_DEC               : ILOpCode<262, "gds_dec">;
+def IL_OP_GDS_CMP               : ILOpCode<263, "gds_cmp">;
+def IL_OP_GDS_READ_ADD          : ILOpCode<264, "gds_read_add">;
+def IL_OP_GDS_READ_AND          : ILOpCode<265, "gds_read_and">;
+def IL_OP_GDS_READ_MAX          : ILOpCode<266, "gds_read_max">;
+def IL_OP_GDS_READ_MIN          : ILOpCode<267, "gds_read_min">;
+def IL_OP_GDS_READ_OR           : ILOpCode<268, "gds_read_or">;
+def IL_OP_GDS_READ_RSUB         : ILOpCode<269, "gds_read_rsub">;
+def IL_OP_GDS_READ_SUB          : ILOpCode<270, "gds_read_sub">;
+def IL_OP_GDS_READ_UMAX         : ILOpCode<271, "gds_read_umax">;
+def IL_OP_GDS_READ_UMIN         : ILOpCode<272, "gds_read_umin">;
+def IL_OP_GDS_READ_MSKOR        : ILOpCode<273, "gds_read_mskor">;
+def IL_OP_GDS_READ_XOR          : ILOpCode<274, "gds_read_xor">;
+def IL_OP_GDS_READ_INC          : ILOpCode<275, "gds_read_inc">;
+def IL_OP_GDS_READ_DEC          : ILOpCode<276, "gds_read_dec">;
+def IL_OP_GDS_READ_XCHG         : ILOpCode<277, "gds_read_xchg">;
+def IL_OP_GDS_READ_CMPXCHG      : ILOpCode<278, "gds_read_cmp_xchg">;
+def IL_OP_APPEND_BUF_ALLOC      : ILOpCode<279, "append_buf_alloc">;
+def IL_OP_APPEND_BUF_CONSUME    : ILOpCode<280, "append_buf_consume">;
+def IL_OP_I64_ADD               : ILOpCode<281, "i64add">;
+def IL_OP_I64_MAX               : ILOpCode<282, "i64max">;
+def IL_OP_U64_MAX               : ILOpCode<283, "u64max">;
+def IL_OP_I64_MIN               : ILOpCode<284, "i64min">;
+def IL_OP_U64_MIN               : ILOpCode<285, "u64min">;
+def IL_OP_I64_NEGATE            : ILOpCode<286, "i64negate">;
+def IL_OP_I64_SHL               : ILOpCode<287, "i64shl">;
+def IL_OP_I64_SHR               : ILOpCode<288, "i64shr">;
+def IL_OP_U64_SHR               : ILOpCode<289, "u64shr">;
+def IL_OP_I64_EQ                : ILOpCode<290, "i64eq">;
+def IL_OP_I64_GE                : ILOpCode<291, "i64ge">;
+def IL_OP_U64_GE                : ILOpCode<292, "u64ge">;
+def IL_OP_I64_LT                : ILOpCode<293, "i64lt">;
+def IL_OP_U64_LT                : ILOpCode<294, "u64lt">;
+def IL_OP_I64_NE                : ILOpCode<295, "i64ne">;
+def IL_OP_U_MULHI24             : ILOpCode<296, "umul24_high">;
+def IL_OP_I_MULHI24             : ILOpCode<297, "imul24_high">;
+def IL_OP_GDS_LOAD              : ILOpCode<298, "gds_load">;
+def IL_OP_GDS_STORE             : ILOpCode<299, "gds_store">;
+def IL_OP_LDS_LOAD              : ILOpCode<300, "lds_load">;
+def IL_OP_LDS_LOAD_VEC          : ILOpCode<301, "lds_load_vec">;
+def IL_OP_LDS_LOAD_BYTE         : ILOpCode<302, "lds_load_byte">;
+def IL_OP_LDS_LOAD_UBYTE        : ILOpCode<303, "lds_load_ubyte">;
+def IL_OP_LDS_LOAD_SHORT        : ILOpCode<304, "lds_load_short">;
+def IL_OP_LDS_LOAD_USHORT       : ILOpCode<305, "lds_load_ushort">;
+def IL_OP_LDS_STORE             : ILOpCode<306, "lds_store">;
+def IL_OP_LDS_STORE_VEC         : ILOpCode<307, "lds_store_vec">;
+def IL_OP_LDS_STORE_BYTE        : ILOpCode<308, "lds_store_byte">;
+def IL_OP_LDS_STORE_SHORT       : ILOpCode<309, "lds_store_short">;
+def IL_OP_RAW_UAV_LOAD          : ILOpCode<310, "uav_raw_load">;
+def IL_OP_RAW_UAV_STORE         : ILOpCode<311, "uav_raw_store">;
+def IL_OP_ARENA_UAV_LOAD        : ILOpCode<312, "uav_arena_load">;
+def IL_OP_ARENA_UAV_STORE       : ILOpCode<313, "uav_arena_store">;
+def IL_OP_LDS_MSKOR             : ILOpCode<314, "lds_mskor">;
+def IL_OP_LDS_READ_MSKOR        : ILOpCode<315, "lds_read_mskor">;
+def IL_OP_UAV_BYTE_LOAD         : ILOpCode<316, "uav_byte_load">;
+def IL_OP_UAV_UBYTE_LOAD        : ILOpCode<317, "uav_ubyte_load">;
+def IL_OP_UAV_SHORT_LOAD        : ILOpCode<318, "uav_short_load">;
+def IL_OP_UAV_USHORT_LOAD       : ILOpCode<319, "uav_ushort_load">;
+def IL_OP_UAV_BYTE_STORE        : ILOpCode<320, "uav_byte_store">;
+def IL_OP_UAV_SHORT_STORE       : ILOpCode<320, "uav_short_store">;
+def IL_OP_UAV_STORE             : ILOpCode<321, "uav_store">;
+def IL_OP_UAV_LOAD              : ILOpCode<322, "uav_load">;
+def IL_OP_MUL                   : ILOpCode<323, "mul">;
+def IL_OP_DIV_INF               : ILOpCode<324, "div_zeroop(infinity)">;
+def IL_OP_DIV_FLTMAX            : ILOpCode<325, "div_zeroop(fltmax)">;
+def IL_OP_DIV_ZERO              : ILOpCode<326, "div_zeroop(zero)">;
+def IL_OP_DIV_INFELSEMAX        : ILOpCode<327, "div_zeroop(inf_else_max)">;
+def IL_OP_FTOI_FLR              : ILOpCode<328, "ftoi_flr">;
+def IL_OP_FTOI_RPI              : ILOpCode<329, "ftoi_rpi">;
+def IL_OP_F32_TO_F16_NEAR       : ILOpCode<330, "f2f16_near">;
+def IL_OP_F32_TO_F16_NEG_INF    : ILOpCode<331, "f2f16_neg_inf">;
+def IL_OP_F32_TO_F16_PLUS_INF   : ILOpCode<332, "f2f16_plus_inf">;
+def IL_OP_I64_MUL               : ILOpCode<333, "i64mul">;
+def IL_OP_U64_MUL               : ILOpCode<334, "u64mul">;
+def IL_OP_CU_ID                 : ILOpCode<355, "cu_id">;
+def IL_OP_WAVE_ID               : ILOpCode<356, "wave_id">;
+def IL_OP_I64_SUB               : ILOpCode<357, "i64sub">;
+def IL_OP_I64_DIV               : ILOpCode<358, "i64div">;
+def IL_OP_U64_DIV               : ILOpCode<359, "u64div">;
+def IL_OP_I64_MOD               : ILOpCode<360, "i64mod">;
+def IL_OP_U64_MOD               : ILOpCode<361, "u64mod">;
+def IL_DCL_GWS_THREAD_COUNT     : ILOpCode<362, "dcl_gws_thread_count">;
+def IL_DCL_SEMAPHORE            : ILOpCode<363, "dcl_semaphore">;
+def IL_OP_SEMAPHORE_INIT        : ILOpCode<364, "init_semaphore">;
+def IL_OP_SEMAPHORE_WAIT        : ILOpCode<365, "semaphore_wait">;
+def IL_OP_SEMAPHORE_SIGNAL      : ILOpCode<366, "semaphore_signal">;
+def IL_OP_BARRIER_REGION        : ILOpCode<377, "fence_threads_gds">;
+def IL_OP_BFI                   : ILOpCode<394, "bfi">;
+def IL_OP_BFM                   : ILOpCode<395, "bfm">;
+def IL_DBG_STRING               : ILOpCode<396, "dbg_string">;
+def IL_DBG_LINE                 : ILOpCode<397, "dbg_line">;
+def IL_DBG_TEMPLOC              : ILOpCode<398, "dbg_temploc">;
diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp

new file mode 100644 (file)

index 0000000..1af2806
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
@@ -0,0 +1,211 @@
+//===-- AMDILEvergreenDevice.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILEvergreenDevice.h"
+#ifdef UPSTREAM_LLVM
+#include "AMDILEGAsmPrinter.h"
+#endif
+#include "AMDILIOExpansion.h"
+#include "AMDILPointerManager.h"
+
+using namespace llvm;
+
+AMDILEvergreenDevice::AMDILEvergreenDevice(AMDILSubtarget *ST)
+: AMDILDevice(ST) {
+  setCaps();
+  std::string name = ST->getDeviceName();
+  if (name == "cedar") {
+    mDeviceFlag = OCL_DEVICE_CEDAR;
+  } else if (name == "redwood") {
+    mDeviceFlag = OCL_DEVICE_REDWOOD;
+  } else if (name == "cypress") {
+    mDeviceFlag = OCL_DEVICE_CYPRESS;
+  } else {
+    mDeviceFlag = OCL_DEVICE_JUNIPER;
+  }
+}
+
+AMDILEvergreenDevice::~AMDILEvergreenDevice() {
+}
+
+size_t AMDILEvergreenDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+size_t AMDILEvergreenDevice::getMaxGDSSize() const {
+  if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+uint32_t AMDILEvergreenDevice::getMaxNumUAVs() const {
+  return 12;
+}
+
+uint32_t AMDILEvergreenDevice::getResourceID(uint32_t id) const {
+  switch(id) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+    if (mSTM->calVersion() >= CAL_VERSION_GLOBAL_RETURN_BUFFER) {
+      return GLOBAL_RETURN_RAW_UAV_ID;
+    } else {
+      return DEFAULT_RAW_UAV_ID;
+    }
+  case GLOBAL_ID:
+  case ARENA_UAV_ID:
+    return DEFAULT_ARENA_UAV_ID;
+  case LDS_ID:
+    if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case GDS_ID:
+    if (usesHardware(AMDILDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case SCRATCH_ID:
+    if (usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  };
+  return 0;
+}
+
+size_t AMDILEvergreenDevice::getWavefrontSize() const {
+  return AMDILDevice::WavefrontSize;
+}
+
+uint32_t AMDILEvergreenDevice::getGeneration() const {
+  return AMDILDeviceInfo::HD5XXX;
+}
+
+void AMDILEvergreenDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::ArenaSegment);
+  mHWBits.set(AMDILDeviceInfo::ArenaUAV);
+  if (mSTM->calVersion() >= CAL_VERSION_SC_140) {
+    mHWBits.set(AMDILDeviceInfo::HW64BitDivMod);
+    mSWBits.reset(AMDILDeviceInfo::HW64BitDivMod);
+  } 
+  mSWBits.set(AMDILDeviceInfo::Signed24BitOps);
+  if (mSTM->isOverride(AMDILDeviceInfo::ByteStores)) {
+    mHWBits.set(AMDILDeviceInfo::ByteStores);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::Debug)) {
+    mSWBits.set(AMDILDeviceInfo::LocalMem);
+    mSWBits.set(AMDILDeviceInfo::RegionMem);
+  } else {
+    mHWBits.set(AMDILDeviceInfo::LocalMem);
+    mHWBits.set(AMDILDeviceInfo::RegionMem);
+  }
+  mHWBits.set(AMDILDeviceInfo::Images);
+  if (mSTM->isOverride(AMDILDeviceInfo::NoAlias)) {
+    mHWBits.set(AMDILDeviceInfo::NoAlias);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_GLOBAL_RETURN_BUFFER) {
+    mHWBits.set(AMDILDeviceInfo::CachedMem);
+  }
+  if (mSTM->isOverride(AMDILDeviceInfo::MultiUAV)) {
+    mHWBits.set(AMDILDeviceInfo::MultiUAV);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_SC_136) {
+    mHWBits.set(AMDILDeviceInfo::ByteLDSOps);
+    mSWBits.reset(AMDILDeviceInfo::ByteLDSOps);
+    mHWBits.set(AMDILDeviceInfo::ArenaVectors);
+  } else {
+    mSWBits.set(AMDILDeviceInfo::ArenaVectors);
+  }
+  if (mSTM->calVersion() > CAL_VERSION_SC_137) {
+    mHWBits.set(AMDILDeviceInfo::LongOps);
+    mSWBits.reset(AMDILDeviceInfo::LongOps);
+  }
+  mHWBits.set(AMDILDeviceInfo::TmrReg);
+}
+FunctionPass* 
+AMDILEvergreenDevice::getIOExpansion(
+    TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const
+{
+  return new AMDILEGIOExpansion(TM AMDIL_OPT_LEVEL_VAR);
+}
+
+AsmPrinter*
+AMDILEvergreenDevice::getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const
+{
+#ifdef UPSTREAM_LLVM
+  return new AMDILEGAsmPrinter(TM, Streamer);
+#else
+  return NULL;
+#endif
+}
+
+FunctionPass*
+AMDILEvergreenDevice::getPointerManager(
+    TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const
+{
+  return new AMDILEGPointerManager(TM AMDIL_OPT_LEVEL_VAR);
+}
+
+AMDILCypressDevice::AMDILCypressDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILCypressDevice::~AMDILCypressDevice() {
+}
+
+void AMDILCypressDevice::setCaps() {
+  if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDILDeviceInfo::DoubleOps);
+    mHWBits.set(AMDILDeviceInfo::FMA);
+  }
+}
+
+
+AMDILCedarDevice::AMDILCedarDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILCedarDevice::~AMDILCedarDevice() {
+}
+
+void AMDILCedarDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::FMA);
+}
+
+size_t AMDILCedarDevice::getWavefrontSize() const {
+  return AMDILDevice::QuarterWavefrontSize;
+}
+
+AMDILRedwoodDevice::AMDILRedwoodDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDILRedwoodDevice::~AMDILRedwoodDevice()
+{
+}
+
+void AMDILRedwoodDevice::setCaps() {
+  mSWBits.set(AMDILDeviceInfo::FMA);
+}
+
+size_t AMDILRedwoodDevice::getWavefrontSize() const {
+  return AMDILDevice::HalfWavefrontSize;
+}
diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.h b/src/gallium/drivers/radeon/AMDILEvergreenDevice.h

new file mode 100644 (file)

index 0000000..726b479
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.h
@@ -0,0 +1,93 @@
+//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===----------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILEVERGREENDEVICE_H_
+#define _AMDILEVERGREENDEVICE_H_
+#include "AMDILDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+//===----------------------------------------------------------------------===//
+// Evergreen generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+
+// The AMDILEvergreenDevice is the base device class for all of the Evergreen
+// series of cards. This class contains information required to differentiate
+// the Evergreen device from the generic AMDILDevice. This device represents
+// that capabilities of the 'Juniper' cards, also known as the HD57XX.
+class AMDILEvergreenDevice : public AMDILDevice {
+public:
+  AMDILEvergreenDevice(AMDILSubtarget *ST);
+  virtual ~AMDILEvergreenDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getMaxGDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getMaxNumUAVs() const;
+  virtual uint32_t getResourceID(uint32_t) const;
+  virtual FunctionPass*
+    getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const;
+  virtual AsmPrinter*
+    getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const;
+  virtual FunctionPass*
+    getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const;
+protected:
+  virtual void setCaps();
+}; // AMDILEvergreenDevice
+
+// The AMDILCypressDevice is similiar to the AMDILEvergreenDevice, except it has
+// support for double precision operations. This device is used to represent
+// both the Cypress and Hemlock cards, which are commercially known as HD58XX
+// and HD59XX cards.
+class AMDILCypressDevice : public AMDILEvergreenDevice {
+public:
+  AMDILCypressDevice(AMDILSubtarget *ST);
+  virtual ~AMDILCypressDevice();
+private:
+  virtual void setCaps();
+}; // AMDILCypressDevice
+
+
+// The AMDILCedarDevice is the class that represents all of the 'Cedar' based
+// devices. This class differs from the base AMDILEvergreenDevice in that the
+// device is a ~quarter of the 'Juniper'. These are commercially known as the
+// HD54XX and HD53XX series of cards.
+class AMDILCedarDevice : public AMDILEvergreenDevice {
+public:
+  AMDILCedarDevice(AMDILSubtarget *ST);
+  virtual ~AMDILCedarDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDILCedarDevice
+
+// The AMDILRedwoodDevice is the class the represents all of the 'Redwood' based
+// devices. This class differs from the base class, in that these devices are
+// considered about half of a 'Juniper' device. These are commercially known as
+// the HD55XX and HD56XX series of cards.
+class AMDILRedwoodDevice : public AMDILEvergreenDevice {
+public:
+  AMDILRedwoodDevice(AMDILSubtarget *ST);
+  virtual ~AMDILRedwoodDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+}; // AMDILRedwoodDevice
+  
+} // namespace llvm
+#endif // _AMDILEVERGREENDEVICE_H_
diff --git a/src/gallium/drivers/radeon/AMDILFormats.td b/src/gallium/drivers/radeon/AMDILFormats.td

new file mode 100644 (file)

index 0000000..99489e7
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILFormats.td
@@ -0,0 +1,450 @@
+//==- AMDILFormats.td - AMDIL Instruction Formats ----*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//===--------------------------------------------------------------------===//
+include "AMDILTokenDesc.td"
+
+//===--------------------------------------------------------------------===//
+// The parent IL instruction class that inherits the Instruction class. This
+// class sets the corresponding namespace, the out and input dag lists the
+// pattern to match to and the string to print out for the assembly printer.
+//===--------------------------------------------------------------------===//
+class ILFormat<ILOpCode op, dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDIL";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     ILOpCode operation = op;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// The base class for vector insert instructions. It is a single dest, quad
+// source instruction where the last two source operands must be 32bit
+// immediate values that are encoding the swizzle of the source register
+// The src2 and src3 instructions must also be inversion of each other such
+// that if src2 is 0x1000300(x0z0), src3 must be 0x20004(0y0w). The values
+// are encoded as 32bit integer with each 8 char representing a swizzle value.
+// The encoding is as follows for 32bit register types:
+// 0x00 -> '_'
+// 0x01 -> 'x'
+// 0x02 -> 'y'
+// 0x03 -> 'z'
+// 0x04 -> 'w'
+// 0x05 -> 'x'
+// 0x06 -> 'y'
+// 0x07 -> 'z'
+// 0x08 -> 'w'
+// 0x09 -> '0'
+// The encoding is as follows for 64bit register types:
+// 0x00 -> "__"
+// 0x01 -> "xy"
+// 0x02 -> "zw"
+// 0x03 -> "xy"
+// 0x04 -> "zw"
+// 0x05 -> "00"
+//===--------------------------------------------------------------------===//
+class InsertVectorClass<ILOpCode op, RegisterClass DReg, RegisterClass SReg,
+      SDNode OpNode, string asmstr> :
+      ILFormat<op, (outs DReg:$dst),
+      (ins DReg:$src0, SReg:$src1, i32imm:$src2, i32imm:$src3),
+      !strconcat(asmstr, " $dst, $src0, $src1"),
+      [(set DReg:$dst, (OpNode DReg:$src0, SReg:$src1,
+                     timm:$src2, timm:$src3))]>;
+
+//===--------------------------------------------------------------------===//
+// Class that has one input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0" and
+// handles the unary math operators.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative for input and output register 0.
+//===--------------------------------------------------------------------===//
+class OneInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : ILFormat<op, outs, ins, asmstr, pattern>
+{
+     ILDst       dst_reg;
+     ILDstMod    dst_mod;
+     ILRelAddr   dst_rel;
+     ILSrc       dst_reg_rel;
+     ILSrcMod    dst_reg_rel_mod;
+     ILSrc       src0_reg;
+     ILSrcMod    src0_mod;
+     ILRelAddr   src0_rel;
+     ILSrc       src0_reg_rel;
+     ILSrcMod    src0_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// A simplified version of OneInOneOut class where the pattern is standard
+// and does not need special cases. This requires that the pattern has
+// a SDNode and takes a source and destination register that is of type
+// RegisterClass. This is the standard unary op class.
+//===--------------------------------------------------------------------===//
+class UnaryOp<ILOpCode op, SDNode OpNode,
+      RegisterClass dRegs, RegisterClass sRegs>
+      : OneInOneOut<op, (outs dRegs:$dst), (ins sRegs:$src),
+      !strconcat(op.Text, " $dst, $src"),
+      [(set dRegs:$dst, (OpNode sRegs:$src))]>;
+
+//===--------------------------------------------------------------------===//
+// This class is similiar to the UnaryOp class, however, there is no
+// result value to assign.
+//===--------------------------------------------------------------------===//
+class UnaryOpNoRet<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : ILFormat<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src0_reg;
+     ILSrcMod    src0_mod;
+     ILRelAddr   src0_rel;
+     ILSrc       src0_reg_rel;
+     ILSrcMod    src0_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// Set of classes that have two input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0, Src1" and
+// handles the binary math operators and comparison operations.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative for input register 1.
+//===--------------------------------------------------------------------===//
+class TwoInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : OneInOneOut<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src1_reg;
+     ILSrcMod    src1_mod;
+     ILRelAddr   src1_rel;
+     ILSrc       src1_reg_rel;
+     ILSrcMod    src1_reg_rel_mod;
+}
+//===--------------------------------------------------------------------===//
+// A simplification of the TwoInOneOut pattern for Binary Operations.
+// This class is a helper class that assumes the simple pattern of
+// $dst = op $src0 $src1.
+// Other type of matching patterns need to use the TwoInOneOut class.
+//===--------------------------------------------------------------------===//
+class BinaryOp<ILOpCode op, SDNode OpNode, RegisterClass dReg,
+      RegisterClass sReg0, RegisterClass sReg1>
+      : TwoInOneOut<op, (outs dReg:$dst), (ins sReg0:$src0, sReg1:$src1),
+      !strconcat(op.Text, " $dst, $src0, $src1"),
+      [(set dReg:$dst, (OpNode sReg0:$src0, sReg1:$src1))]>;
+
+//===--------------------------------------------------------------------===//
+// The base class for vector extract instructions. The vector extract
+// instructions take as an input value a source register and a 32bit integer
+// with the same encoding as specified in InsertVectorClass and produces
+// a result with only the swizzled component in the destination register.
+//===--------------------------------------------------------------------===//
+class ExtractVectorClass<RegisterClass DReg, RegisterClass SReg, SDNode OpNode>
+: TwoInOneOut<IL_OP_MOV, (outs DReg:$dst), (ins SReg:$src0, i32imm:$src1),
+     "mov $dst, $src0",
+     [(set DReg:$dst, (OpNode SReg:$src0, timm:$src1))]>;
+
+//===--------------------------------------------------------------------===//
+// The base class for vector concatenation. This class creates either a vec2
+// or a vec4 of 32bit data types or a vec2 of 64bit data types. This is done
+// by swizzling either the 'x' or 'xy' components of the source operands
+// into the destination register.
+//===--------------------------------------------------------------------===//
+class VectorConcatClass<RegisterClass Dst, RegisterClass Src, SDNode OpNode>
+      : TwoInOneOut<IL_OP_I_ADD, (outs Dst:$dst), (ins Src:$src0, Src:$src1),
+      "iadd $dst, $src0, $src1",
+      [(set Dst:$dst, (OpNode Src:$src0, Src:$src1))]>;
+
+//===--------------------------------------------------------------------===//
+// Similiar to the UnaryOpNoRet class, but takes as arguments two input
+// operands. Used mainly for barrier instructions on PC platform.
+//===--------------------------------------------------------------------===//
+class BinaryOpNoRet<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : UnaryOpNoRet<op, outs, ins, asmstr, pattern>
+{
+     ILSrc       src1_reg;
+     ILSrcMod    src1_mod;
+     ILRelAddr   src1_rel;
+     ILSrc       src1_reg_rel;
+     ILSrcMod    src1_reg_rel_mod;
+}
+
+//===--------------------------------------------------------------------===//
+// Set of classes that have three input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0, Src1, Src2" and
+// handles the mad and conditional mov instruction.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative.
+// This class is the parent class of TernaryOp
+//===--------------------------------------------------------------------===//
+class ThreeInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : TwoInOneOut<op, outs, ins, asmstr, pattern> {
+           ILSrc       src2_reg;
+           ILSrcMod    src2_mod;
+           ILRelAddr   src2_rel;
+           ILSrc       src2_reg_rel;
+           ILSrcMod    src2_reg_rel_mod;
+      }
+
+//===--------------------------------------------------------------------===//
+// The g version of the Three Input pattern uses a standard pattern but
+// but allows specification of the register to further generalize the class
+// This class is mainly used in the generic multiclasses in AMDILMultiClass.td
+//===--------------------------------------------------------------------===//
+class TernaryOp<ILOpCode op, SDNode OpNode,
+      RegisterClass dReg,
+      RegisterClass sReg0,
+      RegisterClass sReg1,
+      RegisterClass sReg2>
+      : ThreeInOneOut<op, (outs dReg:$dst),
+      (ins sReg0:$src0, sReg1:$src1, sReg2:$src2),
+      !strconcat(op.Text, " $dst, $src0, $src1, $src2"),
+      [(set dReg:$dst,
+                (OpNode sReg0:$src0, sReg1:$src1, sReg2:$src2))]>;
+
+//===--------------------------------------------------------------------===//
+// Set of classes that have three input parameters and one output parameter.
+// The basic pattern for this class is "Opcode Dst, Src0, Src1, Src2" and
+// handles the mad and conditional mov instruction.
+// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod
+// if the addressing is register relative.
+// This class is the parent class of TernaryOp
+//===--------------------------------------------------------------------===//
+class FourInOneOut<ILOpCode op, dag outs, dag ins,
+      string asmstr, list<dag> pattern>
+      : ThreeInOneOut<op, outs, ins, asmstr, pattern> {
+           ILSrc       src3_reg;
+           ILSrcMod    src3_mod;
+           ILRelAddr   src3_rel;
+           ILSrc       src3_reg_rel;
+           ILSrcMod    src3_reg_rel_mod;
+      }
+
+
+//===--------------------------------------------------------------------===//
+// The macro class that is an extension of OneInOneOut but is tailored for
+// macros only where all the register types are the same
+//===--------------------------------------------------------------------===//
+class UnaryMacro<RegisterClass Dst, RegisterClass Src0, SDNode OpNode>
+: OneInOneOut<IL_OP_MACRO, (outs Dst:$dst),
+     (ins Src0:$src0),
+     "($dst),($src0)",
+     [(set Dst:$dst, (OpNode Src0:$src0))]>;
+
+//===--------------------------------------------------------------------===//
+// The macro class is an extension of TwoInOneOut but is tailored for
+// macros only where all the register types are the same
+//===--------------------------------------------------------------------===//
+class BinaryMacro<RegisterClass Dst,
+      RegisterClass Src0,
+      RegisterClass Src1,
+      SDNode OpNode>
+      : TwoInOneOut<IL_OP_MACRO, (outs Dst:$dst),
+      (ins Src0: $src0, Src1:$src1),
+      "($dst),($src0, $src1)",
+      [(set Dst:$dst, (OpNode Src0:$src0, Src1:$src1))]>;
+
+//===--------------------------------------------------------------------===//
+// Classes for dealing with atomic instructions w/ 32bit pointers
+//===--------------------------------------------------------------------===//
+class Append<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst")),
+      [(set GPRI32:$dst, (intr ADDR:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class AppendNoRet<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst")),
+      [(set GPRI32:$dst, (intr ADDR:$id))]>;
+
+class UniAtom<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, timm:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class UniAtomNoRet<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst), (ins MEMI32:$ptr, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, timm:$id))]>;
+
+class BinAtom<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$src, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, timm:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class BinAtomNoRet<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst), (ins MEMI32:$ptr, GPRI32:$src, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, timm:$id))]>;
+
+class TriAtom<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src, $src1")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+class CmpXChg<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src1, $src")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+// TODO: Need to get this working without dst...
+class TriAtomNoRet<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src, $src1")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+// TODO: Need to get this working without dst...
+class CmpXChgNoRet<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src1, $src")),
+      [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+
+//===--------------------------------------------------------------------===//
+// Classes for dealing with atomic instructions w/ 64bit pointers
+//===--------------------------------------------------------------------===//
+class Append64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst")),
+      [(set GPRI32:$dst, (intr ADDR64:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class AppendNoRet64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst")),
+      [(set GPRI32:$dst, (intr ADDR64:$id))]>;
+
+class UniAtom64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, timm:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class UniAtomNoRet64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst), (ins MEMI64:$ptr, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, timm:$id))]>;
+
+class BinAtom64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, GPRI32:$src, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, timm:$id))]>;
+
+
+// TODO: Need to get this working without dst...
+class BinAtomNoRet64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst), (ins MEMI64:$ptr, GPRI32:$src, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, timm:$id))]>;
+
+class TriAtom64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src, $src1")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+class CmpXChg64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src1, $src")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+// TODO: Need to get this working without dst...
+class TriAtomNoRet64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src, $src1")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+// TODO: Need to get this working without dst...
+class CmpXChgNoRet64<ILOpCode op, string idType, SDNode intr>
+      : ILFormat<op, (outs GPRI32:$dst),
+      (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id),
+      !strconcat(op.Text, !strconcat(idType," $ptr, $src1, $src")),
+      [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>;
+
+//===--------------------------------------------------------------------===//
+// Intrinsic classes
+// Generic versions of the above classes but for Target specific intrinsics
+// instead of SDNode patterns.
+//===--------------------------------------------------------------------===//
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+     class VoidIntLong :
+          Intrinsic<[llvm_i64_ty], [], []>;
+     class VoidIntInt :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class VoidIntBool :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class UnaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>;
+     class UnaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], []>;
+     class ConvertIntFTOI :
+          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], []>;
+     class ConvertIntITOF :
+          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], []>;
+     class UnaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty], []>;
+     class UnaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty], []>;
+     class BinaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], []>;
+     class BinaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], []>;
+     class BinaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
+     class BinaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
+     class TernaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], []>;
+     class TernaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], []>;
+     class QuaternaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], []>;
+     class UnaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+     class UnaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
diff --git a/src/gallium/drivers/radeon/AMDILFrameLowering.cpp b/src/gallium/drivers/radeon/AMDILFrameLowering.cpp

new file mode 100644 (file)

index 0000000..87eca87
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILFrameLowering.cpp
@@ -0,0 +1,53 @@
+//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILFrameLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+using namespace llvm;
+AMDILFrameLowering::AMDILFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl)
+{
+}
+
+AMDILFrameLowering::~AMDILFrameLowering()
+{
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AMDILFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDILFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const
+{
+  NumEntries = 0;
+  return 0;
+}
+void
+AMDILFrameLowering::emitPrologue(MachineFunction &MF) const
+{
+}
+void
+AMDILFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{
+}
+bool
+AMDILFrameLowering::hasFP(const MachineFunction &MF) const
+{
+  return false;
+}
diff --git a/src/gallium/drivers/radeon/AMDILFrameLowering.h b/src/gallium/drivers/radeon/AMDILFrameLowering.h

new file mode 100644 (file)

index 0000000..b1d919e
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILFrameLowering.h
@@ -0,0 +1,46 @@
+//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Interface to describe a layout of a stack frame on a AMDIL target machine
+//
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILFRAME_LOWERING_H_
+#define _AMDILFRAME_LOWERING_H_
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+/// Information about the stack frame layout on the AMDIL targets. It holds
+/// the direction of the stack growth, the known stack alignment on entry to
+/// each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+
+namespace llvm {
+  class AMDILFrameLowering : public TargetFrameLowering {
+    public:
+      AMDILFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned
+          TransAl = 1);
+      virtual ~AMDILFrameLowering();
+      virtual int getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const;
+      virtual const SpillSlot *
+        getCalleeSavedSpillSlots(unsigned &NumEntries) const;
+      virtual void emitPrologue(MachineFunction &MF) const;
+      virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+      virtual bool hasFP(const MachineFunction &MF) const;
+  }; // class AMDILFrameLowering
+} // namespace llvm
+#endif // _AMDILFRAME_LOWERING_H_
diff --git a/src/gallium/drivers/radeon/AMDILGlobalManager.cpp b/src/gallium/drivers/radeon/AMDILGlobalManager.cpp

new file mode 100644 (file)

index 0000000..eafd36e
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILGlobalManager.cpp
@@ -0,0 +1,1353 @@
+//===-- AMDILGlobalManager.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILGlobalManager.h"
+#include "AMDILDevices.h"
+#include "AMDILKernelManager.h"
+#include "AMDILSubtarget.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILGlobalManager.h"
+#include "AMDILDevices.h"
+#include "AMDILKernelManager.h"
+#include "AMDILSubtarget.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include <cstdio>
+
+using namespace llvm;
+
+AMDILGlobalManager::AMDILGlobalManager(bool debugMode) {
+  mOffset = 0;
+  mReservedBuffs = 0;
+  symTab = NULL;
+  mCurrentCPOffset = 0;
+  mDebugMode = debugMode;
+}
+
+AMDILGlobalManager::~AMDILGlobalManager() {
+}
+
+void AMDILGlobalManager::print(llvm::raw_ostream &O) {
+  if (!mDebugMode) {
+    return;
+  }
+  O << ";AMDIL Global Manager State Dump:\n";
+  O << ";\tSubtarget: " << mSTM << "\tSymbol Table: " << symTab
+    << "\n";
+  O << ";\tConstant Offset: " << mOffset << "\tCP Offset: "
+    << mCurrentCPOffset << "\tReserved Buffers: " << mReservedBuffs
+    << "\n";
+  if (!mImageNameMap.empty()) {
+    llvm::DenseMap<uint32_t, llvm::StringRef>::iterator imb, ime;
+    O << ";\tGlobal Image Mapping: \n";
+    for (imb = mImageNameMap.begin(), ime = mImageNameMap.end(); imb != ime;
+         ++imb) {
+      O << ";\t\tImage ID: " << imb->first << "\tName: "
+        << imb->second << "\n";
+    }
+  }
+  std::set<llvm::StringRef>::iterator sb, se;
+  if (!mByteStore.empty()) {
+    O << ";Byte Store Kernels: \n";
+    for (sb = mByteStore.begin(), se = mByteStore.end(); sb != se; ++sb) {
+      O << ";\t\t" << *sb << "\n";
+    }
+  }
+  if (!mIgnoreStr.empty()) {
+    O << ";\tIgnored Data Strings: \n";
+    for (sb = mIgnoreStr.begin(), se = mIgnoreStr.end(); sb != se; ++sb) {
+      O << ";\t\t" << *sb << "\n";
+    }
+  }
+}
+
+void AMDILGlobalManager::dump() {
+  print(errs());
+}
+
+static const constPtr *getConstPtr(const kernel &krnl, const std::string &arg) {
+  llvm::SmallVector<constPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end;
+  for (begin = krnl.constPtr.begin(), end = krnl.constPtr.end();
+       begin != end; ++begin) {
+    if (!strcmp(begin->name.data(),arg.c_str())) {
+      return &(*begin);
+    }
+  }
+  return NULL;
+}
+#if 0
+static bool structContainsSub32bitType(const StructType *ST) {
+  StructType::element_iterator eib, eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    Type *ptr = *eib;
+    uint32_t size = (uint32_t)GET_SCALAR_SIZE(ptr);
+    if (!size) {
+      if (const StructType *ST = dyn_cast<StructType>(ptr)) {
+        if (structContainsSub32bitType(ST)) {
+          return true;
+        }
+      }
+    } else if (size < 32) {
+      return true;
+    }
+  }
+  return false;
+}
+#endif
+
+void AMDILGlobalManager::processModule(const Module &M,
+                                       const AMDILTargetMachine *mTM)
+{
+  Module::const_global_iterator GI;
+  Module::const_global_iterator GE;
+  symTab = "NoSymTab";
+  mSTM = mTM->getSubtargetImpl();
+  for (GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
+    const GlobalValue *GV = GI;
+    if (mDebugMode) {
+      GV->dump();
+      errs() << "\n";
+    }
+    llvm::StringRef GVName = GV->getName();
+    const char *name = GVName.data();
+    if (!strncmp(name, "sgv", 3)) {
+      mKernelArgs[GVName] = parseSGV(GV);
+    } else if (!strncmp(name, "fgv", 3)) {
+      // we can ignore this since we don't care about the filename
+      // string
+    } else if (!strncmp(name, "lvgv", 4)) {
+      mLocalArgs[GVName] = parseLVGV(GV);
+    } else if (!strncmp(name, "llvm.image.annotations", 22)) {
+      if (strstr(name, "__OpenCL")
+          && strstr(name, "_kernel")) {
+        // we only want to parse the image information if the
+        // image is a kernel, we might have to parse out the
+        // information if a function is found that is not
+        // inlined.
+        parseImageAnnotate(GV);
+      }
+    } else if (!strncmp(name, "llvm.global.annotations", 23)) {
+      parseGlobalAnnotate(GV);
+    } else if (!strncmp(name, "llvm.constpointer.annotations", 29)) {
+      if (strstr(name, "__OpenCL")
+          && strstr(name, "_kernel")) {
+        // we only want to parse constant pointer information
+        // if it is a kernel
+        parseConstantPtrAnnotate(GV);
+      }
+    } else if (!strncmp(name, "llvm.readonlypointer.annotations", 32)) {
+      // These are skipped as we handle them later in AMDILPointerManager.cpp
+    } else if (GV->getType()->getAddressSpace() == 3) { // *** Match cl_kernel.h local AS #
+      parseAutoArray(GV, false);
+    } else if (strstr(name, "clregion")) {
+      parseAutoArray(GV, true);
+    } else if (!GV->use_empty()
+               && mIgnoreStr.find(GVName) == mIgnoreStr.end()) {
+      parseConstantPtr(GV);
+    }
+  }
+  allocateGlobalCB();
+
+  safeForEach(M.begin(), M.end(),
+      std::bind1st(
+        std::mem_fun(&AMDILGlobalManager::checkConstPtrsUseHW),
+        this));
+}
+
+void AMDILGlobalManager::allocateGlobalCB(void) {
+  uint32_t maxCBSize = mSTM->device()->getMaxCBSize();
+  uint32_t offset = 0;
+  uint32_t curCB = 0;
+  uint32_t swoffset = 0;
+  for (StringMap<constPtr>::iterator cpb = mConstMems.begin(),
+       cpe = mConstMems.end(); cpb != cpe; ++cpb) {
+    bool constHW = mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem);
+    cpb->second.usesHardware = false;
+    if (constHW) {
+      // If we have a limit on the max CB Size, then we need to make sure that
+      // the constant sizes fall within the limits.
+      if (cpb->second.size <= maxCBSize) {
+        if (offset + cpb->second.size > maxCBSize) {
+          offset = 0;
+          curCB++;
+        }
+        if (curCB < mSTM->device()->getMaxNumCBs()) {
+          cpb->second.cbNum = curCB + CB_BASE_OFFSET;
+          cpb->second.offset = offset;
+          offset += (cpb->second.size + 15) & (~15);
+          cpb->second.usesHardware = true;
+          continue;
+        }
+      }
+    }
+    cpb->second.cbNum = 0;
+    cpb->second.offset = swoffset;
+    swoffset += (cpb->second.size + 15) & (~15);
+  }
+  if (!mConstMems.empty()) {
+    mReservedBuffs = curCB + 1;
+  }
+}
+
+bool AMDILGlobalManager::checkConstPtrsUseHW(llvm::Module::const_iterator *FCI)
+{
+  Function::const_arg_iterator AI, AE;
+  const Function *func = *FCI;
+  std::string name = func->getName();
+  if (!strstr(name.c_str(), "__OpenCL")
+      || !strstr(name.c_str(), "_kernel")) {
+    return false;
+  }
+  kernel &krnl =  mKernels[name];
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    for (AI = func->arg_begin(), AE = func->arg_end();
+         AI != AE; ++AI) {
+      const Argument *Arg = &(*AI);
+      const PointerType *P = dyn_cast<PointerType>(Arg->getType());
+      if (!P) {
+        continue;
+      }
+      if (P->getAddressSpace() != AMDILAS::CONSTANT_ADDRESS) {
+        continue;
+      }
+      const constPtr *ptr = getConstPtr(krnl, Arg->getName());
+      if (ptr) {
+        continue;
+      }
+      constPtr constAttr;
+      constAttr.name = Arg->getName();
+      constAttr.size = this->mSTM->device()->getMaxCBSize();
+      constAttr.base = Arg;
+      constAttr.isArgument = true;
+      constAttr.isArray = false;
+      constAttr.offset = 0;
+      constAttr.usesHardware =
+        mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem);
+      if (constAttr.usesHardware) {
+        constAttr.cbNum = krnl.constPtr.size() + 2;
+      } else {
+        constAttr.cbNum = 0;
+      }
+      krnl.constPtr.push_back(constAttr);
+    }
+  }
+  // Now lets make sure that only the N largest buffers
+  // get allocated in hardware if we have too many buffers
+  uint32_t numPtrs = krnl.constPtr.size();
+  if (numPtrs > (this->mSTM->device()->getMaxNumCBs() - mReservedBuffs)) {
+    // TODO: Change this routine so it sorts
+    // constPtr instead of pulling the sizes out
+    // and then grab the N largest and disable the rest
+    llvm::SmallVector<uint32_t, 16> sizes;
+    for (uint32_t x = 0; x < numPtrs; ++x) {
+      sizes.push_back(krnl.constPtr[x].size);
+    }
+    std::sort(sizes.begin(), sizes.end());
+    uint32_t numToDisable = numPtrs - (mSTM->device()->getMaxNumCBs() -
+                                       mReservedBuffs);
+    uint32_t safeSize = sizes[numToDisable-1];
+    for (uint32_t x = 0; x < numPtrs && numToDisable; ++x) {
+      if (krnl.constPtr[x].size <= safeSize) {
+        krnl.constPtr[x].usesHardware = false;
+        --numToDisable;
+      }
+    }
+  }
+  // Renumber all of the valid CB's so that
+  // they are linear increase
+  uint32_t CBid = 2 + mReservedBuffs;
+  for (uint32_t x = 0; x < numPtrs; ++x) {
+    if (krnl.constPtr[x].usesHardware) {
+      krnl.constPtr[x].cbNum = CBid++;
+    }
+  }
+  for (StringMap<constPtr>::iterator cpb = mConstMems.begin(),
+       cpe = mConstMems.end(); cpb != cpe; ++cpb) {
+    if (cpb->second.usesHardware) {
+      krnl.constPtr.push_back(cpb->second);
+    }
+  }
+  for (uint32_t x = 0; x < krnl.constPtr.size(); ++x) {
+    constPtr &c = krnl.constPtr[x];
+    uint32_t cbNum = c.cbNum - CB_BASE_OFFSET;
+    if (cbNum < HW_MAX_NUM_CB && c.cbNum >= CB_BASE_OFFSET) {
+      if ((c.size + c.offset) > krnl.constSizes[cbNum]) {
+        krnl.constSizes[cbNum] =
+          ((c.size + c.offset) + 15) & ~15;
+      }
+    } else {
+      krnl.constPtr[x].usesHardware = false;
+    }
+  }
+  return false;
+}
+
+int32_t AMDILGlobalManager::getArrayOffset(const llvm::StringRef &a) const {
+  StringMap<arraymem>::const_iterator iter = mArrayMems.find(a);
+  if (iter != mArrayMems.end()) {
+    return iter->second.offset;
+  } else {
+    return -1;
+  }
+}
+
+int32_t AMDILGlobalManager::getConstOffset(const llvm::StringRef &a) const {
+  StringMap<constPtr>::const_iterator iter = mConstMems.find(a);
+  if (iter != mConstMems.end()) {
+    return iter->second.offset;
+  } else {
+    return -1;
+  }
+}
+
+bool AMDILGlobalManager::getConstHWBit(const llvm::StringRef &name) const {
+  StringMap<constPtr>::const_iterator iter = mConstMems.find(name);
+  if (iter != mConstMems.end()) {
+    return iter->second.usesHardware;
+  } else {
+    return false;
+  }
+}
+
+// As of right now we only care about the required group size
+// so we can skip the variable encoding
+kernelArg AMDILGlobalManager::parseSGV(const GlobalValue *G) {
+  kernelArg nArg;
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  memset(&nArg, 0, sizeof(nArg));
+  for (int x = 0; x < 3; ++x) {
+    nArg.reqGroupSize[x] = mSTM->getDefaultSize(x);
+    nArg.reqRegionSize[x] = mSTM->getDefaultSize(x);
+  }
+  if (!GV || !GV->hasInitializer()) {
+    return nArg;
+  }
+  const Constant *CV = GV->getInitializer();
+  const ConstantDataArray *CA =dyn_cast_or_null<ConstantDataArray>(CV);
+
+  if (!CA || !CA->isString()) {
+    return nArg;
+  }
+  std::string init = CA->getAsString();
+  size_t pos = init.find("RWG");
+  if (pos != llvm::StringRef::npos) {
+    pos += 3;
+    std::string LWS = init.substr(pos, init.length() - pos);
+    const char *lws = LWS.c_str();
+    sscanf(lws, "%u,%u,%u", &(nArg.reqGroupSize[0]),
+           &(nArg.reqGroupSize[1]),
+           &(nArg.reqGroupSize[2]));
+    nArg.mHasRWG = true;
+  }
+  pos = init.find("RWR");
+  if (pos != llvm::StringRef::npos) {
+    pos += 3;
+    std::string LWS = init.substr(pos, init.length() - pos);
+    const char *lws = LWS.c_str();
+    sscanf(lws, "%u,%u,%u", &(nArg.reqRegionSize[0]),
+           &(nArg.reqRegionSize[1]),
+           &(nArg.reqRegionSize[2]));
+    nArg.mHasRWR = true;
+  }
+  return nArg;
+}
+
+localArg AMDILGlobalManager::parseLVGV(const GlobalValue *G) {
+  localArg nArg;
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  nArg.name = "";
+  if (!GV || !GV->hasInitializer()) {
+    return nArg;
+  }
+  const ConstantArray *CA =
+    dyn_cast_or_null<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return nArg;
+  }
+  for (size_t x = 0, y = CA->getNumOperands(); x < y; ++x) {
+    const Value *local = CA->getOperand(x);
+    const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(local);
+    if (!CE || !CE->getNumOperands()) {
+      continue;
+    }
+    nArg.name = (*(CE->op_begin()))->getName();
+    if (mArrayMems.find(nArg.name) != mArrayMems.end()) {
+      nArg.local.push_back(&(mArrayMems[nArg.name]));
+    }
+  }
+  return nArg;
+}
+
+void AMDILGlobalManager::parseConstantPtrAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(G);
+  const ConstantArray *CA =
+    dyn_cast_or_null<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return;
+  }
+  uint32_t numOps = CA->getNumOperands();
+  for (uint32_t x = 0; x < numOps; ++x) {
+    const Value *V = CA->getOperand(x);
+    const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V);
+    if (!CS) {
+      continue;
+    }
+    assert(CS->getNumOperands() == 2 && "There can only be 2"
+           " fields, a name and size");
+    const ConstantExpr *nameField = dyn_cast<ConstantExpr>(CS->getOperand(0));
+    const ConstantInt *sizeField = dyn_cast<ConstantInt>(CS->getOperand(1));
+    assert(nameField && "There must be a constant name field");
+    assert(sizeField && "There must be a constant size field");
+    const GlobalVariable *nameGV =
+      dyn_cast<GlobalVariable>(nameField->getOperand(0));
+    const ConstantDataArray *nameArray =
+      dyn_cast<ConstantDataArray>(nameGV->getInitializer());
+    // Lets add this string to the set of strings we should ignore processing
+    mIgnoreStr.insert(nameGV->getName());
+    if (mConstMems.find(nameGV->getName())
+        != mConstMems.end()) {
+      // If we already processesd this string as a constant, lets remove it from
+      // the list of known constants.  This way we don't process unneeded data
+      // and don't generate code/metadata for strings that are never used.
+      mConstMems.erase(mConstMems.find(nameGV->getName()));
+    } else {
+      mIgnoreStr.insert(CS->getOperand(0)->getName());
+    }
+    constPtr constAttr;
+    constAttr.name = nameArray->getAsString();
+    constAttr.size = (sizeField->getZExtValue() + 15) & ~15;
+    constAttr.base = CS;
+    constAttr.isArgument = true;
+    constAttr.isArray = false;
+    constAttr.cbNum = 0;
+    constAttr.offset = 0;
+    constAttr.usesHardware = (constAttr.size <= mSTM->device()->getMaxCBSize());
+    // Now that we have all our constant information,
+    // lets update the kernel
+    llvm::StringRef  kernelName = G->getName().data() + 30;
+    kernel k;
+    if (mKernels.find(kernelName) != mKernels.end()) {
+      k = mKernels[kernelName];
+    } else {
+      k.curSize = 0;
+      k.curRSize = 0;
+      k.curHWSize = 0;
+      k.curHWRSize = 0;
+      k.constSize = 0;
+      k.lvgv = NULL;
+      k.sgv = NULL;
+      memset(k.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+    }
+    constAttr.cbNum = k.constPtr.size() + 2;
+    k.constPtr.push_back(constAttr);
+    mKernels[kernelName] = k;
+  }
+}
+
+void AMDILGlobalManager::parseImageAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  const ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return;
+  }
+  if (isa<GlobalValue>(CA)) {
+    return;
+  }
+  uint32_t e = CA->getNumOperands();
+  if (!e) {
+    return;
+  }
+  kernel k;
+  llvm::StringRef name = G->getName().data() + 23;
+  if (mKernels.find(name) != mKernels.end()) {
+    k = mKernels[name];
+  } else {
+    k.curSize = 0;
+    k.curRSize = 0;
+    k.curHWSize = 0;
+    k.curHWRSize = 0;
+    k.constSize = 0;
+    k.lvgv = NULL;
+    k.sgv = NULL;
+    memset(k.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+  }
+  for (uint32_t i = 0; i != e; ++i) {
+    const Value *V = CA->getOperand(i);
+    const Constant *C = dyn_cast<Constant>(V);
+    const ConstantStruct *CS = dyn_cast<ConstantStruct>(C);
+    if (CS && CS->getNumOperands() == 2) {
+      if (mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()) !=
+          mConstMems.end()) {
+        // If we already processesd this string as a constant, lets remove it
+        // from the list of known constants.  This way we don't process unneeded
+        // data and don't generate code/metadata for strings that are never
+        // used.
+        mConstMems.erase(
+            mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()));
+      } else {
+        mIgnoreStr.insert(CS->getOperand(0)->getOperand(0)->getName());
+      }
+      const ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(1));
+      uint32_t val = (uint32_t)CI->getZExtValue();
+      if (val == 1) {
+        k.readOnly.insert(i);
+      } else if (val == 2) {
+        k.writeOnly.insert(i);
+      } else {
+        assert(!"Unknown image type value!");
+      }
+    }
+  }
+  mKernels[name] = k;
+}
+
+void AMDILGlobalManager::parseAutoArray(const GlobalValue *GV, bool isRegion) {
+  const GlobalVariable *G = dyn_cast<GlobalVariable>(GV);
+  Type *Ty = (G) ? G->getType() : NULL;
+  arraymem tmp;
+  tmp.isHW = true;
+  tmp.offset = 0;
+  tmp.vecSize = getTypeSize(Ty, true);
+  tmp.isRegion = isRegion;
+  mArrayMems[GV->getName()] = tmp;
+}
+
+void AMDILGlobalManager::parseConstantPtr(const GlobalValue *GV) {
+  const GlobalVariable *G = dyn_cast<GlobalVariable>(GV);
+  Type *Ty = (G) ? G->getType() : NULL;
+  constPtr constAttr;
+  constAttr.name = G->getName();
+  constAttr.size = getTypeSize(Ty, true);
+  constAttr.base = GV;
+  constAttr.isArgument = false;
+  constAttr.isArray = true;
+  constAttr.offset = 0;
+  constAttr.cbNum = 0;
+  constAttr.usesHardware = false;
+  mConstMems[GV->getName()] = constAttr;
+}
+
+void AMDILGlobalManager::parseGlobalAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  if (!GV->hasInitializer()) {
+    return;
+  }
+  const Constant *CT = GV->getInitializer();
+  if (!CT || isa<GlobalValue>(CT)) {
+    return;
+  }
+  const ConstantArray *CA = dyn_cast<ConstantArray>(CT);
+  if (!CA) {
+    return;
+  }
+
+  unsigned int nKernels = CA->getNumOperands();
+  for (unsigned int i = 0, e = nKernels; i != e; ++i) {
+    parseKernelInformation(CA->getOperand(i));
+  }
+}
+
+void AMDILGlobalManager::parseKernelInformation(const Value *V) {
+  if (isa<GlobalValue>(V)) {
+    return;
+  }
+  const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V);
+  if (!CS) {
+    return;
+  }
+  uint32_t N = CS->getNumOperands();
+  if (N != 5) {
+    return;
+  }
+  kernel tmp;
+
+  tmp.curSize = 0;
+  tmp.curRSize = 0;
+  tmp.curHWSize = 0;
+  tmp.curHWRSize = 0;
+  // The first operand is always a pointer to the kernel.
+  const Constant *CV = dyn_cast<Constant>(CS->getOperand(0));
+  llvm::StringRef kernelName = "";
+  if (CV->getNumOperands()) {
+    kernelName = (*(CV->op_begin()))->getName();
+  }
+
+  // If we have images, then we have already created the kernel and we just need
+  // to get the kernel information.
+  if (mKernels.find(kernelName) != mKernels.end()) {
+    tmp = mKernels[kernelName];
+  } else {
+    tmp.curSize = 0;
+    tmp.curRSize = 0;
+    tmp.curHWSize = 0;
+    tmp.curHWRSize = 0;
+    tmp.constSize = 0;
+    tmp.lvgv = NULL;
+    tmp.sgv = NULL;
+    memset(tmp.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+  }
+
+
+  // The second operand is SGV, there can only be one so we don't need to worry
+  // about parsing out multiple data points.
+  CV = dyn_cast<Constant>(CS->getOperand(1));
+
+  llvm::StringRef sgvName;
+  if (CV->getNumOperands()) {
+    sgvName = (*(CV->op_begin()))->getName();
+  }
+
+  if (mKernelArgs.find(sgvName) != mKernelArgs.end()) {
+    tmp.sgv = &mKernelArgs[sgvName];
+  }
+  // The third operand is FGV, which is skipped
+  // The fourth operand is LVGV
+  // There can be multiple local arrays, so we
+  // need to handle each one seperatly
+  CV = dyn_cast<Constant>(CS->getOperand(3));
+  llvm::StringRef lvgvName = "";
+  if (CV->getNumOperands()) {
+    lvgvName = (*(CV->op_begin()))->getName();
+  }
+  if (mLocalArgs.find(lvgvName) != mLocalArgs.end()) {
+    localArg *ptr = &mLocalArgs[lvgvName];
+    tmp.lvgv = ptr;
+    llvm::SmallVector<arraymem *, DEFAULT_VEC_SLOTS>::iterator ib, ie;
+    for (ib = ptr->local.begin(), ie = ptr->local.end(); ib != ie; ++ib) {
+      if ((*ib)->isRegion) {
+        if ((*ib)->isHW) {
+          (*ib)->offset = tmp.curHWRSize;
+          tmp.curHWRSize += ((*ib)->vecSize + 15) & ~15;
+        } else {
+          (*ib)->offset = tmp.curRSize;
+          tmp.curRSize += ((*ib)->vecSize + 15) & ~15;
+        }
+      } else {
+        if ((*ib)->isHW) {
+          (*ib)->offset = tmp.curHWSize;
+          tmp.curHWSize += ((*ib)->vecSize + 15) & ~15;
+        } else {
+          (*ib)->offset = tmp.curSize;
+          tmp.curSize += ((*ib)->vecSize + 15) & ~15;
+        }
+      }
+    }
+  }
+
+  // The fifth operand is NULL
+  mKernels[kernelName] = tmp;
+}
+
+const kernel &AMDILGlobalManager::getKernel(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  assert(isKernel(name) && "Must be a kernel to call getKernel");
+  return iter->second;
+}
+
+bool AMDILGlobalManager::isKernel(const llvm::StringRef &name) const {
+  return (mKernels.find(name) != mKernels.end());
+}
+
+bool AMDILGlobalManager::isWriteOnlyImage(const llvm::StringRef &name,
+                                          uint32_t iID) const {
+  const StringMap<kernel>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return false;
+  }
+  return kiter->second.writeOnly.count(iID);
+}
+
+uint32_t
+AMDILGlobalManager::getNumWriteImages(const llvm::StringRef &name) const {
+  char *env = NULL;
+  env = getenv("GPU_DISABLE_RAW_UAV");
+  if (env && env[0] == '1') {
+    return 8;
+  }
+  const StringMap<kernel>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return 0;
+  } else {
+    return kiter->second.writeOnly.size();
+  }
+}
+
+bool AMDILGlobalManager::isReadOnlyImage(const llvm::StringRef &name,
+                                         uint32_t iID) const {
+  const StringMap<kernel>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return false;
+  }
+  return kiter->second.readOnly.count(iID);
+}
+
+bool AMDILGlobalManager::hasRWG(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    kernelArg *ptr = iter->second.sgv;
+    if (ptr) {
+      return ptr->mHasRWG;
+    }
+  }
+  return false;
+}
+
+bool AMDILGlobalManager::hasRWR(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    kernelArg *ptr = iter->second.sgv;
+    if (ptr) {
+      return ptr->mHasRWR;
+    }
+  }
+  return false;
+}
+
+uint32_t
+AMDILGlobalManager::getMaxGroupSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    kernelArg *sgv = iter->second.sgv;
+    if (sgv) {
+      return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2];
+    }
+  }
+  return mSTM->getDefaultSize(0) *
+         mSTM->getDefaultSize(1) *
+         mSTM->getDefaultSize(2);
+}
+
+uint32_t
+AMDILGlobalManager::getMaxRegionSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    kernelArg *sgv = iter->second.sgv;
+    if (sgv) {
+      return sgv->reqRegionSize[0] *
+             sgv->reqRegionSize[1] *
+             sgv->reqRegionSize[2];
+    }
+  }
+  return mSTM->getDefaultSize(0) *
+         mSTM->getDefaultSize(1) *
+         mSTM->getDefaultSize(2);
+}
+
+uint32_t AMDILGlobalManager::getRegionSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second.curRSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILGlobalManager::getLocalSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second.curSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILGlobalManager::getConstSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second.constSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDILGlobalManager::getHWRegionSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second.curHWRSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILGlobalManager::getHWLocalSize(const llvm::StringRef &name) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second.curHWSize;
+  } else {
+    return 0;
+  }
+}
+
+int32_t AMDILGlobalManager::getArgID(const Argument *arg) {
+  DenseMap<const Argument *, int32_t>::iterator argiter = mArgIDMap.find(arg);
+  if (argiter != mArgIDMap.end()) {
+    return argiter->second;
+  } else {
+    return -1;
+  }
+}
+
+
+uint32_t
+AMDILGlobalManager::getLocal(const llvm::StringRef &name, uint32_t dim) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end() && iter->second.sgv) {
+    kernelArg *sgv = iter->second.sgv;
+    switch (dim) {
+    default: break;
+    case 0:
+    case 1:
+    case 2:
+      return sgv->reqGroupSize[dim];
+      break;
+    case 3:
+      return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2];
+    };
+  }
+  switch (dim) {
+  default:
+    return 1;
+  case 3:
+    return mSTM->getDefaultSize(0) *
+           mSTM->getDefaultSize(1) *
+           mSTM->getDefaultSize(2);
+  case 2:
+  case 1:
+  case 0:
+    return mSTM->getDefaultSize(dim);
+    break;
+  };
+  return 1;
+}
+
+uint32_t
+AMDILGlobalManager::getRegion(const llvm::StringRef &name, uint32_t dim) const {
+  StringMap<kernel>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end() && iter->second.sgv) {
+    kernelArg *sgv = iter->second.sgv;
+    switch (dim) {
+    default: break;
+    case 0:
+    case 1:
+    case 2:
+      return sgv->reqRegionSize[dim];
+      break;
+    case 3:
+      return sgv->reqRegionSize[0] *
+             sgv->reqRegionSize[1] *
+             sgv->reqRegionSize[2];
+    };
+  }
+  switch (dim) {
+  default:
+    return 1;
+  case 3:
+    return mSTM->getDefaultSize(0) *
+           mSTM->getDefaultSize(1) *
+           mSTM->getDefaultSize(2);
+  case 2:
+  case 1:
+  case 0:
+    return mSTM->getDefaultSize(dim);
+    break;
+  };
+  return 1;
+}
+
+StringMap<constPtr>::iterator AMDILGlobalManager::consts_begin() {
+  return mConstMems.begin();
+}
+
+
+StringMap<constPtr>::iterator AMDILGlobalManager::consts_end() {
+  return mConstMems.end();
+}
+
+bool AMDILGlobalManager::byteStoreExists(StringRef S) const {
+  return mByteStore.find(S) != mByteStore.end();
+}
+
+bool AMDILGlobalManager::usesHWConstant(const kernel &krnl,
+                                        const llvm::StringRef &arg) {
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->usesHardware;
+  } else {
+    return false;
+  }
+}
+
+uint32_t AMDILGlobalManager::getConstPtrSize(const kernel &krnl,
+                                             const llvm::StringRef &arg)
+{
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->size;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILGlobalManager::getConstPtrOff(const kernel &krnl,
+                                            const llvm::StringRef &arg)
+{
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->offset;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILGlobalManager::getConstPtrCB(const kernel &krnl,
+                                           const llvm::StringRef &arg)
+{
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->cbNum;
+  } else {
+    return 0;
+  }
+}
+
+void AMDILGlobalManager::calculateCPOffsets(const MachineFunction *MF,
+                                            kernel &krnl)
+{
+  const MachineConstantPool *MCP = MF->getConstantPool();
+  if (!MCP) {
+    return;
+  }
+  const std::vector<MachineConstantPoolEntry> consts = MCP->getConstants();
+  size_t numConsts = consts.size();
+  for (size_t x = 0; x < numConsts; ++x) {
+    krnl.CPOffsets.push_back(
+        std::make_pair<uint32_t, const Constant*>(
+          mCurrentCPOffset, consts[x].Val.ConstVal));
+    size_t curSize = getTypeSize(consts[x].Val.ConstVal->getType(), true);
+    // Align the size to the vector boundary
+    curSize = (curSize + 15) & (~15);
+    mCurrentCPOffset += curSize;
+  }
+}
+
+bool AMDILGlobalManager::isConstPtrArray(const kernel &krnl,
+                                         const llvm::StringRef &arg) {
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->isArray;
+  } else {
+    return false;
+  }
+}
+
+bool AMDILGlobalManager::isConstPtrArgument(const kernel &krnl,
+                                            const llvm::StringRef &arg)
+{
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->isArgument;
+  } else {
+    return false;
+  }
+}
+
+const Value *AMDILGlobalManager::getConstPtrValue(const kernel &krnl,
+                                                  const llvm::StringRef &arg) {
+  const constPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->base;
+  } else {
+    return NULL;
+  }
+}
+
+static void
+dumpZeroElements(const  StructType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(const IntegerType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(const   ArrayType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(const  VectorType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(const        Type * const T, llvm::raw_ostream &O, bool asBytes);
+
+void dumpZeroElements(const Type * const T, llvm::raw_ostream &O, bool asBytes) {
+  if (!T) {
+    return;
+  }
+  switch(T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::DoubleTyID:
+    if (asBytes) {
+      O << ":0:0:0:0:0:0:0:0";
+    } else {
+      O << ":0";
+    }
+    break;
+  case Type::FloatTyID:
+  case Type::PointerTyID:
+  case Type::FunctionTyID:
+    if (asBytes) {
+      O << ":0:0:0:0";
+    } else {
+      O << ":0";
+    }
+    break;
+  case Type::IntegerTyID:
+    dumpZeroElements(dyn_cast<IntegerType>(T), O, asBytes);
+    break;
+  case Type::StructTyID:
+    {
+      const StructType *ST = cast<StructType>(T);
+      if (!ST->isOpaque()) {
+        dumpZeroElements(dyn_cast<StructType>(T), O, asBytes);
+      } else { // A pre-LLVM 3.0 opaque type
+        if (asBytes) {
+          O << ":0:0:0:0";
+        } else {
+          O << ":0";
+        }
+      }
+    }
+    break;
+  case Type::ArrayTyID:
+    dumpZeroElements(dyn_cast<ArrayType>(T), O, asBytes);
+    break;
+  case Type::VectorTyID:
+    dumpZeroElements(dyn_cast<VectorType>(T), O, asBytes);
+    break;
+  };
+}
+
+void
+dumpZeroElements(const StructType * const ST, llvm::raw_ostream &O, bool asBytes) {
+  if (!ST) {
+    return;
+  }
+  Type *curType;
+  StructType::element_iterator eib = ST->element_begin();
+  StructType::element_iterator eie = ST->element_end();
+  for (;eib != eie; ++eib) {
+    curType = *eib;
+    dumpZeroElements(curType, O, asBytes);
+  }
+}
+
+void
+dumpZeroElements(const IntegerType * const IT, llvm::raw_ostream &O, bool asBytes) {
+  if (asBytes) {
+    unsigned byteWidth = (IT->getBitWidth() >> 3);
+    for (unsigned x = 0; x < byteWidth; ++x) {
+      O << ":0";
+    }
+  }
+}
+
+void
+dumpZeroElements(const ArrayType * const AT, llvm::raw_ostream &O, bool asBytes) {
+  size_t size = AT->getNumElements();
+  for (size_t x = 0; x < size; ++x) {
+    dumpZeroElements(AT->getElementType(), O, asBytes);
+  }
+}
+
+void
+dumpZeroElements(const VectorType * const VT, llvm::raw_ostream &O, bool asBytes) {
+  size_t size = VT->getNumElements();
+  for (size_t x = 0; x < size; ++x) {
+    dumpZeroElements(VT->getElementType(), O, asBytes);
+  }
+}
+
+void AMDILGlobalManager::printConstantValue(const Constant *CAval,
+                                            llvm::raw_ostream &O, bool asBytes) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CAval)) {
+    bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
+    if (isDouble) {
+      double val = CFP->getValueAPF().convertToDouble();
+      union dtol_union {
+        double d;
+        uint64_t l;
+        char c[8];
+      } conv;
+      conv.d = val;
+      if (!asBytes) {
+        O << ":";
+        O.write_hex(conv.l);
+      } else {
+        for (int i = 0; i < 8; ++i) {
+          O << ":";
+          O.write_hex((unsigned)conv.c[i] & 0xFF);
+        }
+      }
+    } else {
+      float val = CFP->getValueAPF().convertToFloat();
+      union ftoi_union {
+        float f;
+        uint32_t u;
+        char c[4];
+      } conv;
+      conv.f = val;
+      if (!asBytes) {
+        O << ":";
+        O.write_hex(conv.u);
+      } else {
+        for (int i = 0; i < 4; ++i) {
+          O << ":";
+          O.write_hex((unsigned)conv.c[i] & 0xFF);
+        }
+      }
+    }
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CAval)) {
+    uint64_t zVal = CI->getValue().getZExtValue();
+    if (!asBytes) {
+      O << ":";
+      O.write_hex(zVal);
+    } else {
+      switch (CI->getBitWidth()) {
+      default:
+        {
+          union ltob_union {
+            uint64_t l;
+            char c[8];
+          } conv;
+          conv.l = zVal;
+          for (int i = 0; i < 8; ++i) {
+            O << ":";
+            O.write_hex((unsigned)conv.c[i] & 0xFF);
+          }
+        }
+        break;
+      case 8:
+        O << ":";
+        O.write_hex(zVal & 0xFF);
+        break;
+      case 16:
+        {
+          union stob_union {
+            uint16_t s;
+            char c[2];
+          } conv;
+          conv.s = (uint16_t)zVal;
+          O << ":";
+          O.write_hex((unsigned)conv.c[0] & 0xFF);
+          O << ":";
+          O.write_hex((unsigned)conv.c[1] & 0xFF);
+        }
+        break;
+      case 32:
+        {
+          union itob_union {
+            uint32_t i;
+            char c[4];
+          } conv;
+          conv.i = (uint32_t)zVal;
+          for (int i = 0; i < 4; ++i) {
+            O << ":";
+            O.write_hex((unsigned)conv.c[i] & 0xFF);
+          }
+        }
+        break;
+      }
+    }
+  } else if (const ConstantVector *CV = dyn_cast<ConstantVector>(CAval)) {
+    int y = CV->getNumOperands()-1;
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CV->getOperand(x), O, asBytes);
+    }
+    printConstantValue(CV->getOperand(x), O, asBytes);
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CAval)) {
+    int y = CS->getNumOperands();
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CS->getOperand(x), O, asBytes);
+    }
+  } else if (const ConstantAggregateZero *CAZ
+      = dyn_cast<ConstantAggregateZero>(CAval)) {
+    int y = CAZ->getNumOperands();
+    if (y > 0) {
+      int x = 0;
+      for (; x < y; ++x) {
+        printConstantValue((llvm::Constant *)CAZ->getOperand(x),
+            O, asBytes);
+      }
+    } else {
+      if (asBytes) {
+        dumpZeroElements(CAval->getType(), O, asBytes);
+      } else {
+        int y = getNumElements(CAval->getType())-1;
+        for (int x = 0; x < y; ++x) {
+          O << ":0";
+        }
+        O << ":0";
+      }
+    }
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CAval)) {
+    int y = CA->getNumOperands();
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CA->getOperand(x), O, asBytes);
+    }
+  } else if (dyn_cast<ConstantPointerNull>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else if (dyn_cast<ConstantExpr>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else if (dyn_cast<UndefValue>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else {
+    assert(0 && "Hit condition which was not expected");
+  }
+}
+
+static bool isStruct(Type * const T)
+{
+  if (!T) {
+    return false;
+  }
+  switch (T->getTypeID()) {
+  default:
+    return false;
+  case Type::PointerTyID:
+    return isStruct(T->getContainedType(0));
+  case Type::StructTyID:
+    return true;
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    return isStruct(dyn_cast<SequentialType>(T)->getElementType());
+  };
+
+}
+
+void AMDILGlobalManager::dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km,
+                                      uint32_t id) {
+  uint32_t size = 0;
+  for (StringMap<constPtr>::iterator cmb = consts_begin(),
+      cme = consts_end(); cmb != cme; ++cmb) {
+    if (id == cmb->second.cbNum) {
+      size += (cmb->second.size + 15) & (~15);
+    }
+  }
+  if (id == 0) {
+    O << ";#DATASTART:" << (size + mCurrentCPOffset) << "\n";
+    if (mCurrentCPOffset) {
+      for (StringMap<kernel>::iterator kcpb = mKernels.begin(),
+          kcpe = mKernels.end(); kcpb != kcpe; ++kcpb) {
+        const kernel& k = kcpb->second;
+        size_t numConsts = k.CPOffsets.size();
+        for (size_t x = 0; x < numConsts; ++x) {
+          size_t offset = k.CPOffsets[x].first;
+          const Constant *C = k.CPOffsets[x].second;
+          Type *Ty = C->getType();
+          size_t size = (isStruct(Ty) ? getTypeSize(Ty, true)
+                                      : getNumElements(Ty));
+          O << ";#" << km->getTypeName(Ty, symTab) << ":";
+          O << offset << ":" << size ;
+          printConstantValue(C, O, isStruct(Ty));
+          O << "\n";
+        }
+      }
+    }
+  } else {
+    O << ";#DATASTART:" << id << ":" << size << "\n";
+  }
+
+  for (StringMap<constPtr>::iterator cmb = consts_begin(), cme = consts_end();
+       cmb != cme; ++cmb) {
+    if (cmb->second.cbNum != id) {
+      continue;
+    }
+    const GlobalVariable *G = dyn_cast<GlobalVariable>(cmb->second.base);
+    Type *Ty = (G) ? G->getType() : NULL;
+    size_t offset = cmb->second.offset;
+    const Constant *C = G->getInitializer();
+    size_t size = (isStruct(Ty)
+        ? getTypeSize(Ty, true)
+        : getNumElements(Ty));
+    O << ";#" << km->getTypeName(Ty, symTab) << ":";
+    if (!id) {
+      O << (offset + mCurrentCPOffset) << ":" << size;
+    } else {
+      O << offset << ":" << size;
+    }
+    if (C) {
+      printConstantValue(C, O, isStruct(Ty));
+    } else {
+      assert(0 && "Cannot have a constant pointer"
+          " without an initializer!");
+    }
+    O <<"\n";
+  }
+  if (id == 0) {
+    O << ";#DATAEND\n";
+  } else {
+    O << ";#DATAEND:" << id << "\n";
+  }
+}
+
+void
+AMDILGlobalManager::dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km) {
+  if (mConstMems.empty() && !mCurrentCPOffset) {
+    return;
+  } else {
+    llvm::DenseSet<uint32_t> const_set;
+    for (StringMap<constPtr>::iterator cmb = consts_begin(), cme = consts_end();
+         cmb != cme; ++cmb) {
+      const_set.insert(cmb->second.cbNum);
+    }
+    if (mCurrentCPOffset) {
+      const_set.insert(0);
+    }
+    for (llvm::DenseSet<uint32_t>::iterator setb = const_set.begin(),
+           sete = const_set.end(); setb != sete; ++setb) {
+      dumpDataToCB(O, km, *setb);
+    }
+  }
+}
+
+/// Create a function ID if it is not known or return the known
+/// function ID.
+uint32_t AMDILGlobalManager::getOrCreateFunctionID(const GlobalValue* func) {
+  if (func->getName().size()) {
+    return getOrCreateFunctionID(func->getName());
+  } 
+  uint32_t id;
+  if (mFuncPtrNames.find(func) == mFuncPtrNames.end()) {
+    id = mFuncPtrNames.size() + RESERVED_FUNCS + mFuncNames.size();
+    mFuncPtrNames[func] = id;
+  } else {
+    id = mFuncPtrNames[func];
+  }
+  return id;
+}
+uint32_t AMDILGlobalManager::getOrCreateFunctionID(const std::string &func) {
+  uint32_t id;
+  if (mFuncNames.find(func) == mFuncNames.end()) {
+    id = mFuncNames.size() + RESERVED_FUNCS + mFuncPtrNames.size();
+    mFuncNames[func] = id;
+  } else {
+    id = mFuncNames[func];
+  }
+  return id;
+}
diff --git a/src/gallium/drivers/radeon/AMDILGlobalManager.h b/src/gallium/drivers/radeon/AMDILGlobalManager.h

new file mode 100644 (file)

index 0000000..1b0361e
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILGlobalManager.h
@@ -0,0 +1,256 @@
+//===-- AMDILGlobalManager.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ==-----------------------------------------------------------------------===//
+//
+// Class that handles parsing and storing global variables that are relevant to
+// the compilation of the module.
+//
+// ==-----------------------------------------------------------------------===//
+
+#ifndef _AMDILGLOBALMANAGER_H_
+#define _AMDILGLOBALMANAGER_H_
+
+#include "AMDIL.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Module.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <set>
+#include <string>
+
+#define CB_BASE_OFFSET 2
+
+namespace llvm {
+
+class PointerType;
+class AMDILKernelManager;
+class AMDILSubtarget;
+class TypeSymbolTable;
+class Argument;
+class GlobalValue;
+class MachineFunction;
+
+/// structure that holds information for a single local/region address array
+typedef struct _arrayMemRec {
+  uint32_t vecSize; // size of each vector
+  uint32_t offset;  // offset into the memory section
+  bool isHW;        // flag to specify if HW is used or SW is used
+  bool isRegion;    // flag to specify if GDS is used or not
+} arraymem;
+ 
+/// Structure that holds information for all local/region address
+/// arrays in the kernel
+typedef struct _localArgRec {
+  llvm::SmallVector<arraymem *, DEFAULT_VEC_SLOTS> local;
+  std::string name; // Kernel Name
+} localArg;
+
+/// structure that holds information about a constant address
+/// space pointer that is a kernel argument
+typedef struct _constPtrRec {
+  const Value *base;
+  uint32_t size;
+  uint32_t offset;
+  uint32_t cbNum; // value of 0 means that it does not use hw CB
+  bool isArray;
+  bool isArgument;
+  bool usesHardware;
+  std::string name;
+} constPtr;
+
+/// Structure that holds information for each kernel argument
+typedef struct _kernelArgRec {
+  uint32_t reqGroupSize[3];
+  uint32_t reqRegionSize[3];
+  llvm::SmallVector<uint32_t, DEFAULT_VEC_SLOTS> argInfo;
+  bool mHasRWG;
+  bool mHasRWR;
+} kernelArg;
+
+/// Structure that holds information for each kernel
+typedef struct _kernelRec {
+  mutable uint32_t curSize;
+  mutable uint32_t curRSize;
+  mutable uint32_t curHWSize;
+  mutable uint32_t curHWRSize;
+  uint32_t constSize;
+  kernelArg *sgv;
+  localArg *lvgv;
+  llvm::SmallVector<struct _constPtrRec, DEFAULT_VEC_SLOTS> constPtr;
+  uint32_t constSizes[HW_MAX_NUM_CB];
+  llvm::SmallSet<uint32_t, OPENCL_MAX_READ_IMAGES> readOnly;
+  llvm::SmallSet<uint32_t, OPENCL_MAX_WRITE_IMAGES> writeOnly;
+  llvm::SmallVector<std::pair<uint32_t, const Constant *>,
+    DEFAULT_VEC_SLOTS> CPOffsets;
+} kernel;
+
+class AMDILGlobalManager {
+public:
+  AMDILGlobalManager(bool debugMode = false);
+  ~AMDILGlobalManager();
+
+  /// Process the given module and parse out the global variable metadata passed
+  /// down from the frontend-compiler
+  void processModule(const Module &MF, const AMDILTargetMachine* mTM);
+
+  /// Returns whether the current name is the name of a kernel function or a
+  /// normal function
+  bool isKernel(const llvm::StringRef &name) const;
+
+  /// Returns true if the image ID corresponds to a read only image.
+  bool isReadOnlyImage(const llvm::StringRef &name, uint32_t iID) const;
+
+  /// Returns true if the image ID corresponds to a write only image.
+  bool isWriteOnlyImage(const llvm::StringRef &name, uint32_t iID) const;
+
+  /// Returns the number of write only images for the kernel.
+  uint32_t getNumWriteImages(const llvm::StringRef &name) const;
+
+  /// Gets the group size of the kernel for the given dimension.
+  uint32_t getLocal(const llvm::StringRef &name, uint32_t dim) const;
+
+  /// Gets the region size of the kernel for the given dimension.
+  uint32_t getRegion(const llvm::StringRef &name, uint32_t dim) const;
+
+  /// Get the Region memory size in 1d for the given function/kernel.
+  uint32_t getRegionSize(const llvm::StringRef &name) const;
+
+  /// Get the region memory size in 1d for the given function/kernel.
+  uint32_t getLocalSize(const llvm::StringRef &name) const;
+
+  // Get the max group size in one 1D for the given function/kernel.
+  uint32_t getMaxGroupSize(const llvm::StringRef &name) const;
+
+  // Get the max region size in one 1D for the given function/kernel.
+  uint32_t getMaxRegionSize(const llvm::StringRef &name) const;
+
+  /// Get the constant memory size in 1d for the given function/kernel.
+  uint32_t getConstSize(const llvm::StringRef &name) const;
+
+  /// Get the HW local size in 1d for the given function/kernel We need to
+  /// seperate SW local and HW local for the case where some local memory is
+  /// emulated in global and some is using the hardware features. The main
+  /// problem is that in OpenCL 1.0/1.1 cl_khr_byte_addressable_store allows
+  /// these actions to happen on all memory spaces, but the hardware can only
+  /// write byte address stores to UAV and LDS, not GDS or Stack.
+  uint32_t getHWLocalSize(const llvm::StringRef &name) const;
+  uint32_t getHWRegionSize(const llvm::StringRef &name) const;
+
+  /// Get the offset of the array for the kernel.
+  int32_t getArrayOffset(const llvm::StringRef &name) const;
+
+  /// Get the offset of the const memory for the kernel.
+  int32_t getConstOffset(const llvm::StringRef &name) const;
+
+  /// Get the boolean value if this particular constant uses HW or not.
+  bool getConstHWBit(const llvm::StringRef &name) const;
+
+  /// Get a reference to the kernel metadata information for the given function
+  /// name.
+  const kernel &getKernel(const llvm::StringRef &name) const;
+
+  /// Returns whether a reqd_workgroup_size attribute has been used or not.
+  bool hasRWG(const llvm::StringRef &name) const;
+
+  /// Returns whether a reqd_workregion_size attribute has been used or not.
+  bool hasRWR(const llvm::StringRef &name) const;
+
+
+  /// Dump the data section to the output stream for the given kernel.
+  void dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km);
+
+  /// Iterate through the constants that are global to the compilation unit.
+  StringMap<constPtr>::iterator consts_begin();
+  StringMap<constPtr>::iterator consts_end();
+
+  /// Query if the kernel has a byte store.
+  bool byteStoreExists(llvm::StringRef S) const;
+
+  /// Query if the kernel and argument uses hardware constant memory.
+  bool usesHWConstant(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query if the constant pointer is an argument.
+  bool isConstPtrArgument(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query if the constant pointer is an array that is globally scoped.
+  bool isConstPtrArray(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query the size of the constant pointer.
+  uint32_t getConstPtrSize(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query the offset of the constant pointer.
+  uint32_t getConstPtrOff(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query the constant buffer number for a constant pointer.
+  uint32_t getConstPtrCB(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Query the Value* that the constant pointer originates from.
+  const Value *getConstPtrValue(const kernel &krnl, const llvm::StringRef &arg);
+
+  /// Get the ID of the argument.
+  int32_t getArgID(const Argument *arg);
+
+  /// Get the unique function ID for the specific function name and create a new
+  /// unique ID if it is not found.
+  uint32_t getOrCreateFunctionID(const GlobalValue* func);
+  uint32_t getOrCreateFunctionID(const std::string& func);
+
+  /// Calculate the offsets of the constant pool for the given kernel and
+  /// machine function.
+  void calculateCPOffsets(const MachineFunction *MF, kernel &krnl);
+
+  /// Print the global manager to the output stream.
+  void print(llvm::raw_ostream& O);
+
+  /// Dump the global manager to the output stream - debug use.
+  void dump();
+
+private:
+  /// Various functions that parse global value information and store them in
+  /// the global manager. This approach is used instead of dynamic parsing as it
+  /// might require more space, but should allow caching of data that gets
+  /// requested multiple times.
+  kernelArg parseSGV(const GlobalValue *GV);
+  localArg  parseLVGV(const GlobalValue *GV);
+  void parseGlobalAnnotate(const GlobalValue *G);
+  void parseImageAnnotate(const GlobalValue *G);
+  void parseConstantPtrAnnotate(const GlobalValue *G);
+  void printConstantValue(const Constant *CAval,
+                          llvm::raw_ostream& O,
+                          bool asByte);
+  void parseKernelInformation(const Value *V);
+  void parseAutoArray(const GlobalValue *G, bool isRegion);
+  void parseConstantPtr(const GlobalValue *G);
+  void allocateGlobalCB();
+  void dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km, uint32_t id);
+  bool checkConstPtrsUseHW(Module::const_iterator *F);
+
+  llvm::StringMap<arraymem> mArrayMems;
+  llvm::StringMap<localArg> mLocalArgs;
+  llvm::StringMap<kernelArg> mKernelArgs;
+  llvm::StringMap<kernel> mKernels;
+  llvm::StringMap<constPtr> mConstMems;
+  llvm::StringMap<uint32_t> mFuncNames;
+  llvm::DenseMap<const GlobalValue*, uint32_t> mFuncPtrNames;
+  llvm::DenseMap<uint32_t, llvm::StringRef> mImageNameMap;
+  std::set<llvm::StringRef> mByteStore;
+  std::set<llvm::StringRef> mIgnoreStr;
+  llvm::DenseMap<const Argument *, int32_t> mArgIDMap;
+  const char *symTab;
+  const AMDILSubtarget *mSTM;
+  size_t mOffset;
+  uint32_t mReservedBuffs;
+  uint32_t mCurrentCPOffset;
+  bool mDebugMode;
+};
+} // namespace llvm
+#endif // __AMDILGLOBALMANAGER_H_
diff --git a/src/gallium/drivers/radeon/AMDILIOExpansion.cpp b/src/gallium/drivers/radeon/AMDILIOExpansion.cpp

new file mode 100644 (file)

index 0000000..68d8eef
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILIOExpansion.cpp
@@ -0,0 +1,1160 @@
+//===----------- AMDILIOExpansion.cpp - IO Expansion Pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// The AMDIL IO Expansion class expands pseudo IO instructions into a sequence
+// of instructions that produces the correct results. These instructions are
+// not expanded earlier in the pass because any pass before this can assume to
+// be able to generate a load/store instruction. So this pass can only have
+// passes that execute after it if no load/store instructions can be generated.
+//===----------------------------------------------------------------------===//
+#include "AMDILIOExpansion.h"
+#include "AMDIL.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+
+char AMDILIOExpansion::ID = 0;
+namespace llvm {
+  FunctionPass*
+    createAMDILIOExpansion(TargetMachine &TM AMDIL_OPT_LEVEL_DECL)
+    {
+      return TM.getSubtarget<AMDILSubtarget>()
+        .device()->getIOExpansion(TM AMDIL_OPT_LEVEL_VAR);
+    }
+}
+
+AMDILIOExpansion::AMDILIOExpansion(TargetMachine &tm
+     AMDIL_OPT_LEVEL_DECL) :
+  MachineFunctionPass(ID), TM(tm)
+{
+  mSTM = &tm.getSubtarget<AMDILSubtarget>();
+  mDebug = DEBUGME;
+  mTII = tm.getInstrInfo();
+  mKM = NULL;
+}
+
+AMDILIOExpansion::~AMDILIOExpansion()
+{
+}
+  bool
+AMDILIOExpansion::runOnMachineFunction(MachineFunction &MF)
+{
+  mKM = const_cast<AMDILKernelManager*>(mSTM->getKernelManager());
+  mMFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+      MFI != MFE; ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+    for (MachineBasicBlock::iterator MBI = MBB->begin(), MBE = MBB->end();
+        MBI != MBE; ++MBI) {
+      MachineInstr *MI = MBI;
+      if (isIOInstruction(MI)) {
+        mBB = MBB;
+        saveInst = false;
+        expandIOInstruction(MI);
+        if (!saveInst) {
+          // erase returns the instruction after
+          // and we want the instruction before
+          MBI = MBB->erase(MI);
+          --MBI;
+        }
+      }
+    }
+  }
+  return false;
+}
+const char *AMDILIOExpansion::getPassName() const
+{
+  return "AMDIL Generic IO Expansion Pass";
+}
+  bool
+AMDILIOExpansion::isIOInstruction(MachineInstr *MI)
+{
+  if (!MI) {
+    return false;
+  }
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+      ExpandCaseToAllTypes(AMDIL::CPOOLLOAD)
+        ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::PRIVATELOAD)
+        ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::PRIVATESTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE)
+        ExpandCaseToAllTypes(AMDIL::REGIONSTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE)
+        ExpandCaseToAllTypes(AMDIL::REGIONLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALSTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE)
+        ExpandCaseToAllTypes(AMDIL::LOCALLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::GLOBALLOAD)
+        ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::GLOBALSTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE)
+        return true;
+  };
+  return false;
+}
+void
+AMDILIOExpansion::expandIOInstruction(MachineInstr *MI)
+{
+  assert(isIOInstruction(MI) && "Must be an IO instruction to "
+      "be passed to this function!");
+  switch (MI->getOpcode()) {
+    default:
+      assert(0 && "Not an IO Instruction!");
+      ExpandCaseToAllTypes(AMDIL::GLOBALLOAD);
+      ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD);
+      expandGlobalLoad(MI);
+      break;
+      ExpandCaseToAllTypes(AMDIL::REGIONLOAD);
+      ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD);
+      expandRegionLoad(MI);
+      break;
+      ExpandCaseToAllTypes(AMDIL::LOCALLOAD);
+      ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD);
+      expandLocalLoad(MI);
+      break;
+      ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD);
+      ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD);
+      expandConstantLoad(MI);
+      break;
+      ExpandCaseToAllTypes(AMDIL::PRIVATELOAD);
+      ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD);
+      expandPrivateLoad(MI);
+      break;
+      ExpandCaseToAllTypes(AMDIL::CPOOLLOAD);
+      ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD);
+      ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD);
+      expandConstantPoolLoad(MI);
+      break;
+      ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE)
+      ExpandCaseToAllTypes(AMDIL::GLOBALSTORE);
+      expandGlobalStore(MI);
+      break;
+      ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE);
+      ExpandCaseToAllTypes(AMDIL::PRIVATESTORE);
+      expandPrivateStore(MI);
+      break;
+      ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE);
+      ExpandCaseToAllTypes(AMDIL::REGIONSTORE);
+      expandRegionStore(MI);
+      break;
+      ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE);
+      ExpandCaseToAllTypes(AMDIL::LOCALSTORE);
+      expandLocalStore(MI);
+      break;
+  }
+}
+  bool
+AMDILIOExpansion::isAddrCalcInstr(MachineInstr *MI)
+{
+  switch(MI->getOpcode()) {
+    ExpandCaseToAllTypes(AMDIL::PRIVATELOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD)
+      {
+        // This section of code is a workaround for the problem of
+        // globally scoped constant address variables. The problems
+        // comes that although they are declared in the constant
+        // address space, all variables must be allocated in the
+        // private address space. So when there is a load from
+        // the global address, it automatically goes into the private
+        // address space. However, the data section is placed in the
+        // constant address space so we need to check to see if our
+        // load base address is a global variable or not. Only if it
+        // is not a global variable can we do the address calculation
+        // into the private memory ring.
+
+        MachineMemOperand& memOp = (**MI->memoperands_begin());
+        const Value *V = memOp.getValue();
+        if (V) {
+          const GlobalValue *GV = dyn_cast<GlobalVariable>(V);
+          return mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem)
+            && !(GV);
+        } else {
+          return false;
+        }
+      }
+    ExpandCaseToAllTypes(AMDIL::CPOOLLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD);
+    return MI->getOperand(1).isReg();
+    ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE);
+    ExpandCaseToAllTypes(AMDIL::PRIVATESTORE);
+    return mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem);
+    ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE);
+    ExpandCaseToAllTypes(AMDIL::LOCALSTORE);
+    ExpandCaseToAllTypes(AMDIL::LOCALLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD);
+    return mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem);
+  };
+  return false;
+
+}
+  bool
+AMDILIOExpansion::isExtendLoad(MachineInstr *MI)
+{
+  return isSExtLoadInst(TM.getInstrInfo(), MI) ||
+         isZExtLoadInst(TM.getInstrInfo(), MI) ||
+         isAExtLoadInst(TM.getInstrInfo(), MI)
+    || isSWSExtLoadInst(MI);
+}
+
+  bool
+AMDILIOExpansion::isHardwareRegion(MachineInstr *MI)
+{
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+      break;
+      ExpandCaseToAllTypes(AMDIL::REGIONLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::REGIONSTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE)
+        return mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  };
+  return false;
+}
+  bool
+AMDILIOExpansion::isHardwareLocal(MachineInstr *MI)
+{
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+      break;
+      ExpandCaseToAllTypes(AMDIL::LOCALLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD)
+        ExpandCaseToAllTypes(AMDIL::LOCALSTORE)
+        ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE)
+        return mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  };
+  return false;
+}
+  bool
+AMDILIOExpansion::isPackedData(MachineInstr *MI)
+{
+  switch(MI->getOpcode()) {
+    default:
+      if (isTruncStoreInst(TM.getInstrInfo(), MI)) {
+        switch (MI->getDesc().OpInfo[0].RegClass) {
+          default:
+            break;
+          case AMDIL::GPRV2I64RegClassID:
+          case AMDIL::GPRV2I32RegClassID:
+            switch (getMemorySize(MI)) {
+              case 2:
+              case 4:
+                return true;
+              default:
+                break;
+            }
+            break;
+          case AMDIL::GPRV4I32RegClassID:
+            switch (getMemorySize(MI)) {
+              case 4:
+              case 8:
+                return true;
+              default:
+                break;
+            }
+            break;
+        }
+      } 
+      break;
+      ExpandCaseToPackedTypes(AMDIL::CPOOLLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CPOOLSEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CPOOLZEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CPOOLAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::GLOBALLOAD);
+      ExpandCaseToPackedTypes(AMDIL::GLOBALSEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::GLOBALZEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::GLOBALAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::LOCALLOAD);
+      ExpandCaseToPackedTypes(AMDIL::LOCALSEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::LOCALZEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::LOCALAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::REGIONLOAD);
+      ExpandCaseToPackedTypes(AMDIL::REGIONSEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::REGIONZEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::REGIONAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::PRIVATELOAD);
+      ExpandCaseToPackedTypes(AMDIL::PRIVATESEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::PRIVATEZEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::PRIVATEAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CONSTANTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CONSTANTSEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CONSTANTAEXTLOAD);
+      ExpandCaseToPackedTypes(AMDIL::CONSTANTZEXTLOAD);
+      ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE)
+      ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE);
+      ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE);
+      ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE);
+      ExpandCaseToPackedTypes(AMDIL::GLOBALSTORE);
+      ExpandCaseToPackedTypes(AMDIL::PRIVATESTORE);
+      ExpandCaseToPackedTypes(AMDIL::LOCALSTORE);
+      ExpandCaseToPackedTypes(AMDIL::REGIONSTORE);
+      return true;
+  }
+  return false;
+}
+
+  bool
+AMDILIOExpansion::isStaticCPLoad(MachineInstr *MI)
+{
+  switch(MI->getOpcode()) {
+    ExpandCaseToAllTypes(AMDIL::CPOOLLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD);
+    {
+      uint32_t x = 0;
+      uint32_t num = MI->getNumOperands();
+      for (x = 0; x < num; ++x) {
+        if (MI->getOperand(x).isCPI()) {
+          return true;
+        }
+      }
+    }
+    break;
+    default:
+    break;
+  }
+  return false;
+}
+
+  bool
+AMDILIOExpansion::isNbitType(Type *mType, uint32_t nBits, bool isScalar)
+{
+  if (!mType) {
+    return false;
+  }
+  if (dyn_cast<PointerType>(mType)) {
+    PointerType *PT = dyn_cast<PointerType>(mType);
+    return isNbitType(PT->getElementType(), nBits);
+  } else if (dyn_cast<StructType>(mType)) {
+    return getTypeSize(mType) == nBits;
+  } else if (dyn_cast<VectorType>(mType)) {
+    VectorType *VT = dyn_cast<VectorType>(mType);
+    size_t size = VT->getScalarSizeInBits();
+    return (isScalar ? 
+        VT->getNumElements() * size == nBits : size == nBits);
+  } else if (dyn_cast<ArrayType>(mType)) {
+    ArrayType *AT = dyn_cast<ArrayType>(mType);
+    size_t size = AT->getScalarSizeInBits();
+    return (isScalar ? 
+        AT->getNumElements() * size == nBits : size == nBits);
+  } else if (mType->isSized()) {
+    return mType->getScalarSizeInBits() == nBits;
+  } else {
+    assert(0 && "Found a type that we don't know how to handle!");
+    return false;
+  }
+}
+
+  bool
+AMDILIOExpansion::isHardwareInst(MachineInstr *MI)
+{
+  AMDILAS::InstrResEnc curRes;
+  curRes.u16all = MI->getAsmPrinterFlags();
+  return curRes.bits.HardwareInst;
+}
+
+REG_PACKED_TYPE
+AMDILIOExpansion::getPackedID(MachineInstr *MI)
+{
+  switch (MI->getOpcode()) {
+    default:
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i64i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i64i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i64i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i32i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i32i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i32i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i16i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i16i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i16i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i16i8:
+    case AMDIL::GLOBALSTORE_v2i8:
+    case AMDIL::LOCALSTORE_v2i8:
+    case AMDIL::REGIONSTORE_v2i8:
+    case AMDIL::PRIVATESTORE_v2i8:
+      return PACK_V2I8;
+    case AMDIL::GLOBALTRUNCSTORE_v4i32i8:
+    case AMDIL::REGIONTRUNCSTORE_v4i32i8:
+    case AMDIL::LOCALTRUNCSTORE_v4i32i8:
+    case AMDIL::PRIVATETRUNCSTORE_v4i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_v4i16i8:
+    case AMDIL::REGIONTRUNCSTORE_v4i16i8:
+    case AMDIL::LOCALTRUNCSTORE_v4i16i8:
+    case AMDIL::PRIVATETRUNCSTORE_v4i16i8:
+    case AMDIL::GLOBALSTORE_v4i8:
+    case AMDIL::LOCALSTORE_v4i8:
+    case AMDIL::REGIONSTORE_v4i8:
+    case AMDIL::PRIVATESTORE_v4i8:
+      return PACK_V4I8;
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i16:
+    case AMDIL::REGIONTRUNCSTORE_v2i64i16:
+    case AMDIL::LOCALTRUNCSTORE_v2i64i16:
+    case AMDIL::PRIVATETRUNCSTORE_v2i64i16:
+    case AMDIL::GLOBALTRUNCSTORE_v2i32i16:
+    case AMDIL::REGIONTRUNCSTORE_v2i32i16:
+    case AMDIL::LOCALTRUNCSTORE_v2i32i16:
+    case AMDIL::PRIVATETRUNCSTORE_v2i32i16:
+    case AMDIL::GLOBALSTORE_v2i16:
+    case AMDIL::LOCALSTORE_v2i16:
+    case AMDIL::REGIONSTORE_v2i16:
+    case AMDIL::PRIVATESTORE_v2i16:
+      return PACK_V2I16;
+    case AMDIL::GLOBALTRUNCSTORE_v4i32i16:
+    case AMDIL::REGIONTRUNCSTORE_v4i32i16:
+    case AMDIL::LOCALTRUNCSTORE_v4i32i16:
+    case AMDIL::PRIVATETRUNCSTORE_v4i32i16:
+    case AMDIL::GLOBALSTORE_v4i16:
+    case AMDIL::LOCALSTORE_v4i16:
+    case AMDIL::REGIONSTORE_v4i16:
+    case AMDIL::PRIVATESTORE_v4i16:
+      return PACK_V4I16;
+    case AMDIL::GLOBALLOAD_v2i8:
+    case AMDIL::GLOBALSEXTLOAD_v2i8:
+    case AMDIL::GLOBALAEXTLOAD_v2i8:
+    case AMDIL::GLOBALZEXTLOAD_v2i8:
+    case AMDIL::LOCALLOAD_v2i8:
+    case AMDIL::LOCALSEXTLOAD_v2i8:
+    case AMDIL::LOCALAEXTLOAD_v2i8:
+    case AMDIL::LOCALZEXTLOAD_v2i8:
+    case AMDIL::REGIONLOAD_v2i8:
+    case AMDIL::REGIONSEXTLOAD_v2i8:
+    case AMDIL::REGIONAEXTLOAD_v2i8:
+    case AMDIL::REGIONZEXTLOAD_v2i8:
+    case AMDIL::PRIVATELOAD_v2i8:
+    case AMDIL::PRIVATESEXTLOAD_v2i8:
+    case AMDIL::PRIVATEAEXTLOAD_v2i8:
+    case AMDIL::PRIVATEZEXTLOAD_v2i8:
+    case AMDIL::CONSTANTLOAD_v2i8:
+    case AMDIL::CONSTANTSEXTLOAD_v2i8:
+    case AMDIL::CONSTANTAEXTLOAD_v2i8:
+    case AMDIL::CONSTANTZEXTLOAD_v2i8:
+      return UNPACK_V2I8;
+    case AMDIL::GLOBALLOAD_v4i8:
+    case AMDIL::GLOBALSEXTLOAD_v4i8:
+    case AMDIL::GLOBALAEXTLOAD_v4i8:
+    case AMDIL::GLOBALZEXTLOAD_v4i8:
+    case AMDIL::LOCALLOAD_v4i8:
+    case AMDIL::LOCALSEXTLOAD_v4i8:
+    case AMDIL::LOCALAEXTLOAD_v4i8:
+    case AMDIL::LOCALZEXTLOAD_v4i8:
+    case AMDIL::REGIONLOAD_v4i8:
+    case AMDIL::REGIONSEXTLOAD_v4i8:
+    case AMDIL::REGIONAEXTLOAD_v4i8:
+    case AMDIL::REGIONZEXTLOAD_v4i8:
+    case AMDIL::PRIVATELOAD_v4i8:
+    case AMDIL::PRIVATESEXTLOAD_v4i8:
+    case AMDIL::PRIVATEAEXTLOAD_v4i8:
+    case AMDIL::PRIVATEZEXTLOAD_v4i8:
+    case AMDIL::CONSTANTLOAD_v4i8:
+    case AMDIL::CONSTANTSEXTLOAD_v4i8:
+    case AMDIL::CONSTANTAEXTLOAD_v4i8:
+    case AMDIL::CONSTANTZEXTLOAD_v4i8:
+      return UNPACK_V4I8;
+    case AMDIL::GLOBALLOAD_v2i16:
+    case AMDIL::GLOBALSEXTLOAD_v2i16:
+    case AMDIL::GLOBALAEXTLOAD_v2i16:
+    case AMDIL::GLOBALZEXTLOAD_v2i16:
+    case AMDIL::LOCALLOAD_v2i16:
+    case AMDIL::LOCALSEXTLOAD_v2i16:
+    case AMDIL::LOCALAEXTLOAD_v2i16:
+    case AMDIL::LOCALZEXTLOAD_v2i16:
+    case AMDIL::REGIONLOAD_v2i16:
+    case AMDIL::REGIONSEXTLOAD_v2i16:
+    case AMDIL::REGIONAEXTLOAD_v2i16:
+    case AMDIL::REGIONZEXTLOAD_v2i16:
+    case AMDIL::PRIVATELOAD_v2i16:
+    case AMDIL::PRIVATESEXTLOAD_v2i16:
+    case AMDIL::PRIVATEAEXTLOAD_v2i16:
+    case AMDIL::PRIVATEZEXTLOAD_v2i16:
+    case AMDIL::CONSTANTLOAD_v2i16:
+    case AMDIL::CONSTANTSEXTLOAD_v2i16:
+    case AMDIL::CONSTANTAEXTLOAD_v2i16:
+    case AMDIL::CONSTANTZEXTLOAD_v2i16:
+      return UNPACK_V2I16;
+    case AMDIL::GLOBALLOAD_v4i16:
+    case AMDIL::GLOBALSEXTLOAD_v4i16:
+    case AMDIL::GLOBALAEXTLOAD_v4i16:
+    case AMDIL::GLOBALZEXTLOAD_v4i16:
+    case AMDIL::LOCALLOAD_v4i16:
+    case AMDIL::LOCALSEXTLOAD_v4i16:
+    case AMDIL::LOCALAEXTLOAD_v4i16:
+    case AMDIL::LOCALZEXTLOAD_v4i16:
+    case AMDIL::REGIONLOAD_v4i16:
+    case AMDIL::REGIONSEXTLOAD_v4i16:
+    case AMDIL::REGIONAEXTLOAD_v4i16:
+    case AMDIL::REGIONZEXTLOAD_v4i16:
+    case AMDIL::PRIVATELOAD_v4i16:
+    case AMDIL::PRIVATESEXTLOAD_v4i16:
+    case AMDIL::PRIVATEAEXTLOAD_v4i16:
+    case AMDIL::PRIVATEZEXTLOAD_v4i16:
+    case AMDIL::CONSTANTLOAD_v4i16:
+    case AMDIL::CONSTANTSEXTLOAD_v4i16:
+    case AMDIL::CONSTANTAEXTLOAD_v4i16:
+    case AMDIL::CONSTANTZEXTLOAD_v4i16:
+      return UNPACK_V4I16;
+  };
+  return NO_PACKING;
+}
+
+  uint32_t
+AMDILIOExpansion::getPointerID(MachineInstr *MI)
+{
+  AMDILAS::InstrResEnc curInst;
+  getAsmPrinterFlags(MI, curInst);
+  return curInst.bits.ResourceID;
+}
+
+  uint32_t
+AMDILIOExpansion::getShiftSize(MachineInstr *MI)
+{
+  switch(getPackedID(MI)) {
+    default:
+      return 0;
+    case PACK_V2I8:
+    case PACK_V4I8:
+    case UNPACK_V2I8:
+    case UNPACK_V4I8:
+      return 1;
+    case PACK_V2I16:
+    case PACK_V4I16:
+    case UNPACK_V2I16:
+    case UNPACK_V4I16:
+      return 2;
+  }
+  return 0;
+}
+  uint32_t
+AMDILIOExpansion::getMemorySize(MachineInstr *MI)
+{
+  if (MI->memoperands_empty()) {
+    return 4;
+  }
+  return (uint32_t)((*MI->memoperands_begin())->getSize());
+}
+
+  void
+AMDILIOExpansion::expandLongExtend(MachineInstr *MI,
+    uint32_t numComps, uint32_t size, bool signedShift)
+{
+  DebugLoc DL = MI->getDebugLoc();
+  switch(size) {
+    default:
+      assert(0 && "Found a case we don't handle!");
+      break;
+    case 8:
+      if (numComps == 1) {
+        expandLongExtendSub32(MI, AMDIL::SHL_i8, AMDIL::SHRVEC_v2i32, 
+            AMDIL::USHRVEC_i8,
+            24, (24ULL | (31ULL << 32)), 24, AMDIL::LCREATE, signedShift);
+      } else if (numComps == 2) {
+        expandLongExtendSub32(MI, AMDIL::SHL_v2i8, AMDIL::SHRVEC_v4i32, 
+            AMDIL::USHRVEC_v2i8,
+            24, (24ULL | (31ULL << 32)), 24, AMDIL::LCREATE_v2i64, signedShift);
+      } else {
+        assert(0 && "Found a case we don't handle!");
+      }
+      break;
+    case 16:
+      if (numComps == 1) {
+        expandLongExtendSub32(MI, AMDIL::SHL_i16, AMDIL::SHRVEC_v2i32, 
+            AMDIL::USHRVEC_i16,
+            16, (16ULL | (31ULL << 32)), 16, AMDIL::LCREATE, signedShift);
+      } else if (numComps == 2) {
+        expandLongExtendSub32(MI, AMDIL::SHL_v2i16, AMDIL::SHRVEC_v4i32, 
+            AMDIL::USHRVEC_v2i16,
+            16, (16ULL | (31ULL << 32)), 16, AMDIL::LCREATE_v2i64, signedShift);
+      } else {
+        assert(0 && "Found a case we don't handle!");
+      }
+      break;
+    case 32:
+      if (numComps == 1) {
+        if (signedShift) {
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHRVEC_i32), AMDIL::R1012)
+            .addReg(AMDIL::R1011)
+            .addImm(mMFI->addi32Literal(31));
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+            .addReg(AMDIL::R1011).addReg(AMDIL::R1012);
+        } else {
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addImm(mMFI->addi32Literal(0));
+        }
+      } else if (numComps == 2) {
+        if (signedShift) {
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHRVEC_v2i32), AMDIL::R1012)
+            .addReg(AMDIL::R1011)
+            .addImm(mMFI->addi32Literal(31));
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addReg(AMDIL::R1012);
+        } else {
+          BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011)
+            .addReg(AMDIL::R1011)
+            .addImm(mMFI->addi32Literal(0));
+        }
+      } else {
+        assert(0 && "Found a case we don't handle!");
+      }
+  };
+}
+  void 
+AMDILIOExpansion::expandLongExtendSub32(MachineInstr *MI, 
+    unsigned SHLop, unsigned SHRop, unsigned USHRop, 
+    unsigned SHLimm, uint64_t SHRimm, unsigned USHRimm, 
+    unsigned LCRop, bool signedShift)
+{
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*mBB, MI, DL, mTII->get(SHLop), AMDIL::R1011)
+    .addReg(AMDIL::R1011)
+    .addImm(mMFI->addi32Literal(SHLimm));
+  if (signedShift) {
+    BuildMI(*mBB, MI, DL, mTII->get(LCRop), AMDIL::R1011)
+      .addReg(AMDIL::R1011).addReg(AMDIL::R1011);
+    BuildMI(*mBB, MI, DL, mTII->get(SHRop), AMDIL::R1011)
+      .addReg(AMDIL::R1011)
+      .addImm(mMFI->addi64Literal(SHRimm));
+  } else {
+    BuildMI(*mBB, MI, DL, mTII->get(USHRop), AMDIL::R1011)
+      .addReg(AMDIL::R1011)
+      .addImm(mMFI->addi32Literal(USHRimm));
+    BuildMI(*mBB, MI, MI->getDebugLoc(), mTII->get(LCRop), AMDIL::R1011)
+      .addReg(AMDIL::R1011)
+      .addImm(mMFI->addi32Literal(0));
+  }
+}
+
+  void
+AMDILIOExpansion::expandIntegerExtend(MachineInstr *MI, unsigned SHLop, 
+    unsigned SHRop, unsigned offset)
+{
+  DebugLoc DL = MI->getDebugLoc();
+  offset = mMFI->addi32Literal(offset);
+  BuildMI(*mBB, MI, DL,
+      mTII->get(SHLop), AMDIL::R1011)
+    .addReg(AMDIL::R1011).addImm(offset);
+  BuildMI(*mBB, MI, DL,
+      mTII->get(SHRop), AMDIL::R1011)
+    .addReg(AMDIL::R1011).addImm(offset);
+}
+  void
+AMDILIOExpansion::expandExtendLoad(MachineInstr *MI)
+{
+  if (!isExtendLoad(MI)) {
+    return;
+  }
+  Type *mType = NULL;
+  if (!MI->memoperands_empty()) {
+    MachineMemOperand *memOp = (*MI->memoperands_begin());
+    const Value *moVal = (memOp) ? memOp->getValue() : NULL;
+    mType = (moVal) ? moVal->getType() : NULL;
+  }
+  unsigned opcode = 0;
+  DebugLoc DL = MI->getDebugLoc();
+  if (isZExtLoadInst(TM.getInstrInfo(), MI) || isAExtLoadInst(TM.getInstrInfo(), MI) || isSExtLoadInst(TM.getInstrInfo(), MI)) {
+    switch(MI->getDesc().OpInfo[0].RegClass) {
+      default:
+        assert(0 && "Found an extending load that we don't handle!");
+        break;
+      case AMDIL::GPRI16RegClassID:
+        if (!isHardwareLocal(MI)
+            || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i16 : AMDIL::USHRVEC_i16;
+          expandIntegerExtend(MI, AMDIL::SHL_i16, opcode, 24);
+        }
+        break;
+      case AMDIL::GPRV2I16RegClassID:
+        opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i16 : AMDIL::USHRVEC_v2i16;
+        expandIntegerExtend(MI, AMDIL::SHL_v2i16, opcode, 24);
+        break;
+      case AMDIL::GPRV4I8RegClassID:        
+        opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i8 : AMDIL::USHRVEC_v4i8;
+        expandIntegerExtend(MI, AMDIL::SHL_v4i8, opcode, 24);
+        break;
+      case AMDIL::GPRV4I16RegClassID:
+        opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i16 : AMDIL::USHRVEC_v4i16;
+        expandIntegerExtend(MI, AMDIL::SHL_v4i16, opcode, 24);
+        break;
+      case AMDIL::GPRI32RegClassID:
+        // We can be a i8 or i16 bit sign extended value
+        if (isNbitType(mType, 8) || getMemorySize(MI) == 1) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i32 : AMDIL::USHRVEC_i32;
+          expandIntegerExtend(MI, AMDIL::SHL_i32, opcode, 24);
+        } else if (isNbitType(mType, 16) || getMemorySize(MI) == 2) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i32 : AMDIL::USHRVEC_i32;
+          expandIntegerExtend(MI, AMDIL::SHL_i32, opcode, 16);
+        } else {
+          assert(0 && "Found an extending load that we don't handle!");
+        }
+        break;
+      case AMDIL::GPRV2I32RegClassID:
+        // We can be a v2i8 or v2i16 bit sign extended value
+        if (isNbitType(mType, 8, false) || getMemorySize(MI) == 2) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i32 : AMDIL::USHRVEC_v2i32;
+          expandIntegerExtend(MI, AMDIL::SHL_v2i32, opcode, 24);
+        } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 4) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i32 : AMDIL::USHRVEC_v2i32;
+          expandIntegerExtend(MI, AMDIL::SHL_v2i32, opcode, 16);
+        } else {
+          assert(0 && "Found an extending load that we don't handle!");
+        }
+        break;
+      case AMDIL::GPRV4I32RegClassID:
+        // We can be a v4i8 or v4i16 bit sign extended value
+        if (isNbitType(mType, 8, false) || getMemorySize(MI) == 4) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i32 : AMDIL::USHRVEC_v4i32;
+          expandIntegerExtend(MI, AMDIL::SHL_v4i32, opcode, 24);
+        } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 8) {
+          opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i32 : AMDIL::USHRVEC_v4i32;
+          expandIntegerExtend(MI, AMDIL::SHL_v4i32, opcode, 16);
+        } else {
+          assert(0 && "Found an extending load that we don't handle!");
+        }
+        break;
+      case AMDIL::GPRI64RegClassID:
+        // We can be a i8, i16 or i32 bit sign extended value
+        if (isNbitType(mType, 8) || getMemorySize(MI) == 1) {
+          expandLongExtend(MI, 1, 8, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else if (isNbitType(mType, 16) || getMemorySize(MI) == 2) {
+          expandLongExtend(MI, 1, 16, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else if (isNbitType(mType, 32) || getMemorySize(MI) == 4) {
+          expandLongExtend(MI, 1, 32, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else {
+          assert(0 && "Found an extending load that we don't handle!");
+        }
+        break;
+      case AMDIL::GPRV2I64RegClassID:
+        // We can be a v2i8, v2i16 or v2i32 bit sign extended value
+        if (isNbitType(mType, 8, false) || getMemorySize(MI) == 2) {
+          expandLongExtend(MI, 2, 8, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 4) {
+          expandLongExtend(MI, 2, 16, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else if (isNbitType(mType, 32, false) || getMemorySize(MI) == 8) {
+          expandLongExtend(MI, 2, 32, isSExtLoadInst(TM.getInstrInfo(), MI));
+        } else {
+          assert(0 && "Found an extending load that we don't handle!");
+        }
+        break;
+      case AMDIL::GPRF32RegClassID:
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::HTOF_f32), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        break;
+      case AMDIL::GPRV2F32RegClassID:
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::HTOF_v2f32), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        break;
+      case AMDIL::GPRV4F32RegClassID:
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::HTOF_v4f32), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        break;
+      case AMDIL::GPRF64RegClassID:
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::FTOD), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        break;
+      case AMDIL::GPRV2F64RegClassID:
+        BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VEXTRACT_v2f32),
+            AMDIL::R1012).addReg(AMDIL::R1011).addImm(2);
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::FTOD), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+        BuildMI(*mBB, MI, DL, 
+            mTII->get(AMDIL::FTOD), AMDIL::R1012)
+          .addReg(AMDIL::R1012);
+        BuildMI(*mBB, MI, DL,
+            mTII->get(AMDIL::VINSERT_v2f64), AMDIL::R1011)
+          .addReg(AMDIL::R1011).addReg(AMDIL::R1012)
+          .addImm(1 << 8).addImm(1 << 8);
+        break;
+    };
+  } else if (isSWSExtLoadInst(MI)) {
+    switch(MI->getDesc().OpInfo[0].RegClass) {
+      case AMDIL::GPRI8RegClassID:
+        if (!isHardwareLocal(MI)
+            || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) {
+          expandIntegerExtend(MI, AMDIL::SHL_i8, AMDIL::SHRVEC_i8, 24);
+        }
+        break;
+      case AMDIL::GPRV2I8RegClassID:
+        expandIntegerExtend(MI, AMDIL::SHL_v2i8, AMDIL::SHRVEC_v2i8, 24);
+        break;
+      case AMDIL::GPRV4I8RegClassID:
+        expandIntegerExtend(MI, AMDIL::SHL_v4i8, AMDIL::SHRVEC_v4i8, 24);
+        break;
+      case AMDIL::GPRI16RegClassID:
+        if (!isHardwareLocal(MI)
+            || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) {
+          expandIntegerExtend(MI, AMDIL::SHL_i16, AMDIL::SHRVEC_i16, 16);
+        }
+        break;
+      case AMDIL::GPRV2I16RegClassID:
+        expandIntegerExtend(MI, AMDIL::SHL_v2i16, AMDIL::SHRVEC_v2i16, 16);
+        break;
+      case AMDIL::GPRV4I16RegClassID:
+        expandIntegerExtend(MI, AMDIL::SHL_v4i16, AMDIL::SHRVEC_v4i16, 16);
+        break;
+
+    };
+  }
+}
+
+  void
+AMDILIOExpansion::expandTruncData(MachineInstr *MI)
+{
+  MachineBasicBlock::iterator I = *MI;
+  if (!isTruncStoreInst(TM.getInstrInfo(), MI)) {
+    return;
+  }
+  DebugLoc DL = MI->getDebugLoc();
+  switch (MI->getOpcode()) {
+    default: 
+      MI->dump();
+      assert(!"Found a trunc store instructions we don't handle!");
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_i64i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i8:
+    case AMDIL::LOCALTRUNCSTORE_i64i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i64i8:
+    case AMDIL::REGIONTRUNCSTORE_i64i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i64i8:
+    case AMDIL::PRIVATETRUNCSTORE_i64i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i64i8:
+      BuildMI(*mBB, MI, DL,
+          mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+    case AMDIL::GLOBALTRUNCSTORE_i16i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i16i8:
+    case AMDIL::GLOBALTRUNCSTORE_v4i16i8:
+    case AMDIL::LOCALTRUNCSTORE_i16i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i16i8:
+    case AMDIL::LOCALTRUNCSTORE_v4i16i8:
+    case AMDIL::REGIONTRUNCSTORE_i16i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i16i8:
+    case AMDIL::REGIONTRUNCSTORE_v4i16i8:
+    case AMDIL::PRIVATETRUNCSTORE_i16i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i16i8:
+    case AMDIL::PRIVATETRUNCSTORE_v4i16i8:
+    case AMDIL::GLOBALTRUNCSTORE_i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_v4i32i8:
+    case AMDIL::LOCALTRUNCSTORE_i32i8:
+    case AMDIL::LOCALTRUNCSTORE_v2i32i8:
+    case AMDIL::LOCALTRUNCSTORE_v4i32i8:
+    case AMDIL::REGIONTRUNCSTORE_i32i8:
+    case AMDIL::REGIONTRUNCSTORE_v2i32i8:
+    case AMDIL::REGIONTRUNCSTORE_v4i32i8:
+    case AMDIL::PRIVATETRUNCSTORE_i32i8:
+    case AMDIL::PRIVATETRUNCSTORE_v2i32i8:
+    case AMDIL::PRIVATETRUNCSTORE_v4i32i8:
+      BuildMI(*mBB, MI, DL, 
+          mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0xFF));
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_i64i16:
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i16:
+    case AMDIL::LOCALTRUNCSTORE_i64i16:
+    case AMDIL::LOCALTRUNCSTORE_v2i64i16:
+    case AMDIL::REGIONTRUNCSTORE_i64i16:
+    case AMDIL::REGIONTRUNCSTORE_v2i64i16:
+    case AMDIL::PRIVATETRUNCSTORE_i64i16:
+    case AMDIL::PRIVATETRUNCSTORE_v2i64i16:
+      BuildMI(*mBB, MI, DL,
+          mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+    case AMDIL::GLOBALTRUNCSTORE_i32i16:
+    case AMDIL::GLOBALTRUNCSTORE_v2i32i16:
+    case AMDIL::GLOBALTRUNCSTORE_v4i32i16:
+    case AMDIL::LOCALTRUNCSTORE_i32i16:
+    case AMDIL::LOCALTRUNCSTORE_v2i32i16:
+    case AMDIL::LOCALTRUNCSTORE_v4i32i16:
+    case AMDIL::REGIONTRUNCSTORE_i32i16:
+    case AMDIL::REGIONTRUNCSTORE_v2i32i16:
+    case AMDIL::REGIONTRUNCSTORE_v4i32i16:
+    case AMDIL::PRIVATETRUNCSTORE_i32i16:
+    case AMDIL::PRIVATETRUNCSTORE_v2i32i16:
+    case AMDIL::PRIVATETRUNCSTORE_v4i32i16:
+      BuildMI(*mBB, MI, DL, 
+          mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011)
+        .addReg(AMDIL::R1011)
+        .addImm(mMFI->addi32Literal(0xFFFF));
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_i64i32:
+    case AMDIL::LOCALTRUNCSTORE_i64i32:
+    case AMDIL::REGIONTRUNCSTORE_i64i32:
+    case AMDIL::PRIVATETRUNCSTORE_i64i32:
+      BuildMI(*mBB, MI, DL,
+          mTII->get(AMDIL::LLO), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i32:
+    case AMDIL::LOCALTRUNCSTORE_v2i64i32:
+    case AMDIL::REGIONTRUNCSTORE_v2i64i32:
+    case AMDIL::PRIVATETRUNCSTORE_v2i64i32:
+      BuildMI(*mBB, MI, DL,
+          mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011)
+          .addReg(AMDIL::R1011);
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_f64f32:
+    case AMDIL::LOCALTRUNCSTORE_f64f32:
+    case AMDIL::REGIONTRUNCSTORE_f64f32:
+    case AMDIL::PRIVATETRUNCSTORE_f64f32:
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF),
+          AMDIL::R1011).addReg(AMDIL::R1011);
+      break;
+    case AMDIL::GLOBALTRUNCSTORE_v2f64f32:
+    case AMDIL::LOCALTRUNCSTORE_v2f64f32:
+    case AMDIL::REGIONTRUNCSTORE_v2f64f32:
+    case AMDIL::PRIVATETRUNCSTORE_v2f64f32:
+      BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v2f64),
+          AMDIL::R1012).addReg(AMDIL::R1011).addImm(2);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF),
+          AMDIL::R1011).addReg(AMDIL::R1011);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF),
+          AMDIL::R1012).addReg(AMDIL::R1012);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VINSERT_v2f32),
+          AMDIL::R1011).addReg(AMDIL::R1011).addReg(AMDIL::R1012)
+        .addImm(1 << 8).addImm(1 << 8);
+      break;
+  }
+}
+  void
+AMDILIOExpansion::expandAddressCalc(MachineInstr *MI)
+{
+  if (!isAddrCalcInstr(MI)) {
+    return;
+  }
+  DebugLoc DL = MI->getDebugLoc();
+  switch(MI->getOpcode()) {
+    ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE)
+      ExpandCaseToAllTypes(AMDIL::PRIVATESTORE)
+      ExpandCaseToAllTypes(AMDIL::PRIVATELOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD)
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), 
+          AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::T1);
+    break;
+    ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE)
+      ExpandCaseToAllTypes(AMDIL::LOCALLOAD)
+      ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::LOCALSTORE)
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), 
+          AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::T2);
+    break;
+    ExpandCaseToAllTypes(AMDIL::CPOOLLOAD)
+      ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD)
+      ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD)
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), 
+          AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::SDP);
+    break;
+    default:
+    return;
+  }
+}
+  void
+AMDILIOExpansion::expandLoadStartCode(MachineInstr *MI)
+{
+  DebugLoc DL = MI->getDebugLoc();
+  if (MI->getOperand(2).isReg()) {
+    BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32),
+        AMDIL::R1010).addReg(MI->getOperand(1).getReg())
+      .addReg(MI->getOperand(2).getReg());
+  } else {
+    BuildMI(*mBB, MI, DL, mTII->get(AMDIL::MOVE_i32),
+        AMDIL::R1010).addReg(MI->getOperand(1).getReg());
+  }
+  MI->getOperand(1).setReg(AMDIL::R1010);
+  expandAddressCalc(MI);
+}
+  void
+AMDILIOExpansion::emitStaticCPLoad(MachineInstr* MI, int swizzle, 
+    int id, bool ExtFPLoad)
+{
+  DebugLoc DL = MI->getDebugLoc();
+  switch(swizzle) {
+    default:
+      BuildMI(*mBB, MI, DL, mTII->get(ExtFPLoad 
+            ? AMDIL::DTOF : AMDIL::MOVE_i32), 
+          MI->getOperand(0).getReg())
+        .addImm(id);
+      break;
+    case 1:
+    case 2:
+    case 3:
+      BuildMI(*mBB, MI, DL, mTII->get(ExtFPLoad 
+            ? AMDIL::DTOF : AMDIL::MOVE_i32), AMDIL::R1001)
+        .addImm(id);
+      BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VINSERT_v4i32),
+          MI->getOperand(0).getReg())
+        .addReg(MI->getOperand(0).getReg())
+        .addReg(AMDIL::R1001)
+        .addImm(swizzle + 1);
+      break;
+  };
+}
+  void
+AMDILIOExpansion::emitCPInst(MachineInstr* MI,
+    const Constant* C, AMDILKernelManager* KM, int swizzle, bool ExtFPLoad)
+{
+  if (const ConstantFP* CFP = dyn_cast<ConstantFP>(C)) {
+    if (CFP->getType()->isFloatTy()) {
+      uint32_t val = (uint32_t)(CFP->getValueAPF().bitcastToAPInt()
+          .getZExtValue());
+      uint32_t id = mMFI->addi32Literal(val);
+      if (!id) {
+        const APFloat &APF = CFP->getValueAPF();
+        union dtol_union {
+          double d;
+          uint64_t ul;
+        } conv;
+        if (&APF.getSemantics()
+            == (const llvm::fltSemantics*)&APFloat::IEEEsingle) {
+          float fval = APF.convertToFloat();
+          conv.d = (double)fval;
+        } else {
+          conv.d = APF.convertToDouble();
+        }
+        id = mMFI->addi64Literal(conv.ul);
+      }
+      emitStaticCPLoad(MI, swizzle, id, ExtFPLoad);
+    } else {
+      const APFloat &APF = CFP->getValueAPF();
+      union ftol_union {
+        double d;
+        uint64_t ul;
+      } conv;
+      if (&APF.getSemantics()
+          == (const llvm::fltSemantics*)&APFloat::IEEEsingle) {
+        float fval = APF.convertToFloat();
+        conv.d = (double)fval;
+      } else {
+        conv.d = APF.convertToDouble();
+      }
+      uint32_t id = mMFI->getLongLits(conv.ul);
+      if (!id) {
+        id = mMFI->getIntLits((uint32_t)conv.ul);
+      }
+      emitStaticCPLoad(MI, swizzle, id, ExtFPLoad);
+    }
+  } else if (const ConstantInt* CI = dyn_cast<ConstantInt>(C)) {
+    int64_t val = 0;
+    if (CI) {
+      val = CI->getSExtValue();
+    }
+    if (CI->getBitWidth() == 64) {
+      emitStaticCPLoad(MI, swizzle, mMFI->addi64Literal(val), ExtFPLoad);
+    } else {
+      emitStaticCPLoad(MI, swizzle, mMFI->addi32Literal(val), ExtFPLoad);
+    }
+  } else if (const ConstantArray* CA = dyn_cast<ConstantArray>(C)) {
+    uint32_t size = CA->getNumOperands();
+    assert(size < 5 && "Cannot handle a constant array where size > 4");
+    if (size > 4) {
+      size = 4;
+    }
+    for (uint32_t x = 0; x < size; ++x) {
+      emitCPInst(MI, CA->getOperand(0), KM, x, ExtFPLoad);
+    }
+  } else if (const ConstantAggregateZero* CAZ
+      = dyn_cast<ConstantAggregateZero>(C)) {
+    if (CAZ->isNullValue()) {
+      emitStaticCPLoad(MI, swizzle, mMFI->addi32Literal(0), ExtFPLoad);
+    }
+  } else if (const ConstantStruct* CS = dyn_cast<ConstantStruct>(C)) {
+    uint32_t size = CS->getNumOperands();
+    assert(size < 5 && "Cannot handle a constant array where size > 4");
+    if (size > 4) {
+      size = 4;
+    }
+    for (uint32_t x = 0; x < size; ++x) {
+      emitCPInst(MI, CS->getOperand(0), KM, x, ExtFPLoad);
+    }
+  } else if (const ConstantVector* CV = dyn_cast<ConstantVector>(C)) {
+    // TODO: Make this handle vectors natively up to the correct
+    // size
+    uint32_t size = CV->getNumOperands();
+    assert(size < 5 && "Cannot handle a constant array where size > 4");
+    if (size > 4) {
+      size = 4;
+    }
+    for (uint32_t x = 0; x < size; ++x) {
+      emitCPInst(MI, CV->getOperand(0), KM, x, ExtFPLoad);
+    }
+  } else {
+    // TODO: Do we really need to handle ConstantPointerNull?
+    // What about BlockAddress, ConstantExpr and Undef?
+    // How would these even be generated by a valid CL program?
+    assert(0 && "Found a constant type that I don't know how to handle");
+  }
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILIOExpansion.h b/src/gallium/drivers/radeon/AMDILIOExpansion.h

new file mode 100644 (file)

index 0000000..af4709a
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILIOExpansion.h
@@ -0,0 +1,320 @@
+//===----------- AMDILIOExpansion.h - IO Expansion Pass -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// The AMDIL IO Expansion class expands pseudo IO instructions into a sequence
+// of instructions that produces the correct results. These instructions are
+// not expanded earlier in the backend because any pass before this can assume to
+// be able to generate a load/store instruction. So this pass can only have
+// passes that execute after it if no load/store instructions can be generated
+// in those passes.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILIOEXPANSION_H_
+#define _AMDILIOEXPANSION_H_
+#undef DEBUG_TYPE
+#undef DEBUGME
+#define DEBUG_TYPE "IOExpansion"
+#if !defined(NDEBUG)
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME (false)
+#endif
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MachineFunction;
+  class AMDILKernelManager;
+  class AMDILMachineFunctionInfo;
+  class AMDILSubtarget;
+  class MachineInstr;
+  class Constant;
+  class TargetInstrInfo;
+  class Type;
+  typedef enum {
+    NO_PACKING = 0,
+    PACK_V2I8,
+    PACK_V4I8,
+    PACK_V2I16,
+    PACK_V4I16,
+    UNPACK_V2I8,
+    UNPACK_V4I8,
+    UNPACK_V2I16,
+    UNPACK_V4I16,
+    UNPACK_LAST
+  } REG_PACKED_TYPE;
+  class AMDILIOExpansion : public MachineFunctionPass
+  {
+    public:
+      virtual ~AMDILIOExpansion();
+      virtual const char* getPassName() const;
+      bool runOnMachineFunction(MachineFunction &MF);
+      static char ID;
+    protected:
+      AMDILIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+      TargetMachine &TM;
+      //
+      // @param MI Machine instruction to check.
+      // @brief checks to see if the machine instruction
+      // is an I/O instruction or not.
+      //
+      // @return true if I/O, false otherwise.
+      //
+      virtual bool
+        isIOInstruction(MachineInstr *MI);
+      // Wrapper function that calls the appropriate I/O 
+      // expansion function based on the instruction type.
+      virtual void
+        expandIOInstruction(MachineInstr *MI);
+       virtual void
+        expandGlobalStore(MachineInstr *MI) = 0;
+      virtual void
+        expandLocalStore(MachineInstr *MI) = 0;
+      virtual void
+        expandRegionStore(MachineInstr *MI) = 0;
+      virtual void
+        expandPrivateStore(MachineInstr *MI) = 0;
+      virtual void
+        expandGlobalLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandRegionLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandLocalLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandPrivateLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandConstantLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandConstantPoolLoad(MachineInstr *MI) = 0;
+      bool
+        isAddrCalcInstr(MachineInstr *MI);
+      bool
+        isExtendLoad(MachineInstr *MI);
+      bool
+        isHardwareRegion(MachineInstr *MI);
+      bool
+        isHardwareLocal(MachineInstr *MI);
+      bool
+        isPackedData(MachineInstr *MI);
+      bool
+        isStaticCPLoad(MachineInstr *MI);
+      bool
+        isNbitType(Type *MI, uint32_t nBits, bool isScalar = true);
+      bool
+        isHardwareInst(MachineInstr *MI);
+      uint32_t
+        getMemorySize(MachineInstr *MI);
+      REG_PACKED_TYPE
+        getPackedID(MachineInstr *MI);
+      uint32_t
+        getShiftSize(MachineInstr *MI);
+      uint32_t
+        getPointerID(MachineInstr *MI);
+      void
+        expandTruncData(MachineInstr *MI);
+      void
+        expandLoadStartCode(MachineInstr *MI);
+      virtual void
+        expandStoreSetupCode(MachineInstr *MI) = 0;
+      void
+        expandAddressCalc(MachineInstr *MI);
+      void
+        expandLongExtend(MachineInstr *MI, 
+            uint32_t numComponents, uint32_t size, bool signedShift);
+      void 
+        expandLongExtendSub32(MachineInstr *MI, 
+            unsigned SHLop, unsigned SHRop, unsigned USHRop, 
+            unsigned SHLimm, uint64_t SHRimm, unsigned USHRimm, 
+            unsigned LCRop, bool signedShift);
+      void
+        expandIntegerExtend(MachineInstr *MI, unsigned, unsigned, unsigned);
+      void
+        expandExtendLoad(MachineInstr *MI);
+      virtual void
+        expandPackedData(MachineInstr *MI) = 0;
+       void
+         emitCPInst(MachineInstr* MI, const Constant* C, 
+             AMDILKernelManager* KM, int swizzle, bool ExtFPLoad);
+
+      bool mDebug;
+      const AMDILSubtarget *mSTM;
+      AMDILKernelManager *mKM;
+      MachineBasicBlock *mBB;
+      AMDILMachineFunctionInfo *mMFI;
+      const TargetInstrInfo *mTII;
+      bool saveInst;
+    private:
+      void
+        emitStaticCPLoad(MachineInstr* MI, int swizzle, int id,
+            bool ExtFPLoad);
+  }; // class AMDILIOExpansion
+
+  // Intermediate class that holds I/O code expansion that is common to the
+  // 7XX, Evergreen and Northern Island family of chips.
+  class AMDIL789IOExpansion : public AMDILIOExpansion  {
+    public:
+      virtual ~AMDIL789IOExpansion();
+      virtual const char* getPassName() const;
+    protected:
+      AMDIL789IOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+       virtual void
+        expandGlobalStore(MachineInstr *MI) = 0;
+      virtual void
+        expandLocalStore(MachineInstr *MI) = 0;
+      virtual void
+        expandRegionStore(MachineInstr *MI) = 0;
+      virtual void
+        expandGlobalLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandRegionLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandLocalLoad(MachineInstr *MI) = 0;
+      virtual void
+        expandPrivateStore(MachineInstr *MI);
+      virtual void
+        expandConstantLoad(MachineInstr *MI);
+      virtual void
+        expandPrivateLoad(MachineInstr *MI) ;
+      virtual void
+        expandConstantPoolLoad(MachineInstr *MI);
+      void
+        expandStoreSetupCode(MachineInstr *MI);
+      virtual void
+        expandPackedData(MachineInstr *MI);
+    private:
+      void emitVectorAddressCalc(MachineInstr *MI, bool is32bit, 
+          bool needsSelect);
+      void emitVectorSwitchWrite(MachineInstr *MI, bool is32bit);
+      void emitComponentExtract(MachineInstr *MI, unsigned flag, unsigned src, 
+          unsigned dst, bool beforeInst);
+      void emitDataLoadSelect(MachineInstr *MI);
+  }; // class AMDIL789IOExpansion
+  // Class that handles I/O emission for the 7XX family of devices.
+  class AMDIL7XXIOExpansion : public AMDIL789IOExpansion {
+    public:
+      AMDIL7XXIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+      ~AMDIL7XXIOExpansion();
+      const char* getPassName() const;
+    protected:
+      void
+        expandGlobalStore(MachineInstr *MI);
+      void
+        expandLocalStore(MachineInstr *MI);
+      void
+        expandRegionStore(MachineInstr *MI);
+      void
+        expandGlobalLoad(MachineInstr *MI);
+      void
+        expandRegionLoad(MachineInstr *MI);
+      void
+        expandLocalLoad(MachineInstr *MI);
+  }; // class AMDIL7XXIOExpansion
+
+  // Class that handles image functions to expand them into the
+  // correct set of I/O instructions.
+  class AMDILImageExpansion : public AMDIL789IOExpansion {
+    public:
+      AMDILImageExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+      virtual ~AMDILImageExpansion();
+    protected:
+      //
+      // @param MI Instruction iterator that has the sample instruction
+      // that needs to be taken care of.
+      // @brief transforms the __amdil_sample_data function call into a
+      // sample instruction in IL.
+      //
+      // @warning This function only works correctly if all functions get
+      // inlined
+      //
+      virtual void
+        expandImageLoad(MachineBasicBlock *BB, MachineInstr *MI);
+      //
+      // @param MI Instruction iterator that has the write instruction that
+      // needs to be taken care of.
+      // @brief transforms the __amdil_write_data function call into a
+      // simple UAV write instruction in IL.
+      //
+      // @warning This function only works correctly if all functions get
+      // inlined
+      //
+      virtual void
+        expandImageStore(MachineBasicBlock *BB, MachineInstr *MI);
+      //
+      // @param MI Instruction interator that has the image parameter
+      // instruction
+      // @brief transforms the __amdil_get_image_params function call into
+      // a copy of data from a specific constant buffer to the register
+      //
+      // @warning This function only works correctly if all functions get
+      // inlined
+      //
+      virtual void
+        expandImageParam(MachineBasicBlock *BB, MachineInstr *MI);
+
+      //
+      // @param MI Insturction that points to the image
+      // @brief transforms __amdil_sample_data into a sequence of
+      // if/else that selects the correct sample instruction.
+      //
+      // @warning This function is inefficient and works with no
+      // inlining.
+      //
+      virtual void
+        expandInefficientImageLoad(MachineBasicBlock *BB, MachineInstr *MI);
+    private:
+      AMDILImageExpansion(); // Do not implement.
+
+  }; // class AMDILImageExpansion
+
+  // Class that expands IO instructions for Evergreen and Northern
+  // Island family of devices.
+  class AMDILEGIOExpansion : public AMDILImageExpansion  {
+    public:
+      AMDILEGIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+
+      virtual ~AMDILEGIOExpansion();
+      const char* getPassName() const;
+    protected:
+      virtual bool
+        isIOInstruction(MachineInstr *MI);
+      virtual void
+        expandIOInstruction(MachineInstr *MI);
+      bool
+        isImageIO(MachineInstr *MI);
+      virtual void
+        expandGlobalStore(MachineInstr *MI);
+      void
+        expandLocalStore(MachineInstr *MI);
+      void
+        expandRegionStore(MachineInstr *MI);
+      virtual void
+        expandGlobalLoad(MachineInstr *MI);
+      void
+        expandRegionLoad(MachineInstr *MI);
+      void
+        expandLocalLoad(MachineInstr *MI);
+      virtual bool
+        isCacheableOp(MachineInstr *MI);
+      void
+        expandStoreSetupCode(MachineInstr *MI);
+      void
+        expandPackedData(MachineInstr *MI);
+    private:
+      bool
+        isArenaOp(MachineInstr *MI);
+      void
+        expandArenaSetup(MachineInstr *MI);
+  }; // class AMDILEGIOExpansion
+} // namespace llvm
+#endif // _AMDILIOEXPANSION_H_
diff --git a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp

new file mode 100644 (file)

index 0000000..ff04d9d
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
@@ -0,0 +1,457 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the AMDIL target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILDevices.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDILDAGToDAGISel - AMDIL specific code to select AMDIL machine instructions
+// //for SelectionDAG operations.
+//
+namespace {
+class AMDILDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDIL Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDILSubtarget &Subtarget;
+public:
+  AMDILDAGToDAGISel(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL);
+  virtual ~AMDILDAGToDAGISel();
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+
+  SDNode *Select(SDNode *N);
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  static bool isCPLoad(const LoadSDNode *N);
+  static bool isConstantLoad(const LoadSDNode *N, int cbID);
+  static bool isGlobalLoad(const LoadSDNode *N);
+  static bool isPrivateLoad(const LoadSDNode *N);
+  static bool isLocalLoad(const LoadSDNode *N);
+  static bool isRegionLoad(const LoadSDNode *N);
+
+  virtual const char *getPassName() const;
+private:
+  SDNode *xformAtomicInst(SDNode *N);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDILGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+// createAMDILISelDag - This pass converts a legalized DAG into a AMDIL-specific
+// DAG, ready for instruction scheduling.
+//
+FunctionPass *llvm::createAMDILISelDag(AMDILTargetMachine &TM
+                                        AMDIL_OPT_LEVEL_DECL) {
+  return new AMDILDAGToDAGISel(TM AMDIL_OPT_LEVEL_VAR);
+}
+
+AMDILDAGToDAGISel::AMDILDAGToDAGISel(AMDILTargetMachine &TM
+                                      AMDIL_OPT_LEVEL_DECL)
+  : SelectionDAGISel(TM AMDIL_OPT_LEVEL_VAR), Subtarget(TM.getSubtarget<AMDILSubtarget>())
+{
+}
+
+AMDILDAGToDAGISel::~AMDILDAGToDAGISel() {
+}
+
+SDValue AMDILDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDILDAGToDAGISel::SelectADDRParam(
+    SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDILDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDILDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDILDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  switch (Opc) {
+  default: break;
+  case ISD::FrameIndex:
+    {
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
+        unsigned int FI = FIN->getIndex();
+        EVT OpVT = N->getValueType(0);
+        unsigned int NewOpc = AMDIL::MOVE_i32;
+        SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+        return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
+      }
+    }
+    break;
+  }
+  // For all atomic instructions, we need to add a constant
+  // operand that stores the resource ID in the instruction
+  if (Opc > AMDILISD::ADDADDR && Opc < AMDILISD::APPEND_ALLOC) {
+    N = xformAtomicInst(N);
+  }
+  return SelectCode(N);
+}
+
+bool AMDILDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  return (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+          && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+          && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
+}
+
+bool AMDILDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
+  if (check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
+    return true;
+  }
+  MachineMemOperand *MMO = N->getMemOperand();
+  const Value *V = MMO->getValue();
+  const Value *BV = getBasePointerValue(V);
+  if (MMO
+      && MMO->getValue()
+      && ((V && dyn_cast<GlobalValue>(V))
+          || (BV && dyn_cast<GlobalValue>(
+                        getBasePointerValue(MMO->getValue()))))) {
+    return check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
+  } else {
+    return false;
+  }
+}
+
+bool AMDILDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
+  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+}
+
+bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+    if (MMO) {
+      const Value *V = MMO->getValue();
+      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDILDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
+  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+  if (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+      && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+      && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
+      && !check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
+      && !check_type(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
+      && !check_type(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
+  {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDILDAGToDAGISel::getPassName() const {
+  return "AMDIL DAG->DAG Pattern Instruction Selection";
+}
+
+SDNode*
+AMDILDAGToDAGISel::xformAtomicInst(SDNode *N)
+{
+  uint32_t addVal = 1;
+  bool addOne = false;
+  // bool bitCastToInt = (N->getValueType(0) == MVT::f32);
+  unsigned opc = N->getOpcode();
+  switch (opc) {
+    default: return N;
+    case AMDILISD::ATOM_G_ADD:
+    case AMDILISD::ATOM_G_AND:
+    case AMDILISD::ATOM_G_MAX:
+    case AMDILISD::ATOM_G_UMAX:
+    case AMDILISD::ATOM_G_MIN:
+    case AMDILISD::ATOM_G_UMIN:
+    case AMDILISD::ATOM_G_OR:
+    case AMDILISD::ATOM_G_SUB:
+    case AMDILISD::ATOM_G_RSUB:
+    case AMDILISD::ATOM_G_XCHG:
+    case AMDILISD::ATOM_G_XOR:
+    case AMDILISD::ATOM_G_ADD_NORET:
+    case AMDILISD::ATOM_G_AND_NORET:
+    case AMDILISD::ATOM_G_MAX_NORET:
+    case AMDILISD::ATOM_G_UMAX_NORET:
+    case AMDILISD::ATOM_G_MIN_NORET:
+    case AMDILISD::ATOM_G_UMIN_NORET:
+    case AMDILISD::ATOM_G_OR_NORET:
+    case AMDILISD::ATOM_G_SUB_NORET:
+    case AMDILISD::ATOM_G_RSUB_NORET:
+    case AMDILISD::ATOM_G_XCHG_NORET:
+    case AMDILISD::ATOM_G_XOR_NORET:
+    case AMDILISD::ATOM_L_ADD:
+    case AMDILISD::ATOM_L_AND:
+    case AMDILISD::ATOM_L_MAX:
+    case AMDILISD::ATOM_L_UMAX:
+    case AMDILISD::ATOM_L_MIN:
+    case AMDILISD::ATOM_L_UMIN:
+    case AMDILISD::ATOM_L_OR:
+    case AMDILISD::ATOM_L_SUB:
+    case AMDILISD::ATOM_L_RSUB:
+    case AMDILISD::ATOM_L_XCHG:
+    case AMDILISD::ATOM_L_XOR:
+    case AMDILISD::ATOM_L_ADD_NORET:
+    case AMDILISD::ATOM_L_AND_NORET:
+    case AMDILISD::ATOM_L_MAX_NORET:
+    case AMDILISD::ATOM_L_UMAX_NORET:
+    case AMDILISD::ATOM_L_MIN_NORET:
+    case AMDILISD::ATOM_L_UMIN_NORET:
+    case AMDILISD::ATOM_L_OR_NORET:
+    case AMDILISD::ATOM_L_SUB_NORET:
+    case AMDILISD::ATOM_L_RSUB_NORET:
+    case AMDILISD::ATOM_L_XCHG_NORET:
+    case AMDILISD::ATOM_L_XOR_NORET:
+    case AMDILISD::ATOM_R_ADD:
+    case AMDILISD::ATOM_R_AND:
+    case AMDILISD::ATOM_R_MAX:
+    case AMDILISD::ATOM_R_UMAX:
+    case AMDILISD::ATOM_R_MIN:
+    case AMDILISD::ATOM_R_UMIN:
+    case AMDILISD::ATOM_R_OR:
+    case AMDILISD::ATOM_R_SUB:
+    case AMDILISD::ATOM_R_RSUB:
+    case AMDILISD::ATOM_R_XCHG:
+    case AMDILISD::ATOM_R_XOR:
+    case AMDILISD::ATOM_R_ADD_NORET:
+    case AMDILISD::ATOM_R_AND_NORET:
+    case AMDILISD::ATOM_R_MAX_NORET:
+    case AMDILISD::ATOM_R_UMAX_NORET:
+    case AMDILISD::ATOM_R_MIN_NORET:
+    case AMDILISD::ATOM_R_UMIN_NORET:
+    case AMDILISD::ATOM_R_OR_NORET:
+    case AMDILISD::ATOM_R_SUB_NORET:
+    case AMDILISD::ATOM_R_RSUB_NORET:
+    case AMDILISD::ATOM_R_XCHG_NORET:
+    case AMDILISD::ATOM_R_XOR_NORET:
+    case AMDILISD::ATOM_G_CMPXCHG:
+    case AMDILISD::ATOM_G_CMPXCHG_NORET:
+    case AMDILISD::ATOM_L_CMPXCHG:
+    case AMDILISD::ATOM_L_CMPXCHG_NORET:
+    case AMDILISD::ATOM_R_CMPXCHG:
+    case AMDILISD::ATOM_R_CMPXCHG_NORET:
+             break;
+    case AMDILISD::ATOM_G_DEC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_G_SUB;
+             }
+             break;
+    case AMDILISD::ATOM_G_INC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_G_ADD;
+             }
+             break;
+    case AMDILISD::ATOM_G_DEC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_G_SUB_NORET;
+             }
+             break;
+    case AMDILISD::ATOM_G_INC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_G_ADD_NORET;
+             }
+             break;
+    case AMDILISD::ATOM_L_DEC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_L_SUB;
+             }
+             break;
+    case AMDILISD::ATOM_L_INC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_L_ADD;
+             }
+             break;
+    case AMDILISD::ATOM_L_DEC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_L_SUB_NORET;
+             }
+             break;
+    case AMDILISD::ATOM_L_INC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_L_ADD_NORET;
+             }
+             break;
+    case AMDILISD::ATOM_R_DEC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_R_SUB;
+             }
+             break;
+    case AMDILISD::ATOM_R_INC:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_R_ADD;
+             }
+             break;
+    case AMDILISD::ATOM_R_DEC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_R_SUB;
+             }
+             break;
+    case AMDILISD::ATOM_R_INC_NORET:
+             addOne = true;
+             if (Subtarget.calVersion() >= CAL_VERSION_SC_136) {
+               addVal = (uint32_t)-1;
+             } else {
+               opc = AMDILISD::ATOM_R_ADD_NORET;
+             }
+             break;
+  }
+  // The largest we can have is a cmpxchg w/ a return value and an output chain.
+  // The cmpxchg function has 3 inputs and a single output along with an
+  // output change and a target constant, giving a total of 6.
+  SDValue Ops[12];
+  unsigned x = 0;
+  unsigned y = N->getNumOperands();
+  for (x = 0; x < y; ++x) {
+    Ops[x] = N->getOperand(x);
+  }
+  if (addOne) {
+    Ops[x++] = SDValue(SelectCode(CurDAG->getConstant(addVal, MVT::i32).getNode()), 0);
+  }
+  Ops[x++] = CurDAG->getTargetConstant(0, MVT::i32);
+  SDVTList Tys = N->getVTList();
+  MemSDNode *MemNode = dyn_cast<MemSDNode>(N);
+  assert(MemNode && "Atomic should be of MemSDNode type!");
+  N = CurDAG->getMemIntrinsicNode(opc, N->getDebugLoc(), Tys, Ops, x,
+      MemNode->getMemoryVT(), MemNode->getMemOperand()).getNode();
+  return N;
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.cpp b/src/gallium/drivers/radeon/AMDILISelLowering.cpp

new file mode 100644 (file)

index 0000000..6f78d15
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
@@ -0,0 +1,5612 @@
+//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that AMDIL uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILISelLowering.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+#define ISDBITCAST  ISD::BITCAST
+#define MVTGLUE     MVT::Glue
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+#include "AMDILGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions Begin
+//===----------------------------------------------------------------------===//
+  static SDValue
+getConversionNode(SelectionDAG &DAG, SDValue& Src, SDValue& Dst, bool asType)
+{
+  DebugLoc DL = Src.getDebugLoc();
+  EVT svt = Src.getValueType().getScalarType();
+  EVT dvt = Dst.getValueType().getScalarType();
+  if (svt.isFloatingPoint() && dvt.isFloatingPoint()) {
+    if (dvt.bitsGT(svt)) {
+      Src = DAG.getNode(ISD::FP_EXTEND, DL, dvt, Src);
+    } else if (svt.bitsLT(svt)) {
+      Src = DAG.getNode(ISD::FP_ROUND, DL, dvt, Src,
+          DAG.getConstant(1, MVT::i32));
+    }
+  } else if (svt.isInteger() && dvt.isInteger()) {
+    if (!svt.bitsEq(dvt)) {
+      Src = DAG.getSExtOrTrunc(Src, DL, dvt);
+    } else {
+      Src = DAG.getNode(AMDILISD::MOVE, DL, dvt, Src);
+    }
+  } else if (svt.isInteger()) {
+    unsigned opcode = (asType) ? ISDBITCAST : ISD::SINT_TO_FP;
+    if (!svt.bitsEq(dvt)) {
+      if (dvt.getSimpleVT().SimpleTy == MVT::f32) {
+        Src = DAG.getSExtOrTrunc(Src, DL, MVT::i32);
+      } else if (dvt.getSimpleVT().SimpleTy == MVT::f64) {
+        Src = DAG.getSExtOrTrunc(Src, DL, MVT::i64);
+      } else {
+        assert(0 && "We only support 32 and 64bit fp types");
+      }
+    }
+    Src = DAG.getNode(opcode, DL, dvt, Src);
+  } else if (dvt.isInteger()) {
+    unsigned opcode = (asType) ? ISDBITCAST : ISD::FP_TO_SINT;
+    if (svt.getSimpleVT().SimpleTy == MVT::f32) {
+      Src = DAG.getNode(opcode, DL, MVT::i32, Src);
+    } else if (svt.getSimpleVT().SimpleTy == MVT::f64) {
+      Src = DAG.getNode(opcode, DL, MVT::i64, Src);
+    } else {
+      assert(0 && "We only support 32 and 64bit fp types");
+    }
+    Src = DAG.getSExtOrTrunc(Src, DL, dvt);
+  }
+  return Src;
+}
+// CondCCodeToCC - Convert a DAG condition code to a AMDIL CC
+// condition.
+  static AMDILCC::CondCodes
+CondCCodeToCC(ISD::CondCode CC, const MVT::SimpleValueType& type)
+{
+  switch (CC) {
+    default:
+      {
+        errs()<<"Condition Code: "<< (unsigned int)CC<<"\n";
+        assert(0 && "Unknown condition code!");
+      }
+    case ISD::SETO:
+      switch(type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_O;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_O;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUO:
+      switch(type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UO;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UO;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETGT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_GT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_GT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_GT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_GT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETGE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_GE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_GE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_GE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_GE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETLT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_LT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_LT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_LT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_LT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETLE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_LE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_LE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_LE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_LE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETNE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_NE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_NE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_NE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_NE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETEQ:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_I_EQ;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_EQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_EQ;
+        case MVT::i64:
+          return AMDILCC::IL_CC_L_EQ;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUGT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_GT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UGT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UGT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_GT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUGE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_GE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UGE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UGE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_GE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETULT:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_LT;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ULT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ULT;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_LT;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETULE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_LE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ULE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ULE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_LE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUNE:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_NE;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UNE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UNE;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_NE;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETUEQ:
+      switch (type) {
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+          return AMDILCC::IL_CC_U_EQ;
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_UEQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_UEQ;
+        case MVT::i64:
+          return AMDILCC::IL_CC_UL_EQ;
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOGT:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OGT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OGT;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOGE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OGE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OGE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOLT:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OLT;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OLT;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOLE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OLE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OLE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETONE:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_ONE;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_ONE;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+    case ISD::SETOEQ:
+      switch (type) {
+        case MVT::f32:
+          return AMDILCC::IL_CC_F_OEQ;
+        case MVT::f64:
+          return AMDILCC::IL_CC_D_OEQ;
+        case MVT::i1:
+        case MVT::i8:
+        case MVT::i16:
+        case MVT::i32:
+        case MVT::i64:
+        default:
+          assert(0 && "Opcode combination not generated correctly!");
+          return AMDILCC::COND_ERROR;
+      };
+  };
+}
+
+  static unsigned int
+translateToOpcode(uint64_t CCCode, unsigned int regClass)
+{
+  switch (CCCode) {
+    case AMDILCC::IL_CC_D_EQ:
+    case AMDILCC::IL_CC_D_OEQ:
+      if (regClass == AMDIL::GPRV2F64RegClassID) {
+        return (unsigned int)AMDIL::DEQ_v2f64;
+      } else {
+        return (unsigned int)AMDIL::DEQ;
+      }
+    case AMDILCC::IL_CC_D_LE:
+    case AMDILCC::IL_CC_D_OLE:
+    case AMDILCC::IL_CC_D_ULE:
+    case AMDILCC::IL_CC_D_GE:
+    case AMDILCC::IL_CC_D_OGE:
+    case AMDILCC::IL_CC_D_UGE:
+      return (unsigned int)AMDIL::DGE;
+    case AMDILCC::IL_CC_D_LT:
+    case AMDILCC::IL_CC_D_OLT:
+    case AMDILCC::IL_CC_D_ULT:
+    case AMDILCC::IL_CC_D_GT:
+    case AMDILCC::IL_CC_D_OGT:
+    case AMDILCC::IL_CC_D_UGT:
+      return (unsigned int)AMDIL::DLT;
+    case AMDILCC::IL_CC_D_NE:
+    case AMDILCC::IL_CC_D_UNE:
+      return (unsigned int)AMDIL::DNE;
+    case AMDILCC::IL_CC_F_EQ:
+    case AMDILCC::IL_CC_F_OEQ:
+      return (unsigned int)AMDIL::FEQ;
+    case AMDILCC::IL_CC_F_LE:
+    case AMDILCC::IL_CC_F_ULE:
+    case AMDILCC::IL_CC_F_OLE:
+    case AMDILCC::IL_CC_F_GE:
+    case AMDILCC::IL_CC_F_UGE:
+    case AMDILCC::IL_CC_F_OGE:
+      return (unsigned int)AMDIL::FGE;
+    case AMDILCC::IL_CC_F_LT:
+    case AMDILCC::IL_CC_F_OLT:
+    case AMDILCC::IL_CC_F_ULT:
+    case AMDILCC::IL_CC_F_GT:
+    case AMDILCC::IL_CC_F_OGT:
+    case AMDILCC::IL_CC_F_UGT:
+      if (regClass == AMDIL::GPRV2F32RegClassID) {
+        return (unsigned int)AMDIL::FLT_v2f32;
+      } else if (regClass == AMDIL::GPRV4F32RegClassID) {
+        return (unsigned int)AMDIL::FLT_v4f32;
+      } else {
+        return (unsigned int)AMDIL::FLT;
+      }
+    case AMDILCC::IL_CC_F_NE:
+    case AMDILCC::IL_CC_F_UNE:
+      return (unsigned int)AMDIL::FNE;
+    case AMDILCC::IL_CC_I_EQ:
+    case AMDILCC::IL_CC_U_EQ:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::IEQ;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRV2I8RegClassID
+          || regClass == AMDIL::GPRV2I16RegClassID) {
+        return (unsigned int)AMDIL::IEQ_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRV4I8RegClassID
+          || regClass == AMDIL::GPRV4I16RegClassID) {
+        return (unsigned int)AMDIL::IEQ_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_L_EQ:
+    case AMDILCC::IL_CC_UL_EQ:
+      return (unsigned int)AMDIL::LEQ;
+    case AMDILCC::IL_CC_I_GE:
+    case AMDILCC::IL_CC_I_LE:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::IGE;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::IGE_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::IGE_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_I_LT:
+    case AMDILCC::IL_CC_I_GT:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ILT;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ILT_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ILT_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_L_GE:
+      return (unsigned int)AMDIL::LGE;
+    case AMDILCC::IL_CC_L_LE:
+      return (unsigned int)AMDIL::LLE;
+    case AMDILCC::IL_CC_L_LT:
+      return (unsigned int)AMDIL::LLT;
+    case AMDILCC::IL_CC_L_GT:
+      return (unsigned int)AMDIL::LGT;
+    case AMDILCC::IL_CC_I_NE:
+    case AMDILCC::IL_CC_U_NE:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::INE;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::INE_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::INE_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_U_GE:
+    case AMDILCC::IL_CC_U_LE:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGE;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGE_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGE_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_L_NE:
+    case AMDILCC::IL_CC_UL_NE:
+      return (unsigned int)AMDIL::LNE;
+    case AMDILCC::IL_CC_UL_GE:
+      return (unsigned int)AMDIL::ULGE;
+    case AMDILCC::IL_CC_UL_LE:
+      return (unsigned int)AMDIL::ULLE;
+    case AMDILCC::IL_CC_U_LT:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ULT;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ULT_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::ULT_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_U_GT:
+      if (regClass == AMDIL::GPRI32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGT;
+      } else if (regClass == AMDIL::GPRV2I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGT_v2i32;
+      } else if (regClass == AMDIL::GPRV4I32RegClassID
+          || regClass == AMDIL::GPRI8RegClassID
+          || regClass == AMDIL::GPRI16RegClassID) {
+        return (unsigned int)AMDIL::UGT_v4i32;
+      } else {
+        assert(!"Unknown reg class!");
+      }
+    case AMDILCC::IL_CC_UL_LT:
+      return (unsigned int)AMDIL::ULLT;
+    case AMDILCC::IL_CC_UL_GT:
+      return (unsigned int)AMDIL::ULGT;
+    case AMDILCC::IL_CC_F_UEQ:
+    case AMDILCC::IL_CC_D_UEQ:
+    case AMDILCC::IL_CC_F_ONE:
+    case AMDILCC::IL_CC_D_ONE:
+    case AMDILCC::IL_CC_F_O:
+    case AMDILCC::IL_CC_F_UO:
+    case AMDILCC::IL_CC_D_O:
+    case AMDILCC::IL_CC_D_UO:
+      // we don't care
+      return 0;
+
+  }
+  errs()<<"Opcode: "<<CCCode<<"\n";
+  assert(0 && "Unknown opcode retrieved");
+  return 0;
+}
+SDValue
+AMDILTargetLowering::LowerMemArgument(
+    SDValue Chain,
+    CallingConv::ID CallConv,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl, SelectionDAG &DAG,
+    const CCValAssign &VA,
+    MachineFrameInfo *MFI,
+    unsigned i) const
+{
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+
+  bool AlwaysUseMutable = (CallConv==CallingConv::Fast) &&
+    getTargetMachine().Options.GuaranteedTailCallOpt;
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can
+  // be changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+      VA.getLocMemOffset(), isImmutable);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+  if (Flags.isByVal())
+    return FIN;
+  return DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(FI),
+      false, false, false, 0);
+}
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions End
+//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Instruction generation functions
+//===----------------------------------------------------------------------===//
+uint32_t
+AMDILTargetLowering::addExtensionInstructions(
+    uint32_t reg, bool signedShift,
+    unsigned int simpleVT) const
+{
+  int shiftSize = 0;
+  uint32_t LShift, RShift;
+  switch(simpleVT)
+  {
+    default:
+      return reg;
+    case AMDIL::GPRI8RegClassID:
+      shiftSize = 24;
+      LShift = AMDIL::SHL_i8;
+      if (signedShift) {
+        RShift = AMDIL::SHR_i8;
+      } else {
+        RShift = AMDIL::USHR_i8;
+      }
+      break;
+    case AMDIL::GPRV2I8RegClassID:
+      shiftSize = 24;
+      LShift = AMDIL::SHL_v2i8;
+      if (signedShift) {
+        RShift = AMDIL::SHR_v2i8;
+      } else {
+        RShift = AMDIL::USHR_v2i8;
+      }
+      break;
+    case AMDIL::GPRV4I8RegClassID:
+      shiftSize = 24;
+      LShift = AMDIL::SHL_v4i8;
+      if (signedShift) {
+        RShift = AMDIL::SHR_v4i8;
+      } else {
+        RShift = AMDIL::USHR_v4i8;
+      }
+      break;
+    case AMDIL::GPRI16RegClassID:
+      shiftSize = 16;
+      LShift = AMDIL::SHL_i16;
+      if (signedShift) {
+        RShift = AMDIL::SHR_i16;
+      } else {
+        RShift = AMDIL::USHR_i16;
+      }
+      break;
+    case AMDIL::GPRV2I16RegClassID:
+      shiftSize = 16;
+      LShift = AMDIL::SHL_v2i16;
+      if (signedShift) {
+        RShift = AMDIL::SHR_v2i16;
+      } else {
+        RShift = AMDIL::USHR_v2i16;
+      }
+      break;
+    case AMDIL::GPRV4I16RegClassID:
+      shiftSize = 16;
+      LShift = AMDIL::SHL_v4i16;
+      if (signedShift) {
+        RShift = AMDIL::SHR_v4i16;
+      } else {
+        RShift = AMDIL::USHR_v4i16;
+      }
+      break;
+  };
+  uint32_t LoadReg = genVReg(simpleVT);
+  uint32_t tmp1 = genVReg(simpleVT);
+  uint32_t tmp2 = genVReg(simpleVT);
+  generateMachineInst(AMDIL::LOADCONST_i32, LoadReg).addImm(shiftSize);
+  generateMachineInst(LShift, tmp1, reg, LoadReg);
+  generateMachineInst(RShift, tmp2, tmp1, LoadReg);
+  return tmp2;
+}
+
+MachineOperand
+AMDILTargetLowering::convertToReg(MachineOperand op) const
+{
+  if (op.isReg()) {
+    return op;
+  } else if (op.isImm()) {
+    uint32_t loadReg
+      = genVReg(op.getParent()->getDesc().OpInfo[0].RegClass);
+    generateMachineInst(AMDIL::LOADCONST_i32, loadReg)
+      .addImm(op.getImm());
+    op.ChangeToRegister(loadReg, false);
+  } else if (op.isFPImm()) {
+    uint32_t loadReg
+      = genVReg(op.getParent()->getDesc().OpInfo[0].RegClass);
+    generateMachineInst(AMDIL::LOADCONST_f32, loadReg)
+      .addFPImm(op.getFPImm());
+    op.ChangeToRegister(loadReg, false);
+  } else if (op.isMBB()) {
+    op.ChangeToRegister(0, false);
+  } else if (op.isFI()) {
+    op.ChangeToRegister(0, false);
+  } else if (op.isCPI()) {
+    op.ChangeToRegister(0, false);
+  } else if (op.isJTI()) {
+    op.ChangeToRegister(0, false);
+  } else if (op.isGlobal()) {
+    op.ChangeToRegister(0, false);
+  } else if (op.isSymbol()) {
+    op.ChangeToRegister(0, false);
+  }/* else if (op.isMetadata()) {
+      op.ChangeToRegister(0, false);
+      }*/
+  return op;
+}
+
+void
+AMDILTargetLowering::generateCMPInstr(
+    MachineInstr *MI,
+    MachineBasicBlock *BB,
+    const TargetInstrInfo& TII)
+const
+{
+  MachineOperand DST = MI->getOperand(0);
+  MachineOperand CC = MI->getOperand(1);
+  MachineOperand LHS = MI->getOperand(2);
+  MachineOperand RHS = MI->getOperand(3);
+  int64_t ccCode = CC.getImm();
+  unsigned int simpleVT = MI->getDesc().OpInfo[0].RegClass;
+  unsigned int opCode = translateToOpcode(ccCode, simpleVT);
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock::iterator BBI = MI;
+  setPrivateData(BB, BBI, &DL, &TII);
+  if (!LHS.isReg()) {
+    LHS = convertToReg(LHS);
+  }
+  if (!RHS.isReg()) {
+    RHS = convertToReg(RHS);
+  }
+  switch (ccCode) {
+    case AMDILCC::IL_CC_I_EQ:
+    case AMDILCC::IL_CC_I_NE:
+    case AMDILCC::IL_CC_I_GE:
+    case AMDILCC::IL_CC_I_LT:
+      {
+        uint32_t lhsreg = addExtensionInstructions(
+            LHS.getReg(), true, simpleVT);
+        uint32_t rhsreg = addExtensionInstructions(
+            RHS.getReg(), true, simpleVT);
+        generateMachineInst(opCode, DST.getReg(), lhsreg, rhsreg);
+      }
+      break;
+    case AMDILCC::IL_CC_U_EQ:
+    case AMDILCC::IL_CC_U_NE:
+    case AMDILCC::IL_CC_U_GE:
+    case AMDILCC::IL_CC_U_LT:
+    case AMDILCC::IL_CC_D_EQ:
+    case AMDILCC::IL_CC_F_EQ:
+    case AMDILCC::IL_CC_F_OEQ:
+    case AMDILCC::IL_CC_D_OEQ:
+    case AMDILCC::IL_CC_D_NE:
+    case AMDILCC::IL_CC_F_NE:
+    case AMDILCC::IL_CC_F_UNE:
+    case AMDILCC::IL_CC_D_UNE:
+    case AMDILCC::IL_CC_D_GE:
+    case AMDILCC::IL_CC_F_GE:
+    case AMDILCC::IL_CC_D_OGE:
+    case AMDILCC::IL_CC_F_OGE:
+    case AMDILCC::IL_CC_D_LT:
+    case AMDILCC::IL_CC_F_LT:
+    case AMDILCC::IL_CC_F_OLT:
+    case AMDILCC::IL_CC_D_OLT:
+      generateMachineInst(opCode, DST.getReg(),
+          LHS.getReg(), RHS.getReg());
+      break;
+    case AMDILCC::IL_CC_I_GT:
+    case AMDILCC::IL_CC_I_LE:
+      {
+        uint32_t lhsreg = addExtensionInstructions(
+            LHS.getReg(), true, simpleVT);
+        uint32_t rhsreg = addExtensionInstructions(
+            RHS.getReg(), true, simpleVT);
+        generateMachineInst(opCode, DST.getReg(), rhsreg, lhsreg);
+      }
+      break;
+    case AMDILCC::IL_CC_U_GT:
+    case AMDILCC::IL_CC_U_LE:
+    case AMDILCC::IL_CC_F_GT:
+    case AMDILCC::IL_CC_D_GT:
+    case AMDILCC::IL_CC_F_OGT:
+    case AMDILCC::IL_CC_D_OGT:
+    case AMDILCC::IL_CC_F_LE:
+    case AMDILCC::IL_CC_D_LE:
+    case AMDILCC::IL_CC_D_OLE:
+    case AMDILCC::IL_CC_F_OLE:
+      generateMachineInst(opCode, DST.getReg(),
+          RHS.getReg(), LHS.getReg());
+      break;
+    case AMDILCC::IL_CC_F_UGT:
+    case AMDILCC::IL_CC_F_ULE:
+      {
+        uint32_t VReg[4] = {
+          genVReg(simpleVT), genVReg(simpleVT),
+          genVReg(simpleVT), genVReg(simpleVT)
+        };
+        generateMachineInst(opCode, VReg[0],
+            RHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[1],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[2],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_F_ULT:
+    case AMDILCC::IL_CC_F_UGE:
+      {
+        uint32_t VReg[4] = {
+          genVReg(simpleVT), genVReg(simpleVT),
+          genVReg(simpleVT), genVReg(simpleVT)
+        };
+        generateMachineInst(opCode, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[1],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[2],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_D_UGT:
+    case AMDILCC::IL_CC_D_ULE:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[4] = {
+          genVReg(regID), genVReg(regID),
+          genVReg(regID), genVReg(regID)
+        };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(opCode, VReg[0],
+            RHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[1],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[2],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_D_UGE:
+    case AMDILCC::IL_CC_D_ULT:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[4] = {
+          genVReg(regID), genVReg(regID),
+          genVReg(regID), genVReg(regID)
+        };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(opCode, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[1],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[2],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_F_UEQ:
+      {
+        uint32_t VReg[4] = {
+          genVReg(simpleVT), genVReg(simpleVT),
+          genVReg(simpleVT), genVReg(simpleVT)
+        };
+        generateMachineInst(AMDIL::FEQ, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[2],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_F_ONE:
+      {
+        uint32_t VReg[4] = {
+          genVReg(simpleVT), genVReg(simpleVT),
+          genVReg(simpleVT), genVReg(simpleVT)
+        };
+        generateMachineInst(AMDIL::FNE, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FEQ, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::FEQ, VReg[2],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+      }
+      break;
+    case AMDILCC::IL_CC_D_UEQ:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[4] = {
+          genVReg(regID), genVReg(regID),
+          genVReg(regID), genVReg(regID)
+        };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(AMDIL::DEQ, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[2],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+
+      }
+      break;
+    case AMDILCC::IL_CC_D_ONE:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[4] = {
+          genVReg(regID), genVReg(regID),
+          genVReg(regID), genVReg(regID)
+        };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(AMDIL::DNE, VReg[0],
+            LHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DEQ, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::DEQ, VReg[2],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            VReg[3], VReg[0], VReg[1]);
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            DST.getReg(), VReg[2], VReg[3]);
+
+      }
+      break;
+    case AMDILCC::IL_CC_F_O:
+      {
+        uint32_t VReg[2] = { genVReg(simpleVT), genVReg(simpleVT) };
+        generateMachineInst(AMDIL::FEQ, VReg[0],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FEQ, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            DST.getReg(), VReg[0], VReg[1]);
+      }
+      break;
+    case AMDILCC::IL_CC_D_O:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[2] = { genVReg(regID), genVReg(regID) };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(AMDIL::DEQ, VReg[0],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DEQ, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_AND_f32,
+            DST.getReg(), VReg[0], VReg[1]);
+      }
+      break;
+    case AMDILCC::IL_CC_F_UO:
+      {
+        uint32_t VReg[2] = { genVReg(simpleVT), genVReg(simpleVT) };
+        generateMachineInst(AMDIL::FNE, VReg[0],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::FNE, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[0], VReg[1]);
+      }
+      break;
+    case AMDILCC::IL_CC_D_UO:
+      {
+        uint32_t regID = AMDIL::GPRF64RegClassID;
+        uint32_t VReg[2] = { genVReg(regID), genVReg(regID) };
+        // The result of a double comparison is a 32bit result
+        generateMachineInst(AMDIL::DNE, VReg[0],
+            RHS.getReg(), RHS.getReg());
+        generateMachineInst(AMDIL::DNE, VReg[1],
+            LHS.getReg(), LHS.getReg());
+        generateMachineInst(AMDIL::BINARY_OR_f32,
+            DST.getReg(), VReg[0], VReg[1]);
+      }
+      break;
+    case AMDILCC::IL_CC_L_LE:
+    case AMDILCC::IL_CC_L_GE:
+    case AMDILCC::IL_CC_L_EQ:
+    case AMDILCC::IL_CC_L_NE:
+    case AMDILCC::IL_CC_L_LT:
+    case AMDILCC::IL_CC_L_GT:
+    case AMDILCC::IL_CC_UL_LE:
+    case AMDILCC::IL_CC_UL_GE:
+    case AMDILCC::IL_CC_UL_EQ:
+    case AMDILCC::IL_CC_UL_NE:
+    case AMDILCC::IL_CC_UL_LT:
+    case AMDILCC::IL_CC_UL_GT:
+      {
+        const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+            &this->getTargetMachine())->getSubtargetImpl();
+        if (stm->device()->usesHardware(AMDILDeviceInfo::LongOps)) {
+          generateMachineInst(opCode, DST.getReg(), LHS.getReg(), RHS.getReg());
+        } else {
+          generateLongRelational(MI, opCode);
+        }
+      }
+      break;
+    case AMDILCC::COND_ERROR:
+      assert(0 && "Invalid CC code");
+      break;
+  };
+}
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Class Implementation Begins
+//===----------------------------------------------------------------------===//
+  AMDILTargetLowering::AMDILTargetLowering(TargetMachine &TM)
+: TargetLowering(TM, new TargetLoweringObjectFileELF())
+{
+  int types[] =
+  {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::f32,
+    (int)MVT::f64,
+    (int)MVT::i64,
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+
+  int IntTypes[] =
+  {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::i64
+  };
+
+  int FloatTypes[] =
+  {
+    (int)MVT::f32,
+    (int)MVT::f64
+  };
+
+  int VectorTypes[] =
+  {
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+  size_t numTypes = sizeof(types) / sizeof(*types);
+  size_t numFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
+  size_t numIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
+  size_t numVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
+
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  // These are the current register classes that are
+  // supported
+
+  addRegisterClass(MVT::i32, AMDIL::GPRI32RegisterClass);
+  addRegisterClass(MVT::f32, AMDIL::GPRF32RegisterClass);
+
+  if (stm->device()->isSupported(AMDILDeviceInfo::DoubleOps)) {
+    addRegisterClass(MVT::f64, AMDIL::GPRF64RegisterClass);
+    addRegisterClass(MVT::v2f64, AMDIL::GPRV2F64RegisterClass);
+  }
+  if (stm->device()->isSupported(AMDILDeviceInfo::ByteOps)) {
+    addRegisterClass(MVT::i8, AMDIL::GPRI8RegisterClass);
+    addRegisterClass(MVT::v2i8, AMDIL::GPRV2I8RegisterClass);
+    addRegisterClass(MVT::v4i8, AMDIL::GPRV4I8RegisterClass);
+    setOperationAction(ISD::Constant          , MVT::i8   , Legal);
+  }
+  if (stm->device()->isSupported(AMDILDeviceInfo::ShortOps)) {
+    addRegisterClass(MVT::i16, AMDIL::GPRI16RegisterClass);
+    addRegisterClass(MVT::v2i16, AMDIL::GPRV2I16RegisterClass);
+    addRegisterClass(MVT::v4i16, AMDIL::GPRV4I16RegisterClass);
+    setOperationAction(ISD::Constant          , MVT::i16  , Legal);
+  }
+  addRegisterClass(MVT::v2f32, AMDIL::GPRV2F32RegisterClass);
+  addRegisterClass(MVT::v4f32, AMDIL::GPRV4F32RegisterClass);
+  addRegisterClass(MVT::v2i32, AMDIL::GPRV2I32RegisterClass);
+  addRegisterClass(MVT::v4i32, AMDIL::GPRV4I32RegisterClass);
+  if (stm->device()->isSupported(AMDILDeviceInfo::LongOps)) {
+    addRegisterClass(MVT::i64, AMDIL::GPRI64RegisterClass);
+    addRegisterClass(MVT::v2i64, AMDIL::GPRV2I64RegisterClass);
+  }
+
+  for (unsigned int x  = 0; x < numTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+
+    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
+    // We cannot sextinreg, expand to shifts
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    setOperationAction(ISD::FP_ROUND, VT, Expand);
+    setOperationAction(ISD::OR, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADD, VT, Custom);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::BRCOND, VT, Custom);
+    setOperationAction(ISD::BR_CC, VT, Custom);
+    setOperationAction(ISD::BR_JT, VT, Expand);
+    setOperationAction(ISD::BRIND, VT, Expand);
+    // TODO: Implement custom UREM/SREM routines
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::UINT_TO_FP, VT, Custom);
+    setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+    setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+    setOperationAction(ISDBITCAST, VT, Custom);
+    setOperationAction(ISD::GlobalAddress, VT, Custom);
+    setOperationAction(ISD::JumpTable, VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::SELECT_CC, VT, Custom);
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    if (VT != MVT::i64 && VT != MVT::v2i64) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+      setOperationAction(ISD::UDIV, VT, Custom);
+    }
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  }
+  for (unsigned int x = 0; x < numFloatTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
+
+    // IL does not have these operations for floating point types
+    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
+    setOperationAction(ISD::FP_ROUND, VT, Custom);
+    setOperationAction(ISD::SETOLT, VT, Expand);
+    setOperationAction(ISD::SETOGE, VT, Expand);
+    setOperationAction(ISD::SETOGT, VT, Expand);
+    setOperationAction(ISD::SETOLE, VT, Expand);
+    setOperationAction(ISD::SETULT, VT, Expand);
+    setOperationAction(ISD::SETUGE, VT, Expand);
+    setOperationAction(ISD::SETUGT, VT, Expand);
+    setOperationAction(ISD::SETULE, VT, Expand);
+  }
+
+  for (unsigned int x = 0; x < numIntTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
+
+    // GPU also does not have divrem function for signed or unsigned
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::FP_ROUND, VT, Expand);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    // GPU doesn't have a rotl, rotr, or byteswap instruction
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::ROTL, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+
+    // GPU doesn't have any counting operators
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  for ( unsigned int ii = 0; ii < numVectorTypes; ++ii )
+  {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
+
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
+    setOperationAction(ISD::FP_ROUND, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    // setOperationAction(ISD::VSETCC, VT, Expand);
+    setOperationAction(ISD::SETCC, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+    setOperationAction(ISD::SELECT, VT, Expand);
+
+  }
+  setOperationAction(ISD::FP_ROUND, MVT::Other, Expand);
+  if (stm->device()->isSupported(AMDILDeviceInfo::LongOps)) {
+    if (stm->calVersion() < CAL_VERSION_SC_139
+        || stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+      setOperationAction(ISD::MUL, MVT::i64, Custom);
+    }
+    setOperationAction(ISD::SUB, MVT::i64, Custom);
+    setOperationAction(ISD::ADD, MVT::i64, Custom);
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::SUB, MVT::v2i64, Expand);
+    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
+    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Expand);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Expand);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  }
+  if (stm->device()->isSupported(AMDILDeviceInfo::DoubleOps)) {
+    // we support loading/storing v2f64 but not operations on the type
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    // We want to expand vector conversions into their scalar
+    // counterparts.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+  }
+  // TODO: Fix the UDIV24 algorithm so it works for these
+  // types correctly. This needs vector comparisons
+  // for this to work correctly.
+  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SUBC, MVT::Other, Expand);
+  setOperationAction(ISD::ADDE, MVT::Other, Expand);
+  setOperationAction(ISD::ADDC, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC, MVT::Other, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::SETCC, MVT::Other, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::v2f32, Custom);
+  setOperationAction(ISD::FDIV, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::BUILD_VECTOR, MVT::Other, Custom);
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
+  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
+  setOperationAction(ISD::TRAP              , MVT::Other  , Legal);
+
+  setStackPointerRegisterToSaveRestore(AMDIL::SP);
+  setSchedulingPreference(Sched::RegPressure);
+  setPow2DivIsCheap(false);
+  setPrefLoopAlignment(16);
+  setSelectIsExpensive(true);
+  setJumpIsExpensive(true);
+  computeRegisterProperties();
+
+  maxStoresPerMemcpy  = 4096;
+  maxStoresPerMemmove = 4096;
+  maxStoresPerMemset  = 4096;
+
+#undef numTypes
+#undef numIntTypes
+#undef numVectorTypes
+#undef numFloatTypes
+}
+
+const char *
+AMDILTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  switch (Opcode) {
+    default: return 0;
+    case AMDILISD::INTTOANY: return "AMDILISD::INTTOANY";
+    case AMDILISD::DP_TO_FP:  return "AMDILISD::DP_TO_FP";
+    case AMDILISD::FP_TO_DP:  return "AMDILISD::FP_TO_DP";
+    case AMDILISD::BITCONV: return "AMDILISD::BITCONV";
+    case AMDILISD::CMOV:  return "AMDILISD::CMOV";
+    case AMDILISD::CMOVLOG:  return "AMDILISD::CMOVLOG";
+    case AMDILISD::INEGATE:  return "AMDILISD::INEGATE";
+    case AMDILISD::MAD:  return "AMDILISD::MAD";
+    case AMDILISD::UMAD:  return "AMDILISD::UMAD";
+    case AMDILISD::CALL:  return "AMDILISD::CALL";
+    case AMDILISD::RET:   return "AMDILISD::RET";
+    case AMDILISD::IFFB_HI: return "AMDILISD::IFFB_HI";
+    case AMDILISD::IFFB_LO: return "AMDILISD::IFFB_LO";
+    case AMDILISD::ADD: return "AMDILISD::ADD";
+    case AMDILISD::UMUL: return "AMDILISD::UMUL";
+    case AMDILISD::AND: return "AMDILISD::AND";
+    case AMDILISD::OR: return "AMDILISD::OR";
+    case AMDILISD::NOT: return "AMDILISD::NOT";
+    case AMDILISD::XOR: return "AMDILISD::XOR";
+    case AMDILISD::DIV_INF: return "AMDILISD::DIV_INF";
+    case AMDILISD::SMAX: return "AMDILISD::SMAX";
+    case AMDILISD::PHIMOVE: return "AMDILISD::PHIMOVE";
+    case AMDILISD::MOVE: return "AMDILISD::MOVE";
+    case AMDILISD::VBUILD: return "AMDILISD::VBUILD";
+    case AMDILISD::VEXTRACT: return "AMDILISD::VEXTRACT";
+    case AMDILISD::VINSERT: return "AMDILISD::VINSERT";
+    case AMDILISD::VCONCAT: return "AMDILISD::VCONCAT";
+    case AMDILISD::LCREATE: return "AMDILISD::LCREATE";
+    case AMDILISD::LCOMPHI: return "AMDILISD::LCOMPHI";
+    case AMDILISD::LCOMPLO: return "AMDILISD::LCOMPLO";
+    case AMDILISD::DCREATE: return "AMDILISD::DCREATE";
+    case AMDILISD::DCOMPHI: return "AMDILISD::DCOMPHI";
+    case AMDILISD::DCOMPLO: return "AMDILISD::DCOMPLO";
+    case AMDILISD::LCREATE2: return "AMDILISD::LCREATE2";
+    case AMDILISD::LCOMPHI2: return "AMDILISD::LCOMPHI2";
+    case AMDILISD::LCOMPLO2: return "AMDILISD::LCOMPLO2";
+    case AMDILISD::DCREATE2: return "AMDILISD::DCREATE2";
+    case AMDILISD::DCOMPHI2: return "AMDILISD::DCOMPHI2";
+    case AMDILISD::DCOMPLO2: return "AMDILISD::DCOMPLO2";
+    case AMDILISD::CMP: return "AMDILISD::CMP";
+    case AMDILISD::IL_CC_I_LT: return "AMDILISD::IL_CC_I_LT";
+    case AMDILISD::IL_CC_I_LE: return "AMDILISD::IL_CC_I_LE";
+    case AMDILISD::IL_CC_I_GT: return "AMDILISD::IL_CC_I_GT";
+    case AMDILISD::IL_CC_I_GE: return "AMDILISD::IL_CC_I_GE";
+    case AMDILISD::IL_CC_I_EQ: return "AMDILISD::IL_CC_I_EQ";
+    case AMDILISD::IL_CC_I_NE: return "AMDILISD::IL_CC_I_NE";
+    case AMDILISD::RET_FLAG: return "AMDILISD::RET_FLAG";
+    case AMDILISD::BRANCH_COND: return "AMDILISD::BRANCH_COND";
+    case AMDILISD::LOOP_NZERO: return "AMDILISD::LOOP_NZERO";
+    case AMDILISD::LOOP_ZERO: return "AMDILISD::LOOP_ZERO";
+    case AMDILISD::LOOP_CMP: return "AMDILISD::LOOP_CMP";
+    case AMDILISD::ADDADDR: return "AMDILISD::ADDADDR";
+    case AMDILISD::ATOM_G_ADD: return "AMDILISD::ATOM_G_ADD";
+    case AMDILISD::ATOM_G_AND: return "AMDILISD::ATOM_G_AND";
+    case AMDILISD::ATOM_G_CMPXCHG: return "AMDILISD::ATOM_G_CMPXCHG";
+    case AMDILISD::ATOM_G_DEC: return "AMDILISD::ATOM_G_DEC";
+    case AMDILISD::ATOM_G_INC: return "AMDILISD::ATOM_G_INC";
+    case AMDILISD::ATOM_G_MAX: return "AMDILISD::ATOM_G_MAX";
+    case AMDILISD::ATOM_G_UMAX: return "AMDILISD::ATOM_G_UMAX";
+    case AMDILISD::ATOM_G_MIN: return "AMDILISD::ATOM_G_MIN";
+    case AMDILISD::ATOM_G_UMIN: return "AMDILISD::ATOM_G_UMIN";
+    case AMDILISD::ATOM_G_OR: return "AMDILISD::ATOM_G_OR";
+    case AMDILISD::ATOM_G_SUB: return "AMDILISD::ATOM_G_SUB";
+    case AMDILISD::ATOM_G_RSUB: return "AMDILISD::ATOM_G_RSUB";
+    case AMDILISD::ATOM_G_XCHG: return "AMDILISD::ATOM_G_XCHG";
+    case AMDILISD::ATOM_G_XOR: return "AMDILISD::ATOM_G_XOR";
+    case AMDILISD::ATOM_G_ADD_NORET: return "AMDILISD::ATOM_G_ADD_NORET";
+    case AMDILISD::ATOM_G_AND_NORET: return "AMDILISD::ATOM_G_AND_NORET";
+    case AMDILISD::ATOM_G_CMPXCHG_NORET: return "AMDILISD::ATOM_G_CMPXCHG_NORET";
+    case AMDILISD::ATOM_G_DEC_NORET: return "AMDILISD::ATOM_G_DEC_NORET";
+    case AMDILISD::ATOM_G_INC_NORET: return "AMDILISD::ATOM_G_INC_NORET";
+    case AMDILISD::ATOM_G_MAX_NORET: return "AMDILISD::ATOM_G_MAX_NORET";
+    case AMDILISD::ATOM_G_UMAX_NORET: return "AMDILISD::ATOM_G_UMAX_NORET";
+    case AMDILISD::ATOM_G_MIN_NORET: return "AMDILISD::ATOM_G_MIN_NORET";
+    case AMDILISD::ATOM_G_UMIN_NORET: return "AMDILISD::ATOM_G_UMIN_NORET";
+    case AMDILISD::ATOM_G_OR_NORET: return "AMDILISD::ATOM_G_OR_NORET";
+    case AMDILISD::ATOM_G_SUB_NORET: return "AMDILISD::ATOM_G_SUB_NORET";
+    case AMDILISD::ATOM_G_RSUB_NORET: return "AMDILISD::ATOM_G_RSUB_NORET";
+    case AMDILISD::ATOM_G_XCHG_NORET: return "AMDILISD::ATOM_G_XCHG_NORET";
+    case AMDILISD::ATOM_G_XOR_NORET: return "AMDILISD::ATOM_G_XOR_NORET";
+    case AMDILISD::ATOM_L_ADD: return "AMDILISD::ATOM_L_ADD";
+    case AMDILISD::ATOM_L_AND: return "AMDILISD::ATOM_L_AND";
+    case AMDILISD::ATOM_L_CMPXCHG: return "AMDILISD::ATOM_L_CMPXCHG";
+    case AMDILISD::ATOM_L_DEC: return "AMDILISD::ATOM_L_DEC";
+    case AMDILISD::ATOM_L_INC: return "AMDILISD::ATOM_L_INC";
+    case AMDILISD::ATOM_L_MAX: return "AMDILISD::ATOM_L_MAX";
+    case AMDILISD::ATOM_L_UMAX: return "AMDILISD::ATOM_L_UMAX";
+    case AMDILISD::ATOM_L_MIN: return "AMDILISD::ATOM_L_MIN";
+    case AMDILISD::ATOM_L_UMIN: return "AMDILISD::ATOM_L_UMIN";
+    case AMDILISD::ATOM_L_OR: return "AMDILISD::ATOM_L_OR";
+    case AMDILISD::ATOM_L_SUB: return "AMDILISD::ATOM_L_SUB";
+    case AMDILISD::ATOM_L_RSUB: return "AMDILISD::ATOM_L_RSUB";
+    case AMDILISD::ATOM_L_XCHG: return "AMDILISD::ATOM_L_XCHG";
+    case AMDILISD::ATOM_L_XOR: return "AMDILISD::ATOM_L_XOR";
+    case AMDILISD::ATOM_L_ADD_NORET: return "AMDILISD::ATOM_L_ADD_NORET";
+    case AMDILISD::ATOM_L_AND_NORET: return "AMDILISD::ATOM_L_AND_NORET";
+    case AMDILISD::ATOM_L_CMPXCHG_NORET: return "AMDILISD::ATOM_L_CMPXCHG_NORET";
+    case AMDILISD::ATOM_L_DEC_NORET: return "AMDILISD::ATOM_L_DEC_NORET";
+    case AMDILISD::ATOM_L_INC_NORET: return "AMDILISD::ATOM_L_INC_NORET";
+    case AMDILISD::ATOM_L_MAX_NORET: return "AMDILISD::ATOM_L_MAX_NORET";
+    case AMDILISD::ATOM_L_UMAX_NORET: return "AMDILISD::ATOM_L_UMAX_NORET";
+    case AMDILISD::ATOM_L_MIN_NORET: return "AMDILISD::ATOM_L_MIN_NORET";
+    case AMDILISD::ATOM_L_UMIN_NORET: return "AMDILISD::ATOM_L_UMIN_NORET";
+    case AMDILISD::ATOM_L_OR_NORET: return "AMDILISD::ATOM_L_OR_NORET";
+    case AMDILISD::ATOM_L_SUB_NORET: return "AMDILISD::ATOM_L_SUB_NORET";
+    case AMDILISD::ATOM_L_RSUB_NORET: return "AMDILISD::ATOM_L_RSUB_NORET";
+    case AMDILISD::ATOM_L_XCHG_NORET: return "AMDILISD::ATOM_L_XCHG_NORET";
+    case AMDILISD::ATOM_R_ADD: return "AMDILISD::ATOM_R_ADD";
+    case AMDILISD::ATOM_R_AND: return "AMDILISD::ATOM_R_AND";
+    case AMDILISD::ATOM_R_CMPXCHG: return "AMDILISD::ATOM_R_CMPXCHG";
+    case AMDILISD::ATOM_R_DEC: return "AMDILISD::ATOM_R_DEC";
+    case AMDILISD::ATOM_R_INC: return "AMDILISD::ATOM_R_INC";
+    case AMDILISD::ATOM_R_MAX: return "AMDILISD::ATOM_R_MAX";
+    case AMDILISD::ATOM_R_UMAX: return "AMDILISD::ATOM_R_UMAX";
+    case AMDILISD::ATOM_R_MIN: return "AMDILISD::ATOM_R_MIN";
+    case AMDILISD::ATOM_R_UMIN: return "AMDILISD::ATOM_R_UMIN";
+    case AMDILISD::ATOM_R_OR: return "AMDILISD::ATOM_R_OR";
+    case AMDILISD::ATOM_R_MSKOR: return "AMDILISD::ATOM_R_MSKOR";
+    case AMDILISD::ATOM_R_SUB: return "AMDILISD::ATOM_R_SUB";
+    case AMDILISD::ATOM_R_RSUB: return "AMDILISD::ATOM_R_RSUB";
+    case AMDILISD::ATOM_R_XCHG: return "AMDILISD::ATOM_R_XCHG";
+    case AMDILISD::ATOM_R_XOR: return "AMDILISD::ATOM_R_XOR";
+    case AMDILISD::ATOM_R_ADD_NORET: return "AMDILISD::ATOM_R_ADD_NORET";
+    case AMDILISD::ATOM_R_AND_NORET: return "AMDILISD::ATOM_R_AND_NORET";
+    case AMDILISD::ATOM_R_CMPXCHG_NORET: return "AMDILISD::ATOM_R_CMPXCHG_NORET";
+    case AMDILISD::ATOM_R_DEC_NORET: return "AMDILISD::ATOM_R_DEC_NORET";
+    case AMDILISD::ATOM_R_INC_NORET: return "AMDILISD::ATOM_R_INC_NORET";
+    case AMDILISD::ATOM_R_MAX_NORET: return "AMDILISD::ATOM_R_MAX_NORET";
+    case AMDILISD::ATOM_R_UMAX_NORET: return "AMDILISD::ATOM_R_UMAX_NORET";
+    case AMDILISD::ATOM_R_MIN_NORET: return "AMDILISD::ATOM_R_MIN_NORET";
+    case AMDILISD::ATOM_R_UMIN_NORET: return "AMDILISD::ATOM_R_UMIN_NORET";
+    case AMDILISD::ATOM_R_OR_NORET: return "AMDILISD::ATOM_R_OR_NORET";
+    case AMDILISD::ATOM_R_MSKOR_NORET: return "AMDILISD::ATOM_R_MSKOR_NORET";
+    case AMDILISD::ATOM_R_SUB_NORET: return "AMDILISD::ATOM_R_SUB_NORET";
+    case AMDILISD::ATOM_R_RSUB_NORET: return "AMDILISD::ATOM_R_RSUB_NORET";
+    case AMDILISD::ATOM_R_XCHG_NORET: return "AMDILISD::ATOM_R_XCHG_NORET";
+    case AMDILISD::ATOM_R_XOR_NORET: return "AMDILISD::ATOM_R_XOR_NORET";
+    case AMDILISD::APPEND_ALLOC: return "AMDILISD::APPEND_ALLOC";
+    case AMDILISD::APPEND_ALLOC_NORET: return "AMDILISD::APPEND_ALLOC_NORET";
+    case AMDILISD::APPEND_CONSUME: return "AMDILISD::APPEND_CONSUME";
+    case AMDILISD::APPEND_CONSUME_NORET: return "AMDILISD::APPEND_CONSUME_NORET";
+    case AMDILISD::IMAGE2D_READ: return "AMDILISD::IMAGE2D_READ";
+    case AMDILISD::IMAGE2D_WRITE: return "AMDILISD::IMAGE2D_WRITE";
+    case AMDILISD::IMAGE2D_INFO0: return "AMDILISD::IMAGE2D_INFO0";
+    case AMDILISD::IMAGE2D_INFO1: return "AMDILISD::IMAGE2D_INFO1";
+    case AMDILISD::IMAGE3D_READ: return "AMDILISD::IMAGE3D_READ";
+    case AMDILISD::IMAGE3D_WRITE: return "AMDILISD::IMAGE3D_WRITE";
+    case AMDILISD::IMAGE3D_INFO0: return "AMDILISD::IMAGE3D_INFO0";
+    case AMDILISD::IMAGE3D_INFO1: return "AMDILISD::IMAGE3D_INFO1";
+
+  };
+}
+bool
+AMDILTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+    const CallInst &I, unsigned Intrinsic) const
+{
+  if (Intrinsic <= AMDGPUIntrinsic::last_non_AMDIL_intrinsic 
+      || Intrinsic > AMDGPUIntrinsic::num_AMDIL_intrinsics) {
+    return false;
+  }
+  bool bitCastToInt = false;
+  unsigned IntNo;
+  bool isRet = true;
+  const AMDILSubtarget *STM = &this->getTargetMachine()
+    .getSubtarget<AMDILSubtarget>();
+  switch (Intrinsic) {
+    default: return false; // Don't custom lower most intrinsics.
+    case AMDGPUIntrinsic::AMDIL_atomic_add_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_gu32:
+             IntNo = AMDILISD::ATOM_G_ADD; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_add_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_ADD_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_add_lu32:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_li32:
+             IntNo = AMDILISD::ATOM_L_ADD; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_add_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_ADD_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_add_ru32:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_ri32:
+             IntNo = AMDILISD::ATOM_R_ADD; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_add_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_add_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_ADD_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_gu32:
+             IntNo = AMDILISD::ATOM_G_AND; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_AND_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_lu32:
+             IntNo = AMDILISD::ATOM_L_AND; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_AND_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_ru32:
+             IntNo = AMDILISD::ATOM_R_AND; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_and_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_and_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_AND_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gu32:
+             IntNo = AMDILISD::ATOM_G_CMPXCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_CMPXCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_lu32:
+             IntNo = AMDILISD::ATOM_L_CMPXCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_CMPXCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ru32:
+             IntNo = AMDILISD::ATOM_R_CMPXCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_CMPXCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_gu32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_G_DEC;
+             } else {
+               IntNo = AMDILISD::ATOM_G_SUB;
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_gu32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_G_DEC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_G_SUB_NORET; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_lu32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_L_DEC;
+             } else {
+               IntNo = AMDILISD::ATOM_L_SUB; 
+             } 
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_lu32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_L_DEC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_L_SUB_NORET; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_ru32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_R_DEC;
+             } else {
+               IntNo = AMDILISD::ATOM_R_SUB; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_dec_ru32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_R_DEC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_R_SUB_NORET; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_gu32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_G_INC;
+             } else {
+               IntNo = AMDILISD::ATOM_G_ADD; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_gu32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_G_INC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_G_ADD_NORET; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_lu32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_L_INC;
+             } else {
+               IntNo = AMDILISD::ATOM_L_ADD; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_lu32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_L_INC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_L_ADD_NORET; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_ru32:
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_R_INC;
+             } else {
+               IntNo = AMDILISD::ATOM_R_ADD; 
+             }
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_inc_ru32_noret:
+             isRet = false;
+             if (STM->calVersion() >= CAL_VERSION_SC_136) {
+               IntNo = AMDILISD::ATOM_R_INC_NORET;
+             } else {
+               IntNo = AMDILISD::ATOM_R_ADD_NORET; 
+             } 
+             break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_gi32:
+             IntNo = AMDILISD::ATOM_G_MAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_gu32:
+             IntNo = AMDILISD::ATOM_G_UMAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_gi32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_MAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_UMAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_li32:
+             IntNo = AMDILISD::ATOM_L_MAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_lu32:
+             IntNo = AMDILISD::ATOM_L_UMAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_li32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_MAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_UMAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_ri32:
+             IntNo = AMDILISD::ATOM_R_MAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_ru32:
+             IntNo = AMDILISD::ATOM_R_UMAX; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_ri32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_MAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_max_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_UMAX_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_gi32:
+             IntNo = AMDILISD::ATOM_G_MIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_gu32:
+             IntNo = AMDILISD::ATOM_G_UMIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_gi32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_MIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_UMIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_li32:
+             IntNo = AMDILISD::ATOM_L_MIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_lu32:
+             IntNo = AMDILISD::ATOM_L_UMIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_li32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_MIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_UMIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_ri32:
+             IntNo = AMDILISD::ATOM_R_MIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_ru32:
+             IntNo = AMDILISD::ATOM_R_UMIN; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_ri32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_MIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_min_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_UMIN_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_gu32:
+             IntNo = AMDILISD::ATOM_G_OR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_OR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_lu32:
+             IntNo = AMDILISD::ATOM_L_OR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_OR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_ru32:
+             IntNo = AMDILISD::ATOM_R_OR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_or_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_or_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_OR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_gu32:
+             IntNo = AMDILISD::ATOM_G_SUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_SUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_lu32:
+             IntNo = AMDILISD::ATOM_L_SUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_SUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_ru32:
+             IntNo = AMDILISD::ATOM_R_SUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_sub_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_SUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_gu32:
+             IntNo = AMDILISD::ATOM_G_RSUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_RSUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_lu32:
+             IntNo = AMDILISD::ATOM_L_RSUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_RSUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_ru32:
+             IntNo = AMDILISD::ATOM_R_RSUB; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_rsub_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_RSUB_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gf32:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gu32:
+             IntNo = AMDILISD::ATOM_G_XCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gf32_noret:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_XCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_lf32:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_lu32:
+             IntNo = AMDILISD::ATOM_L_XCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_lf32_noret:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_XCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_rf32:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_ru32:
+             IntNo = AMDILISD::ATOM_R_XCHG; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_rf32_noret:
+             bitCastToInt = true;
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xchg_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_XCHG_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_gi32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_gu32:
+             IntNo = AMDILISD::ATOM_G_XOR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_gi32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_gu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_G_XOR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_li32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_lu32:
+             IntNo = AMDILISD::ATOM_L_XOR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_li32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_lu32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_L_XOR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_ri32:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_ru32:
+             IntNo = AMDILISD::ATOM_R_XOR; break;
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_ri32_noret:
+    case AMDGPUIntrinsic::AMDIL_atomic_xor_ru32_noret:
+             isRet = false;
+             IntNo = AMDILISD::ATOM_R_XOR_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_append_alloc_i32:
+             IntNo = AMDILISD::APPEND_ALLOC; break;
+    case AMDGPUIntrinsic::AMDIL_append_alloc_i32_noret:
+             isRet = false;
+             IntNo = AMDILISD::APPEND_ALLOC_NORET; break;
+    case AMDGPUIntrinsic::AMDIL_append_consume_i32:
+             IntNo = AMDILISD::APPEND_CONSUME; break;
+    case AMDGPUIntrinsic::AMDIL_append_consume_i32_noret:
+             isRet = false;
+             IntNo = AMDILISD::APPEND_CONSUME_NORET; break;
+  };
+  const AMDILSubtarget *stm = &this->getTargetMachine()
+    .getSubtarget<AMDILSubtarget>();
+  AMDILKernelManager *KM = const_cast<AMDILKernelManager*>(
+      stm->getKernelManager());
+  KM->setOutputInst();
+
+  Info.opc = IntNo;
+  Info.memVT = (bitCastToInt) ? MVT::f32 : MVT::i32;
+  Info.ptrVal = I.getOperand(0);
+  Info.offset = 0;
+  Info.align = 4;
+  Info.vol = true;
+  Info.readMem = isRet;
+  Info.writeMem = true;
+  return true;
+}
+// The backend supports 32 and 64 bit floating point immediates
+bool
+AMDILTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const
+{
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool
+AMDILTargetLowering::ShouldShrinkFPConstant(EVT VT) const
+{
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+// be zero. Op is expected to be a target specific node. Used by DAG
+// combiner.
+
+void
+AMDILTargetLowering::computeMaskedBitsForTargetNode(
+    const SDValue Op,
+    APInt &KnownZero,
+    APInt &KnownOne,
+    const SelectionDAG &DAG,
+    unsigned Depth) const
+{
+  APInt KnownZero2;
+  APInt KnownOne2;
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
+  switch (Op.getOpcode()) {
+    default: break;
+    case AMDILISD::SELECT_CC:
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(1),
+                 KnownZero,
+                 KnownOne,
+                 Depth + 1
+                 );
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(0),
+                 KnownZero2,
+                 KnownOne2
+                 );
+             assert((KnownZero & KnownOne) == 0
+                 && "Bits known to be one AND zero?");
+             assert((KnownZero2 & KnownOne2) == 0
+                 && "Bits known to be one AND zero?");
+             // Only known if known in both the LHS and RHS
+             KnownOne &= KnownOne2;
+             KnownZero &= KnownZero2;
+             break;
+  };
+}
+
+// This is the function that determines which calling convention should
+// be used. Currently there is only one calling convention
+CCAssignFn*
+AMDILTargetLowering::CCAssignFnForNode(unsigned int Op) const
+{
+  //uint64_t CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  return CC_AMDIL32;
+}
+
+// LowerCallResult - Lower the result values of an ISD::CALL into the
+// appropriate copies out of appropriate physical registers.  This assumes that
+// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+// being lowered.  The returns a SDNode with the same number of values as the
+// ISD::CALL.
+SDValue
+AMDILTargetLowering::LowerCallResult(
+    SDValue Chain,
+    SDValue InFlag,
+    CallingConv::ID CallConv,
+    bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl,
+    SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const
+{
+  // Assign locations to each value returned by this call
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_AMDIL32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    EVT CopyVT = RVLocs[i].getValVT();
+    if (RVLocs[i].isRegLoc()) {
+      Chain = DAG.getCopyFromReg(
+          Chain,
+          dl,
+          RVLocs[i].getLocReg(),
+          CopyVT,
+          InFlag
+          ).getValue(1);
+      SDValue Val = Chain.getValue(0);
+      InFlag = Chain.getValue(2);
+      InVals.push_back(Val);
+    }
+  }
+
+  return Chain;
+
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+AMDILTargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr *MI, MachineBasicBlock *BB) const
+{
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  switch (MI->getOpcode()) {
+    ExpandCaseToAllTypes(AMDIL::CMP);
+    generateCMPInstr(MI, BB, TII);
+    MI->eraseFromParent();
+    break;
+    default:
+    break;
+  }
+  return BB;
+}
+
+// Recursively assign SDNodeOrdering to any unordered nodes
+// This is necessary to maintain source ordering of instructions
+// under -O0 to avoid odd-looking "skipping around" issues.
+  static const SDValue
+Ordered( SelectionDAG &DAG, unsigned order, const SDValue New )
+{
+  if (order != 0 && DAG.GetOrdering( New.getNode() ) == 0) {
+    DAG.AssignOrdering( New.getNode(), order );
+    for (unsigned i = 0, e = New.getNumOperands(); i < e; ++i)
+      Ordered( DAG, order, New.getOperand(i) );
+  }
+  return New;
+}
+
+#define LOWER(A) \
+  case ISD:: A: \
+return Ordered( DAG, DAG.GetOrdering( Op.getNode() ), Lower##A(Op, DAG) )
+
+SDValue
+AMDILTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
+{
+  switch (Op.getOpcode()) {
+    default:
+      Op.getNode()->dump();
+      assert(0 && "Custom lowering code for this"
+          "instruction is not implemented yet!");
+      break;
+      LOWER(GlobalAddress);
+      LOWER(JumpTable);
+      LOWER(ConstantPool);
+      LOWER(ExternalSymbol);
+      LOWER(FP_TO_SINT);
+      LOWER(FP_TO_UINT);
+      LOWER(SINT_TO_FP);
+      LOWER(UINT_TO_FP);
+      LOWER(ADD);
+      LOWER(MUL);
+      LOWER(SUB);
+      LOWER(FDIV);
+      LOWER(SDIV);
+      LOWER(SREM);
+      LOWER(UDIV);
+      LOWER(UREM);
+      LOWER(BUILD_VECTOR);
+      LOWER(INSERT_VECTOR_ELT);
+      LOWER(EXTRACT_VECTOR_ELT);
+      LOWER(EXTRACT_SUBVECTOR);
+      LOWER(SCALAR_TO_VECTOR);
+      LOWER(CONCAT_VECTORS);
+      LOWER(AND);
+      LOWER(OR);
+      LOWER(SELECT);
+      LOWER(SELECT_CC);
+      LOWER(SETCC);
+      LOWER(SIGN_EXTEND_INREG);
+      LOWER(BITCAST);
+      LOWER(DYNAMIC_STACKALLOC);
+      LOWER(BRCOND);
+      LOWER(BR_CC);
+      LOWER(FP_ROUND);
+  }
+  return Op;
+}
+
+int
+AMDILTargetLowering::getVarArgsFrameOffset() const
+{
+  return VarArgsFrameOffset;
+}
+#undef LOWER
+
+SDValue
+AMDILTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue DST = Op;
+  const GlobalAddressSDNode *GADN = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *G = GADN->getGlobal();
+  const AMDILSubtarget *stm = &this->getTargetMachine()
+    .getSubtarget<AMDILSubtarget>();
+  const AMDILGlobalManager *GM = stm->getGlobalManager();
+  DebugLoc DL = Op.getDebugLoc();
+  int64_t base_offset = GADN->getOffset();
+  int32_t arrayoffset = GM->getArrayOffset(G->getName());
+  int32_t constoffset = GM->getConstOffset(G->getName());
+  if (arrayoffset != -1) {
+    DST = DAG.getConstant(arrayoffset, MVT::i32);
+    DST = DAG.getNode(ISD::ADD, DL, MVT::i32,
+        DST, DAG.getConstant(base_offset, MVT::i32));
+  } else if (constoffset != -1) {
+    if (GM->getConstHWBit(G->getName())) {
+      DST = DAG.getConstant(constoffset, MVT::i32);
+      DST = DAG.getNode(ISD::ADD, DL, MVT::i32,
+          DST, DAG.getConstant(base_offset, MVT::i32));
+    } else {
+      SDValue addr = DAG.getTargetGlobalAddress(G, DL, MVT::i32);
+      SDValue DPReg = DAG.getRegister(AMDIL::SDP, MVT::i32);
+      DPReg = DAG.getNode(ISD::ADD, DL, MVT::i32, DPReg,
+          DAG.getConstant(base_offset, MVT::i32));
+      DST = DAG.getNode(AMDILISD::ADDADDR, DL, MVT::i32, addr, DPReg);
+    }
+  } else {
+    const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+    if (!GV) {
+      DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+    } else {
+      if (GV->hasInitializer()) {
+        const Constant *C = dyn_cast<Constant>(GV->getInitializer());
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
+          DST = DAG.getConstant(CI->getValue(), Op.getValueType());
+
+        } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(C)) {
+          DST = DAG.getConstantFP(CF->getValueAPF(),
+              Op.getValueType());
+        } else if (dyn_cast<ConstantAggregateZero>(C)) {
+          EVT VT = Op.getValueType();
+          if (VT.isInteger()) {
+            DST = DAG.getConstant(0, VT);
+          } else {
+            DST = DAG.getConstantFP(0, VT);
+          }
+        } else {
+          assert(!"lowering this type of Global Address "
+              "not implemented yet!");
+          C->dump();
+          DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+        }
+      } else {
+        DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
+      }
+    }
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const
+{
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32);
+  return Result;
+}
+SDValue
+AMDILTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = Op.getValueType();
+  SDValue Result;
+  if (CP->isMachineConstantPoolEntry()) {
+    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+        CP->getAlignment(), CP->getOffset(), CP->getTargetFlags());
+  } else {
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+        CP->getAlignment(), CP->getOffset(), CP->getTargetFlags());
+  }
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const
+{
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, MVT::i32);
+  return Result;
+}
+/// LowerFORMAL_ARGUMENTS - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: isVarArg, hasStructRet, isMemReg
+  SDValue
+AMDILTargetLowering::LowerFormalArguments(SDValue Chain,
+    CallingConv::ID CallConv,
+    bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl,
+    SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals)
+const
+{
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  AMDILMachineFunctionInfo *FuncInfo
+    = MF.getInfo<AMDILMachineFunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  //const Function *Fn = MF.getFunction();
+  //MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CallingConv::ID CC = MF.getFunction()->getCallingConv();
+  //bool hasStructRet = MF.getFunction()->hasStructRetAttr();
+
+  CCState CCInfo(CC, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+
+  // When more calling conventions are added, they need to be chosen here
+  CCInfo.AnalyzeFormalArguments(Ins, CC_AMDIL32);
+  SDValue StackPtr;
+
+  //unsigned int FirstStackArgLoc = 0;
+
+  for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      const TargetRegisterClass *RC = getRegClassFromType(
+          RegVT.getSimpleVT().SimpleTy);
+
+      unsigned int Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(
+          Chain,
+          dl,
+          Reg,
+          RegVT);
+      // If this is an 8 or 16-bit value, it is really passed
+      // promoted to 32 bits.  Insert an assert[sz]ext to capture
+      // this, then truncate to the right size.
+
+      if (VA.getLocInfo() == CCValAssign::SExt) {
+        ArgValue = DAG.getNode(
+            ISD::AssertSext,
+            dl,
+            RegVT,
+            ArgValue,
+            DAG.getValueType(VA.getValVT()));
+      } else if (VA.getLocInfo() == CCValAssign::ZExt) {
+        ArgValue = DAG.getNode(
+            ISD::AssertZext,
+            dl,
+            RegVT,
+            ArgValue,
+            DAG.getValueType(VA.getValVT()));
+      }
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        ArgValue = DAG.getNode(
+            ISD::TRUNCATE,
+            dl,
+            VA.getValVT(),
+            ArgValue);
+      }
+      // Add the value to the list of arguments
+      // to be passed in registers
+      InVals.push_back(ArgValue);
+      if (isVarArg) {
+        assert(0 && "Variable arguments are not yet supported");
+        // See MipsISelLowering.cpp for ideas on how to implement
+      }
+    } else if(VA.isMemLoc()) {
+      InVals.push_back(LowerMemArgument(Chain, CallConv, Ins,
+            dl, DAG, VA, MFI, i));
+    } else {
+      assert(0 && "found a Value Assign that is "
+          "neither a register or a memory location");
+    }
+  }
+  /*if (hasStructRet) {
+    assert(0 && "Has struct return is not yet implemented");
+  // See MipsISelLowering.cpp for ideas on how to implement
+  }*/
+
+  unsigned int StackSize = CCInfo.getNextStackOffset();
+  if (isVarArg) {
+    assert(0 && "Variable arguments are not yet supported");
+    // See X86/PPC/CellSPU ISelLowering.cpp for ideas on how to implement
+  }
+  // This needs to be changed to non-zero if the return function needs
+  // to pop bytes
+  FuncInfo->setBytesToPopOnReturn(StackSize);
+  return Chain;
+}
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+    ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
+  assert(0 && "MemCopy does not exist yet");
+  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+
+  return DAG.getMemcpy(Chain,
+      Src.getDebugLoc(),
+      Dst, Src, SizeNode, Flags.getByValAlign(),
+      /*IsVol=*/false, /*AlwaysInline=*/true, 
+      MachinePointerInfo(), MachinePointerInfo());
+}
+
+SDValue
+AMDILTargetLowering::LowerMemOpCallTo(SDValue Chain,
+    SDValue StackPtr, SDValue Arg,
+    DebugLoc dl, SelectionDAG &DAG,
+    const CCValAssign &VA,
+    ISD::ArgFlagsTy Flags) const
+{
+  unsigned int LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD,
+      dl,
+      getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    PtrOff = CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
+  } else {
+    PtrOff = DAG.getStore(Chain, dl, Arg, PtrOff,
+        MachinePointerInfo::getStack(LocMemOffset),
+        false, false, 0);
+  }
+  return PtrOff;
+}
+/// LowerCAL - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isVarArg, isTailCall, hasStructRet
+SDValue
+AMDILTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+    CallingConv::ID CallConv, bool isVarArg, bool doesNotRet,
+    bool& isTailCall,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins,
+    DebugLoc dl, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals)
+const
+{
+  isTailCall = false;
+  MachineFunction& MF = DAG.getMachineFunction();
+  // FIXME: DO we need to handle fast calling conventions and tail call
+  // optimizations?? X86/PPC ISelLowering
+  /*bool hasStructRet = (TheCall->getNumArgs())
+    ? TheCall->getArgFlags(0).device()->isSRet()
+    : false;*/
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Analyze operands of the call, assigning locations to each operand
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  // Analyize the calling operands, but need to change
+  // if we have more than one calling convetion
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+
+  unsigned int NumBytes = CCInfo.getNextStackOffset();
+  if (isTailCall) {
+    assert(isTailCall && "Tail Call not handled yet!");
+    // See X86/PPC ISelLowering
+  }
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SmallVector<std::pair<unsigned int, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  //unsigned int FirstStacArgLoc = 0;
+  //int LastArgStackLoc = 0;
+
+  // Walk the register/memloc assignments, insert copies/loads
+  for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    //bool isByVal = Flags.isByVal(); // handle byval/bypointer registers
+    // Arguments start after the 5 first operands of ISD::CALL
+    SDValue Arg = OutVals[i];
+    //Promote the value if needed
+    switch(VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full:
+               break;
+      case CCValAssign::SExt:
+               Arg = DAG.getNode(ISD::SIGN_EXTEND,
+                   dl,
+                   VA.getLocVT(), Arg);
+               break;
+      case CCValAssign::ZExt:
+               Arg = DAG.getNode(ISD::ZERO_EXTEND,
+                   dl,
+                   VA.getLocVT(), Arg);
+               break;
+      case CCValAssign::AExt:
+               Arg = DAG.getNode(ISD::ANY_EXTEND,
+                   dl,
+                   VA.getLocVT(), Arg);
+               break;
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (VA.isMemLoc()) {
+      // Create the frame index object for this incoming parameter
+      int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+          VA.getLocMemOffset(), true);
+      SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+      // emit ISD::STORE whichs stores the
+      // parameter value to a stack Location
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+            MachinePointerInfo::getFixedStack(FI),
+            false, false, 0));
+    } else {
+      assert(0 && "Not a Reg/Mem Loc, major error!");
+    }
+  }
+  if (!MemOpChains.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor,
+        dl,
+        MVT::Other,
+        &MemOpChains[0],
+        MemOpChains.size());
+  }
+  SDValue InFlag;
+  if (!isTailCall) {
+    for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain,
+          dl,
+          RegsToPass[i].first,
+          RegsToPass[i].second,
+          InFlag);
+      InFlag = Chain.getValue(1);
+    }
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common,
+  // every direct call is) turn it into a TargetGlobalAddress/
+  // TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))  {
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy());
+  }
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+  }
+  else if (isTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1708
+  }
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVTGLUE);
+  SmallVector<SDValue, 8> Ops;
+
+  if (isTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1721
+  }
+  // If this is a direct call, pass the chain and the callee
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+
+  if (isTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1739
+  }
+
+  // Add argument registers to the end of the list so that they are known
+  // live into the call
+  for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Ops.push_back(DAG.getRegister(
+          RegsToPass[i].first,
+          RegsToPass[i].second.getValueType()));
+  }
+  if (InFlag.getNode()) {
+    Ops.push_back(InFlag);
+  }
+
+  // Emit Tail Call
+  if (isTailCall) {
+    assert(0 && "Tail calls are not handled yet");
+    // see X86 ISelLowering for ideas on implementation: 1762
+  }
+
+  Chain = DAG.getNode(AMDILISD::CALL,
+      dl,
+      NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node
+  Chain = DAG.getCALLSEQ_END(
+      Chain,
+      DAG.getIntPtrConstant(NumBytes, true),
+      DAG.getIntPtrConstant(0, true),
+      InFlag);
+  InFlag = Chain.getValue(1);
+  // Handle result values, copying them out of physregs into vregs that
+  // we return
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+      InVals);
+}
+static void checkMADType(
+    SDValue Op, const AMDILSubtarget *STM, bool& is24bitMAD, bool& is32bitMAD)
+{
+  bool globalLoadStore = false;
+  is24bitMAD = false;
+  is32bitMAD = false;
+  return;
+  assert(Op.getOpcode() == ISD::ADD && "The opcode must be a add in order for "
+      "this to work correctly!");
+  if (Op.getNode()->use_empty()) {
+    return;
+  }
+  for (SDNode::use_iterator nBegin = Op.getNode()->use_begin(),
+      nEnd = Op.getNode()->use_end(); nBegin != nEnd; ++nBegin) {
+    SDNode *ptr = *nBegin;
+    const LSBaseSDNode *lsNode = dyn_cast<LSBaseSDNode>(ptr);
+    // If we are not a LSBaseSDNode then we don't do this
+    // optimization.
+    // If we are a LSBaseSDNode, but the op is not the offset
+    // or base pointer, then we don't do this optimization
+    // (i.e. we are the value being stored)
+    if (!lsNode ||
+        (lsNode->writeMem() && lsNode->getOperand(1) == Op)) {
+      return;
+    }
+    const PointerType *PT =
+      dyn_cast<PointerType>(lsNode->getSrcValue()->getType());
+    unsigned as = PT->getAddressSpace();
+    switch(as) {
+      default:
+        globalLoadStore = true;
+      case AMDILAS::PRIVATE_ADDRESS:
+        if (!STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) {
+          globalLoadStore = true;
+        }
+        break;
+      case AMDILAS::CONSTANT_ADDRESS:
+        if (!STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) {
+          globalLoadStore = true;
+        }
+        break;
+      case AMDILAS::LOCAL_ADDRESS:
+        if (!STM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) {
+          globalLoadStore = true;
+        }
+        break;
+      case AMDILAS::REGION_ADDRESS:
+        if (!STM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) {
+          globalLoadStore = true;
+        }
+        break;
+    }
+  }
+  if (globalLoadStore) {
+    is32bitMAD = true;
+  } else {
+    is24bitMAD = true;
+  }
+}
+
+SDValue
+AMDILTargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  const AMDILSubtarget *stm = &this->getTargetMachine()
+    .getSubtarget<AMDILSubtarget>();
+  bool isVec = OVT.isVector();
+  if (OVT.getScalarType() == MVT::i64) {
+    MVT INTTY = MVT::i32;
+    if (OVT == MVT::v2i64) {
+      INTTY = MVT::v2i32;
+    }
+    if (stm->device()->usesHardware(AMDILDeviceInfo::LongOps)
+        && INTTY == MVT::i32) {
+      DST = DAG.getNode(AMDILISD::ADD,
+          DL,
+          OVT,
+          LHS, RHS);
+    } else {
+      SDValue LHSLO, LHSHI, RHSLO, RHSHI, INTLO, INTHI;
+      // TODO: need to turn this into a bitcast of i64/v2i64 to v2i32/v4i32
+      LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, LHS);
+      RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, RHS);
+      LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, LHS);
+      RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, RHS);
+      INTLO = DAG.getNode(ISD::ADD, DL, INTTY, LHSLO, RHSLO);
+      INTHI = DAG.getNode(ISD::ADD, DL, INTTY, LHSHI, RHSHI);
+      SDValue cmp;
+      cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+          DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32),
+          INTLO, RHSLO);
+      cmp = DAG.getNode(AMDILISD::INEGATE, DL, INTTY, cmp);
+      INTHI = DAG.getNode(ISD::ADD, DL, INTTY, INTHI, cmp);
+      DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, OVT,
+          INTLO, INTHI);
+    }
+  } else {
+    if (LHS.getOpcode() == ISD::FrameIndex ||
+        RHS.getOpcode() == ISD::FrameIndex) {
+      DST = DAG.getNode(AMDILISD::ADDADDR,
+          DL,
+          OVT,
+          LHS, RHS);
+    } else {
+      if (stm->device()->usesHardware(AMDILDeviceInfo::LocalMem)
+          && LHS.getNumOperands()
+          && RHS.getNumOperands()) {
+        bool is24bitMAD = false;
+        bool is32bitMAD = false;
+        const ConstantSDNode *LHSConstOpCode =
+          dyn_cast<ConstantSDNode>(LHS.getOperand(LHS.getNumOperands()-1));
+        const ConstantSDNode *RHSConstOpCode =
+          dyn_cast<ConstantSDNode>(RHS.getOperand(RHS.getNumOperands()-1));
+        if ((LHS.getOpcode() == ISD::SHL && LHSConstOpCode)
+            || (RHS.getOpcode() == ISD::SHL && RHSConstOpCode)
+            || LHS.getOpcode() == ISD::MUL
+            || RHS.getOpcode() == ISD::MUL) {
+          SDValue Op1, Op2, Op3;
+          // FIXME: Fix this so that it works for unsigned 24bit ops.
+          if (LHS.getOpcode() == ISD::MUL) {
+            Op1 = LHS.getOperand(0);
+            Op2 = LHS.getOperand(1);
+            Op3 = RHS;
+          } else if (RHS.getOpcode() == ISD::MUL) {
+            Op1 = RHS.getOperand(0);
+            Op2 = RHS.getOperand(1);
+            Op3 = LHS;
+          } else if (LHS.getOpcode() == ISD::SHL && LHSConstOpCode) {
+            Op1 = LHS.getOperand(0);
+            Op2 = DAG.getConstant(
+                1 << LHSConstOpCode->getZExtValue(), MVT::i32);
+            Op3 = RHS;
+          } else if (RHS.getOpcode() == ISD::SHL && RHSConstOpCode) {
+            Op1 = RHS.getOperand(0);
+            Op2 = DAG.getConstant(
+                1 << RHSConstOpCode->getZExtValue(), MVT::i32);
+            Op3 = LHS;
+          }
+          checkMADType(Op, stm, is24bitMAD, is32bitMAD);
+          // We can possibly do a MAD transform!
+          if (is24bitMAD && stm->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
+            uint32_t opcode = AMDGPUIntrinsic::AMDIL_mad24_i32;
+            SDVTList Tys = DAG.getVTList(OVT/*, MVT::Other*/);
+            DST = DAG.getNode(ISD::INTRINSIC_W_CHAIN,
+                DL, Tys, DAG.getEntryNode(), DAG.getConstant(opcode, MVT::i32),
+                Op1, Op2, Op3);
+          } else if(is32bitMAD) {
+            SDVTList Tys = DAG.getVTList(OVT/*, MVT::Other*/);
+            DST = DAG.getNode(ISD::INTRINSIC_W_CHAIN,
+                DL, Tys, DAG.getEntryNode(),
+                DAG.getConstant(
+                  AMDGPUIntrinsic::AMDIL_mad_i32, MVT::i32),
+                Op1, Op2, Op3);
+          }
+        }
+      }
+      DST = DAG.getNode(AMDILISD::ADD,
+          DL,
+          OVT,
+          LHS, RHS);
+    }
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::genCLZuN(SDValue Op, SelectionDAG &DAG,
+    uint32_t bits) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT INTTY = Op.getValueType();
+  EVT FPTY;
+  if (INTTY.isVector()) {
+    FPTY = EVT(MVT::getVectorVT(MVT::f32,
+          INTTY.getVectorNumElements()));
+  } else {
+    FPTY = EVT(MVT::f32);
+  }
+  /* static inline uint
+     __clz_Nbit(uint x)
+     {
+     int xor = 0x3f800000U | x;
+     float tp = as_float(xor);
+     float t = tp + -1.0f;
+     uint tint = as_uint(t);
+     int cmp = (x != 0);
+     uint tsrc = tint >> 23;
+     uint tmask = tsrc & 0xffU;
+     uint cst = (103 + N)U - tmask;
+     return cmp ? cst : N;
+     }
+     */
+  assert(INTTY.getScalarType().getSimpleVT().SimpleTy == MVT::i32
+      && "genCLZu16 only works on 32bit types");
+  // uint x = Op
+  SDValue x = Op;
+  // xornode = 0x3f800000 | x
+  SDValue xornode = DAG.getNode(ISD::OR, DL, INTTY,
+      DAG.getConstant(0x3f800000, INTTY), x);
+  // float tp = as_float(xornode)
+  SDValue tp = DAG.getNode(ISDBITCAST, DL, FPTY, xornode);
+  // float t = tp + -1.0f
+  SDValue t = DAG.getNode(ISD::FADD, DL, FPTY, tp,
+      DAG.getConstantFP(-1.0f, FPTY));
+  // uint tint = as_uint(t)
+  SDValue tint = DAG.getNode(ISDBITCAST, DL, INTTY, t);
+  // int cmp = (x != 0)
+  SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+      DAG.getConstant(CondCCodeToCC(ISD::SETNE, MVT::i32), MVT::i32), x,
+      DAG.getConstant(0, INTTY));
+  // uint tsrc = tint >> 23
+  SDValue tsrc = DAG.getNode(ISD::SRL, DL, INTTY, tint,
+      DAG.getConstant(23, INTTY));
+  // uint tmask = tsrc & 0xFF
+  SDValue tmask = DAG.getNode(ISD::AND, DL, INTTY, tsrc,
+      DAG.getConstant(0xFFU, INTTY));
+  // uint cst = (103 + bits) - tmask
+  SDValue cst = DAG.getNode(ISD::SUB, DL, INTTY,
+      DAG.getConstant((103U + bits), INTTY), tmask);
+  // return cmp ? cst : N
+  cst = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp, cst,
+      DAG.getConstant(bits, INTTY));
+  return cst;
+}
+
+SDValue
+AMDILTargetLowering::genCLZu32(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue DST = SDValue();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT INTTY = Op.getValueType();
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  if (stm->device()->getGeneration() >= AMDILDeviceInfo::HD5XXX) {
+    //__clz_32bit(uint u)
+    //{
+    // int z = __amdil_ffb_hi(u) ;
+    // return z < 0 ? 32 : z;
+    // }
+    // uint u = op
+    SDValue u = Op;
+    // int z = __amdil_ffb_hi(u)
+    SDValue z = DAG.getNode(AMDILISD::IFFB_HI, DL, INTTY, u);
+    // int cmp = z < 0
+    SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+        z, DAG.getConstant(0, INTTY));
+    // return cmp ? 32 : z
+    DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp,
+        DAG.getConstant(32, INTTY), z);
+  } else if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    //  static inline uint
+    //__clz_32bit(uint x)
+    //{
+    //    uint zh = __clz_16bit(x >> 16);
+    //    uint zl = __clz_16bit(x & 0xffffU);
+    //   return zh == 16U ? 16U + zl : zh;
+    //}
+    // uint x = Op
+    SDValue x = Op;
+    // uint xs16 = x >> 16
+    SDValue xs16 = DAG.getNode(ISD::SRL, DL, INTTY, x,
+        DAG.getConstant(16, INTTY));
+    // uint zh = __clz_16bit(xs16)
+    SDValue zh = genCLZuN(xs16, DAG, 16);
+    // uint xa16 = x & 0xFFFF
+    SDValue xa16 = DAG.getNode(ISD::AND, DL, INTTY, x,
+        DAG.getConstant(0xFFFFU, INTTY));
+    // uint zl = __clz_16bit(xa16)
+    SDValue zl = genCLZuN(xa16, DAG, 16);
+    // uint cmp = zh == 16U
+    SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        zh, DAG.getConstant(16U, INTTY));
+    // uint zl16 = zl + 16
+    SDValue zl16 = DAG.getNode(ISD::ADD, DL, INTTY,
+        DAG.getConstant(16, INTTY), zl);
+    // return cmp ? zl16 : zh
+    DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY,
+        cmp, zl16, zh);
+  } else {
+    assert(0 && "Attempting to generate a CLZ function with an"
+        " unknown graphics card");
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::genCLZu64(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue DST = SDValue();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT INTTY;
+  EVT LONGTY = Op.getValueType();
+  bool isVec = LONGTY.isVector();
+  if (isVec) {
+    INTTY = EVT(MVT::getVectorVT(MVT::i32, Op.getValueType()
+          .getVectorNumElements()));
+  } else {
+    INTTY = EVT(MVT::i32);
+  }
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  if (stm->device()->getGeneration() >= AMDILDeviceInfo::HD5XXX) {
+    // Evergreen:
+    // static inline uint
+    // __clz_u64(ulong x)
+    // {
+    //uint zhi = __clz_32bit((uint)(x >> 32));
+    //uint zlo = __clz_32bit((uint)(x & 0xffffffffUL));
+    //return zhi == 32U ? 32U + zlo : zhi;
+    //}
+    //ulong x = op
+    SDValue x = Op;
+    // uint xhi = x >> 32
+    SDValue xlo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, x);
+    // uint xlo = x & 0xFFFFFFFF
+    SDValue xhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, x);
+    // uint zhi = __clz_32bit(xhi)
+    SDValue zhi = genCLZu32(xhi, DAG);
+    // uint zlo = __clz_32bit(xlo)
+    SDValue zlo = genCLZu32(xlo, DAG);
+    // uint cmp = zhi == 32
+    SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        zhi, DAG.getConstant(32U, INTTY));
+    // uint zlop32 = 32 + zlo
+    SDValue zlop32 = DAG.getNode(AMDILISD::ADD, DL, INTTY,
+        DAG.getConstant(32U, INTTY), zlo);
+    // return cmp ? zlop32: zhi
+    DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp, zlop32, zhi);
+  } else if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // HD4XXX:
+    //  static inline uint
+    //__clz_64bit(ulong x)
+    //{
+    //uint zh = __clz_23bit((uint)(x >> 46)) - 5U;
+    //uint zm = __clz_23bit((uint)(x >> 23) & 0x7fffffU);
+    //uint zl = __clz_23bit((uint)x & 0x7fffffU);
+    //uint r = zh == 18U ? 18U + zm : zh;
+    //return zh + zm == 41U ? 41U + zl : r;
+    //}
+    //ulong x = Op
+    SDValue x = Op;
+    // ulong xs46 = x >> 46
+    SDValue xs46 = DAG.getNode(ISD::SRL, DL, LONGTY, x,
+        DAG.getConstant(46, LONGTY));
+    // uint ixs46 = (uint)xs46
+    SDValue ixs46 = DAG.getNode(ISD::TRUNCATE, DL, INTTY, xs46);
+    // ulong xs23 = x >> 23
+    SDValue xs23 = DAG.getNode(ISD::SRL, DL, LONGTY, x,
+        DAG.getConstant(23, LONGTY));
+    // uint ixs23 = (uint)xs23
+    SDValue ixs23 = DAG.getNode(ISD::TRUNCATE, DL, INTTY, xs23);
+    // uint xs23m23 = ixs23 & 0x7FFFFF
+    SDValue xs23m23 = DAG.getNode(ISD::AND, DL, INTTY, ixs23,
+        DAG.getConstant(0x7fffffU, INTTY));
+    // uint ix = (uint)x
+    SDValue ix = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, x);
+    // uint xm23 = ix & 0x7FFFFF
+    SDValue xm23 = DAG.getNode(ISD::AND, DL, INTTY, ix,
+        DAG.getConstant(0x7fffffU, INTTY));
+    // uint zh = __clz_23bit(ixs46)
+    SDValue zh = genCLZuN(ixs46, DAG, 23);
+    // uint zm = __clz_23bit(xs23m23)
+    SDValue zm = genCLZuN(xs23m23, DAG, 23);
+    // uint zl = __clz_23bit(xm23)
+    SDValue zl = genCLZuN(xm23, DAG, 23);
+    // uint zhm5 = zh - 5
+    SDValue zhm5 = DAG.getNode(ISD::ADD, DL, INTTY, zh,
+        DAG.getConstant(-5U, INTTY));
+    SDValue const18 = DAG.getConstant(18, INTTY);
+    SDValue const41 = DAG.getConstant(41, INTTY);
+    // uint cmp1 = zh = 18
+    SDValue cmp1 = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        zhm5, const18);
+    // uint zhm5zm = zhm5 + zh
+    SDValue zhm5zm = DAG.getNode(ISD::ADD, DL, INTTY, zhm5, zm);
+    // uint cmp2 = zhm5zm == 41
+    SDValue cmp2 = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        zhm5zm, const41);
+    // uint zmp18 = zhm5 + 18
+    SDValue zmp18 = DAG.getNode(ISD::ADD, DL, INTTY, zm, const18);
+    // uint zlp41 = zl + 41
+    SDValue zlp41 = DAG.getNode(ISD::ADD, DL, INTTY, zl, const41);
+    // uint r = cmp1 ? zmp18 : zh
+    SDValue r = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY,
+        cmp1, zmp18, zhm5);
+    // return cmp2 ? zlp41 : r
+    DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp2, zlp41, r);
+  } else {
+    assert(0 && "Attempting to generate a CLZ function with an"
+        " unknown graphics card");
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::genf64toi64(SDValue RHS, SelectionDAG &DAG,
+    bool includeSign) const
+{
+  EVT INTVT;
+  EVT LONGVT;
+  SDValue DST;
+  DebugLoc DL = RHS.getDebugLoc();
+  EVT RHSVT = RHS.getValueType();
+  bool isVec = RHSVT.isVector();
+  if (isVec) {
+    LONGVT = EVT(MVT::getVectorVT(MVT::i64, RHSVT
+          .getVectorNumElements()));
+    INTVT = EVT(MVT::getVectorVT(MVT::i32, RHSVT
+          .getVectorNumElements()));
+  } else {
+    LONGVT = EVT(MVT::i64);
+    INTVT = EVT(MVT::i32);
+  }
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+    // unsigned version:
+    // uint uhi = (uint)(d * 0x1.0p-32);
+    // uint ulo = (uint)(mad((double)uhi, -0x1.0p+32, d));
+    // return as_ulong2((uint2)(ulo, uhi));
+    //
+    // signed version:
+    // double ad = fabs(d);
+    // long l = unsigned_version(ad);
+    // long nl = -l;
+    // return d == ad ? l : nl;
+    SDValue d = RHS;
+    if (includeSign) {
+      d = DAG.getNode(ISD::FABS, DL, RHSVT, d);
+    }
+    SDValue uhid = DAG.getNode(ISD::FMUL, DL, RHSVT, d, 
+        DAG.getConstantFP(0x2f800000, RHSVT));
+    SDValue uhi = DAG.getNode(ISD::FP_TO_UINT, DL, INTVT, uhid);
+    SDValue ulod = DAG.getNode(ISD::UINT_TO_FP, DL, RHSVT, uhi);
+    ulod = DAG.getNode(AMDILISD::MAD, DL, RHSVT, ulod, 
+        DAG.getConstantFP(0xcf800000, RHSVT), d);
+    SDValue ulo = DAG.getNode(ISD::FP_TO_UINT, DL, INTVT, ulod);
+    SDValue l = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, ulo, uhi);
+    if (includeSign) {
+      SDValue nl = DAG.getNode(AMDILISD::INEGATE, DL, LONGVT, l);
+      SDValue c = DAG.getNode(AMDILISD::CMP, DL, RHSVT,
+          DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::f64), MVT::i32),
+          RHS, d);
+      l = DAG.getNode(AMDILISD::CMOVLOG, DL, LONGVT, c, l, nl);
+    }
+    DST = l;
+  } else {
+    /*
+       __attribute__((always_inline)) long
+       cast_f64_to_i64(double d)
+       {
+    // Convert d in to 32-bit components
+    long x = as_long(d);
+    xhi = LCOMPHI(x);
+    xlo = LCOMPLO(x);
+
+    // Generate 'normalized' mantissa
+    mhi = xhi | 0x00100000; // hidden bit
+    mhi <<= 11;
+    temp = xlo >> (32 - 11);
+    mhi |= temp
+    mlo = xlo << 11;
+
+    // Compute shift right count from exponent
+    e = (xhi >> (52-32)) & 0x7ff;
+    sr = 1023 + 63 - e;
+    srge64 = sr >= 64;
+    srge32 = sr >= 32;
+
+    // Compute result for 0 <= sr < 32
+    rhi0 = mhi >> (sr &31);
+    rlo0 = mlo >> (sr &31);
+    temp = mhi << (32 - sr);
+    temp |= rlo0;
+    rlo0 = sr ? temp : rlo0;
+
+    // Compute result for 32 <= sr
+    rhi1 = 0;
+    rlo1 = srge64 ? 0 : rhi0;
+
+    // Pick between the 2 results
+    rhi = srge32 ? rhi1 : rhi0;
+    rlo = srge32 ? rlo1 : rlo0;
+
+    // Optional saturate on overflow
+    srlt0 = sr < 0;
+    rhi = srlt0 ? MAXVALUE : rhi;
+    rlo = srlt0 ? MAXVALUE : rlo;
+
+    // Create long
+    res = LCREATE( rlo, rhi );
+
+    // Deal with sign bit (ignoring whether result is signed or unsigned value)
+    if (includeSign) {
+    sign = ((signed int) xhi) >> 31; fill with sign bit
+    sign = LCREATE( sign, sign );
+    res += sign;
+    res ^= sign;
+    }
+
+    return res;
+    }
+    */
+    SDValue c11 = DAG.getConstant( 63 - 52, INTVT );
+    SDValue c32 = DAG.getConstant( 32, INTVT );
+
+    // Convert d in to 32-bit components
+    SDValue d = RHS;
+    SDValue x = DAG.getNode(ISDBITCAST, DL, LONGVT, d);
+    SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x );
+    SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x );
+
+    // Generate 'normalized' mantissa
+    SDValue mhi = DAG.getNode( ISD::OR, DL, INTVT,
+        xhi, DAG.getConstant( 0x00100000, INTVT ) );
+    mhi = DAG.getNode( ISD::SHL, DL, INTVT, mhi, c11 );
+    SDValue temp = DAG.getNode( ISD::SRL, DL, INTVT,
+        xlo, DAG.getConstant( 32 - (63 - 52), INTVT ) );
+    mhi = DAG.getNode( ISD::OR, DL, INTVT, mhi, temp );
+    SDValue mlo = DAG.getNode( ISD::SHL, DL, INTVT, xlo, c11 );
+
+    // Compute shift right count from exponent
+    SDValue e = DAG.getNode( ISD::SRL, DL, INTVT,
+        xhi, DAG.getConstant( 52-32, INTVT ) );
+    e = DAG.getNode( ISD::AND, DL, INTVT,
+        e, DAG.getConstant( 0x7ff, INTVT ) );
+    SDValue sr = DAG.getNode( ISD::SUB, DL, INTVT,
+        DAG.getConstant( 1023 + 63, INTVT ), e );
+    SDValue srge64 = DAG.getNode( AMDILISD::CMP, DL, INTVT,
+        DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32),
+        sr, DAG.getConstant(64, INTVT));
+    SDValue srge32 = DAG.getNode( AMDILISD::CMP, DL, INTVT,
+        DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32),
+        sr, DAG.getConstant(32, INTVT));
+
+    // Compute result for 0 <= sr < 32
+    SDValue rhi0 = DAG.getNode( ISD::SRL, DL, INTVT, mhi, sr );
+    SDValue rlo0 = DAG.getNode( ISD::SRL, DL, INTVT, mlo, sr );
+    temp = DAG.getNode( ISD::SUB, DL, INTVT, c32, sr );
+    temp = DAG.getNode( ISD::SHL, DL, INTVT, mhi, temp );
+    temp = DAG.getNode( ISD::OR,  DL, INTVT, rlo0, temp );
+    rlo0 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, sr, temp, rlo0 );
+
+    // Compute result for 32 <= sr
+    SDValue rhi1 = DAG.getConstant( 0, INTVT );
+    SDValue rlo1 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        srge64, rhi1, rhi0 );
+
+    // Pick between the 2 results
+    SDValue rhi = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        srge32, rhi1, rhi0 );
+    SDValue rlo = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        srge32, rlo1, rlo0 );
+
+    // Create long
+    SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi );
+
+    // Deal with sign bit
+    if (includeSign) {
+      SDValue sign = DAG.getNode( ISD::SRA, DL, INTVT,
+          xhi, DAG.getConstant( 31, INTVT ) );
+      sign = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, sign, sign );
+      res = DAG.getNode( ISD::ADD, DL, LONGVT, res, sign );
+      res = DAG.getNode( ISD::XOR, DL, LONGVT, res, sign );
+    }
+    DST = res;
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::genf64toi32(SDValue RHS, SelectionDAG &DAG,
+    bool includeSign) const
+{
+  EVT INTVT;
+  EVT LONGVT;
+  DebugLoc DL = RHS.getDebugLoc();
+  EVT RHSVT = RHS.getValueType();
+  bool isVec = RHSVT.isVector();
+  if (isVec) {
+    LONGVT = EVT(MVT::getVectorVT(MVT::i64,
+          RHSVT.getVectorNumElements()));
+    INTVT = EVT(MVT::getVectorVT(MVT::i32,
+          RHSVT.getVectorNumElements()));
+  } else {
+    LONGVT = EVT(MVT::i64);
+    INTVT = EVT(MVT::i32);
+  }
+  /*
+     __attribute__((always_inline)) int
+     cast_f64_to_[u|i]32(double d)
+     {
+  // Convert d in to 32-bit components
+  long x = as_long(d);
+  xhi = LCOMPHI(x);
+  xlo = LCOMPLO(x);
+
+  // Generate 'normalized' mantissa
+  mhi = xhi | 0x00100000; // hidden bit
+  mhi <<= 11;
+  temp = xlo >> (32 - 11);
+  mhi |= temp
+
+  // Compute shift right count from exponent
+  e = (xhi >> (52-32)) & 0x7ff;
+  sr = 1023 + 31 - e;
+  srge32 = sr >= 32;
+
+  // Compute result for 0 <= sr < 32
+  res = mhi >> (sr &31);
+  res = srge32 ? 0 : res;
+
+  // Optional saturate on overflow
+  srlt0 = sr < 0;
+  res = srlt0 ? MAXVALUE : res;
+
+  // Deal with sign bit (ignoring whether result is signed or unsigned value)
+  if (includeSign) {
+  sign = ((signed int) xhi) >> 31; fill with sign bit
+  res += sign;
+  res ^= sign;
+  }
+
+  return res;
+  }
+  */
+  SDValue c11 = DAG.getConstant( 63 - 52, INTVT );
+
+  // Convert d in to 32-bit components
+  SDValue d = RHS;
+  SDValue x = DAG.getNode(ISDBITCAST, DL, LONGVT, d);
+  SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x );
+  SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x );
+
+  // Generate 'normalized' mantissa
+  SDValue mhi = DAG.getNode( ISD::OR, DL, INTVT,
+      xhi, DAG.getConstant( 0x00100000, INTVT ) );
+  mhi = DAG.getNode( ISD::SHL, DL, INTVT, mhi, c11 );
+  SDValue temp = DAG.getNode( ISD::SRL, DL, INTVT,
+      xlo, DAG.getConstant( 32 - (63 - 52), INTVT ) );
+  mhi = DAG.getNode( ISD::OR, DL, INTVT, mhi, temp );
+
+  // Compute shift right count from exponent
+  SDValue e = DAG.getNode( ISD::SRL, DL, INTVT,
+      xhi, DAG.getConstant( 52-32, INTVT ) );
+  e = DAG.getNode( ISD::AND, DL, INTVT,
+      e, DAG.getConstant( 0x7ff, INTVT ) );
+  SDValue sr = DAG.getNode( ISD::SUB, DL, INTVT,
+      DAG.getConstant( 1023 + 31, INTVT ), e );
+  SDValue srge32 = DAG.getNode( AMDILISD::CMP, DL, INTVT,
+      DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32),
+      sr, DAG.getConstant(32, INTVT));
+
+  // Compute result for 0 <= sr < 32
+  SDValue res = DAG.getNode( ISD::SRL, DL, INTVT, mhi, sr );
+  res = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+      srge32, DAG.getConstant(0,INTVT), res );
+
+  // Deal with sign bit
+  if (includeSign) {
+    SDValue sign = DAG.getNode( ISD::SRA, DL, INTVT,
+        xhi, DAG.getConstant( 31, INTVT ) );
+    res = DAG.getNode( ISD::ADD, DL, INTVT, res, sign );
+    res = DAG.getNode( ISD::XOR, DL, INTVT, res, sign );
+  }
+  return res;
+}
+SDValue
+AMDILTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue RHS = Op.getOperand(0);
+  EVT RHSVT = RHS.getValueType();
+  MVT RST = RHSVT.getScalarType().getSimpleVT();
+  EVT LHSVT = Op.getValueType();
+  MVT LST = LHSVT.getScalarType().getSimpleVT();
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue DST;
+  const AMDILTargetMachine*
+    amdtm = reinterpret_cast<const AMDILTargetMachine*>
+    (&this->getTargetMachine());
+  const AMDILSubtarget*
+    stm = dynamic_cast<const AMDILSubtarget*>(
+        amdtm->getSubtargetImpl());
+  if (RST == MVT::f64 && RHSVT.isVector()
+      && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
+    // We dont support vector 64bit floating point convertions.
+    for (unsigned x = 0, y = RHSVT.getVectorNumElements(); x < y; ++x) {
+      SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32));
+      op = DAG.getNode(ISD::FP_TO_SINT, DL, LST, op);
+      if (!x) {
+        DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op);
+      } else {
+        DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT,
+            DST, op, DAG.getTargetConstant(x, MVT::i32));
+      }
+    }
+  } else {
+    if (RST == MVT::f64
+        && LST == MVT::i32) {
+      if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        DST = SDValue(Op.getNode(), 0);
+      } else {
+        DST = genf64toi32(RHS, DAG, true);
+      }
+    } else if (RST == MVT::f64
+        && LST == MVT::i64) {
+      DST = genf64toi64(RHS, DAG, true);
+    } else if (RST == MVT::f64
+        && (LST == MVT::i8 || LST == MVT::i16)) {
+      if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, SDValue(Op.getNode(), 0));
+      } else {
+        SDValue ToInt = genf64toi32(RHS, DAG, true);
+        DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, ToInt);
+      }
+
+    } else {
+      DST = SDValue(Op.getNode(), 0);
+    }
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue DST;
+  SDValue RHS = Op.getOperand(0);
+  EVT RHSVT = RHS.getValueType();
+  MVT RST = RHSVT.getScalarType().getSimpleVT();
+  EVT LHSVT = Op.getValueType();
+  MVT LST = LHSVT.getScalarType().getSimpleVT();
+  DebugLoc DL = Op.getDebugLoc();
+  const AMDILTargetMachine*
+    amdtm = reinterpret_cast<const AMDILTargetMachine*>
+    (&this->getTargetMachine());
+  const AMDILSubtarget*
+    stm = dynamic_cast<const AMDILSubtarget*>(
+        amdtm->getSubtargetImpl());
+  if (RST == MVT::f64 && RHSVT.isVector()
+      && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
+    // We dont support vector 64bit floating point convertions.
+    for (unsigned x = 0, y = RHSVT.getVectorNumElements(); x < y; ++x) {
+      SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32));
+      op = DAG.getNode(ISD::FP_TO_SINT, DL, LST, op);
+      if (!x) {
+        DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op);
+      } else {
+        DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT,
+            DST, op, DAG.getTargetConstant(x, MVT::i32));
+      }
+
+    }
+  } else {
+    if (RST == MVT::f64
+        && LST == MVT::i32) {
+      if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        DST = SDValue(Op.getNode(), 0);
+      } else {
+        DST = genf64toi32(RHS, DAG, false);
+      }
+    } else if (RST == MVT::f64
+        && LST == MVT::i64) {
+      DST = genf64toi64(RHS, DAG, false);
+    } else if (RST == MVT::f64
+        && (LST == MVT::i8 || LST == MVT::i16)) {
+      if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, SDValue(Op.getNode(), 0));
+      } else {
+        SDValue ToInt = genf64toi32(RHS, DAG, false);
+        DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, ToInt);
+      }
+
+    } else {
+      DST = SDValue(Op.getNode(), 0);
+    }
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::genu32tof64(SDValue RHS, EVT LHSVT,
+    SelectionDAG &DAG) const
+{
+  EVT RHSVT = RHS.getValueType();
+  DebugLoc DL = RHS.getDebugLoc();
+  EVT INTVT;
+  EVT LONGVT;
+  bool isVec = RHSVT.isVector();
+  if (isVec) {
+    LONGVT = EVT(MVT::getVectorVT(MVT::i64,
+          RHSVT.getVectorNumElements()));
+    INTVT = EVT(MVT::getVectorVT(MVT::i32,
+          RHSVT.getVectorNumElements()));
+  } else {
+    LONGVT = EVT(MVT::i64);
+    INTVT = EVT(MVT::i32);
+  }
+  SDValue x = RHS;
+  const AMDILTargetMachine*
+    amdtm = reinterpret_cast<const AMDILTargetMachine*>
+    (&this->getTargetMachine());
+  const AMDILSubtarget*
+    stm = dynamic_cast<const AMDILSubtarget*>(
+        amdtm->getSubtargetImpl());
+  if (stm->calVersion() >= CAL_VERSION_SC_135) {
+    // unsigned x = RHS;
+    // ulong xd = (ulong)(0x4330_0000 << 32) | x;
+    // double d = as_double( xd );
+    // return d - 0x1.0p+52; // 0x1.0p+52 == 0x4330_0000_0000_0000
+    SDValue xd = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, x,
+        DAG.getConstant( 0x43300000, INTVT ) );
+    SDValue d = DAG.getNode( ISDBITCAST, DL, LHSVT, xd );
+    SDValue offsetd = DAG.getNode( ISDBITCAST, DL, LHSVT,
+        DAG.getConstant( 0x4330000000000000ULL, LONGVT ) );
+    return DAG.getNode( ISD::FSUB, DL, LHSVT, d, offsetd );
+  } else {
+    SDValue clz = genCLZu32(x, DAG);
+
+    // Compute the exponent. 1023 is the bias, 31-clz the actual power of 2
+    // Except for an input 0... which requires a 0 exponent
+    SDValue exp = DAG.getNode( ISD::SUB, DL, INTVT,
+        DAG.getConstant( (1023+31), INTVT), clz );
+    exp = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, x, exp, x );
+
+    // Normalize frac
+    SDValue rhi = DAG.getNode( ISD::SHL, DL, INTVT, x, clz );
+
+    // Eliminate hidden bit
+    rhi = DAG.getNode( ISD::AND, DL, INTVT,
+        rhi, DAG.getConstant( 0x7fffffff, INTVT ) );
+
+    // Pack exponent and frac
+    SDValue rlo = DAG.getNode( ISD::SHL, DL, INTVT,
+        rhi, DAG.getConstant( (32 - 11), INTVT ) );
+    rhi = DAG.getNode( ISD::SRL, DL, INTVT,
+        rhi, DAG.getConstant( 11, INTVT ) );
+    exp = DAG.getNode( ISD::SHL, DL, INTVT,
+        exp, DAG.getConstant( 20, INTVT ) );
+    rhi = DAG.getNode( ISD::OR, DL, INTVT, rhi, exp );
+
+    // Convert 2 x 32 in to 1 x 64, then to double precision float type
+    SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi );
+    return DAG.getNode(ISDBITCAST, DL, LHSVT, res);
+  }
+}
+SDValue
+AMDILTargetLowering::genu64tof64(SDValue RHS, EVT LHSVT,
+    SelectionDAG &DAG) const
+{
+  EVT RHSVT = RHS.getValueType();
+  DebugLoc DL = RHS.getDebugLoc();
+  EVT INTVT;
+  EVT LONGVT;
+  bool isVec = RHSVT.isVector();
+  if (isVec) {
+    INTVT = EVT(MVT::getVectorVT(MVT::i32,
+          RHSVT.getVectorNumElements()));
+  } else {
+    INTVT = EVT(MVT::i32);
+  }
+  LONGVT = RHSVT;
+  SDValue x = RHS;
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+    // double dhi = (double)(as_uint2(x).y);
+    // double dlo = (double)(as_uint2(x).x);
+    // return mad(dhi, 0x1.0p+32, dlo)
+    SDValue dhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x);
+    dhi = DAG.getNode(ISD::UINT_TO_FP, DL, LHSVT, dhi);
+    SDValue dlo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x);
+    dlo = DAG.getNode(ISD::UINT_TO_FP, DL, LHSVT, dlo);
+    return DAG.getNode(AMDILISD::MAD, DL, LHSVT, dhi,
+        DAG.getConstantFP(0x4f800000, LHSVT), dlo);
+  } else if (stm->calVersion() >= CAL_VERSION_SC_135) {
+    // double lo = as_double( as_ulong( 0x1.0p+52) | (u & 0xffff_ffffUL));
+    // double hi = as_double( as_ulong( 0x1.0p+84) | (u >> 32));
+    // return (hi - (0x1.0p+84 + 0x1.0p+52)) + lo;
+    SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x );  // x & 0xffff_ffffUL
+    SDValue xd = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, xlo, DAG.getConstant( 0x43300000, INTVT ) );
+    SDValue lo = DAG.getNode( ISDBITCAST, DL, LHSVT, xd );
+    SDValue xhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 :  AMDILISD::LCOMPHI, DL, INTVT, x ); // x >> 32
+    SDValue xe = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, xhi, DAG.getConstant( 0x45300000, INTVT ) );
+    SDValue hi = DAG.getNode( ISDBITCAST, DL, LHSVT, xe );
+    SDValue c = DAG.getNode( ISDBITCAST, DL, LHSVT,
+        DAG.getConstant( 0x4530000000100000ULL, LONGVT ) );
+    hi = DAG.getNode( ISD::FSUB, DL, LHSVT, hi, c );
+    return DAG.getNode( ISD::FADD, DL, LHSVT, hi, lo );
+
+  } else {
+    SDValue clz = genCLZu64(x, DAG);
+    SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x );
+    SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x );
+
+    // Compute the exponent. 1023 is the bias, 63-clz the actual power of 2
+    SDValue exp = DAG.getNode( ISD::SUB, DL, INTVT,
+        DAG.getConstant( (1023+63), INTVT), clz );
+    SDValue mash = DAG.getNode( ISD::OR, DL, INTVT, xhi, xlo );
+    exp = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        mash, exp, mash );  // exp = exp, or 0 if input was 0
+
+    // Normalize frac
+    SDValue clz31 = DAG.getNode( ISD::AND, DL, INTVT,
+        clz, DAG.getConstant( 31, INTVT ) );
+    SDValue rshift = DAG.getNode( ISD::SUB, DL, INTVT,
+        DAG.getConstant( 32, INTVT ), clz31 );
+    SDValue t1 = DAG.getNode( ISD::SHL, DL, INTVT, xhi, clz31 );
+    SDValue t2 = DAG.getNode( ISD::SRL, DL, INTVT, xlo, rshift );
+    t2 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, clz31, t2, t1 );
+    SDValue rhi1 = DAG.getNode( ISD::OR, DL, INTVT, t1, t2 );
+    SDValue rlo1 = DAG.getNode( ISD::SHL, DL, INTVT, xlo, clz31 );
+    SDValue rhi2 = DAG.getNode( ISD::SHL, DL, INTVT, xlo, clz31 );
+    SDValue rlo2 = DAG.getConstant( 0, INTVT );
+    SDValue clz32 = DAG.getNode( ISD::AND, DL, INTVT,
+        clz, DAG.getConstant( 32, INTVT ) );
+    SDValue rhi = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        clz32, rhi2, rhi1 );
+    SDValue rlo = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT,
+        clz32, rlo2, rlo1 );
+
+    // Eliminate hidden bit
+    rhi = DAG.getNode( ISD::AND, DL, INTVT,
+        rhi, DAG.getConstant( 0x7fffffff, INTVT ) );
+
+    // Save bits needed to round properly
+    SDValue round = DAG.getNode( ISD::AND, DL, INTVT,
+        rlo, DAG.getConstant( 0x7ff, INTVT ) );
+
+    // Pack exponent and frac
+    rlo = DAG.getNode( ISD::SRL, DL, INTVT,
+        rlo, DAG.getConstant( 11, INTVT ) );
+    SDValue temp = DAG.getNode( ISD::SHL, DL, INTVT,
+        rhi, DAG.getConstant( (32 - 11), INTVT ) );
+    rlo = DAG.getNode( ISD::OR, DL, INTVT, rlo, temp );
+    rhi = DAG.getNode( ISD::SRL, DL, INTVT,
+        rhi, DAG.getConstant( 11, INTVT ) );
+    exp = DAG.getNode( ISD::SHL, DL, INTVT,
+        exp, DAG.getConstant( 20, INTVT ) );
+    rhi = DAG.getNode( ISD::OR, DL, INTVT, rhi, exp );
+
+    // Compute rounding bit
+    SDValue even = DAG.getNode( ISD::AND, DL, INTVT,
+        rlo, DAG.getConstant( 1, INTVT ) );
+    SDValue grs = DAG.getNode( ISD::AND, DL, INTVT,
+        round, DAG.getConstant( 0x3ff, INTVT ) );
+    grs = DAG.getNode( AMDILISD::CMP, DL, INTVT,
+        DAG.getConstant( CondCCodeToCC( ISD::SETNE, MVT::i32), MVT::i32),
+        grs, DAG.getConstant( 0, INTVT ) ); // -1 if any GRS set, 0 if none
+    grs = DAG.getNode( ISD::OR, DL, INTVT, grs, even );
+    round = DAG.getNode( ISD::SRL, DL, INTVT,
+        round, DAG.getConstant( 10, INTVT ) );
+    round = DAG.getNode( ISD::AND, DL, INTVT, round, grs ); // 0 or 1
+
+    // Add rounding bit
+    SDValue lround = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT,
+        round, DAG.getConstant( 0, INTVT ) );
+    SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi );
+    res = DAG.getNode( ISD::ADD, DL, LONGVT, res, lround );
+    return DAG.getNode(ISDBITCAST, DL, LHSVT, res);
+  }
+}
+SDValue
+AMDILTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue RHS = Op.getOperand(0);
+  EVT RHSVT = RHS.getValueType();
+  MVT RST = RHSVT.getScalarType().getSimpleVT();
+  EVT LHSVT = Op.getValueType();
+  MVT LST = LHSVT.getScalarType().getSimpleVT();
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue DST;
+  EVT INTVT;
+  EVT LONGVT;
+  const AMDILTargetMachine*
+    amdtm = reinterpret_cast<const AMDILTargetMachine*>
+    (&this->getTargetMachine());
+  const AMDILSubtarget*
+    stm = dynamic_cast<const AMDILSubtarget*>(
+        amdtm->getSubtargetImpl());
+  if (LST == MVT::f64 && LHSVT.isVector()
+      && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
+    // We dont support vector 64bit floating point convertions.
+    DST = Op;
+    for (unsigned x = 0, y = LHSVT.getVectorNumElements(); x < y; ++x) {
+      SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32));
+      op = DAG.getNode(ISD::UINT_TO_FP, DL, LST, op);
+      if (!x) {
+        DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op);
+      } else {
+        DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, DST,
+            op, DAG.getTargetConstant(x, MVT::i32));
+      }
+
+    }
+  } else {
+
+    if (RST == MVT::i32
+        && LST == MVT::f64) {
+      if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+        DST = SDValue(Op.getNode(), 0);
+      } else {
+        DST = genu32tof64(RHS, LHSVT, DAG);
+      }
+    } else if (RST == MVT::i64
+        && LST == MVT::f64) {
+      DST = genu64tof64(RHS, LHSVT, DAG);
+    } else {
+      DST = SDValue(Op.getNode(), 0);
+    }
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue RHS = Op.getOperand(0);
+  EVT RHSVT = RHS.getValueType();
+  MVT RST = RHSVT.getScalarType().getSimpleVT();
+  EVT INTVT;
+  EVT LONGVT;
+  SDValue DST;
+  bool isVec = RHSVT.isVector();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT LHSVT = Op.getValueType();
+  MVT LST = LHSVT.getScalarType().getSimpleVT();
+  const AMDILTargetMachine*
+    amdtm = reinterpret_cast<const AMDILTargetMachine*>
+    (&this->getTargetMachine());
+  const AMDILSubtarget*
+    stm = dynamic_cast<const AMDILSubtarget*>(
+        amdtm->getSubtargetImpl());
+  if (LST == MVT::f64 && LHSVT.isVector()
+      && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
+    // We dont support vector 64bit floating point convertions.
+    for (unsigned x = 0, y = LHSVT.getVectorNumElements(); x < y; ++x) {
+      SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32));
+      op = DAG.getNode(ISD::UINT_TO_FP, DL, LST, op);
+      if (!x) {
+        DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op);
+      } else {
+        DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, DST,
+            op, DAG.getTargetConstant(x, MVT::i32));
+      }
+
+    }
+  } else {
+
+    if (isVec) {
+      LONGVT = EVT(MVT::getVectorVT(MVT::i64,
+            RHSVT.getVectorNumElements()));
+      INTVT = EVT(MVT::getVectorVT(MVT::i32,
+            RHSVT.getVectorNumElements()));
+    } else {
+      LONGVT = EVT(MVT::i64);
+      INTVT = EVT(MVT::i32);
+    }
+    MVT RST = RHSVT.getScalarType().getSimpleVT();
+    if ((RST == MVT::i32 || RST == MVT::i64)
+        && LST == MVT::f64) {
+      if (RST == MVT::i32) {
+        if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+          DST = SDValue(Op.getNode(), 0);
+          return DST;
+        }
+      }
+      SDValue c31 = DAG.getConstant( 31, INTVT );
+      SDValue cSbit = DAG.getConstant( 0x80000000, INTVT );
+
+      SDValue S;      // Sign, as 0 or -1
+      SDValue Sbit;   // Sign bit, as one bit, MSB only.
+      if (RST == MVT::i32) {
+        Sbit = DAG.getNode( ISD::AND, DL, INTVT, RHS, cSbit );
+        S = DAG.getNode(ISD::SRA, DL, RHSVT, RHS, c31 );
+      } else { // 64-bit case... SRA of 64-bit values is slow
+        SDValue hi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, RHS );
+        Sbit = DAG.getNode( ISD::AND, DL, INTVT, hi, cSbit );
+        SDValue temp = DAG.getNode( ISD::SRA, DL, INTVT, hi, c31 );
+        S = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, RHSVT, temp, temp );
+      }
+
+      // get abs() of input value, given sign as S (0 or -1)
+      // SpI = RHS + S
+      SDValue SpI = DAG.getNode(ISD::ADD, DL, RHSVT, RHS, S);
+      // SpIxS = SpI ^ S
+      SDValue SpIxS = DAG.getNode(ISD::XOR, DL, RHSVT, SpI, S);
+
+      // Convert unsigned value to double precision
+      SDValue R;
+      if (RST == MVT::i32) {
+        // r = cast_u32_to_f64(SpIxS)
+        R = genu32tof64(SpIxS, LHSVT, DAG);
+      } else {
+        // r = cast_u64_to_f64(SpIxS)
+        R = genu64tof64(SpIxS, LHSVT, DAG);
+      }
+
+      // drop in the sign bit
+      SDValue t = DAG.getNode( AMDILISD::BITCONV, DL, LONGVT, R );
+      SDValue thi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, t );
+      SDValue tlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, t );
+      thi = DAG.getNode( ISD::OR, DL, INTVT, thi, Sbit );
+      t = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, tlo, thi );
+      DST = DAG.getNode( AMDILISD::BITCONV, DL, LHSVT, t );
+    } else {
+      DST = SDValue(Op.getNode(), 0);
+    }
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  bool isVec = RHS.getValueType().isVector();
+  if (OVT.getScalarType() == MVT::i64) {
+    /*const AMDILTargetMachine*
+      amdtm = reinterpret_cast<const AMDILTargetMachine*>
+      (&this->getTargetMachine());
+      const AMDILSubtarget*
+      stm = dynamic_cast<const AMDILSubtarget*>(
+      amdtm->getSubtargetImpl());*/
+    MVT INTTY = MVT::i32;
+    if (OVT == MVT::v2i64) {
+      INTTY = MVT::v2i32;
+    }
+    SDValue LHSLO, LHSHI, RHSLO, RHSHI, INTLO, INTHI;
+    // TODO: need to turn this into a bitcast of i64/v2i64 to v2i32/v4i32
+    LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, LHS);
+    RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, RHS);
+    LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, LHS);
+    RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, RHS);
+    INTLO = DAG.getNode(ISD::SUB, DL, INTTY, LHSLO, RHSLO);
+    INTHI = DAG.getNode(ISD::SUB, DL, INTTY, LHSHI, RHSHI);
+    //TODO: need to use IBORROW on HD5XXX and later hardware
+    SDValue cmp;
+    if (OVT == MVT::i64) {
+      cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+          DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32),
+          LHSLO, RHSLO);
+    } else {
+      SDValue cmplo;
+      SDValue cmphi;
+      SDValue LHSRLO = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, MVT::i32, LHSLO, DAG.getTargetConstant(0, MVT::i32));
+      SDValue LHSRHI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, MVT::i32, LHSLO, DAG.getTargetConstant(1, MVT::i32));
+      SDValue RHSRLO = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, MVT::i32, RHSLO, DAG.getTargetConstant(0, MVT::i32));
+      SDValue RHSRHI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, MVT::i32, RHSLO, DAG.getTargetConstant(1, MVT::i32));
+      cmplo = DAG.getNode(AMDILISD::CMP, DL, MVT::i32,
+          DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32),
+          LHSRLO, RHSRLO);
+      cmphi = DAG.getNode(AMDILISD::CMP, DL, MVT::i32,
+          DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32),
+          LHSRHI, RHSRHI);
+      cmp = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v2i32, cmplo);
+      cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i32,
+          cmp, cmphi, DAG.getTargetConstant(1, MVT::i32));
+    }
+    INTHI = DAG.getNode(ISD::ADD, DL, INTTY, INTHI, cmp);
+    DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, OVT,
+        INTLO, INTHI);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::f64) {
+    DST = LowerFDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::f32) {
+    DST = LowerFDIV32(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSDIV32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16
+      || OVT.getScalarType() == MVT::i8) {
+    DST = LowerSDIV24(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerUDIV(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerUDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerUDIV32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16
+      || OVT.getScalarType() == MVT::i8) {
+    DST = LowerUDIV24(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSREM64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSREM32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16) {
+    DST = LowerSREM16(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i8) {
+    DST = LowerSREM8(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerUREM(SDValue Op, SelectionDAG &DAG) const
+{
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerUREM64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerUREM32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16) {
+    DST = LowerUREM16(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i8) {
+    DST = LowerUREM8(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  bool isVec = OVT.isVector();
+  if (OVT.getScalarType() != MVT::i64)
+  {
+    DST = SDValue(Op.getNode(), 0);
+  } else {
+    assert(OVT.getScalarType() == MVT::i64 && "Only 64 bit mul should be lowered!");
+    // TODO: This needs to be turned into a tablegen pattern
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+
+    MVT INTTY = MVT::i32;
+    if (OVT == MVT::v2i64) {
+      INTTY = MVT::v2i32;
+    }
+    // mul64(h1, l1, h0, l0)
+    SDValue LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO,
+        DL,
+        INTTY, LHS);
+    SDValue LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI,
+        DL,
+        INTTY, LHS);
+    SDValue RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO,
+        DL,
+        INTTY, RHS);
+    SDValue RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI,
+        DL,
+        INTTY, RHS);
+    // MULLO_UINT_1 r1, h0, l1
+    SDValue RHILLO = DAG.getNode(AMDILISD::UMUL,
+        DL,
+        INTTY, RHSHI, LHSLO);
+    // MULLO_UINT_1 r2, h1, l0
+    SDValue RLOHHI = DAG.getNode(AMDILISD::UMUL,
+        DL,
+        INTTY, RHSLO, LHSHI);
+    // ADD_INT hr, r1, r2
+    SDValue ADDHI = DAG.getNode(ISD::ADD,
+        DL,
+        INTTY, RHILLO, RLOHHI);
+    // MULHI_UINT_1 r3, l1, l0
+    SDValue RLOLLO = DAG.getNode(ISD::MULHU,
+        DL,
+        INTTY, RHSLO, LHSLO);
+    // ADD_INT hr, hr, r3
+    SDValue HIGH = DAG.getNode(ISD::ADD,
+        DL,
+        INTTY, ADDHI, RLOLLO);
+    // MULLO_UINT_1 l3, l1, l0
+    SDValue LOW = DAG.getNode(AMDILISD::UMUL,
+        DL,
+        INTTY, LHSLO, RHSLO);
+    DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE,
+        DL,
+        OVT, LOW, HIGH);
+  }
+  return DST;
+}
+SDValue
+AMDILTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const
+{
+  EVT VT = Op.getValueType();
+  //printSDValue(Op, 1);
+  SDValue Nodes1;
+  SDValue second;
+  SDValue third;
+  SDValue fourth;
+  DebugLoc DL = Op.getDebugLoc();
+  Nodes1 = DAG.getNode(AMDILISD::VBUILD,
+      DL,
+      VT, Op.getOperand(0));
+  bool allEqual = true;
+  for (unsigned x = 1, y = Op.getNumOperands(); x < y; ++x) {
+    if (Op.getOperand(0) != Op.getOperand(x)) {
+      allEqual = false;
+      break;
+    }
+  }
+  if (allEqual) {
+    return Nodes1;
+  }
+  switch(Op.getNumOperands()) {
+    default:
+    case 1:
+      break;
+    case 4:
+      fourth = Op.getOperand(3);
+      if (fourth.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            fourth,
+            DAG.getConstant(7, MVT::i32));
+      }
+    case 3:
+      third = Op.getOperand(2);
+      if (third.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            third,
+            DAG.getConstant(6, MVT::i32));
+      }
+    case 2:
+      second = Op.getOperand(1);
+      if (second.getOpcode() != ISD::UNDEF) {
+        Nodes1 = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT,
+            DL,
+            Op.getValueType(),
+            Nodes1,
+            second,
+            DAG.getConstant(5, MVT::i32));
+      }
+      break;
+  };
+  return Nodes1;
+}
+
+SDValue
+AMDILTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  const SDValue *ptr = NULL;
+  const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+  uint32_t swizzleNum = 0;
+  SDValue DST;
+  if (!VT.isVector()) {
+    SDValue Res = Op.getOperand(0);
+    return Res;
+  }
+
+  if (Op.getOperand(1).getOpcode() != ISD::UNDEF) {
+    ptr = &Op.getOperand(1);
+  } else {
+    ptr = &Op.getOperand(0);
+  }
+  if (CSDN) {
+    swizzleNum = (uint32_t)CSDN->getZExtValue();
+    uint32_t mask2 = 0x04030201 & ~(0xFF << (swizzleNum * 8));
+    uint32_t mask3 = 0x01010101 & (0xFF << (swizzleNum * 8));
+    DST = DAG.getNode(AMDILISD::VINSERT,
+        DL,
+        VT,
+        Op.getOperand(0),
+        *ptr,
+        DAG.getTargetConstant(mask2, MVT::i32),
+        DAG.getTargetConstant(mask3, MVT::i32));
+  } else {
+    uint32_t mask2 = 0x04030201 & ~(0xFF << (swizzleNum * 8));
+    uint32_t mask3 = 0x01010101 & (0xFF << (swizzleNum * 8));
+    SDValue res = DAG.getNode(AMDILISD::VINSERT,
+        DL, VT, Op.getOperand(0), *ptr,
+        DAG.getTargetConstant(mask2, MVT::i32),
+        DAG.getTargetConstant(mask3, MVT::i32));
+    for (uint32_t x = 1; x < VT.getVectorNumElements(); ++x) {
+      mask2 = 0x04030201 & ~(0xFF << (x * 8));
+      mask3 = 0x01010101 & (0xFF << (x * 8));
+      SDValue t = DAG.getNode(AMDILISD::VINSERT,
+          DL, VT, Op.getOperand(0), *ptr,
+          DAG.getTargetConstant(mask2, MVT::i32),
+          DAG.getTargetConstant(mask3, MVT::i32));
+      SDValue c = DAG.getNode(AMDILISD::CMP, DL, ptr->getValueType(),
+          DAG.getConstant(AMDILCC::IL_CC_I_EQ, MVT::i32),
+          Op.getOperand(2), DAG.getConstant(x, MVT::i32));
+      c = DAG.getNode(AMDILISD::VBUILD, DL, Op.getValueType(), c);
+      res = DAG.getNode(AMDILISD::CMOVLOG, DL, VT, c, t, res);
+    }
+    DST = res;
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  EVT VT = Op.getValueType();
+  //printSDValue(Op, 1);
+  const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  uint64_t swizzleNum = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Res;
+  if (!Op.getOperand(0).getValueType().isVector()) {
+    Res = Op.getOperand(0);
+    return Res;
+  }
+  if (CSDN) {
+    // Static vector extraction
+    swizzleNum = CSDN->getZExtValue() + 1;
+    Res = DAG.getNode(AMDILISD::VEXTRACT,
+        DL, VT,
+        Op.getOperand(0),
+        DAG.getTargetConstant(swizzleNum, MVT::i32));
+  } else {
+    SDValue Op1 = Op.getOperand(1);
+    uint32_t vecSize = 4;
+    SDValue Op0 = Op.getOperand(0);
+    SDValue res = DAG.getNode(AMDILISD::VEXTRACT,
+        DL, VT, Op0,
+        DAG.getTargetConstant(1, MVT::i32));
+    if (Op0.getValueType().isVector()) {
+      vecSize = Op0.getValueType().getVectorNumElements();
+    }
+    for (uint32_t x = 2; x <= vecSize; ++x) {
+      SDValue t = DAG.getNode(AMDILISD::VEXTRACT,
+          DL, VT, Op0,
+          DAG.getTargetConstant(x, MVT::i32));
+      SDValue c = DAG.getNode(AMDILISD::CMP,
+          DL, Op1.getValueType(),
+          DAG.getConstant(AMDILCC::IL_CC_I_EQ, MVT::i32),
+          Op1, DAG.getConstant(x, MVT::i32));
+      res = DAG.getNode(AMDILISD::CMOVLOG, DL,
+          VT, c, t, res);
+
+    }
+    Res = res;
+  }
+  return Res;
+}
+
+SDValue
+AMDILTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  uint32_t vecSize = Op.getValueType().getVectorNumElements();
+  SDValue src = Op.getOperand(0);
+  const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  uint64_t offset = 0;
+  EVT vecType = Op.getValueType().getVectorElementType();
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue Result;
+  if (CSDN) {
+    offset = CSDN->getZExtValue();
+    Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+        DL,vecType, src, DAG.getConstant(offset, MVT::i32));
+    Result = DAG.getNode(AMDILISD::VBUILD, DL,
+        Op.getValueType(), Result);
+    for (uint32_t x = 1; x < vecSize; ++x) {
+      SDValue elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, vecType,
+          src, DAG.getConstant(offset + x, MVT::i32));
+      if (elt.getOpcode() != ISD::UNDEF) {
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+            Op.getValueType(), Result, elt,
+            DAG.getConstant(x, MVT::i32));
+      }
+    }
+  } else {
+    SDValue idx = Op.getOperand(1);
+    Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+        DL, vecType, src, idx);
+    Result = DAG.getNode(AMDILISD::VBUILD, DL,
+        Op.getValueType(), Result);
+    for (uint32_t x = 1; x < vecSize; ++x) {
+      idx = DAG.getNode(ISD::ADD, DL, vecType,
+          idx, DAG.getConstant(1, MVT::i32));
+      SDValue elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, vecType,
+          src, idx);
+      if (elt.getOpcode() != ISD::UNDEF) {
+        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+            Op.getValueType(), Result, elt, idx);
+      }
+    }
+  }
+  return Result;
+}
+SDValue
+AMDILTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  SDValue Res = DAG.getNode(AMDILISD::VBUILD,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0));
+  return Res;
+}
+SDValue
+AMDILTargetLowering::LowerAND(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue andOp;
+  andOp = DAG.getNode(
+      AMDILISD::AND,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      Op.getOperand(1));
+  return andOp;
+}
+SDValue
+AMDILTargetLowering::LowerOR(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue orOp;
+  orOp = DAG.getNode(AMDILISD::OR,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      Op.getOperand(1));
+  return orOp;
+}
+SDValue
+AMDILTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  Cond = getConversionNode(DAG, Cond, Op, true);
+  Cond = DAG.getNode(AMDILISD::CMOVLOG,
+      DL,
+      Op.getValueType(), Cond, LHS, RHS);
+  return Cond;
+}
+SDValue
+AMDILTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue TRUE = Op.getOperand(2);
+  SDValue FALSE = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  DebugLoc DL = Op.getDebugLoc();
+  bool skipCMov = false;
+  bool genINot = false;
+  EVT OVT = Op.getValueType();
+
+  // Check for possible elimination of cmov
+  if (TRUE.getValueType().getSimpleVT().SimpleTy == MVT::i32) {
+    const ConstantSDNode *trueConst
+      = dyn_cast<ConstantSDNode>( TRUE.getNode() );
+    const ConstantSDNode *falseConst
+      = dyn_cast<ConstantSDNode>( FALSE.getNode() );
+    if (trueConst && falseConst) {
+      // both possible result values are constants
+      if (trueConst->isAllOnesValue()
+          && falseConst->isNullValue()) { // and convenient constants
+        skipCMov = true;
+      }
+      else if (trueConst->isNullValue()
+          && falseConst->isAllOnesValue()) { // less convenient
+        skipCMov = true;
+        genINot = true;
+      }
+    }
+  }
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned int AMDILCC = CondCCodeToCC(
+      SetCCOpcode,
+      LHS.getValueType().getSimpleVT().SimpleTy);
+  assert((AMDILCC != AMDILCC::COND_ERROR) && "Invalid SetCC!");
+  Cond = DAG.getNode(
+      AMDILISD::CMP,
+      DL,
+      LHS.getValueType(),
+      DAG.getConstant(AMDILCC, MVT::i32),
+      LHS,
+      RHS);
+  Cond = getConversionNode(DAG, Cond, Op, true);
+  if (genINot) {
+    Cond = DAG.getNode(AMDILISD::NOT, DL, OVT, Cond);
+  }
+  if (!skipCMov) {
+    Cond = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, Cond, TRUE, FALSE);
+  }
+  return Cond;
+}
+SDValue
+AMDILTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Cond;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC  = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  unsigned int AMDILCC = CondCCodeToCC(
+      SetCCOpcode,
+      LHS.getValueType().getSimpleVT().SimpleTy);
+  assert((AMDILCC != AMDILCC::COND_ERROR) && "Invalid SetCC!");
+  Cond = DAG.getNode(
+      AMDILISD::CMP,
+      DL,
+      LHS.getValueType(),
+      DAG.getConstant(AMDILCC, MVT::i32),
+      LHS,
+      RHS);
+  Cond = getConversionNode(DAG, Cond, Op, true);
+  Cond = DAG.getNode(
+      ISD::AND,
+      DL,
+      Cond.getValueType(),
+      DAG.getConstant(1, Cond.getValueType()),
+      Cond);
+  return Cond;
+}
+
+SDValue
+AMDILTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Data = Op.getOperand(0);
+  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
+  DebugLoc DL = Op.getDebugLoc();
+  EVT DVT = Data.getValueType();
+  EVT BVT = BaseType->getVT();
+  unsigned baseBits = BVT.getScalarType().getSizeInBits();
+  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
+  unsigned shiftBits = srcBits - baseBits;
+  if (srcBits < 32) {
+    // If the op is less than 32 bits, then it needs to extend to 32bits
+    // so it can properly keep the upper bits valid.
+    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
+    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
+    shiftBits = 32 - baseBits;
+    DVT = IVT;
+  }
+  SDValue Shift = DAG.getConstant(shiftBits, DVT);
+  // Shift left by 'Shift' bits.
+  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
+  // Signed shift Right by 'Shift' bits.
+  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
+  if (srcBits < 32) {
+    // Once the sign extension is done, the op needs to be converted to
+    // its original type.
+    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
+  }
+  return Data;
+}
+EVT
+AMDILTargetLowering::genIntType(uint32_t size, uint32_t numEle) const
+{
+  int iSize = (size * numEle);
+  int vEle = (iSize >> ((size == 64) ? 6 : 5));
+  if (!vEle) {
+    vEle = 1;
+  }
+  if (size == 64) {
+    if (vEle == 1) {
+      return EVT(MVT::i64);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i64, vEle));
+    }
+  } else {
+    if (vEle == 1) {
+      return EVT(MVT::i32);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i32, vEle));
+    }
+  }
+}
+
+SDValue
+AMDILTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Src = Op.getOperand(0);
+  SDValue Dst = Op;
+  SDValue Res;
+  DebugLoc DL = Op.getDebugLoc();
+  EVT SrcVT = Src.getValueType();
+  EVT DstVT = Dst.getValueType();
+  // Lets bitcast the floating point types to an
+  // equivalent integer type before converting to vectors.
+  if (SrcVT.getScalarType().isFloatingPoint()) {
+    Src = DAG.getNode(AMDILISD::BITCONV, DL, genIntType(
+          SrcVT.getScalarType().getSimpleVT().getSizeInBits(),
+          SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1),
+        Src);
+    SrcVT = Src.getValueType();
+  }
+  uint32_t ScalarSrcSize = SrcVT.getScalarType()
+    .getSimpleVT().getSizeInBits();
+  uint32_t ScalarDstSize = DstVT.getScalarType()
+    .getSimpleVT().getSizeInBits();
+  uint32_t SrcNumEle = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
+  uint32_t DstNumEle = DstVT.isVector() ? DstVT.getVectorNumElements() : 1;
+  bool isVec = SrcVT.isVector();
+  if (DstVT.getScalarType().isInteger() &&
+      (SrcVT.getScalarType().isInteger()
+       || SrcVT.getScalarType().isFloatingPoint())) {
+    if ((ScalarDstSize == 64 && SrcNumEle == 4 && ScalarSrcSize == 16)
+        || (ScalarSrcSize == 64
+          && DstNumEle == 4
+          && ScalarDstSize == 16)) {
+      // This is the problematic case when bitcasting i64 <-> <4 x i16>
+      // This approach is a little different as we cannot generate a
+      // <4 x i64> vector
+      // as that is illegal in our backend and we are already past
+      // the DAG legalizer.
+      // So, in this case, we will do the following conversion.
+      // Case 1:
+      // %dst = <4 x i16> %src bitconvert i64 ==>
+      // %tmp = <4 x i16> %src convert <4 x i32>
+      // %tmp = <4 x i32> %tmp and 0xFFFF
+      // %tmp = <4 x i32> %tmp shift_left <0, 16, 0, 16>
+      // %tmp = <4 x i32> %tmp or %tmp.xz %tmp.yw
+      // %dst = <2 x i32> %tmp bitcast i64
+      // case 2:
+      // %dst = i64 %src bitconvert <4 x i16> ==>
+      // %tmp = i64 %src bitcast <2 x i32>
+      // %tmp = <4 x i32> %tmp vinsert %tmp.xxyy
+      // %tmp = <4 x i32> %tmp shift_right <0, 16, 0, 16>
+      // %tmp = <4 x i32> %tmp and 0xFFFF
+      // %dst = <4 x i16> %tmp bitcast <4 x i32>
+      SDValue mask = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v4i32,
+          DAG.getConstant(0xFFFF, MVT::i32));
+      SDValue const16 = DAG.getConstant(16, MVT::i32);
+      if (ScalarDstSize == 64) {
+        // case 1
+        Op = DAG.getSExtOrTrunc(Src, DL, MVT::v4i32);
+        Op = DAG.getNode(ISD::AND, DL, Op.getValueType(), Op, mask);
+        SDValue x = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+            Op, DAG.getConstant(0, MVT::i32));
+        SDValue y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+            Op, DAG.getConstant(1, MVT::i32));
+        y = DAG.getNode(ISD::SHL, DL, MVT::i32, y, const16);
+        SDValue z = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+            Op, DAG.getConstant(2, MVT::i32));
+        SDValue w = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+            Op, DAG.getConstant(3, MVT::i32));
+        w = DAG.getNode(ISD::SHL, DL, MVT::i32, w, const16);
+        x = DAG.getNode(ISD::OR, DL, MVT::i32, x, y);
+        y = DAG.getNode(ISD::OR, DL, MVT::i32, z, w);
+        Res = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, MVT::i64, x, y);
+        return Res;
+      } else {
+        // case 2
+        SDValue lo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, MVT::i32, Src);
+        SDValue lor16
+          = DAG.getNode(ISD::SRL, DL, MVT::i32, lo, const16);
+        SDValue hi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, MVT::i32, Src);
+        SDValue hir16
+          = DAG.getNode(ISD::SRL, DL, MVT::i32, hi, const16);
+        SDValue resVec = DAG.getNode(AMDILISD::VBUILD, DL,
+            MVT::v4i32, lo);
+        SDValue idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL,
+            getPointerTy(), DAG.getConstant(1, MVT::i32));
+        resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+            resVec, lor16, idxVal);
+        idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL,
+            getPointerTy(), DAG.getConstant(2, MVT::i32));
+        resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+            resVec, hi, idxVal);
+        idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL,
+            getPointerTy(), DAG.getConstant(3, MVT::i32));
+        resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
+            resVec, hir16, idxVal);
+        resVec = DAG.getNode(ISD::AND, DL, MVT::v4i32, resVec, mask);
+        Res = DAG.getSExtOrTrunc(resVec, DL, MVT::v4i16);
+        return Res;
+      }
+    } else {
+      // There are four cases we need to worry about for bitcasts
+      // where the size of all
+      // source, intermediates and result is <= 128 bits, unlike
+      // the above case
+      // 1) Sub32bit bitcast 32bitAlign
+      // %dst = <4 x i8> bitcast i32
+      // (also <[2|4] x i16> to <[2|4] x i32>)
+      // 2) 32bitAlign bitcast Sub32bit
+      // %dst = i32 bitcast <4 x i8>
+      // 3) Sub32bit bitcast LargerSub32bit
+      // %dst = <2 x i8> bitcast i16
+      // (also <4 x i8> to <2 x i16>)
+      // 4) Sub32bit bitcast SmallerSub32bit
+      // %dst = i16 bitcast <2 x i8>
+      // (also <2 x i16> to <4 x i8>)
+      // This also only handles types that are powers of two
+      if ((ScalarDstSize & (ScalarDstSize - 1))
+          || (ScalarSrcSize & (ScalarSrcSize - 1))) {
+      } else if (ScalarDstSize >= 32 && ScalarSrcSize < 32) {
+        // case 1:
+        EVT IntTy = genIntType(ScalarDstSize, SrcNumEle);
+#if 0 // TODO: LLVM does not like this for some reason, cannot SignExt vectors
+        SDValue res = DAG.getSExtOrTrunc(Src, DL, IntTy);
+#else
+        SDValue res = DAG.getNode(AMDILISD::VBUILD, DL, IntTy,
+            DAG.getUNDEF(IntTy.getScalarType()));
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+              getPointerTy(), DAG.getConstant(x, MVT::i32));
+          SDValue temp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+              SrcVT.getScalarType(), Src,
+              DAG.getConstant(x, MVT::i32));
+          temp = DAG.getSExtOrTrunc(temp, DL, IntTy.getScalarType());
+          res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntTy,
+              res, temp, idx);
+        }
+#endif
+        SDValue mask = DAG.getNode(AMDILISD::VBUILD, DL, IntTy,
+            DAG.getConstant((1 << ScalarSrcSize) - 1, MVT::i32));
+        SDValue *newEle = new SDValue[SrcNumEle];
+        res = DAG.getNode(ISD::AND, DL, IntTy, res, mask);
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          newEle[x] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+              IntTy.getScalarType(), res,
+              DAG.getConstant(x, MVT::i32));
+        }
+        uint32_t Ratio = SrcNumEle / DstNumEle;
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          if (x % Ratio) {
+            newEle[x] = DAG.getNode(ISD::SHL, DL,
+                IntTy.getScalarType(), newEle[x],
+                DAG.getConstant(ScalarSrcSize * (x % Ratio),
+                  MVT::i32));
+          }
+        }
+        for (uint32_t x = 0; x < SrcNumEle; x += 2) {
+          newEle[x] = DAG.getNode(ISD::OR, DL,
+              IntTy.getScalarType(), newEle[x], newEle[x + 1]);
+        }
+        if (ScalarSrcSize == 8) {
+          for (uint32_t x = 0; x < SrcNumEle; x += 4) {
+            newEle[x] = DAG.getNode(ISD::OR, DL,
+                IntTy.getScalarType(), newEle[x], newEle[x + 2]);
+          }
+          if (DstNumEle == 1) {
+            Dst = newEle[0];
+          } else {
+            Dst = DAG.getNode(AMDILISD::VBUILD, DL, DstVT,
+                newEle[0]);
+            for (uint32_t x = 1; x < DstNumEle; ++x) {
+              SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                  getPointerTy(), DAG.getConstant(x, MVT::i32));
+              Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+                  DstVT, Dst, newEle[x * 4], idx);
+            }
+          }
+        } else {
+          if (DstNumEle == 1) {
+            Dst = newEle[0];
+          } else {
+            Dst = DAG.getNode(AMDILISD::VBUILD, DL, DstVT,
+                newEle[0]);
+            for (uint32_t x = 1; x < DstNumEle; ++x) {
+              SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                  getPointerTy(), DAG.getConstant(x, MVT::i32));
+              Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
+                  DstVT, Dst, newEle[x * 2], idx);
+            }
+          }
+        }
+        delete [] newEle;
+        return Dst;
+      } else if (ScalarDstSize < 32 && ScalarSrcSize >= 32) {
+        // case 2:
+        EVT IntTy = genIntType(ScalarSrcSize, DstNumEle);
+        SDValue vec = DAG.getNode(AMDILISD::VBUILD, DL, IntTy,
+            DAG.getUNDEF(IntTy.getScalarType()));
+        uint32_t mult = (ScalarDstSize == 8) ? 4 : 2;
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          for (uint32_t y = 0; y < mult; ++y) {
+            SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                getPointerTy(),
+                DAG.getConstant(x * mult + y, MVT::i32));
+            SDValue t;
+            if (SrcNumEle > 1) {
+              t = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+                  DL, SrcVT.getScalarType(), Src,
+                  DAG.getConstant(x, MVT::i32));
+            } else {
+              t = Src;
+            }
+            if (y != 0) {
+              t = DAG.getNode(ISD::SRL, DL, t.getValueType(),
+                  t, DAG.getConstant(y * ScalarDstSize,
+                    MVT::i32));
+            }
+            vec = DAG.getNode(ISD::INSERT_VECTOR_ELT,
+                DL, IntTy, vec, t, idx);
+          }
+        }
+        Dst = DAG.getSExtOrTrunc(vec, DL, DstVT);
+        return Dst;
+      } else if (ScalarDstSize == 16 && ScalarSrcSize == 8) {
+        // case 3:
+        SDValue *numEle = new SDValue[SrcNumEle];
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          numEle[x] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+              MVT::i8, Src, DAG.getConstant(x, MVT::i32));
+          numEle[x] = DAG.getSExtOrTrunc(numEle[x], DL, MVT::i16);
+          numEle[x] = DAG.getNode(ISD::AND, DL, MVT::i16, numEle[x],
+              DAG.getConstant(0xFF, MVT::i16));
+        }
+        for (uint32_t x = 1; x < SrcNumEle; x += 2) {
+          numEle[x] = DAG.getNode(ISD::SHL, DL, MVT::i16, numEle[x],
+              DAG.getConstant(8, MVT::i16));
+          numEle[x - 1] = DAG.getNode(ISD::OR, DL, MVT::i16,
+              numEle[x-1], numEle[x]);
+        }
+        if (DstNumEle > 1) {
+          // If we are not a scalar i16, the only other case is a
+          // v2i16 since we can't have v8i8 at this point, v4i16
+          // cannot be generated
+          Dst = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v2i16,
+              numEle[0]);
+          SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+              getPointerTy(), DAG.getConstant(1, MVT::i32));
+          Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i16,
+              Dst, numEle[2], idx);
+        } else {
+          Dst = numEle[0];
+        }
+        delete [] numEle;
+        return Dst;
+      } else if (ScalarDstSize == 8 && ScalarSrcSize == 16) {
+        // case 4:
+        SDValue *numEle = new SDValue[DstNumEle];
+        for (uint32_t x = 0; x < SrcNumEle; ++x) {
+          numEle[x * 2] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+              MVT::i16, Src, DAG.getConstant(x, MVT::i32));
+          numEle[x * 2 + 1] = DAG.getNode(ISD::SRL, DL, MVT::i16,
+              numEle[x * 2], DAG.getConstant(8, MVT::i16));
+        }
+        MVT ty = (SrcNumEle == 1) ? MVT::v2i16 : MVT::v4i16;
+        Dst = DAG.getNode(AMDILISD::VBUILD, DL, ty, numEle[0]);
+        for (uint32_t x = 1; x < DstNumEle; ++x) {
+          SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL,
+              getPointerTy(), DAG.getConstant(x, MVT::i32));
+          Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ty,
+              Dst, numEle[x], idx);
+        }
+        delete [] numEle;
+        ty = (SrcNumEle == 1) ? MVT::v2i8 : MVT::v4i8;
+        Res = DAG.getSExtOrTrunc(Dst, DL, ty);
+        return Res;
+      }
+    }
+  } 
+  Res = DAG.getNode(AMDILISD::BITCONV,
+      Dst.getDebugLoc(),
+      Dst.getValueType(), Src);
+  return Res;
+}
+
+SDValue
+AMDILTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+    SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  unsigned int SPReg = AMDIL::SP;
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue SP = DAG.getCopyFromReg(Chain,
+      DL,
+      SPReg, MVT::i32);
+  SDValue NewSP = DAG.getNode(ISD::ADD,
+      DL,
+      MVT::i32, SP, Size);
+  Chain = DAG.getCopyToReg(SP.getValue(1),
+      DL,
+      SPReg, NewSP);
+  SDValue Ops[2] = {NewSP, Chain};
+  Chain = DAG.getMergeValues(Ops, 2 ,DL);
+  return Chain;
+}
+SDValue
+AMDILTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+  SDValue Result;
+  Result = DAG.getNode(
+      AMDILISD::BRANCH_COND,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Chain, Jump, Cond);
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Chain = Op.getOperand(0);
+  CondCodeSDNode *CCNode = cast<CondCodeSDNode>(Op.getOperand(1));
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue JumpT  = Op.getOperand(4);
+  SDValue CmpValue;
+  ISD::CondCode CC = CCNode->get();
+  SDValue Result;
+  unsigned int cmpOpcode = CondCCodeToCC(
+      CC,
+      LHS.getValueType().getSimpleVT().SimpleTy);
+  CmpValue = DAG.getNode(
+      AMDILISD::CMP,
+      Op.getDebugLoc(),
+      LHS.getValueType(),
+      DAG.getConstant(cmpOpcode, MVT::i32),
+      LHS, RHS);
+  Result = DAG.getNode(
+      AMDILISD::BRANCH_COND,
+      CmpValue.getDebugLoc(),
+      MVT::Other, Chain,
+      JumpT, CmpValue);
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Result = DAG.getNode(
+      AMDILISD::DP_TO_FP,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      Op.getOperand(1));
+  return Result;
+}
+
+SDValue
+AMDILTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
+{
+  SDValue Result = DAG.getNode(
+      AMDILISD::VCONCAT,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      Op.getOperand(1));
+  return Result;
+}
+// LowerRET - Lower an ISD::RET node.
+SDValue
+AMDILTargetLowering::LowerReturn(SDValue Chain,
+    CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    DebugLoc dl, SelectionDAG &DAG)
+const
+{
+  //MachineFunction& MF = DAG.getMachineFunction();
+  // CCValAssign - represent the assignment of the return value
+  // to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+
+  // Analyze return values of ISD::RET
+  CCInfo.AnalyzeReturn(Outs, RetCC_AMDIL32);
+  // If this is the first return lowered for this function, add
+  // the regs to the liveout set for the function
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) {
+    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) {
+      MRI.addLiveOut(RVLocs[i].getLocReg());
+    }
+  }
+  // FIXME: implement this when tail call is implemented
+  // Chain = GetPossiblePreceedingTailCall(Chain, AMDILISD::TAILCALL);
+  // both x86 and ppc implement this in ISelLowering
+
+  // Regular return here
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain);
+  RetOps.push_back(DAG.getConstant(0/*getBytesToPopOnReturn()*/, MVT::i32));
+  for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue ValToCopy = OutVals[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    // ISD::Ret => ret chain, (regnum1, val1), ...
+    // So i * 2 + 1 index only the regnums
+    Chain = DAG.getCopyToReg(Chain,
+        dl,
+        VA.getLocReg(),
+        ValToCopy,
+        Flag);
+    // guarantee that all emitted copies are stuck together
+    // avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+  /*if (MF.getFunction()->hasStructRetAttr()) {
+    assert(0 && "Struct returns are not yet implemented!");
+  // Both MIPS and X86 have this
+  }*/
+  RetOps[0] = Chain;
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  Flag = DAG.getNode(AMDILISD::RET_FLAG,
+      dl,
+      MVT::Other, &RetOps[0], RetOps.size());
+  return Flag;
+}
+void
+AMDILTargetLowering::generateLongRelational(MachineInstr *MI,
+    unsigned int opCode) const
+{
+  MachineOperand DST = MI->getOperand(0);
+  MachineOperand LHS = MI->getOperand(2);
+  MachineOperand RHS = MI->getOperand(3);
+  unsigned int opi32Code = 0, si32Code = 0;
+  unsigned int simpleVT = MI->getDesc().OpInfo[0].RegClass;
+  uint32_t REGS[12];
+  // All the relationals can be generated with with 6 temp registers
+  for (int x = 0; x < 12; ++x) {
+    REGS[x] = genVReg(simpleVT);
+  }
+  // Pull out the high and low components of each 64 bit register
+  generateMachineInst(AMDIL::LHI, REGS[0], LHS.getReg());
+  generateMachineInst(AMDIL::LLO, REGS[1], LHS.getReg());
+  generateMachineInst(AMDIL::LHI, REGS[2], RHS.getReg());
+  generateMachineInst(AMDIL::LLO, REGS[3], RHS.getReg());
+  // Determine the correct opcode that we should use
+  switch(opCode) {
+    default:
+      assert(!"comparison case not handled!");
+      break;
+    case AMDIL::LEQ:
+      si32Code = opi32Code = AMDIL::IEQ;
+      break;
+    case AMDIL::LNE:
+      si32Code = opi32Code = AMDIL::INE;
+      break;
+    case AMDIL::LLE:
+    case AMDIL::ULLE:
+    case AMDIL::LGE:
+    case AMDIL::ULGE:
+      if (opCode == AMDIL::LGE || opCode == AMDIL::ULGE) {
+        std::swap(REGS[0], REGS[2]);
+      } else {
+        std::swap(REGS[1], REGS[3]);
+      }
+      if (opCode == AMDIL::LLE || opCode == AMDIL::LGE) {
+        opi32Code = AMDIL::ILT;
+      } else {
+        opi32Code = AMDIL::ULT;
+      }
+      si32Code = AMDIL::UGE;
+      break;
+    case AMDIL::LGT:
+    case AMDIL::ULGT:
+      std::swap(REGS[0], REGS[2]);
+      std::swap(REGS[1], REGS[3]);
+    case AMDIL::LLT:
+    case AMDIL::ULLT:
+      if (opCode == AMDIL::LGT || opCode == AMDIL::LLT) {
+        opi32Code = AMDIL::ILT;
+      } else {
+        opi32Code = AMDIL::ULT;
+      }
+      si32Code = AMDIL::ULT;
+      break;
+  };
+  // Do the initial opcode on the high and low components.
+  // This leaves the following:
+  // REGS[4] = L_HI OP R_HI
+  // REGS[5] = L_LO OP R_LO
+  generateMachineInst(opi32Code, REGS[4], REGS[0], REGS[2]);
+  generateMachineInst(si32Code, REGS[5], REGS[1], REGS[3]);
+  switch(opi32Code) {
+    case AMDIL::IEQ:
+    case AMDIL::INE:
+      {
+        // combine the results with an and or or depending on if
+        // we are eq or ne
+        uint32_t combineOp = (opi32Code == AMDIL::IEQ)
+          ? AMDIL::BINARY_AND_i32 : AMDIL::BINARY_OR_i32;
+        generateMachineInst(combineOp, REGS[11], REGS[4], REGS[5]);
+      }
+      break;
+    default:
+      // this finishes codegen for the following pattern
+      // REGS[4] || (REGS[5] && (L_HI == R_HI))
+      generateMachineInst(AMDIL::IEQ, REGS[9], REGS[0], REGS[2]);
+      generateMachineInst(AMDIL::BINARY_AND_i32, REGS[10], REGS[5],
+          REGS[9]);
+      generateMachineInst(AMDIL::BINARY_OR_i32, REGS[11], REGS[4],
+          REGS[10]);
+      break;
+  }
+  generateMachineInst(AMDIL::LCREATE, DST.getReg(), REGS[11], REGS[11]);
+}
+
+unsigned int
+AMDILTargetLowering::getFunctionAlignment(const Function *) const
+{
+  return 0;
+}
+
+void
+AMDILTargetLowering::setPrivateData(MachineBasicBlock *BB,
+    MachineBasicBlock::iterator &BBI,
+    DebugLoc *DL, const TargetInstrInfo *TII) const
+{
+  mBB = BB;
+  mBBI = BBI;
+  mDL = DL;
+  mTII = TII;
+}
+uint32_t
+AMDILTargetLowering::genVReg(uint32_t regType) const
+{
+  return mBB->getParent()->getRegInfo().createVirtualRegister(
+      getRegClassFromID(regType));
+}
+
+MachineInstrBuilder
+AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst) const
+{
+  return BuildMI(*mBB, mBBI, *mDL, mTII->get(opcode), dst);
+}
+
+MachineInstrBuilder
+AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst,
+    uint32_t src1) const
+{
+  return generateMachineInst(opcode, dst).addReg(src1);
+}
+
+MachineInstrBuilder
+AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst,
+    uint32_t src1, uint32_t src2) const
+{
+  return generateMachineInst(opcode, dst, src1).addReg(src2);
+}
+
+MachineInstrBuilder
+AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst,
+    uint32_t src1, uint32_t src2, uint32_t src3) const
+{
+  return generateMachineInst(opcode, dst, src1, src2).addReg(src3);
+}
+
+
+SDValue
+AMDILTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(AMDILISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(AMDILISD::MAD, DL, FLTTY, fqneg, fb, fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, cv, jq, 
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getNode(AMDILISD::CMP, DL, OVT,
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r0, DAG.getConstant(0, OVT));
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getNode(AMDILISD::CMP, DL, OVT, 
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r1, DAG.getConstant(0, OVT));
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDILTargetLowering::LowerUDIV24(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+
+  // The LowerUDIV24 function implements the following CL.
+  // int ia = (int)LHS
+  // float fa = (float)ia
+  // int ib = (int)RHS
+  // float fb = (float)ib
+  // float fq = native_divide(fa, fb)
+  // fq = trunc(fq)
+  // float t = mad(fq, fb, fb)
+  // int iq = (int)fq - (t <= fa)
+  // return (type)iq
+
+  // int ia = (int)LHS
+  SDValue ia = DAG.getZExtOrTrunc(LHS, DL, INTTY);
+
+  // float fa = (float)ia
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // int ib = (int)RHS
+  SDValue ib = DAG.getZExtOrTrunc(RHS, DL, INTTY);
+
+  // float fb = (float)ib
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb)
+  SDValue fq = DAG.getNode(AMDILISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq)
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float t = mad(fq, fb, fb)
+  SDValue t = DAG.getNode(AMDILISD::MAD, DL, FLTTY, fq, fb, fb);
+
+  // int iq = (int)fq - (t <= fa) // This is sub and not add because GPU returns 0, -1
+  SDValue iq;
+  fq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+  if (INTTY == MVT::i32) {
+    iq = DAG.getSetCC(DL, INTTY, t, fa, ISD::SETOLE);
+  } else {
+    iq = DAG.getSetCC(DL, INTTY, t, fa, ISD::SETOLE);
+  }
+  iq = DAG.getNode(ISD::ADD, DL, INTTY, fq, iq);
+
+
+  // return (type)iq
+  iq = DAG.getZExtOrTrunc(iq, DL, OVT);
+  return iq;
+
+}
+
+SDValue
+AMDILTargetLowering::LowerUDIV32(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDILTargetLowering::LowerUDIV64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+SDValue
+AMDILTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i8) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i8) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i16) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i16) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getNode(AMDILISD::CMP, DL, OVT,
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r0, DAG.getConstant(0, OVT));
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getNode(AMDILISD::CMP, DL, OVT, 
+      DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32),
+      r1, DAG.getConstant(0, OVT));
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDILTargetLowering::LowerUREM8(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i8) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i8) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerUREM8 function generates equivalent to the following IL.
+  // mov r0, as_u32(LHS)
+  // mov r1, as_u32(RHS)
+  // and r10, r0, 0xFF
+  // and r11, r1, 0xFF
+  // cmov_logical r3, r11, r11, 0x1
+  // udiv r3, r10, r3
+  // cmov_logical r3, r11, r3, 0
+  // umul r3, r3, r11
+  // sub r3, r10, r3
+  // and as_u8(DST), r3, 0xFF
+
+  // mov r0, as_u32(LHS)
+  SDValue r0 = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // mov r1, as_u32(RHS)
+  SDValue r1 = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // and r10, r0, 0xFF
+  SDValue r10 = DAG.getNode(ISD::AND, DL, INTTY, r0, 
+      DAG.getConstant(0xFF, INTTY));
+
+  // and r11, r1, 0xFF
+  SDValue r11 = DAG.getNode(ISD::AND, DL, INTTY, r1, 
+      DAG.getConstant(0xFF, INTTY));
+
+  // cmov_logical r3, r11, r11, 0x1
+  SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, r11, r11,
+      DAG.getConstant(0x01, INTTY));
+
+  // udiv r3, r10, r3
+  r3 = DAG.getNode(ISD::UREM, DL, INTTY, r10, r3);
+
+  // cmov_logical r3, r11, r3, 0
+  r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, r11, r3,
+      DAG.getConstant(0, INTTY));
+
+  // umul r3, r3, r11
+  r3 = DAG.getNode(AMDILISD::UMUL, DL, INTTY, r3, r11);
+
+  // sub r3, r10, r3
+  r3 = DAG.getNode(ISD::SUB, DL, INTTY, r10, r3);
+
+  // and as_u8(DST), r3, 0xFF
+  SDValue DST = DAG.getNode(ISD::AND, DL, INTTY, r3,
+      DAG.getConstant(0xFF, INTTY));
+  DST = DAG.getZExtOrTrunc(DST, DL, OVT);
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerUREM16(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i16) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i16) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerUREM16 function generatest equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // DIV = LowerUDIV16(LHS, RHS)
+  // and r10, r0, 0xFFFF
+  // and r11, r1, 0xFFFF
+  // cmov_logical r3, r11, r11, 0x1
+  // udiv as_u16(r3), as_u32(r10), as_u32(r3)
+  // and r3, r3, 0xFFFF
+  // cmov_logical r3, r11, r3, 0
+  // umul r3, r3, r11
+  // sub r3, r10, r3
+  // and DST, r3, 0xFFFF
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // and r10, r0, 0xFFFF
+  SDValue r10 = DAG.getNode(ISD::AND, DL, OVT, r0, 
+      DAG.getConstant(0xFFFF, OVT));
+
+  // and r11, r1, 0xFFFF
+  SDValue r11 = DAG.getNode(ISD::AND, DL, OVT, r1, 
+      DAG.getConstant(0xFFFF, OVT));
+
+  // cmov_logical r3, r11, r11, 0x1
+  SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r11, r11,
+      DAG.getConstant(0x01, OVT));
+
+  // udiv as_u16(r3), as_u32(r10), as_u32(r3)
+  r10 = DAG.getZExtOrTrunc(r10, DL, INTTY);
+  r3 = DAG.getZExtOrTrunc(r3, DL, INTTY);
+  r3 = DAG.getNode(ISD::UREM, DL, INTTY, r10, r3);
+  r3 = DAG.getZExtOrTrunc(r3, DL, OVT);
+  r10 = DAG.getZExtOrTrunc(r10, DL, OVT);
+
+  // and r3, r3, 0xFFFF
+  r3 = DAG.getNode(ISD::AND, DL, OVT, r3, 
+      DAG.getConstant(0xFFFF, OVT));
+
+  // cmov_logical r3, r11, r3, 0
+  r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r11, r3,
+      DAG.getConstant(0, OVT));
+  // umul r3, r3, r11
+  r3 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r3, r11);
+
+  // sub r3, r10, r3
+  r3 = DAG.getNode(ISD::SUB, DL, OVT, r10, r3);
+
+  // and DST, r3, 0xFFFF
+  SDValue DST = DAG.getNode(ISD::AND, DL, OVT, r3,
+      DAG.getConstant(0xFFFF, OVT));
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerUREM32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerUREM32 function generates equivalent to the following IL.
+  // udiv r20, LHS, RHS
+  // umul r20, r20, RHS
+  // sub DST, LHS, r20
+
+  // udiv r20, LHS, RHS
+  SDValue r20 = DAG.getNode(ISD::UDIV, DL, OVT, LHS, RHS);
+
+  // umul r20, r20, RHS
+  r20 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r20, RHS);
+
+  // sub DST, LHS, r20
+  SDValue DST = DAG.getNode(ISD::SUB, DL, OVT, LHS, r20);
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerUREM64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
+
+
+SDValue
+AMDILTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const
+{
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2f32) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4f32) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue DST;
+  const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>(
+      &this->getTargetMachine())->getSubtargetImpl();
+  if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // TODO: This doesn't work for vector types yet
+    // The LowerFDIV32 function generates equivalent to the following
+    // IL:
+    // mov r20, as_int(LHS)
+    // mov r21, as_int(RHS)
+    // and r30, r20, 0x7f800000
+    // and r31, r20, 0x807FFFFF
+    // and r32, r21, 0x7f800000
+    // and r33, r21, 0x807FFFFF
+    // ieq r40, r30, 0x7F800000
+    // ieq r41, r31, 0x7F800000
+    // ieq r42, r32, 0
+    // ieq r43, r33, 0
+    // and r50, r20, 0x80000000
+    // and r51, r21, 0x80000000
+    // ior r32, r32, 0x3f800000
+    // ior r33, r33, 0x3f800000
+    // cmov_logical r32, r42, r50, r32
+    // cmov_logical r33, r43, r51, r33
+    // cmov_logical r32, r40, r20, r32
+    // cmov_logical r33, r41, r21, r33
+    // ior r50, r40, r41
+    // ior r51, r42, r43
+    // ior r50, r50, r51
+    // inegate r52, r31
+    // iadd r30, r30, r52
+    // cmov_logical r30, r50, 0, r30
+    // div_zeroop(infinity) r21, 1.0, r33
+    // mul_ieee r20, r32, r21
+    // and r22, r20, 0x7FFFFFFF
+    // and r23, r20, 0x80000000
+    // ishr r60, r22, 0x00000017
+    // ishr r61, r30, 0x00000017
+    // iadd r20, r20, r30
+    // iadd r21, r22, r30
+    // iadd r60, r60, r61
+    // ige r42, 0, R60
+    // ior r41, r23, 0x7F800000
+    // ige r40, r60, 0x000000FF
+    // cmov_logical r40, r50, 0, r40
+    // cmov_logical r20, r42, r23, r20
+    // cmov_logical DST, r40, r41, r20
+    // as_float(DST)
+
+    // mov r20, as_int(LHS)
+    SDValue R20 = DAG.getNode(ISDBITCAST, DL, INTTY, LHS);
+
+    // mov r21, as_int(RHS)
+    SDValue R21 = DAG.getNode(ISDBITCAST, DL, INTTY, RHS);
+
+    // and r30, r20, 0x7f800000
+    SDValue R30 = DAG.getNode(ISD::AND, DL, INTTY, R20,
+        DAG.getConstant(0x7F800000, INTTY));
+
+    // and r31, r21, 0x7f800000
+    SDValue R31 = DAG.getNode(ISD::AND, DL, INTTY, R21,
+        DAG.getConstant(0x7f800000, INTTY));
+
+    // and r32, r20, 0x807FFFFF
+    SDValue R32 = DAG.getNode(ISD::AND, DL, INTTY, R20, 
+        DAG.getConstant(0x807FFFFF, INTTY));
+
+    // and r33, r21, 0x807FFFFF
+    SDValue R33 = DAG.getNode(ISD::AND, DL, INTTY, R21, 
+        DAG.getConstant(0x807FFFFF, INTTY));
+
+    // ieq r40, r30, 0x7F800000
+    SDValue R40 = DAG.getNode(AMDILISD::CMP, DL, INTTY, 
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        R30, DAG.getConstant(0x7F800000, INTTY));
+
+    // ieq r41, r31, 0x7F800000
+    SDValue R41 = DAG.getNode(AMDILISD::CMP, DL, INTTY, 
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        R31, DAG.getConstant(0x7F800000, INTTY));
+
+    // ieq r42, r30, 0
+    SDValue R42 = DAG.getNode(AMDILISD::CMP, DL, INTTY, 
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        R30, DAG.getConstant(0, INTTY));
+
+    // ieq r43, r31, 0
+    SDValue R43 = DAG.getNode(AMDILISD::CMP, DL, INTTY, 
+        DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32),
+        R31, DAG.getConstant(0, INTTY));
+
+    // and r50, r20, 0x80000000
+    SDValue R50 = DAG.getNode(ISD::AND, DL, INTTY, R20,
+        DAG.getConstant(0x80000000, INTTY));
+
+    // and r51, r21, 0x80000000
+    SDValue R51 = DAG.getNode(ISD::AND, DL, INTTY, R21,
+        DAG.getConstant(0x80000000, INTTY));
+
+    // ior r32, r32, 0x3f800000
+    R32 = DAG.getNode(ISD::OR, DL, INTTY, R32, 
+        DAG.getConstant(0x3F800000, INTTY));
+
+    // ior r33, r33, 0x3f800000
+    R33 = DAG.getNode(ISD::OR, DL, INTTY, R33, 
+        DAG.getConstant(0x3F800000, INTTY));
+
+    // cmov_logical r32, r42, r50, r32
+    R32 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R42, R50, R32);
+
+    // cmov_logical r33, r43, r51, r33
+    R33 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R43, R51, R33);
+
+    // cmov_logical r32, r40, r20, r32
+    R32 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R40, R20, R32);
+
+    // cmov_logical r33, r41, r21, r33
+    R33 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R41, R21, R33);
+
+    // ior r50, r40, r41
+    R50 = DAG.getNode(ISD::OR, DL, INTTY, R40, R41);
+
+    // ior r51, r42, r43
+    R51 = DAG.getNode(ISD::OR, DL, INTTY, R42, R43);
+
+    // ior r50, r50, r51
+    R50 = DAG.getNode(ISD::OR, DL, INTTY, R50, R51);
+
+    // inegate r52, r31
+    SDValue R52 = DAG.getNode(AMDILISD::INEGATE, DL, INTTY, R31);
+
+    // iadd r30, r30, r52
+    R30 = DAG.getNode(ISD::ADD, DL, INTTY, R30, R52);
+
+    // cmov_logical r30, r50, 0, r30
+    R30 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R50,
+        DAG.getConstant(0, INTTY), R30);
+
+    // div_zeroop(infinity) r21, 1.0, as_float(r33)
+    R33 = DAG.getNode(ISDBITCAST, DL, OVT, R33);
+    R21 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, 
+        DAG.getConstantFP(1.0f, OVT), R33);
+
+    // mul_ieee as_int(r20), as_float(r32), r21
+    R32 = DAG.getNode(ISDBITCAST, DL, OVT, R32);
+    R20 = DAG.getNode(ISD::FMUL, DL, OVT, R32, R21);
+    R20 = DAG.getNode(ISDBITCAST, DL, INTTY, R20);
+
+    // div_zeroop(infinity) r21, 1.0, as_float(r33)
+    R33 = DAG.getNode(ISDBITCAST, DL, OVT, R33);
+    R21 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, 
+        DAG.getConstantFP(1.0f, OVT), R33);
+
+    // mul_ieee as_int(r20), as_float(r32), r21
+    R32 = DAG.getNode(ISDBITCAST, DL, OVT, R32);
+    R20 = DAG.getNode(ISD::FMUL, DL, OVT, R32, R21);
+    R20 = DAG.getNode(ISDBITCAST, DL, INTTY, R20);
+
+    // and r22, r20, 0x7FFFFFFF
+    SDValue R22 = DAG.getNode(ISD::AND, DL, INTTY, R20,
+        DAG.getConstant(0x7FFFFFFF, INTTY));
+
+    // and r23, r20, 0x80000000
+    SDValue R23 = DAG.getNode(ISD::AND, DL, INTTY, R20,
+        DAG.getConstant(0x80000000, INTTY));
+
+    // ishr r60, r22, 0x00000017
+    SDValue R60 = DAG.getNode(ISD::SRA, DL, INTTY, R22,
+        DAG.getConstant(0x00000017, INTTY));
+
+    // ishr r61, r30, 0x00000017
+    SDValue R61 = DAG.getNode(ISD::SRA, DL, INTTY, R30,
+        DAG.getConstant(0x00000017, INTTY));
+
+    // iadd r20, r20, r30
+    R20 = DAG.getNode(ISD::ADD, DL, INTTY, R20, R30);
+
+    // iadd r21, r22, r30
+    R21 = DAG.getNode(ISD::ADD, DL, INTTY, R22, R30);
+
+    // iadd r60, r60, r61
+    R60 = DAG.getNode(ISD::ADD, DL, INTTY, R60, R61);
+
+    // ige r42, 0, R60
+    R42 = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32),
+        DAG.getConstant(0, INTTY),
+        R60);
+
+    // ior r41, r23, 0x7F800000
+    R41 = DAG.getNode(ISD::OR, DL, INTTY, R23,
+        DAG.getConstant(0x7F800000, INTTY));
+
+    // ige r40, r60, 0x000000FF
+    R40 = DAG.getNode(AMDILISD::CMP, DL, INTTY,
+        DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32),
+        R60,
+        DAG.getConstant(0x0000000FF, INTTY));
+
+    // cmov_logical r40, r50, 0, r40
+    R40 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R50,
+        DAG.getConstant(0, INTTY),
+        R40);
+
+    // cmov_logical r20, r42, r23, r20
+    R20 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R42, R23, R20);
+
+    // cmov_logical DST, r40, r41, r20
+    DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R40, R41, R20);
+
+    // as_float(DST)
+    DST = DAG.getNode(ISDBITCAST, DL, OVT, DST);
+  } else {
+    // The following sequence of DAG nodes produce the following IL:
+    // fabs r1, RHS
+    // lt r2, 0x1.0p+96f, r1
+    // cmov_logical r3, r2, 0x1.0p-23f, 1.0f
+    // mul_ieee r1, RHS, r3
+    // div_zeroop(infinity) r0, LHS, r1
+    // mul_ieee DST, r0, r3
+
+    // fabs r1, RHS
+    SDValue r1 = DAG.getNode(ISD::FABS, DL, OVT, RHS);   
+    // lt r2, 0x1.0p+96f, r1
+    SDValue r2 = DAG.getNode(AMDILISD::CMP, DL, OVT,
+        DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::f32), MVT::i32),
+        DAG.getConstant(0x6f800000, INTTY), r1);
+    // cmov_logical r3, r2, 0x1.0p-23f, 1.0f
+    SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r2,
+        DAG.getConstant(0x2f800000, INTTY),
+        DAG.getConstant(0x3f800000, INTTY));
+    // mul_ieee r1, RHS, r3
+    r1 = DAG.getNode(ISD::FMUL, DL, OVT, RHS, r3);
+    // div_zeroop(infinity) r0, LHS, r1
+    SDValue r0 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, LHS, r1);
+    // mul_ieee DST, r0, r3
+    DST = DAG.getNode(ISD::FMUL, DL, OVT, r0, r3);
+  }
+  return DST;
+}
+
+SDValue
+AMDILTargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const
+{
+  return SDValue(Op.getNode(), 0);
+}
diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.h b/src/gallium/drivers/radeon/AMDILISelLowering.h

new file mode 100644 (file)

index 0000000..302f0cb
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILISelLowering.h
@@ -0,0 +1,527 @@
+//===-- AMDILISelLowering.h - AMDIL DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that AMDIL uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_ISELLOWERING_H_
+#define AMDIL_ISELLOWERING_H_
+#include "AMDIL.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm
+{
+  namespace AMDILISD
+  {
+    enum
+    {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      INTTOANY,    // Dummy instruction that takes an int and goes to
+      // any type converts the SDNode to an int
+      DP_TO_FP,    // Conversion from 64bit FP to 32bit FP
+      FP_TO_DP,    // Conversion from 32bit FP to 64bit FP
+      BITCONV,     // instruction that converts from any type to any type
+      CMOV,        // 32bit FP Conditional move instruction
+      CMOVLOG,     // 32bit FP Conditional move logical instruction
+      SELECT,      // 32bit FP Conditional move logical instruction
+      SETCC,       // 32bit FP Conditional move logical instruction
+      ISGN,        // 32bit Int Sign instruction
+      INEGATE,     // 32bit Int Negation instruction
+      MAD,         // 32bit Fused Multiply Add instruction
+      ADD,         // 32/64 bit pseudo instruction
+      AND,         // 128 bit and instruction
+      OR,          // 128 bit or instruction
+      NOT,         // 128 bit not instruction
+      XOR,         // 128 bit xor instruction
+      MOVE,        // generic mov instruction
+      PHIMOVE,     // generic phi-node mov instruction
+      VBUILD,      // scalar to vector mov instruction
+      VEXTRACT,    // extract vector components
+      VINSERT,     // insert vector components
+      VCONCAT,     // concat a single vector to another vector
+      UMAD,        // 32bit UInt Fused Multiply Add instruction
+      CALL,        // Function call based on a single integer
+      RET,         // Return from a function call
+      SELECT_CC,   // Select the correct conditional instruction
+      BRCC,        // Select the correct branch instruction
+      CMPCC,       // Compare to GPR operands
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC
+      //flags.
+      SELECT_FCC,  // Select between two values using the current FCC
+      //flags.
+      LCREATE,     // Create a 64bit integer from two 32 bit integers
+      LCOMPHI,     // Get the hi 32 bits from a 64 bit integer
+      LCOMPLO,     // Get the lo 32 bits from a 64 bit integer
+      DCREATE,     // Create a 64bit float from two 32 bit integers
+      DCOMPHI,     // Get the hi 32 bits from a 64 bit float
+      DCOMPLO,     // Get the lo 32 bits from a 64 bit float
+      LCREATE2,     // Create a 64bit integer from two 32 bit integers
+      LCOMPHI2,     // Get the hi 32 bits from a 64 bit integer
+      LCOMPLO2,     // Get the lo 32 bits from a 64 bit integer
+      DCREATE2,     // Create a 64bit float from two 32 bit integers
+      DCOMPHI2,     // Get the hi 32 bits from a 64 bit float
+      DCOMPLO2,     // Get the lo 32 bits from a 64 bit float
+      UMUL,        // 32bit unsigned multiplication
+      IFFB_HI,  // 32bit find first hi bit instruction
+      IFFB_LO,  // 32bit find first low bit instruction
+      DIV_INF,      // Divide with infinity returned on zero divisor
+      SMAX,        // Signed integer max
+      CMP,
+      IL_CC_I_GT,
+      IL_CC_I_LT,
+      IL_CC_I_GE,
+      IL_CC_I_LE,
+      IL_CC_I_EQ,
+      IL_CC_I_NE,
+      RET_FLAG,
+      BRANCH_COND,
+      LOOP_NZERO,
+      LOOP_ZERO,
+      LOOP_CMP,
+      ADDADDR,
+      // ATOMIC Operations
+      // Global Memory
+      ATOM_G_ADD = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      ATOM_G_AND,
+      ATOM_G_CMPXCHG,
+      ATOM_G_DEC,
+      ATOM_G_INC,
+      ATOM_G_MAX,
+      ATOM_G_UMAX,
+      ATOM_G_MIN,
+      ATOM_G_UMIN,
+      ATOM_G_OR,
+      ATOM_G_SUB,
+      ATOM_G_RSUB,
+      ATOM_G_XCHG,
+      ATOM_G_XOR,
+      ATOM_G_ADD_NORET,
+      ATOM_G_AND_NORET,
+      ATOM_G_CMPXCHG_NORET,
+      ATOM_G_DEC_NORET,
+      ATOM_G_INC_NORET,
+      ATOM_G_MAX_NORET,
+      ATOM_G_UMAX_NORET,
+      ATOM_G_MIN_NORET,
+      ATOM_G_UMIN_NORET,
+      ATOM_G_OR_NORET,
+      ATOM_G_SUB_NORET,
+      ATOM_G_RSUB_NORET,
+      ATOM_G_XCHG_NORET,
+      ATOM_G_XOR_NORET,
+      // Local Memory
+      ATOM_L_ADD,
+      ATOM_L_AND,
+      ATOM_L_CMPXCHG,
+      ATOM_L_DEC,
+      ATOM_L_INC,
+      ATOM_L_MAX,
+      ATOM_L_UMAX,
+      ATOM_L_MIN,
+      ATOM_L_UMIN,
+      ATOM_L_OR,
+      ATOM_L_MSKOR,
+      ATOM_L_SUB,
+      ATOM_L_RSUB,
+      ATOM_L_XCHG,
+      ATOM_L_XOR,
+      ATOM_L_ADD_NORET,
+      ATOM_L_AND_NORET,
+      ATOM_L_CMPXCHG_NORET,
+      ATOM_L_DEC_NORET,
+      ATOM_L_INC_NORET,
+      ATOM_L_MAX_NORET,
+      ATOM_L_UMAX_NORET,
+      ATOM_L_MIN_NORET,
+      ATOM_L_UMIN_NORET,
+      ATOM_L_OR_NORET,
+      ATOM_L_MSKOR_NORET,
+      ATOM_L_SUB_NORET,
+      ATOM_L_RSUB_NORET,
+      ATOM_L_XCHG_NORET,
+      ATOM_L_XOR_NORET,
+      // Region Memory
+      ATOM_R_ADD,
+      ATOM_R_AND,
+      ATOM_R_CMPXCHG,
+      ATOM_R_DEC,
+      ATOM_R_INC,
+      ATOM_R_MAX,
+      ATOM_R_UMAX,
+      ATOM_R_MIN,
+      ATOM_R_UMIN,
+      ATOM_R_OR,
+      ATOM_R_MSKOR,
+      ATOM_R_SUB,
+      ATOM_R_RSUB,
+      ATOM_R_XCHG,
+      ATOM_R_XOR,
+      ATOM_R_ADD_NORET,
+      ATOM_R_AND_NORET,
+      ATOM_R_CMPXCHG_NORET,
+      ATOM_R_DEC_NORET,
+      ATOM_R_INC_NORET,
+      ATOM_R_MAX_NORET,
+      ATOM_R_UMAX_NORET,
+      ATOM_R_MIN_NORET,
+      ATOM_R_UMIN_NORET,
+      ATOM_R_OR_NORET,
+      ATOM_R_MSKOR_NORET,
+      ATOM_R_SUB_NORET,
+      ATOM_R_RSUB_NORET,
+      ATOM_R_XCHG_NORET,
+      ATOM_R_XOR_NORET,
+      // Append buffer
+      APPEND_ALLOC,
+      APPEND_ALLOC_NORET,
+      APPEND_CONSUME,
+      APPEND_CONSUME_NORET,
+      // 2D Images
+      IMAGE2D_READ,
+      IMAGE2D_WRITE,
+      IMAGE2D_INFO0,
+      IMAGE2D_INFO1,
+      // 3D Images
+      IMAGE3D_READ,
+      IMAGE3D_WRITE,
+      IMAGE3D_INFO0,
+      IMAGE3D_INFO1,
+
+      LAST_ISD_NUMBER
+    };
+  } // AMDILISD
+
+  class MachineBasicBlock;
+  class MachineInstr;
+  class DebugLoc;
+  class TargetInstrInfo;
+
+  class AMDILTargetLowering : public TargetLowering
+  {
+    private:
+      int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+    public:
+      AMDILTargetLowering(TargetMachine &TM);
+
+      virtual SDValue
+        LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+
+      int
+        getVarArgsFrameOffset() const;
+
+      /// computeMaskedBitsForTargetNode - Determine which of
+      /// the bits specified
+      /// in Mask are known to be either zero or one and return them in
+      /// the
+      /// KnownZero/KnownOne bitsets.
+      virtual void
+        computeMaskedBitsForTargetNode(
+            const SDValue Op,
+            APInt &KnownZero,
+            APInt &KnownOne,
+            const SelectionDAG &DAG,
+            unsigned Depth = 0
+            ) const;
+
+      virtual MachineBasicBlock*
+        EmitInstrWithCustomInserter(
+            MachineInstr *MI,
+            MachineBasicBlock *MBB) const;
+
+      virtual bool 
+        getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                  const CallInst &I, unsigned Intrinsic) const;
+      virtual const char*
+        getTargetNodeName(
+            unsigned Opcode
+            ) const;
+      // We want to mark f32/f64 floating point values as
+      // legal
+      bool
+        isFPImmLegal(const APFloat &Imm, EVT VT) const;
+      // We don't want to shrink f64/f32 constants because
+      // they both take up the same amount of space and
+      // we don't want to use a f2d instruction.
+      bool ShouldShrinkFPConstant(EVT VT) const;
+
+      /// getFunctionAlignment - Return the Log2 alignment of this
+      /// function.
+      virtual unsigned int
+        getFunctionAlignment(const Function *F) const;
+
+    private:
+      CCAssignFn*
+        CCAssignFnForNode(unsigned int CC) const;
+
+      SDValue LowerCallResult(SDValue Chain,
+          SDValue InFlag,
+          CallingConv::ID CallConv,
+          bool isVarArg,
+          const SmallVectorImpl<ISD::InputArg> &Ins,
+          DebugLoc dl,
+          SelectionDAG &DAG,
+          SmallVectorImpl<SDValue> &InVals) const;
+
+      SDValue LowerMemArgument(SDValue Chain,
+          CallingConv::ID CallConv,
+          const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+          DebugLoc dl, SelectionDAG &DAG,
+          const CCValAssign &VA,  MachineFrameInfo *MFI,
+          unsigned i) const;
+
+      SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
+          SDValue Arg,
+          DebugLoc dl, SelectionDAG &DAG,
+          const CCValAssign &VA,
+          ISD::ArgFlagsTy Flags) const;
+
+      virtual SDValue
+        LowerFormalArguments(SDValue Chain,
+            CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::InputArg> &Ins,
+            DebugLoc dl, SelectionDAG &DAG,
+            SmallVectorImpl<SDValue> &InVals) const;
+
+      virtual SDValue
+        LowerCall(SDValue Chain, SDValue Callee,
+            CallingConv::ID CallConv, bool isVarArg, bool doesNotRet,
+            bool &isTailCall,
+            const SmallVectorImpl<ISD::OutputArg> &Outs,
+            const SmallVectorImpl<SDValue> &OutVals,
+            const SmallVectorImpl<ISD::InputArg> &Ins,
+            DebugLoc dl, SelectionDAG &DAG,
+            SmallVectorImpl<SDValue> &InVals) const;
+
+      virtual SDValue
+        LowerReturn(SDValue Chain,
+            CallingConv::ID CallConv, bool isVarArg,
+            const SmallVectorImpl<ISD::OutputArg> &Outs,
+            const SmallVectorImpl<SDValue> &OutVals,
+            DebugLoc dl, SelectionDAG &DAG) const;
+
+      //+++--- Function dealing with conversions between floating point and
+      //integer types ---+++//
+      SDValue
+        genCLZu64(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        genCLZuN(SDValue Op, SelectionDAG &DAG, uint32_t bits) const;
+      SDValue
+        genCLZu32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        genf64toi32(SDValue Op, SelectionDAG &DAG,
+            bool includeSign) const;
+
+      SDValue
+        genf64toi64(SDValue Op, SelectionDAG &DAG,
+            bool includeSign) const;
+
+      SDValue
+        genu32tof64(SDValue Op, EVT dblvt, SelectionDAG &DAG) const;
+
+      SDValue
+        genu64tof64(SDValue Op, EVT dblvt, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG& DAG) const;
+
+      SDValue
+        LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG& DAG) const;
+
+      SDValue
+        LowerINTRINSIC_VOID(SDValue Op, SelectionDAG& DAG) const;
+
+      SDValue
+        LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerADD(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSUB(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerUREM(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUREM8(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUREM16(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUREM32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUREM64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerUDIV(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUDIV24(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUDIV32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerUDIV64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerAND(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerOR(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+
+      EVT
+        genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
+
+      SDValue
+        LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+
+      SDValue
+        LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+      SDValue
+        LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+      void
+        generateCMPInstr(MachineInstr*, MachineBasicBlock*,
+            const TargetInstrInfo&) const;
+      MachineOperand
+        convertToReg(MachineOperand) const;
+
+      // private members used by the set of instruction generation
+      // functions, these are marked mutable as they are cached so
+      // that they don't have to constantly be looked up when using the
+      // generateMachineInst/genVReg instructions. This is to simplify
+      // the code
+      // and to make it cleaner. The object itself doesn't change as
+      // only these functions use these three data types.
+      mutable MachineBasicBlock *mBB;
+      mutable DebugLoc *mDL;
+      mutable const TargetInstrInfo *mTII;
+      mutable MachineBasicBlock::iterator mBBI;
+      void
+        setPrivateData(MachineBasicBlock *BB, 
+            MachineBasicBlock::iterator &BBI, 
+            DebugLoc *DL,
+          const TargetInstrInfo *TII) const;
+      uint32_t genVReg(uint32_t regType) const;
+      MachineInstrBuilder
+        generateMachineInst(uint32_t opcode,
+          uint32_t dst) const;
+      MachineInstrBuilder
+        generateMachineInst(uint32_t opcode,
+          uint32_t dst, uint32_t src1) const;
+      MachineInstrBuilder
+        generateMachineInst(uint32_t opcode,
+          uint32_t dst, uint32_t src1, uint32_t src2) const;
+      MachineInstrBuilder
+        generateMachineInst(uint32_t opcode,
+          uint32_t dst, uint32_t src1, uint32_t src2,
+          uint32_t src3) const;
+      uint32_t
+        addExtensionInstructions(
+          uint32_t reg, bool signedShift,
+          unsigned int simpleVT) const;
+      void
+        generateLongRelational(MachineInstr *MI,
+          unsigned int opCode) const;
+
+  }; // AMDILTargetLowering
+} // end namespace llvm
+
+#endif    // AMDIL_ISELLOWERING_H_
diff --git a/src/gallium/drivers/radeon/AMDILImageExpansion.cpp b/src/gallium/drivers/radeon/AMDILImageExpansion.cpp

new file mode 100644 (file)

index 0000000..e6fe37a
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILImageExpansion.cpp
@@ -0,0 +1,171 @@
+//===-- AMDILImageExpansion.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// @file AMDILImageExpansion.cpp
+// @details Implementatino of the Image expansion class for image capable devices
+//
+#include "AMDILIOExpansion.h"
+#include "AMDILKernelManager.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Value.h"
+
+using namespace llvm;
+
+AMDILImageExpansion::AMDILImageExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : AMDIL789IOExpansion(tm AMDIL_OPT_LEVEL_VAR)
+{
+}
+
+AMDILImageExpansion::~AMDILImageExpansion()
+{
+}
+void AMDILImageExpansion::expandInefficientImageLoad(
+    MachineBasicBlock *mBB, MachineInstr *MI)
+{
+#if 0
+  const llvm::StringRef &name = MI->getOperand(0).getGlobal()->getName();
+  const char *tReg1, *tReg2, *tReg3, *tReg4;
+  tReg1 = mASM->getRegisterName(MI->getOperand(1).getReg());
+  if (MI->getOperand(2).isReg()) {
+    tReg2 = mASM->getRegisterName(MI->getOperand(2).getReg());
+  } else {
+    tReg2 = mASM->getRegisterName(AMDIL::R1);
+    O << "\tmov " << tReg2 << ", l" << MI->getOperand(2).getImm() << "\n";
+  }
+  if (MI->getOperand(3).isReg()) {
+    tReg3 = mASM->getRegisterName(MI->getOperand(3).getReg());
+  } else {
+    tReg3 = mASM->getRegisterName(AMDIL::R2);
+    O << "\tmov " << tReg3 << ", l" << MI->getOperand(3).getImm() << "\n";
+  }
+  if (MI->getOperand(4).isReg()) {
+    tReg4 = mASM->getRegisterName(MI->getOperand(4).getReg());
+  } else {
+    tReg4 = mASM->getRegisterName(AMDIL::R3);
+    O << "\tmov " << tReg2 << ", l" << MI->getOperand(4).getImm() << "\n";
+  }
+  bool internalSampler = false;
+  //bool linear = true;
+  unsigned ImageCount = 3; // OPENCL_MAX_READ_IMAGES
+  unsigned SamplerCount = 3; // OPENCL_MAX_SAMPLERS
+  if (ImageCount - 1) {
+  O << "\tswitch " << mASM->getRegisterName(MI->getOperand(1).getReg())
+    << "\n";
+  }
+  for (unsigned rID = 0; rID < ImageCount; ++rID) {
+    if (ImageCount - 1)  {
+    if (!rID) {
+      O << "\tdefault\n";
+    } else {
+      O << "\tcase " << rID << "\n" ;
+    }
+    O << "\tswitch " << mASM->getRegisterName(MI->getOperand(2).getReg())
+     << "\n";
+    }
+    for (unsigned sID = 0; sID < SamplerCount; ++sID) {
+      if (SamplerCount - 1) {
+      if (!sID) {
+        O << "\tdefault\n";
+      } else {
+        O << "\tcase " << sID << "\n" ;
+      }
+      }
+      if (internalSampler) {
+        // Check if sampler has normalized setting.
+        O << "\tand r0.x, " << tReg2 << ".x, l0.y\n"
+          << "\tif_logicalz r0.x\n"
+          << "\tflr " << tReg3 << ", " << tReg3 << "\n"
+          << "\tsample_resource(" << rID << ")_sampler("
+          << sID << ")_coordtype(unnormalized) "
+          << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n"
+          << "\telse\n"
+          << "\tiadd " << tReg1 << ".y, " << tReg1 << ".x, l0.y\n"
+          << "\titof " << tReg2 << ", cb1[" << tReg1 << ".x].xyz\n"
+          << "\tmul " << tReg3 << ", " << tReg3 << ", " << tReg2 << "\n"
+          << "\tflr " << tReg3 << ", " << tReg3 << "\n"
+          << "\tmul " << tReg3 << ", " << tReg3 << ", cb1[" 
+          << tReg1 << ".y].xyz\n"
+          << "\tsample_resource(" << rID << ")_sampler("
+          << sID << ")_coordtype(normalized) "
+          << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n"
+          << "\tendif\n";
+      } else {
+        O << "\tiadd " << tReg1 << ".y, " << tReg1 << ".x, l0.y\n"
+          // Check if sampler has normalized setting.
+          << "\tand r0, " << tReg2 << ".x, l0.y\n"
+          // Convert image dimensions to float.
+          << "\titof " << tReg4 << ", cb1[" << tReg1 << ".x].xyz\n"
+          // Move into R0 1 if unnormalized or dimensions if normalized.
+          << "\tcmov_logical r0, r0, " << tReg4 << ", r1.1111\n"
+          // Make coordinates unnormalized.
+          << "\tmul " << tReg3 << ", r0, " << tReg3 << "\n"
+          // Get linear filtering if set.
+          << "\tand " << tReg4 << ", " << tReg2 << ".x, l6.x\n"
+          // Save unnormalized coordinates in R0.
+          << "\tmov r0, " << tReg3 << "\n"
+          // Floor the coordinates due to HW incompatibility with precision
+          // requirements.
+          << "\tflr " << tReg3 << ", " << tReg3 << "\n"
+          // get Origianl coordinates (without floor) if linear filtering
+          << "\tcmov_logical " << tReg3 << ", " << tReg4 
+          << ".xxxx, r0, " << tReg3 << "\n"
+          // Normalize the coordinates with multiplying by 1/dimensions
+          << "\tmul " << tReg3 << ", " << tReg3 << ", cb1[" 
+          << tReg1 << ".y].xyz\n"
+          << "\tsample_resource(" << rID << ")_sampler("
+          << sID << ")_coordtype(normalized) "
+          << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n";
+      }
+      if (SamplerCount - 1) {
+      O << "\tbreak\n";
+      }
+    }
+    if (SamplerCount - 1) {
+      O << "\tendswitch\n";
+    }
+    if (ImageCount - 1) {
+    O << "\tbreak\n";
+    }
+  }
+  if (ImageCount - 1) {
+    O << "\tendswitch\n";
+  }
+#endif
+}
+  void
+AMDILImageExpansion::expandImageLoad(MachineBasicBlock *mBB, MachineInstr *MI)
+{
+  uint32_t imageID = getPointerID(MI);
+  MI->getOperand(1).ChangeToImmediate(imageID);
+  saveInst = true;
+}
+  void
+AMDILImageExpansion::expandImageStore(MachineBasicBlock *mBB, MachineInstr *MI)
+{
+  uint32_t imageID = getPointerID(MI);
+  mKM->setOutputInst();
+  MI->getOperand(0).ChangeToImmediate(imageID);
+  saveInst = true;
+}
+  void
+AMDILImageExpansion::expandImageParam(MachineBasicBlock *mBB, MachineInstr *MI)
+{
+    MachineBasicBlock::iterator I = *MI;
+    uint32_t ID = getPointerID(MI);
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(*mBB, I, DL, mTII->get(AMDIL::CBLOAD), 
+        MI->getOperand(0).getReg())
+        .addImm(ID)
+        .addImm(1);
+}
diff --git a/src/gallium/drivers/radeon/AMDILInliner.cpp b/src/gallium/drivers/radeon/AMDILInliner.cpp

new file mode 100644 (file)

index 0000000..9dad6ad
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInliner.cpp
@@ -0,0 +1,271 @@
+//===-- AMDILInliner.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdilinline"
+#include "AMDIL.h"
+#include "AMDILCompilerErrors.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILSubtarget.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+namespace
+{
+  class LLVM_LIBRARY_VISIBILITY AMDILInlinePass: public FunctionPass
+
+  {
+    public:
+      TargetMachine &TM;
+      static char ID;
+      AMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+      ~AMDILInlinePass();
+      virtual const char* getPassName() const;
+      virtual bool runOnFunction(Function &F);
+      bool doInitialization(Module &M);
+      bool doFinalization(Module &M);
+      virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    private:
+      typedef DenseMap<const ArrayType*, SmallVector<AllocaInst*,
+              DEFAULT_VEC_SLOTS> > InlinedArrayAllocasTy;
+      bool
+        AMDILInlineCallIfPossible(CallSite CS,
+            const TargetData *TD,
+            InlinedArrayAllocasTy &InlinedArrayAllocas);
+
+      CodeGenOpt::Level OptLevel;
+  };
+  char AMDILInlinePass::ID = 0;
+} // anonymouse namespace
+
+
+namespace llvm
+{
+  FunctionPass*
+    createAMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+    {
+      return new AMDILInlinePass(tm AMDIL_OPT_LEVEL_VAR);
+    }
+} // llvm namespace
+
+  AMDILInlinePass::AMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+: FunctionPass(ID), TM(tm)
+{
+  OptLevel = tm.getOptLevel();
+}
+AMDILInlinePass::~AMDILInlinePass()
+{
+}
+
+
+bool
+AMDILInlinePass::AMDILInlineCallIfPossible(CallSite CS,
+    const TargetData *TD, InlinedArrayAllocasTy &InlinedArrayAllocas) {
+  Function *Callee = CS.getCalledFunction();
+  Function *Caller = CS.getCaller();
+
+  // Try to inline the function.  Get the list of static allocas that were
+  // inlined.
+  SmallVector<AllocaInst*, 16> StaticAllocas;
+  InlineFunctionInfo IFI;
+  if (!InlineFunction(CS, IFI))
+    return false;
+  DEBUG(errs() << "<amdilinline> function " << Caller->getName()
+      << ": inlined call to "<< Callee->getName() << "\n");
+
+  // If the inlined function had a higher stack protection level than the
+  // calling function, then bump up the caller's stack protection level.
+  if (Callee->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtectReq);
+  else if (Callee->hasFnAttr(Attribute::StackProtect) &&
+      !Caller->hasFnAttr(Attribute::StackProtectReq))
+    Caller->addFnAttr(Attribute::StackProtect);
+
+
+  // Look at all of the allocas that we inlined through this call site.  If we
+  // have already inlined other allocas through other calls into this function,
+  // then we know that they have disjoint lifetimes and that we can merge them.
+  //
+  // There are many heuristics possible for merging these allocas, and the
+  // different options have different tradeoffs.  One thing that we *really*
+  // don't want to hurt is SRoA: once inlining happens, often allocas are no
+  // longer address taken and so they can be promoted.
+  //
+  // Our "solution" for that is to only merge allocas whose outermost type is an
+  // array type.  These are usually not promoted because someone is using a
+  // variable index into them.  These are also often the most important ones to
+  // merge.
+  //
+  // A better solution would be to have real memory lifetime markers in the IR
+  // and not have the inliner do any merging of allocas at all.  This would
+  // allow the backend to do proper stack slot coloring of all allocas that
+  // *actually make it to the backend*, which is really what we want.
+  //
+  // Because we don't have this information, we do this simple and useful hack.
+  //
+  SmallPtrSet<AllocaInst*, 16> UsedAllocas;
+
+  // Loop over all the allocas we have so far and see if they can be merged with
+  // a previously inlined alloca.  If not, remember that we had it.
+
+  for (unsigned AllocaNo = 0,
+      e = IFI.StaticAllocas.size();
+      AllocaNo != e; ++AllocaNo) {
+
+    AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
+
+    // Don't bother trying to merge array allocations (they will usually be
+    // canonicalized to be an allocation *of* an array), or allocations whose
+    // type is not itself an array (because we're afraid of pessimizing SRoA).
+    const ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+    if (ATy == 0 || AI->isArrayAllocation())
+      continue;
+
+    // Get the list of all available allocas for this array type.
+    SmallVector<AllocaInst*, DEFAULT_VEC_SLOTS> &AllocasForType
+      = InlinedArrayAllocas[ATy];
+
+    // Loop over the allocas in AllocasForType to see if we can reuse one.  Note
+    // that we have to be careful not to reuse the same "available" alloca for
+    // multiple different allocas that we just inlined, we use the 'UsedAllocas'
+    // set to keep track of which "available" allocas are being used by this
+    // function.  Also, AllocasForType can be empty of course!
+    bool MergedAwayAlloca = false;
+    for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) {
+      AllocaInst *AvailableAlloca = AllocasForType[i];
+
+      // The available alloca has to be in the right function, not in some other
+      // function in this SCC.
+      if (AvailableAlloca->getParent() != AI->getParent())
+        continue;
+
+      // If the inlined function already uses this alloca then we can't reuse
+      // it.
+      if (!UsedAllocas.insert(AvailableAlloca))
+        continue;
+
+      // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
+      // success!
+      DEBUG(errs() << "    ***MERGED ALLOCA: " << *AI);
+
+      AI->replaceAllUsesWith(AvailableAlloca);
+      AI->eraseFromParent();
+      MergedAwayAlloca = true;
+      break;
+    }
+
+    // If we already nuked the alloca, we're done with it.
+    if (MergedAwayAlloca)
+      continue;
+
+    // If we were unable to merge away the alloca either because there are no
+    // allocas of the right type available or because we reused them all
+    // already, remember that this alloca came from an inlined function and mark
+    // it used so we don't reuse it for other allocas from this inline
+    // operation.
+    AllocasForType.push_back(AI);
+    UsedAllocas.insert(AI);
+  }
+
+  return true;
+}
+
+  bool
+AMDILInlinePass::runOnFunction(Function &MF)
+{
+  Function *F = &MF;
+  const AMDILSubtarget &STM = TM.getSubtarget<AMDILSubtarget>();
+  if (STM.device()->isSupported(AMDILDeviceInfo::NoInline)) {
+    return false;
+  }
+  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  SmallVector<CallSite, 16> CallSites;
+  for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      CallSite CS = CallSite(cast<Value>(I));
+      // If this isn't a call, or it is a call to an intrinsic, it can
+      // never be inlined.
+      if (CS.getInstruction() == 0 || isa<IntrinsicInst>(I))
+        continue;
+
+      // If this is a direct call to an external function, we can never inline
+      // it.  If it is an indirect call, inlining may resolve it to be a
+      // direct call, so we keep it.
+      if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration())
+        continue;
+
+      // We don't want to inline if we are recursive.
+      if (CS.getCalledFunction() && CS.getCalledFunction()->getName() == MF.getName()) {
+        AMDILMachineFunctionInfo *MFI =
+          getAnalysis<MachineFunctionAnalysis>().getMF()
+          .getInfo<AMDILMachineFunctionInfo>();
+        MFI->addErrorMsg(amd::CompilerErrorMessage[RECURSIVE_FUNCTION]);
+        continue;
+      }
+
+      CallSites.push_back(CS);
+    }
+  }
+
+  InlinedArrayAllocasTy InlinedArrayAllocas;
+  bool Changed = false;
+  for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+    CallSite CS = CallSites[CSi];
+
+    Function *Callee = CS.getCalledFunction();
+
+    // We can only inline direct calls to non-declarations.
+    if (Callee == 0 || Callee->isDeclaration()) continue;
+
+    // Attempt to inline the function...
+    if (!AMDILInlineCallIfPossible(CS, TD, InlinedArrayAllocas))
+      continue;
+    Changed = true;
+  }
+  return Changed;
+}
+
+const char*
+AMDILInlinePass::getPassName() const
+{
+  return "AMDIL Inline Function Pass";
+}
+  bool
+AMDILInlinePass::doInitialization(Module &M)
+{
+  return false;
+}
+
+  bool
+AMDILInlinePass::doFinalization(Module &M)
+{
+  return false;
+}
+
+void
+AMDILInlinePass::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp

new file mode 100644 (file)

index 0000000..fbc3e45
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
@@ -0,0 +1,709 @@
+//===- AMDILInstrInfo.cpp - AMDIL Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the AMDIL implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILInstrInfo.h"
+#include "AMDILUtilityFunctions.h"
+
+#define GET_INSTRINFO_CTOR
+#include "AMDILGenInstrInfo.inc"
+
+#include "AMDILInstrInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Instructions.h"
+
+using namespace llvm;
+
+AMDILInstrInfo::AMDILInstrInfo(AMDILTargetMachine &tm)
+  : AMDILGenInstrInfo(AMDIL::ADJCALLSTACKDOWN, AMDIL::ADJCALLSTACKUP),
+    RI(tm, *this),
+    TM(tm) {
+}
+
+const AMDILRegisterInfo &AMDILInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+/// Return true if the instruction is a register to register move and leave the
+/// source and dest operands in the passed parameters.
+bool AMDILInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
+                                 unsigned int &DstReg, unsigned int &SrcSubIdx,
+                                 unsigned int &DstSubIdx) const {
+  // FIXME: we should look for:
+  //    add with 0
+  //assert(0 && "is Move Instruction has not been implemented yet!");
+  //return true;
+  if (!isMove(MI.getOpcode())) {
+    return false;
+  }
+  if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg()) {
+    return false;
+  }
+  SrcReg = MI.getOperand(1).getReg();
+  DstReg = MI.getOperand(0).getReg();
+  DstSubIdx = 0;
+  SrcSubIdx = 0;
+  return true;
+}
+
+bool AMDILInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+// TODO: Implement this function
+  return false;
+}
+
+unsigned AMDILInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDILInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                   int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+bool AMDILInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+unsigned AMDILInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+unsigned AMDILInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+bool AMDILInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+#if 0
+void
+AMDILInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              unsigned DestReg, unsigned SubIdx,
+                             const MachineInstr *Orig,
+                             const TargetRegisterInfo *TRI) const {
+// TODO: Implement this function
+}
+
+MachineInst AMDILInstrInfo::duplicate(MachineInstr *Orig,
+                                      MachineFunction &MF) const {
+// TODO: Implement this function
+  return NULL;
+}
+#endif
+MachineInstr *
+AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const {
+// TODO: Implement this function
+  return NULL;
+}
+#if 0
+MachineInst AMDILInstrInfo::commuteInstruction(MachineInstr *MI,
+                                               bool NewMI = false) const {
+// TODO: Implement this function
+  return NULL;
+}
+bool
+AMDILInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                                     unsigned &SrcOpIdx2) const
+{
+// TODO: Implement this function
+}
+bool
+AMDILInstrInfo::produceSameValue(const MachineInstr *MI0,
+                                const MachineInstr *MI1) const
+{
+// TODO: Implement this function
+}
+#endif
+bool AMDILInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                                        MachineBasicBlock &MBB) const {
+  while (iter != MBB.end()) {
+    switch (iter->getOpcode()) {
+    default:
+      break;
+      ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND);
+    case AMDIL::BRANCH:
+      return true;
+    };
+    ++iter;
+  }
+  return false;
+}
+
+bool AMDILInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  bool retVal = true;
+  return retVal;
+  MachineBasicBlock::iterator iter = MBB.begin();
+  if (!getNextBranchInstr(iter, MBB)) {
+    retVal = false;
+  } else {
+    MachineInstr *firstBranch = iter;
+    if (!getNextBranchInstr(++iter, MBB)) {
+      if (firstBranch->getOpcode() == AMDIL::BRANCH) {
+        TBB = firstBranch->getOperand(0).getMBB();
+        firstBranch->eraseFromParent();
+        retVal = false;
+      } else {
+        TBB = firstBranch->getOperand(0).getMBB();
+        FBB = *(++MBB.succ_begin());
+        if (FBB == TBB) {
+          FBB = *(MBB.succ_begin());
+        }
+        Cond.push_back(firstBranch->getOperand(1));
+        retVal = false;
+      }
+    } else {
+      MachineInstr *secondBranch = iter;
+      if (!getNextBranchInstr(++iter, MBB)) {
+        if (secondBranch->getOpcode() == AMDIL::BRANCH) {
+          TBB = firstBranch->getOperand(0).getMBB();
+          Cond.push_back(firstBranch->getOperand(1));
+          FBB = secondBranch->getOperand(0).getMBB();
+          secondBranch->eraseFromParent();
+          retVal = false;
+        } else {
+          assert(0 && "Should not have two consecutive conditional branches");
+        }
+      } else {
+        MBB.getParent()->viewCFG();
+        assert(0 && "Should not have three branch instructions in"
+               " a single basic block");
+        retVal = false;
+      }
+    }
+  }
+  return retVal;
+}
+
+unsigned int AMDILInstrInfo::getBranchInstr(const MachineOperand &op) const {
+  const MachineInstr *MI = op.getParent();
+  
+  switch (MI->getDesc().OpInfo->RegClass) {
+  default: // FIXME: fallthrough??
+  case AMDIL::GPRI8RegClassID:  return AMDIL::BRANCH_COND_i8;
+  case AMDIL::GPRI16RegClassID: return AMDIL::BRANCH_COND_i16;
+  case AMDIL::GPRI32RegClassID: return AMDIL::BRANCH_COND_i32;
+  case AMDIL::GPRI64RegClassID: return AMDIL::BRANCH_COND_i64;
+  case AMDIL::GPRF32RegClassID: return AMDIL::BRANCH_COND_f32;
+  case AMDIL::GPRF64RegClassID: return AMDIL::BRANCH_COND_f64;
+  };
+}
+
+unsigned int
+AMDILInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond,
+                             DebugLoc DL) const
+{
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  for (unsigned int x = 0; x < Cond.size(); ++x) {
+    Cond[x].getParent()->dump();
+  }
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDIL::BRANCH)).addMBB(TBB);
+    } else {
+      BuildMI(&MBB, DL, get(getBranchInstr(Cond[0])))
+        .addMBB(TBB).addReg(Cond[0].getReg());
+    }
+    return 1;
+  } else {
+    BuildMI(&MBB, DL, get(getBranchInstr(Cond[0])))
+      .addMBB(TBB).addReg(Cond[0].getReg());
+    BuildMI(&MBB, DL, get(AMDIL::BRANCH)).addMBB(FBB);
+  }
+  assert(0 && "Inserting two branches not supported");
+  return 0;
+}
+
+unsigned int AMDILInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+    ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND);
+  case AMDIL::BRANCH:
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+  
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+    ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND);
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator tmp = MBB->end();
+  if (!MBB->size()) {
+    return MBB->end();
+  }
+  while (--tmp) {
+    if (tmp->getOpcode() == AMDIL::ENDLOOP
+        || tmp->getOpcode() == AMDIL::ENDIF
+        || tmp->getOpcode() == AMDIL::ELSE) {
+      if (tmp == MBB->begin()) {
+        return tmp;
+      } else {
+        continue;
+      }
+    }  else {
+      return ++tmp;
+    }
+  }
+  return MBB->end();
+}
+
+bool
+AMDILInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             unsigned DestReg, unsigned SrcReg,
+                             const TargetRegisterClass *DestRC,
+                             const TargetRegisterClass *SrcRC,
+                             DebugLoc DL) const {
+  // If we are adding to the end of a basic block we can safely assume that the
+  // move is caused by a PHI node since all move instructions that are non-PHI
+  // have already been inserted into the basic blocks Therefor we call the skip
+  // flow control instruction to move the iterator before the flow control
+  // instructions and put the move instruction there.
+  bool phi = (DestReg < 1025) || (SrcReg < 1025);
+  int movInst = phi ? getMoveInstFromID(DestRC->getID())
+                    : getPHIMoveInstFromID(DestRC->getID());
+  
+  MachineBasicBlock::iterator iTemp = (I == MBB.end()) ? skipFlowControl(&MBB)
+                                                       : I;
+  if (DestRC != SrcRC) {
+    //int convInst;
+    size_t dSize = DestRC->getSize();
+    size_t sSize = SrcRC->getSize();
+    if (dSize > sSize) {
+      // Elements are going to get duplicated.
+      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
+    } else if (dSize == sSize) {
+      // Direct copy, conversions are not handled.
+      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
+    } else if (dSize < sSize) {
+      // Elements are going to get dropped.
+      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
+    }
+  } else {
+    BuildMI( MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
+  }
+  return true;
+}
+void
+AMDILInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI, DebugLoc DL,
+                            unsigned DestReg, unsigned SrcReg,
+                            bool KillSrc) const
+{
+  BuildMI(MBB, MI, DL, get(AMDIL::MOVE_v4i32), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+  return;
+#if 0
+  DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
+               << " to " << RI.getName(DestReg) << '\n');
+  abort();
+#endif
+}
+void
+AMDILInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill,
+                                    int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+  unsigned int Opc = 0;
+  // MachineInstr *curMI = MI;
+  MachineFunction &MF = *(MBB.getParent());
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  
+  DebugLoc DL;
+  switch (RC->getID()) {
+  default:
+    Opc = AMDIL::PRIVATESTORE_v4i32;
+    break;
+  case AMDIL::GPRF32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_f32;
+    break;
+  case AMDIL::GPRF64RegClassID:
+    Opc = AMDIL::PRIVATESTORE_f64;
+    break;
+  case AMDIL::GPRI16RegClassID:
+    Opc = AMDIL::PRIVATESTORE_i16;
+    break;
+  case AMDIL::GPRI32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_i32;
+    break;
+  case AMDIL::GPRI8RegClassID:
+    Opc = AMDIL::PRIVATESTORE_i8;
+    break;
+  case AMDIL::GPRI64RegClassID:
+    Opc = AMDIL::PRIVATESTORE_i64;
+    break;
+  case AMDIL::GPRV2F32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2f32;
+    break;
+  case AMDIL::GPRV2F64RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2f64;
+    break;
+  case AMDIL::GPRV2I16RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2i16;
+    break;
+  case AMDIL::GPRV2I32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2i32;
+    break;
+  case AMDIL::GPRV2I8RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2i8;
+    break;
+  case AMDIL::GPRV2I64RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v2i64;
+    break;
+  case AMDIL::GPRV4F32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v4f32;
+    break;
+  case AMDIL::GPRV4I16RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v4i16;
+    break;
+  case AMDIL::GPRV4I32RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v4i32;
+    break;
+  case AMDIL::GPRV4I8RegClassID:
+    Opc = AMDIL::PRIVATESTORE_v4i8;
+    break;
+  }
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineMemOperand *MMO =
+   new MachineMemOperand(
+        MachinePointerInfo::getFixedStack(FrameIndex),
+                          MachineMemOperand::MOLoad,
+                          MFI.getObjectSize(FrameIndex),
+                          MFI.getObjectAlignment(FrameIndex));
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+  MachineInstr *nMI = BuildMI(MBB, MI, DL, get(Opc))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addMemOperand(MMO)
+    .addImm(0);
+  AMDILAS::InstrResEnc curRes;
+  curRes.bits.ResourceID 
+    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+  setAsmPrinterFlags(nMI, curRes);
+}
+
+void
+AMDILInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const {
+  unsigned int Opc = 0;
+  MachineFunction &MF = *(MBB.getParent());
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  DebugLoc DL;
+  switch (RC->getID()) {
+  default:
+    Opc = AMDIL::PRIVATELOAD_v4i32;
+    break;
+  case AMDIL::GPRF32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_f32;
+    break;
+  case AMDIL::GPRF64RegClassID:
+    Opc = AMDIL::PRIVATELOAD_f64;
+    break;
+  case AMDIL::GPRI16RegClassID:
+    Opc = AMDIL::PRIVATELOAD_i16;
+    break;
+  case AMDIL::GPRI32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_i32;
+    break;
+  case AMDIL::GPRI8RegClassID:
+    Opc = AMDIL::PRIVATELOAD_i8;
+    break;
+  case AMDIL::GPRI64RegClassID:
+    Opc = AMDIL::PRIVATELOAD_i64;
+    break;
+  case AMDIL::GPRV2F32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2f32;
+    break;
+  case AMDIL::GPRV2F64RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2f64;
+    break;
+  case AMDIL::GPRV2I16RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2i16;
+    break;
+  case AMDIL::GPRV2I32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2i32;
+    break;
+  case AMDIL::GPRV2I8RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2i8;
+    break;
+  case AMDIL::GPRV2I64RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v2i64;
+    break;
+  case AMDIL::GPRV4F32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v4f32;
+    break;
+  case AMDIL::GPRV4I16RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v4i16;
+    break;
+  case AMDIL::GPRV4I32RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v4i32;
+    break;
+  case AMDIL::GPRV4I8RegClassID:
+    Opc = AMDIL::PRIVATELOAD_v4i8;
+    break;
+  }
+
+  MachineMemOperand *MMO =
+    new MachineMemOperand(
+        MachinePointerInfo::getFixedStack(FrameIndex),
+                          MachineMemOperand::MOLoad,
+                          MFI.getObjectSize(FrameIndex),
+                          MFI.getObjectAlignment(FrameIndex));
+  if (MI != MBB.end()) {
+    DL = MI->getDebugLoc();
+  }
+  MachineInstr* nMI = BuildMI(MBB, MI, DL, get(Opc))
+    .addReg(DestReg, RegState::Define)
+    .addFrameIndex(FrameIndex)
+    .addMemOperand(MMO)
+    .addImm(0);
+  AMDILAS::InstrResEnc curRes;
+  curRes.bits.ResourceID 
+    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+  setAsmPrinterFlags(nMI, curRes);
+
+}
+MachineInstr *
+AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+MachineInstr*
+AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const {
+  // TODO: Implement this function
+  return 0;
+}
+bool
+AMDILInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                     const SmallVectorImpl<unsigned> &Ops) const
+{
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDILInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                 unsigned Reg, bool UnfoldLoad,
+                                 bool UnfoldStore,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool
+AMDILInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                    SmallVectorImpl<SDNode*> &NewNodes) const {
+  // TODO: Implement this function
+  return false;
+}
+
+unsigned
+AMDILInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                           bool UnfoldLoad, bool UnfoldStore,
+                                           unsigned *LoadRegIndex) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+bool
+AMDILInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                        int64_t &Offset1,
+                                        int64_t &Offset2) const {
+  return false;
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) {
+    return false;
+  }
+  const MachineSDNode *mload1 = dyn_cast<MachineSDNode>(Load1);
+  const MachineSDNode *mload2 = dyn_cast<MachineSDNode>(Load2);
+  if (!mload1 || !mload2) {
+    return false;
+  }
+  if (mload1->memoperands_empty() ||
+      mload2->memoperands_empty()) {
+    return false;
+  }
+  MachineMemOperand *memOp1 = (*mload1->memoperands_begin());
+  MachineMemOperand *memOp2 = (*mload2->memoperands_begin());
+  const Value *mv1 = memOp1->getValue();
+  const Value *mv2 = memOp2->getValue();
+  if (!memOp1->isLoad() || !memOp2->isLoad()) {
+    return false;
+  }
+  if (getBasePointerValue(mv1) == getBasePointerValue(mv2)) {
+    if (isa<GetElementPtrInst>(mv1) && isa<GetElementPtrInst>(mv2)) {
+      const GetElementPtrInst *gep1 = dyn_cast<GetElementPtrInst>(mv1);
+      const GetElementPtrInst *gep2 = dyn_cast<GetElementPtrInst>(mv2);
+      if (!gep1 || !gep2) {
+        return false;
+      }
+      if (gep1->getNumOperands() != gep2->getNumOperands()) {
+        return false;
+      }
+      for (unsigned i = 0, e = gep1->getNumOperands() - 1; i < e; ++i) {
+        const Value *op1 = gep1->getOperand(i);
+        const Value *op2 = gep2->getOperand(i);
+        if (op1 != op2) {
+          // If any value except the last one is different, return false.
+          return false;
+        }
+      }
+      unsigned size = gep1->getNumOperands()-1;
+      if (!isa<ConstantInt>(gep1->getOperand(size))
+          || !isa<ConstantInt>(gep2->getOperand(size))) {
+        return false;
+      }
+      Offset1 = dyn_cast<ConstantInt>(gep1->getOperand(size))->getSExtValue();
+      Offset2 = dyn_cast<ConstantInt>(gep2->getOperand(size))->getSExtValue();
+      return true;
+    } else if (isa<Argument>(mv1) && isa<Argument>(mv2)) {
+      return false;
+    } else if (isa<GlobalValue>(mv1) && isa<GlobalValue>(mv2)) {
+      return false;
+    }
+  }
+  return false;
+}
+
+bool AMDILInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                             int64_t Offset1, int64_t Offset2,
+                                             unsigned NumLoads) const {
+  assert(Offset2 > Offset1
+         && "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 16,
+  // then schedule together.
+  // TODO: Make the loads schedule near if it fits in a cacheline
+  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+}
+
+bool
+AMDILInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // TODO: Implement this function
+  return true;
+}
+void AMDILInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDILInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return false;
+}
+#if 0
+bool AMDILInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDILInstrInfo::PredicateInstruction(MachineInstr *MI,
+        const SmallVectorImpl<MachineOperand> &Pred) const {
+    // TODO: Implement this function
+}
+#endif
+bool
+AMDILInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                                  const SmallVectorImpl<MachineOperand> &Pred2)
+  const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDILInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                      std::vector<MachineOperand> &Pred) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDILInstrInfo::isPredicable(MachineInstr *MI) const {
+  // TODO: Implement this function
+  return MI->getDesc().isPredicable();
+}
+
+bool
+AMDILInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // TODO: Implement this function
+  return true;
+}
+
+unsigned AMDILInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+#if 0
+unsigned
+AMDILInstrInfo::GetFunctionSizeInBytes(const MachineFunction &MF) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDILInstrInfo::getInlineAsmLength(const char *Str,
+                                            const MCAsmInfo &MAI) const {
+  // TODO: Implement this function
+  return 0;
+}
+#endif
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.h b/src/gallium/drivers/radeon/AMDILInstrInfo.h

new file mode 100644 (file)

index 0000000..88dd4e9
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.h
@@ -0,0 +1,175 @@
+//===- AMDILInstrInfo.h - AMDIL Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the AMDIL implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDILINSTRUCTIONINFO_H_
+#define AMDILINSTRUCTIONINFO_H_
+
+#include "AMDILRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define GET_INSTRINFO_HEADER
+#include "AMDILGenInstrInfo.inc"
+
+namespace llvm {
+  // AMDIL - This namespace holds all of the target specific flags that
+  // instruction info tracks.
+  //
+  //class AMDILTargetMachine;
+class AMDILInstrInfo : public AMDILGenInstrInfo {
+private:
+  const AMDILRegisterInfo RI;
+  AMDILTargetMachine &TM;
+  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                          MachineBasicBlock &MBB) const;
+  unsigned int getBranchInstr(const MachineOperand &op) const;
+public:
+  explicit AMDILInstrInfo(AMDILTargetMachine &tm);
+
+  // getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  // such, whenever a client has an instance of instruction info, it should
+  // always be able to get register info as well (through this method).
+  const AMDILRegisterInfo &getRegisterInfo() const;
+
+  // Return true if the instruction is a register to register move and leave the
+  // source and dest operands in the passed parameters.
+  bool isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
+                   unsigned int &DstReg, unsigned int &SrcSubIdx,
+                   unsigned int &DstSubIdx) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const;
+  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  bool hasStoreFromStackSlot(const MachineInstr *MI,
+                             const MachineMemOperand *&MMO,
+                             int &FrameIndex) const;
+
+
+#if 0
+  void reMaterialize(MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo *TRI) const;
+  MachineInstr *duplicate(MachineInstr *Orig,
+                          MachineFunction &MF) const;
+#endif
+  MachineInstr *
+  convertToThreeAddress(MachineFunction::iterator &MFI,
+                        MachineBasicBlock::iterator &MBBI,
+                        LiveVariables *LV) const;
+#if 0
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI = false) const;
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const;
+  bool produceSameValue(const MachineInstr *MI0,
+                        const MachineInstr *MI1) const;
+
+#endif
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  unsigned
+  InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+               MachineBasicBlock *FBB,
+               const SmallVectorImpl<MachineOperand> &Cond,
+               DebugLoc DL) const;
+
+  bool copyRegToReg(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC,
+                    DebugLoc DL) const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+protected:
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const;
+public:
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                            const SmallVectorImpl<unsigned> &Ops) const;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode *> &NewNodes) const;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex = 0) const;
+  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                               int64_t &Offset1, int64_t &Offset2) const;
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const;
+  bool isPredicated(const MachineInstr *MI) const;
+#if 0
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+#endif
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const;
+  bool isPredicable(MachineInstr *MI) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+#if 0
+  unsigned GetFunctionSizeInBytes(const MachineFunction &MF) const;
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const;
+#endif
+  };
+
+}
+
+#endif // AMDILINSTRINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.td b/src/gallium/drivers/radeon/AMDILInstrInfo.td

new file mode 100644 (file)

index 0000000..7086e53
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.td
@@ -0,0 +1,115 @@
+//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file describes the AMDIL instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+// AMDIL Instruction Predicate Definitions
+// Predicate that is set to true if the hardware supports double precision
+// divide
+def HasHWDDiv                 : Predicate<"Subtarget.device()"
+                           "->getGeneration() > AMDILDeviceInfo::HD4XXX && "
+              "Subtarget.device()->usesHardware(AMDILDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware supports double, but not double
+// precision divide in hardware
+def HasSWDDiv             : Predicate<"Subtarget.device()"
+                           "->getGeneration() == AMDILDeviceInfo::HD4XXX &&"
+              "Subtarget.device()->usesHardware(AMDILDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware support 24bit signed
+// math ops. Otherwise a software expansion to 32bit math ops is used instead.
+def HasHWSign24Bit          : Predicate<"Subtarget.device()"
+                            "->getGeneration() > AMDILDeviceInfo::HD5XXX">;
+
+// Predicate that is set to true if 64bit operations are supported or not
+def HasHW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDILDeviceInfo::LongOps)">;
+def HasSW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDILDeviceInfo::LongOps)">;
+
+// Predicate that is set to true if the timer register is supported
+def HasTmrRegister          : Predicate<"Subtarget.device()"
+                            "->isSupported(AMDILDeviceInfo::TmrReg)">;
+// Predicate that is true if we are at least evergreen series
+def HasDeviceIDInst         : Predicate<"Subtarget.device()"
+                            "->getGeneration() >= AMDILDeviceInfo::HD5XXX">;
+
+// Predicate that is true if we have region address space.
+def hasRegionAS             : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDILDeviceInfo::RegionMem)">;
+
+// Predicate that is false if we don't have region address space.
+def noRegionAS             : Predicate<"!Subtarget.device()"
+                            "->isSupported(AMDILDeviceInfo::RegionMem)">;
+
+
+// Predicate that is set to true if 64bit Mul is supported in the IL or not
+def HasHW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          ">= CAL_VERSION_SC_139"
+                                          "&& Subtarget.device()"
+                                          "->getGeneration() >="
+                                          "AMDILDeviceInfo::HD5XXX">;
+def HasSW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          "< CAL_VERSION_SC_139">;
+// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
+def HasHW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDILDeviceInfo::HW64BitDivMod)">;
+def HasSW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDILDeviceInfo::HW64BitDivMod)">;
+
+// Predicate that is set to true if 64bit pointer are used.
+def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
+def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+include "AMDILOperands.td"
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Type Profiles
+//===--------------------------------------------------------------------===//
+include "AMDILProfiles.td"
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Nodes
+//===--------------------------------------------------------------------===//
+include "AMDILNodes.td"
+
+//===--------------------------------------------------------------------===//
+// Custom Pattern DAG Nodes
+//===--------------------------------------------------------------------===//
+include "AMDILPatterns.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction format classes
+//===----------------------------------------------------------------------===//
+include "AMDILFormats.td"
+
+//===--------------------------------------------------------------------===//
+// Multiclass Instruction formats
+//===--------------------------------------------------------------------===//
+include "AMDILMultiClass.td"
+
+//===--------------------------------------------------------------------===//
+// Intrinsics support
+//===--------------------------------------------------------------------===//
+include "AMDILIntrinsics.td"
+
+//===--------------------------------------------------------------------===//
+// Instructions support
+//===--------------------------------------------------------------------===//
+include "AMDILInstructions.td"
+
+//===--------------------------------------------------------------------===//
+// Instruction Pattern support - This Must be the last include in the file
+// as it requires items defined in other files
+//===--------------------------------------------------------------------===//
+include "AMDILInstrPatterns.td"
+
diff --git a/src/gallium/drivers/radeon/AMDILInstrPatterns.td b/src/gallium/drivers/radeon/AMDILInstrPatterns.td

new file mode 100644 (file)

index 0000000..51181b2
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInstrPatterns.td
@@ -0,0 +1,66 @@
+//===- AMDILInstrPatterns.td - AMDIL Target ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//===--------------------------------------------------------------------===//
+// This file holds all the custom patterns that are used by the amdil backend
+//
+//===--------------------------------------------------------------------===//
+//===--------------------------------------------------------------------===//
+// Custom patterns for conversion operations
+//===--------------------------------------------------------------------===////
+// Pattern to remap integer or to IL_or
+def : Pat<(i32 (or GPRI32:$src0, GPRI32:$src1)),
+          (i32 (BINARY_OR_i32 GPRI32:$src0, GPRI32:$src1))>;
+// float ==> long patterns
+// unsigned: f32 -> i64
+def FTOUL : Pat<(i64 (fp_to_uint GPRF32:$src)),
+    (LCREATE (FTOU GPRF32:$src), (LOADCONST_i32 0))>;
+// signed: f32 -> i64
+def FTOL : Pat<(i64 (fp_to_sint GPRF32:$src)),
+    (LCREATE (FTOI GPRF32:$src), (LOADCONST_i32 0))>;
+// unsigned: i64 -> f32
+def ULTOF : Pat<(f32 (uint_to_fp GPRI64:$src)),
+    (UTOF (LLO GPRI64:$src))>;
+// signed: i64 -> f32
+def LTOF : Pat<(f32 (sint_to_fp GPRI64:$src)),
+    (ITOF (LLO GPRI64:$src))>;
+
+// integer subtraction
+// a - b ==> a + (-b)
+def SUB_i8 : Pat<(sub GPRI8:$src0, GPRI8:$src1),
+    (ADD_i8 GPRI8:$src0, (NEGATE_i8 GPRI8:$src1))>;
+def SUB_v2i8 : Pat<(sub GPRV2I8:$src0, GPRV2I8:$src1),
+    (ADD_v2i8 GPRV2I8:$src0, (NEGATE_v2i8 GPRV2I8:$src1))>;
+def SUB_v4i8 : Pat<(sub GPRV4I8:$src0, GPRV4I8:$src1),
+    (ADD_v4i8 GPRV4I8:$src0, (NEGATE_v4i8 GPRV4I8:$src1))>;
+def SUB_i16 : Pat<(sub GPRI16:$src0, GPRI16:$src1),
+    (ADD_i16 GPRI16:$src0, (NEGATE_i16 GPRI16:$src1))>;
+def SUB_v2i16 : Pat<(sub GPRV2I16:$src0, GPRV2I16:$src1),
+    (ADD_v2i16 GPRV2I16:$src0, (NEGATE_v2i16 GPRV2I16:$src1))>;
+def SUB_v4i16 : Pat<(sub GPRV4I16:$src0, GPRV4I16:$src1),
+    (ADD_v4i16 GPRV4I16:$src0, (NEGATE_v4i16 GPRV4I16:$src1))>;
+def SUB_i32 : Pat<(sub GPRI32:$src0, GPRI32:$src1),
+    (ADD_i32 GPRI32:$src0, (NEGATE_i32 GPRI32:$src1))>;
+def SUB_v2i32 : Pat<(sub GPRV2I32:$src0, GPRV2I32:$src1),
+    (ADD_v2i32 GPRV2I32:$src0, (NEGATE_v2i32 GPRV2I32:$src1))>;
+def SUB_v4i32 : Pat<(sub GPRV4I32:$src0, GPRV4I32:$src1),
+    (ADD_v4i32 GPRV4I32:$src0, (NEGATE_v4i32 GPRV4I32:$src1))>;
+// LLVM isn't lowering this correctly, so writing a pattern that
+// matches it isntead.
+def : Pat<(build_vector (i32 imm:$src)),
+    (VCREATE_v4i32 (LOADCONST_i32 imm:$src))>;
+
+// Calls:
+def : Pat<(IL_call tglobaladdr:$dst),
+    (CALL tglobaladdr:$dst)>;
+def : Pat<(IL_call texternalsym:$dst),
+    (CALL texternalsym:$dst)>;
+def : Pat<(IL_call tconstpool:$dst),
+  (CALL tconstpool:$dst)>;
+
+include "AMDILConversions.td"
diff --git a/src/gallium/drivers/radeon/AMDILInstructions.td b/src/gallium/drivers/radeon/AMDILInstructions.td

new file mode 100644 (file)

index 0000000..f824a67
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILInstructions.td
@@ -0,0 +1,2436 @@
+//===-- AMDILInstructions.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+// Operations in this file are generic to all data types
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+  defm LOADCONST  : ILConstant<"mov $dst, $val">;
+  defm MOVE       : UnaryOpMC<IL_OP_MOV, IL_mov>;
+  defm PHIMOVE    : UnaryOpMC<IL_OP_MOV, IL_phimov>;
+}
+defm BINARY_NOT : UnaryOpMC<IL_OP_I_NOT, IL_not>;
+defm BINARY_OR  : BinaryOpMC<IL_OP_I_OR, IL_or>;
+defm BINARY_AND : BinaryOpMC<IL_OP_AND, IL_and>;
+defm BINARY_XOR : BinaryOpMC<IL_OP_I_XOR, IL_xor>;
+defm AND        : BinaryOpMCInt<IL_OP_AND, and>;
+defm CMOV       : BinaryOpMC<IL_OP_CMOV, IL_cmov>;
+defm DIV_INF    : BinaryOpMC<IL_OP_DIV_INF, IL_div_inf>;
+defm SMAX       : BinaryOpMCInt<IL_OP_I_MAX, IL_smax>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder for 64bit
+// instructions
+defm CMOVLOG    : TernaryOpMC<IL_OP_CMOV_LOGICAL, IL_cmov_logical>;
+// This opcode has a custom swizzle pattern in the Swizzle Encoder and 
+// should never be selected in ISel. It should only be generated in the
+// I/O expansion code. These are different from the CMOVLOG instruction
+// in that the src0 argument uses a custom swizzle for the Y/Z/W
+// vector channel respectively instead of the default channel.
+def CMOVLOG_Y_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst),
+    (ins GPRI32:$src0, GPRI32:$src1, GPRI32:$src2),
+    !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"),
+    []>;
+def CMOVLOG_Z_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst),
+    (ins GPRI32:$src0, GPRI32:$src1, GPRI32:$src2),
+    !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"),
+    []>;
+def CMOVLOG_W_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst),
+    (ins GPRI32:$src0, GPRI32:$src1 ,GPRI32:$src2),
+    !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"),
+    []>;
+defm SELECTBIN  : TernaryOpMCScalar<IL_OP_CMOV_LOGICAL, select>;
+//===---------------------------------------------------------------------===//
+// Signed 8bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+def INTTOANY_i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins GPRI32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRI8:$dst, (IL_inttoany GPRI32:$src0))]>;
+//===---------------------------------------------------------------------===//
+// Signed 16bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+def INTTOANY_i16: OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins GPRI32:$src0),
+    !strconcat(IL_OP_MOV.Text," $dst, $src0"), 
+    [(set GPRI16:$dst, (IL_inttoany GPRI32:$src0))]>;
+//===---------------------------------------------------------------------===//
+// Signed 32bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+defm NEGATE     : UnaryOpMCi32<IL_OP_I_NEGATE, IL_inegate>;
+defm SMUL       : BinaryOpMCi32<IL_OP_I_MUL, mul>;
+defm SMULHI     : BinaryOpMCi32<IL_OP_I_MUL_HIGH, mulhs>;
+defm SHL        : BinaryOpMCi32Const<IL_OP_I_SHL, shl>;
+defm SHR        : BinaryOpMCi32Const<IL_OP_I_SHR, sra>;
+defm SHLVEC     : BinaryOpMCi32<IL_OP_I_SHL, shl>;
+defm SHRVEC     : BinaryOpMCi32<IL_OP_I_SHR, sra>;
+defm ADD        : BinaryOpMCi32<IL_OP_I_ADD, add>;
+defm CUSTOM_XOR : BinaryOpMCInt<IL_OP_I_XOR, xor>;
+// get rid of the addri via the tablegen instead of custom lowered instruction
+defm CUSTOM_ADD : BinaryOpMCi32<IL_OP_I_ADD, IL_add>;
+defm EADD   : BinaryOpMCi32<IL_OP_I_ADD, adde>;
+def INTTOANY_i32: OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins GPRI32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"), 
+    [(set GPRI32:$dst, (IL_inttoany GPRI32:$src0))]>;
+// Integer offsets for addressing
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def ADDir       : TwoInOneOut<IL_OP_I_ADD, (outs GPRI32:$dst),
+      (ins MEMI32:$ptr, GPRI32:$offset),
+          !strconcat(IL_OP_I_ADD.Text, " $dst, $ptr, $offset"),
+          [(set GPRI32:$dst,
+        (IL_addaddrri ADDR:$ptr,
+          (i32 GPRI32:$offset)))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def ADDri       : TwoInOneOut<IL_OP_I_ADD, (outs GPRI32:$dst),
+      (ins GPRI32:$offset,  MEMI32:$ptr),
+          !strconcat(IL_OP_I_ADD.Text, " $dst, $offset, $ptr"),
+          [(set GPRI32:$dst,
+        (IL_addaddrir
+          (i32 GPRI32:$offset), ADDR:$ptr))]>;
+
+defm IFFB_HI    : UnaryOpMCi32<IL_OP_I_FFB_HI, IL_ffb_hi>;
+defm IFFB_LO    : UnaryOpMCi32<IL_OP_I_FFB_LO, IL_ffb_lo>;
+let mayLoad = 0, mayStore = 0 in {
+defm ABS : UnaryIntrinsicInt<IL_OP_ABS, int_AMDIL_abs>;
+defm BITCOUNT : UnaryIntrinsicInt<IL_OP_IBIT_COUNT, int_AMDIL_bit_count_i32>;
+defm FFB_LO : UnaryIntrinsicInt<IL_OP_I_FFB_LO, int_AMDIL_bit_find_first_lo>;
+defm FFB_HI : UnaryIntrinsicInt<IL_OP_I_FFB_HI, int_AMDIL_bit_find_first_hi>;
+defm FFB_SGN : UnaryIntrinsicInt<IL_OP_I_FFB_SGN,
+        int_AMDIL_bit_find_first_sgn>;
+defm IMULHI  : BinaryIntrinsicInt<IL_OP_I_MUL_HIGH, int_AMDIL_mulhi_i32>;
+let Predicates = [HasHWSign24Bit] in {
+defm IMUL24 : BinaryIntrinsicInt<IL_OP_I_MUL24, int_AMDIL_mul24_i32>;
+defm IMULHI24 : BinaryIntrinsicInt<IL_OP_I_MULHI24, int_AMDIL_mulhi24_i32>;
+defm IMAD24  : TernaryIntrinsicInt<IL_OP_I_MAD24, int_AMDIL_mad24_i32>;
+}
+defm CARRY  : BinaryIntrinsicInt<IL_OP_I_CARRY, int_AMDIL_carry_i32>;
+defm BORROW  : BinaryIntrinsicInt<IL_OP_I_BORROW, int_AMDIL_borrow_i32>;
+defm IMIN  : BinaryIntrinsicInt<IL_OP_I_MIN, int_AMDIL_min_i32>;
+defm IMAX  : BinaryIntrinsicInt<IL_OP_I_MAX, int_AMDIL_max_i32>;
+defm CMOV_LOG  : TernaryIntrinsicInt<IL_OP_CMOV_LOGICAL,
+          int_AMDIL_cmov_logical>;
+defm IBIT_EXTRACT : TernaryIntrinsicInt<IL_OP_IBIT_EXTRACT,
+          int_AMDIL_bit_extract_i32>;
+defm IMAD  : TernaryIntrinsicInt<IL_OP_I_MAD, int_AMDIL_mad_i32>;
+defm SAD  : TernaryIntrinsicInt<IL_OP_SAD, int_AMDIL_media_sad>;
+defm SADHI  : TernaryIntrinsicInt<IL_OP_SAD_HI,
+          int_AMDIL_media_sad_hi>;
+}
+def SAD4_i32  : ThreeInOneOut<IL_OP_SAD4, (outs GPRI32:$dst),
+      (ins GPRV4I32:$src, GPRV4I32:$src1, GPRI32:$src2),
+      !strconcat(IL_OP_SAD4.Text, " $dst, $src, $src1, $src2"),
+      [(set GPRI32:$dst,
+      (int_AMDIL_media_sad4 GPRV4I32:$src, GPRV4I32:$src1,
+      GPRI32:$src2))]>;
+def FTOV4U8_i32 : OneInOneOut<IL_OP_F2U4, (outs GPRI32:$dst),
+      (ins GPRV4F32:$src),
+      !strconcat(IL_OP_F2U4.Text, " $dst, $src"),
+      [(set GPRI32:$dst,
+      (int_AMDIL_media_convert_f2v4u8 GPRV4F32:$src))]>;
+//===---------------------------------------------------------------------===//
+// Unsigned 32bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+defm UMUL       : BinaryOpMCi32<IL_OP_U_MUL, IL_umul>;
+defm UMULHI     : BinaryOpMCi32<IL_OP_U_MUL_HIGH, mulhu>;
+defm USHR       : BinaryOpMCi32Const<IL_OP_U_SHR, srl>;
+defm USHRVEC    : BinaryOpMCi32<IL_OP_U_SHR, srl>;
+defm UDIV       : BinaryOpMCi32<IL_OP_U_DIV, udiv>;
+defm NATIVE_UDIV  : BinaryIntrinsicInt<IL_OP_U_DIV, int_AMDIL_udiv>;
+let mayLoad=0, mayStore=0 in {
+defm UBIT_REVERSE : UnaryIntrinsicInt<IL_OP_UBIT_REVERSE,
+        int_AMDIL_bit_reverse_u32>;
+defm UMULHI_INT : BinaryIntrinsicInt<IL_OP_U_MUL_HIGH, int_AMDIL_mulhi_u32>;
+defm UMULHI24   : BinaryIntrinsicInt<IL_OP_U_MULHI24, int_AMDIL_mulhi24_u32>;
+defm UMUL24     : BinaryIntrinsicInt<IL_OP_U_MUL24, int_AMDIL_mul24_u32>;
+defm UMIN  : BinaryIntrinsicInt<IL_OP_U_MIN, int_AMDIL_min_u32>;
+defm UMAX  : BinaryIntrinsicInt<IL_OP_U_MAX, int_AMDIL_max_u32>;
+defm UBIT_EXTRACT : TernaryIntrinsicInt<IL_OP_UBIT_EXTRACT,
+          int_AMDIL_bit_extract_u32>;
+defm UBIT_INSERT : QuaternaryIntrinsicInt<IL_OP_UBIT_INSERT,
+          int_AMDIL_bit_insert_u32>;
+defm BFI : TernaryIntrinsicInt<IL_OP_BFI, int_AMDIL_bfi>;
+defm BFM : BinaryIntrinsicInt<IL_OP_BFM, int_AMDIL_bfm>;
+defm UMAD  : TernaryIntrinsicInt<IL_OP_U_MAD, int_AMDIL_mad_u32>;
+defm UMAD24  : TernaryIntrinsicInt<IL_OP_U_MAD24, int_AMDIL_mad24_u32>;
+defm U4LERP  : TernaryIntrinsicInt<IL_OP_U4_LERP,
+          int_AMDIL_media_lerp_u4>;
+defm BITALIGN : TernaryIntrinsicInt<IL_OP_BIT_ALIGN, int_AMDIL_media_bitalign>;
+defm BYTEALIGN : TernaryIntrinsicInt<IL_OP_BYTE_ALIGN, int_AMDIL_media_bytealign>;
+}
+//===---------------------------------------------------------------------===//
+// Signed 64bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LNEGATE     : OneInOneOut<IL_OP_MOV,  (outs GPRI64:$dst), (ins GPRI64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRI64:$dst, (IL_inegate GPRI64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LNEGATE_v2i64: OneInOneOut<IL_OP_MOV,  (outs GPRV2I64:$dst),
+                (ins GPRV2I64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRV2I64:$dst, (IL_inegate GPRV2I64:$src))]>;
+let Predicates = [HasHW64Bit] in {
+def LADD        : TwoInOneOut<IL_OP_I64_ADD, (outs GPRI64:$dst),
+                  (ins GPRI64:$src1, GPRI64:$src2),
+                  !strconcat(IL_OP_I64_ADD.Text, " $dst, $src1, $src2"),
+                [(set GPRI64:$dst, (IL_add GPRI64:$src1, GPRI64:$src2))]>;
+defm IMIN64 : BinaryIntrinsicLong<IL_OP_I64_MIN, int_AMDIL_min_i32>;
+defm UMIN64 : BinaryIntrinsicLong<IL_OP_U64_MIN, int_AMDIL_min_u32>;
+defm IMAX64 : BinaryIntrinsicLong<IL_OP_I64_MAX, int_AMDIL_max_i32>;
+defm UMAX64 : BinaryIntrinsicLong<IL_OP_U64_MAX, int_AMDIL_max_u32>;
+}
+let Predicates = [HasHW64Bit] in {
+def LSHR        : TwoInOneOut<IL_OP_I64_SHR, (outs GPRI64:$dst),
+                  (ins GPRI64:$src1, GPRI32:$src2),
+                  !strconcat(IL_OP_I64_SHR.Text, " $dst, $src1, $src2"),
+                [(set GPRI64:$dst, (sra GPRI64:$src1, GPRI32:$src2))]>;
+def LSHL       : TwoInOneOut<IL_OP_I64_SHL, (outs GPRI64:$dst),
+                  (ins GPRI64:$src1, GPRI32:$src2),
+                  !strconcat(IL_OP_I64_SHL.Text, " $dst, $src1, $src2"),
+                [(set GPRI64:$dst, (shl GPRI64:$src1, GPRI32:$src2))]>;
+}
+
+
+//===---------------------------------------------------------------------===//
+// Unsigned 64bit integer math instructions start here
+//===---------------------------------------------------------------------===//
+let Predicates = [HasTmrRegister] in {
+  def Tmr : ILFormat<IL_OP_MOV, (outs GPRI64:$tmr),
+      (ins), !strconcat(IL_OP_MOV.Text, " $tmr, Tmr"),
+      [(set GPRI64:$tmr, (int_AMDIL_get_cycle_count))]>;
+}
+let Predicates = [HasDeviceIDInst] in {
+def CU_ID : ILFormat<IL_OP_CU_ID, (outs GPRI32:$id), (ins),
+    !strconcat(IL_OP_CU_ID.Text, " $id"),
+    [(set GPRI32:$id, (int_AMDIL_compute_unit_id))]>;
+def WAVE_ID : ILFormat<IL_OP_WAVE_ID, (outs GPRI32:$id), (ins),
+    !strconcat(IL_OP_WAVE_ID.Text, " $id"),
+    [(set GPRI32:$id, (int_AMDIL_wavefront_id))]>;
+}
+let Predicates = [HasHW64Bit] in {
+def LUSHR        : TwoInOneOut<IL_OP_U64_SHR, (outs GPRI64:$dst),
+                  (ins GPRI64:$src1, GPRI32:$src2),
+                  !strconcat(IL_OP_U64_SHR.Text, " $dst, $src1, $src2"),
+                [(set GPRI64:$dst, (srl GPRI64:$src1, GPRI32:$src2))]>;
+}
+
+
+//===---------------------------------------------------------------------===//
+// Generic Float Instructions
+//===---------------------------------------------------------------------===//
+let hasIEEEFlag = 1 in {
+defm MUL_IEEE  : BinaryOpMCFloat<IL_OP_MUL_IEEE, IL_OP_D_MUL, fmul>;
+}
+defm ADD  : BinaryOpMCFloat<IL_OP_ADD, IL_OP_D_ADD, fadd>;
+//===---------------------------------------------------------------------===//
+// float math instructions start here
+//===---------------------------------------------------------------------===//
+let mayLoad=0, mayStore=0 in {
+defm ABS : UnaryIntrinsicFloat<IL_OP_ABS, int_AMDIL_fabs>;
+defm FRAC : UnaryIntrinsicFloat<IL_OP_FRC, int_AMDIL_fraction>;
+defm PIREDUCE : UnaryIntrinsicFloat<IL_OP_PI_REDUCE, int_AMDIL_pireduce>;
+defm ROUND_NEAREST : UnaryIntrinsicFloat<IL_OP_ROUND_NEAR,
+          int_AMDIL_round_nearest>;
+defm ROUND_NEGINF : UnaryIntrinsicFloat<IL_OP_ROUND_NEG_INF,
+          int_AMDIL_round_neginf>;
+defm ROUND_POSINF : UnaryIntrinsicFloat<IL_OP_ROUND_POS_INF,
+          int_AMDIL_round_posinf>;
+defm ROUND_ZERO : UnaryIntrinsicFloat<IL_OP_ROUND_ZERO,
+          int_AMDIL_round_zero>;
+defm ACOS : UnaryIntrinsicFloatScalar<IL_OP_ACOS, int_AMDIL_acos>;
+defm ATAN : UnaryIntrinsicFloatScalar<IL_OP_ATAN, int_AMDIL_atan>;
+defm ASIN : UnaryIntrinsicFloatScalar<IL_OP_ASIN, int_AMDIL_asin>;
+defm TAN : UnaryIntrinsicFloatScalar<IL_OP_TAN, int_AMDIL_tan>;
+defm SIN : UnaryIntrinsicFloatScalar<IL_OP_SIN, int_AMDIL_sin>;
+defm COS : UnaryIntrinsicFloatScalar<IL_OP_COS, int_AMDIL_cos>;
+defm SQRT : UnaryIntrinsicFloatScalar<IL_OP_SQRT, int_AMDIL_sqrt>;
+defm EXP : UnaryIntrinsicFloatScalar<IL_OP_EXP, int_AMDIL_exp>;
+defm EXPVEC : UnaryIntrinsicFloat<IL_OP_EXP_VEC, int_AMDIL_exp_vec>;
+defm SQRTVEC : UnaryIntrinsicFloat<IL_OP_SQRT_VEC, int_AMDIL_sqrt_vec>;
+defm COSVEC : UnaryIntrinsicFloat<IL_OP_COS_VEC, int_AMDIL_cos_vec>;
+defm SINVEC : UnaryIntrinsicFloat<IL_OP_SIN_VEC, int_AMDIL_sin_vec>;
+defm LOGVEC : UnaryIntrinsicFloat<IL_OP_LOG_VEC, int_AMDIL_log_vec>;
+defm RSQVEC : UnaryIntrinsicFloat<IL_OP_RSQ_VEC, int_AMDIL_rsq_vec>;
+defm EXN : UnaryIntrinsicFloatScalar<IL_OP_EXN, int_AMDIL_exn>;
+defm SIGN : UnaryIntrinsicFloat<IL_OP_SGN, int_AMDIL_sign>;
+defm LENGTH : UnaryIntrinsicFloat<IL_OP_LEN, int_AMDIL_length>;
+defm POW : BinaryIntrinsicFloat<IL_OP_POW, int_AMDIL_pow>;
+}
+
+let hasIEEEFlag = 1 in {
+  let mayLoad = 0, mayStore=0 in {
+defm MIN  : BinaryIntrinsicFloat<IL_OP_MIN, int_AMDIL_min>;
+defm MAX  : BinaryIntrinsicFloat<IL_OP_MAX, int_AMDIL_max>;
+defm MAD  : TernaryIntrinsicFloat<IL_OP_MAD, int_AMDIL_mad>;
+  }
+defm MOD  : BinaryOpMCf32<IL_OP_MOD, frem>;
+}
+let hasZeroOpFlag = 1 in {
+  let mayLoad = 0, mayStore=0 in {
+defm LN  : UnaryIntrinsicFloatScalar<IL_OP_LN, int_AMDIL_ln>;
+defm LOG : UnaryIntrinsicFloatScalar<IL_OP_LOG, int_AMDIL_log>;
+defm RSQ : UnaryIntrinsicFloatScalar<IL_OP_RSQ, int_AMDIL_rsq>;
+defm DIV  : BinaryIntrinsicFloat<IL_OP_DIV, int_AMDIL_div>;
+  }
+}
+  let mayLoad = 0, mayStore=0 in {
+defm CLAMP : TernaryIntrinsicFloat<IL_OP_CLAMP, int_AMDIL_clamp>;
+defm FMA  : TernaryIntrinsicFloat<IL_OP_FMA, int_AMDIL_fma>;
+defm LERP  : TernaryIntrinsicFloat<IL_OP_LERP, int_AMDIL_lerp>;
+  }
+defm SUB  : BinaryOpMCf32<IL_OP_SUB, fsub>;
+defm FABS  : UnaryOpMCf32<IL_OP_ABS, fabs>;
+defm FMAD  : TernaryOpMCf32<IL_OP_MAD, IL_mad>;
+defm NEAR : UnaryOpMCf32<IL_OP_ROUND_NEAR, fnearbyint>;
+defm RND_Z : UnaryOpMCf32<IL_OP_ROUND_ZERO, ftrunc>;
+
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def NEG_f32         : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst),
+  (ins GPRF32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRF32:$dst, (fneg GPRF32:$src0))]>;
+def INTTOANY_f32    : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst),
+  (ins GPRI32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRF32:$dst, (IL_inttoany GPRI32:$src0))]>;
+let hasIEEEFlag = 1 in {
+def DP2ADD_f32 : ThreeInOneOut<IL_OP_DP2_ADD, (outs GPRF32:$dst),
+    (ins GPRV2F32:$src0, GPRV2F32:$src1, GPRF32:$src2),
+    !strconcat(IL_OP_DP2_ADD.Text, " $dst, $src0, $src1, $src2"),
+    [(set GPRF32:$dst,
+    (int_AMDIL_dp2_add GPRV2F32:$src0,
+    GPRV2F32:$src1, GPRF32:$src2))]>;
+def DP2_f32 : TwoInOneOut<IL_OP_DP2, (outs GPRF32:$dst),
+    (ins GPRV2F32:$src0, GPRV2F32:$src1),
+    !strconcat(IL_OP_DP2.Text, " $dst, $src0, $src1"),
+    [(set GPRF32:$dst,
+    (int_AMDIL_dp2 GPRV2F32:$src0, GPRV2F32:$src1))]>;
+def DP3_f32 : TwoInOneOut<IL_OP_DP3, (outs GPRF32:$dst),
+    (ins GPRV4F32:$src0, GPRV4F32:$src1),
+    !strconcat(IL_OP_DP3.Text, " $dst, $src0, $src1"),
+    [(set GPRF32:$dst,
+    (int_AMDIL_dp3 GPRV4F32:$src0, GPRV4F32:$src1))]>;
+def DP4_f32 : TwoInOneOut<IL_OP_DP4, (outs GPRF32:$dst),
+    (ins GPRV4F32:$src0, GPRV4F32:$src1),
+    !strconcat(IL_OP_DP4.Text, " $dst, $src0, $src1"),
+    [(set GPRF32:$dst,
+    (int_AMDIL_dp4 GPRV4F32:$src0, GPRV4F32:$src1))]>;
+}
+defm UNPACK_B0 : IntrConvertI32TOF32<IL_OP_UNPACK_0, int_AMDIL_media_unpack_byte_0>;
+defm UNPACK_B1 : IntrConvertI32TOF32<IL_OP_UNPACK_1, int_AMDIL_media_unpack_byte_1>;
+defm UNPACK_B2 : IntrConvertI32TOF32<IL_OP_UNPACK_2, int_AMDIL_media_unpack_byte_2>;
+defm UNPACK_B3 : IntrConvertI32TOF32<IL_OP_UNPACK_3, int_AMDIL_media_unpack_byte_3>;
+defm FTOI_FLR  : IntrConvertF32TOI32<IL_OP_FTOI_FLR, int_AMDIL_convert_f32_i32_flr>;
+defm FTOI_RPI  : IntrConvertF32TOI32<IL_OP_FTOI_RPI, int_AMDIL_convert_f32_i32_rpi>;
+defm HTOF      : IntrConvertF16TOF32<IL_OP_F16_TO_F32, int_AMDIL_convert_f16_f32>;
+defm FTOH      : IntrConvertF32TOF16<IL_OP_F32_TO_F16, int_AMDIL_convert_f32_f16>;
+defm FTOH_NEAR     : IntrConvertF32TOF16<IL_OP_F32_TO_F16_NEAR, int_AMDIL_convert_f32_f16_near>;
+defm FTOH_NEG_INF  : IntrConvertF32TOF16<IL_OP_F32_TO_F16_NEG_INF, int_AMDIL_convert_f32_f16_neg_inf>;
+defm FTOH_PLUS_INF : IntrConvertF32TOF16<IL_OP_F32_TO_F16_PLUS_INF, int_AMDIL_convert_f32_f16_plus_inf>;
+//===---------------------------------------------------------------------===//
+// float math instructions end here
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// float2 math instructions start here
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def NEG_v2f32       : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst),
+  (ins GPRV2F32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRV2F32:$dst, (fneg GPRV2F32:$src0))]>;
+//===---------------------------------------------------------------------===//
+// float2 math instructions end here
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// float4 math instructions start here
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def NEG_v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst),
+  (ins GPRV4F32:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRV4F32:$dst, (fneg GPRV4F32:$src0))]>;
+//===---------------------------------------------------------------------===//
+// float4 math instructions end here
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// double math instructions start here
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def  SUB_f64       : TwoInOneOut<IL_OP_D_ADD, (outs GPRF64:$dst),
+  (ins GPRF64:$src0, GPRF64:$src1),
+     !strconcat(IL_OP_D_ADD.Text, " $dst, $src0, $src1"),
+     [(set GPRF64:$dst, (fsub GPRF64:$src0, GPRF64:$src1))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def  SUB_v2f64      : TwoInOneOut<IL_OP_D_ADD, (outs GPRV2F64:$dst),
+  (ins GPRV2F64:$src0, GPRV2F64:$src1),
+     !strconcat(IL_OP_D_ADD.Text, " $dst, $src0, $src1"),
+     [(set GPRV2F64:$dst, (fsub GPRV2F64:$src0, GPRV2F64:$src1))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def NEG_f64       : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst),
+  (ins GPRF64:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRF64:$dst, (fneg GPRF64:$src0))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def NEG_v2f64       : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst),
+  (ins GPRV2F64:$src0),
+    !strconcat(IL_OP_MOV.Text, " $dst, $src0"),
+    [(set GPRV2F64:$dst, (fneg GPRV2F64:$src0))]>;
+  let mayLoad = 0, mayStore=0 in {
+defm MIN  : BinaryIntrinsicDouble<IL_OP_D_MIN, int_AMDIL_min>;
+defm MAX  : BinaryIntrinsicDouble<IL_OP_D_MAX, int_AMDIL_max>;
+defm DIV  : BinaryIntrinsicDouble<IL_OP_D_DIV, int_AMDIL_div>;
+defm MAD  : TernaryIntrinsicDouble<IL_OP_D_MAD, int_AMDIL_mad>;
+defm DFMA : TernaryIntrinsicDouble<IL_OP_D_MAD, int_AMDIL_fma>;
+defm FRAC : UnaryIntrinsicDouble<IL_OP_D_FRC, int_AMDIL_fraction>;
+defm SQRT : UnaryIntrinsicDouble<IL_OP_D_SQRT, int_AMDIL_sqrt>;
+defm RSQ  : UnaryIntrinsicDoubleScalar<IL_OP_D_RSQ, int_AMDIL_rsq>;
+defm RCP  : UnaryIntrinsicDoubleScalar<IL_OP_D_RCP, int_AMDIL_drcp>;
+defm DMAD : TernaryOpMCf64<IL_OP_D_MAD, IL_mad>;
+  }
+def FREXP_f64 : OneInOneOut<IL_OP_D_FREXP, (outs GPRV2I64:$dst),
+      (ins GPRF64:$src),
+      !strconcat(IL_OP_D_FREXP.Text," $dst, $src"),
+      [(set GPRV2I64:$dst,
+      (int_AMDIL_frexp_f64 GPRF64:$src))]>;
+def LDEXP_f64 : TwoInOneOut<IL_OP_D_LDEXP, (outs GPRF64:$dst),
+      (ins GPRF64:$src, GPRI32:$src1),
+      !strconcat(IL_OP_D_LDEXP.Text, " $dst, $src, $src1"),
+      [(set GPRF64:$dst,
+      (int_AMDIL_ldexp GPRF64:$src, GPRI32:$src1))]>;
+def LDEXP_v2f64 : TwoInOneOut<IL_OP_D_LDEXP, (outs GPRV2F64:$dst),
+      (ins GPRV2F64:$src, GPRV2I32:$src1),
+      !strconcat(IL_OP_D_LDEXP.Text, " $dst, $src, $src1"),
+      [(set GPRV2F64:$dst,
+      (int_AMDIL_ldexp GPRV2F64:$src, GPRV2I32:$src1))]>;
+//===---------------------------------------------------------------------===//
+// double math instructions end here
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+// Various Macros
+//===---------------------------------------------------------------------===//
+def MACRO__sdiv_i8   : BinaryMacro< GPRI8, GPRI8, GPRI8, sdiv>;
+def MACRO__sdiv_i16  : BinaryMacro<GPRI16, GPRI16, GPRI16, sdiv>;
+def MACRO__sdiv_i32  : BinaryMacro<GPRI32, GPRI32, GPRI32, sdiv>;
+def MACRO__udiv_i8   : BinaryMacro< GPRI8, GPRI8, GPRI8, udiv>;
+def MACRO__udiv_i16  : BinaryMacro<GPRI16, GPRI16, GPRI16, udiv>;
+def MACRO__udiv_i32  : BinaryMacro<GPRI32, GPRI32, GPRI32, udiv>;
+def MACRO__smod_i8   : BinaryMacro< GPRI8, GPRI8, GPRI8, srem>;
+def MACRO__smod_i16  : BinaryMacro<GPRI16, GPRI16, GPRI16, srem>;
+def MACRO__smod_i32  : BinaryMacro<GPRI32, GPRI32, GPRI32, srem>;
+def MACRO__umod_i8   : BinaryMacro< GPRI8, GPRI8, GPRI8, urem>;
+def MACRO__umod_i16  : BinaryMacro<GPRI16, GPRI16, GPRI16, urem>;
+def MACRO__umod_i32  : BinaryMacro<GPRI32, GPRI32, GPRI32, urem>;
+let Predicates = [HasSWDDiv] in {
+  def MACRO__ddiv_f64: BinaryMacro<GPRF64, GPRF64, GPRF64, fdiv>;
+}
+let Predicates = [HasHWDDiv] in {
+  def MACRO__ddiv_f64_fma: BinaryMacro<GPRF64, GPRF64, GPRF64, fdiv>;
+}
+def MACRO__ftol_i64  : UnaryMacro<GPRI64, GPRF32, fp_to_sint>;
+def MACRO__ftoul_i64 : UnaryMacro<GPRI64, GPRF32, fp_to_uint>;
+def MACRO__ultof_f32 : UnaryMacro<GPRF32, GPRI64, uint_to_fp>;
+def MACRO__ltof_f32  : UnaryMacro<GPRF32, GPRI64, sint_to_fp>;
+let Predicates = [HasSW64Mul] in {
+def MACRO__mul_i64   : BinaryMacro<GPRI64, GPRI64, GPRI64, mul>;
+def MACRO__mul_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I64, mul>;
+}
+let Predicates = [HasSW64DivMod] in {
+def MACRO__sdiv_i64  : BinaryMacro<GPRI64, GPRI64, GPRI64, sdiv>;
+def MACRO__udiv_i64  : BinaryMacro<GPRI64, GPRI64, GPRI64, udiv>;
+def MACRO__smod_i64  : BinaryMacro<GPRI64, GPRI64, GPRI64, srem>;
+def MACRO__umod_i64  : BinaryMacro<GPRI64, GPRI64, GPRI64, urem>;
+}
+let Predicates = [HasHW64DivMod] in {
+  defm SDIV : BinaryOpMCi64<IL_OP_I64_DIV, sdiv>;
+  defm UDIV : BinaryOpMCi64<IL_OP_U64_DIV, udiv>;
+  defm SMOD : BinaryOpMCi64<IL_OP_I64_MOD, srem>;
+  defm UMOD : BinaryOpMCi64<IL_OP_U64_MOD, urem>;
+}
+let Predicates = [HasHW64Mul] in {
+  defm SMUL       : BinaryOpMCi64<IL_OP_I64_MUL, mul>;
+  defm UMUL       : BinaryOpMCi64<IL_OP_U64_MUL, IL_umul>;
+}
+def MACRO__shr_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, srl>;
+def MACRO__shl_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, shl>;
+def MACRO__sra_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, sra>;
+
+let Predicates = [HasSW64Bit] in {
+def MACRO__shr_i64   : BinaryMacro<GPRI64, GPRI64, GPRI32, srl>;
+def MACRO__shl_i64   : BinaryMacro<GPRI64, GPRI64, GPRI32, shl>;
+def MACRO__sra_i64   : BinaryMacro<GPRI64, GPRI64, GPRI32, sra>;
+}
+//===---------------------------------------------------------------------===//
+// Comparison Instructions
+//===---------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+    defm CMP : Compare<"Pseudo comparison instr">;
+}
+//===---------------------------------------------------------------------===//
+// 32-bit floating point operations
+//===---------------------------------------------------------------------===//
+def FEQ         : TwoInOneOut<IL_OP_EQ, (outs GPRF32:$dst),
+        (ins GPRF32:$lhs, GPRF32:$rhs),
+        !strconcat(IL_OP_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def FGE         : TwoInOneOut<IL_OP_GE, (outs GPRF32:$dst),
+        (ins GPRF32:$lhs, GPRF32:$rhs),
+        !strconcat(IL_OP_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def FLT         : TwoInOneOut<IL_OP_LT, (outs GPRF32:$dst),
+        (ins GPRF32:$lhs, GPRF32:$rhs),
+        !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def FLT_v2f32 : TwoInOneOut<IL_OP_LT, (outs GPRV2F32:$dst),
+        (ins GPRV2F32:$lhs, GPRV2F32:$rhs),
+        !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def FLT_v4f32 : TwoInOneOut<IL_OP_LT, (outs GPRV4F32:$dst),
+        (ins GPRV4F32:$lhs, GPRV4F32:$rhs),
+        !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def FNE         : TwoInOneOut<IL_OP_NE, (outs GPRF32:$dst),
+        (ins GPRF32:$lhs, GPRF32:$rhs),
+        !strconcat(IL_OP_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+
+//===---------------------------------------------------------------------===//
+//TODO: need to correctly define comparison instructions
+//===---------------------------------------------------------------------===//
+def DEQ        : TwoInOneOut<IL_OP_D_EQ, (outs GPRF64:$dst),
+        (ins GPRF64:$lhs, GPRF64:$rhs),
+        !strconcat(IL_OP_D_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def DEQ_v2f64        : TwoInOneOut<IL_OP_D_EQ, (outs GPRV2F64:$dst),
+        (ins GPRV2F64:$lhs, GPRV2F64:$rhs),
+        !strconcat(IL_OP_D_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def DGE        : TwoInOneOut<IL_OP_D_GE, (outs GPRF64:$dst),
+        (ins GPRF64:$lhs, GPRF64:$rhs),
+        !strconcat(IL_OP_D_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def DLT        : TwoInOneOut<IL_OP_D_LT, (outs GPRF64:$dst),
+        (ins GPRF64:$lhs, GPRF64:$rhs),
+        !strconcat(IL_OP_D_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def DNE        : TwoInOneOut<IL_OP_D_NE, (outs GPRF64:$dst),
+        (ins GPRF64:$lhs, GPRF64:$rhs),
+        !strconcat(IL_OP_D_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+
+//===---------------------------------------------------------------------===//
+//TODO: need to correctly define comparison instructions
+//===---------------------------------------------------------------------===//
+def IEQ        : TwoInOneOut<IL_OP_I_EQ, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def IEQ_v2i32        : TwoInOneOut<IL_OP_I_EQ, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def IEQ_v4i32        : TwoInOneOut<IL_OP_I_EQ, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def IGE        : TwoInOneOut<IL_OP_I_GE, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def IGE_v2i32        : TwoInOneOut<IL_OP_I_GE, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def IGE_v4i32        : TwoInOneOut<IL_OP_I_GE, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ILT        : TwoInOneOut<IL_OP_I_LT, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ILT_v2i32        : TwoInOneOut<IL_OP_I_LT, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ILT_v4i32        : TwoInOneOut<IL_OP_I_LT, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def INE        : TwoInOneOut<IL_OP_I_NE, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def INE_v2i32        : TwoInOneOut<IL_OP_I_NE, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def INE_v4i32        : TwoInOneOut<IL_OP_I_NE, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+let Predicates = [HasHW64Bit] in {
+def LEQ        : TwoInOneOut<IL_OP_I64_EQ, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def LGE        : TwoInOneOut<IL_OP_I64_GE, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def LLE        : TwoInOneOut<IL_OP_I64_GE, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_GE.Text, " $dst, $rhs, $lhs")
+        , []>;
+def LGT        : TwoInOneOut<IL_OP_I64_LT, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_LT.Text, " $dst, $rhs, $lhs")
+        , []>;
+def LLT        : TwoInOneOut<IL_OP_I64_LT, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def LNE        : TwoInOneOut<IL_OP_I64_NE, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_I64_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+}
+
+//===---------------------------------------------------------------------===//
+// Unsigned Integer Operations
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+//TODO: need to correctly define comparison instructions
+//===---------------------------------------------------------------------===//
+def UEQ        : TwoInOneOut<IL_OP_I_EQ, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UEQ_v2i32        : TwoInOneOut<IL_OP_I_EQ, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UEQ_v4i32        : TwoInOneOut<IL_OP_I_EQ, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULE        : TwoInOneOut<IL_OP_U_GE, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULE_v2i32        : TwoInOneOut<IL_OP_U_GE, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULE_v4i32        : TwoInOneOut<IL_OP_U_GE, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGT        : TwoInOneOut<IL_OP_U_LT, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGT_v2i32        : TwoInOneOut<IL_OP_U_LT, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGT_v4i32        : TwoInOneOut<IL_OP_U_LT, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGE        : TwoInOneOut<IL_OP_U_GE, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGE_v2i32        : TwoInOneOut<IL_OP_U_GE, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UGE_v4i32        : TwoInOneOut<IL_OP_U_GE, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULT        : TwoInOneOut<IL_OP_U_LT, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULT_v2i32        : TwoInOneOut<IL_OP_U_LT, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULT_v4i32        : TwoInOneOut<IL_OP_U_LT, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UNE        : TwoInOneOut<IL_OP_I_NE, (outs GPRI32:$dst),
+        (ins GPRI32:$lhs, GPRI32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UNE_v2i32        : TwoInOneOut<IL_OP_I_NE, (outs GPRV2I32:$dst),
+        (ins GPRV2I32:$lhs, GPRV2I32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def UNE_v4i32        : TwoInOneOut<IL_OP_I_NE, (outs GPRV4I32:$dst),
+        (ins GPRV4I32:$lhs, GPRV4I32:$rhs),
+        !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs")
+        , []>;
+let Predicates = [HasHW64Bit] in {
+def ULLE        : TwoInOneOut<IL_OP_U64_GE, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_U64_GE.Text, " $dst, $rhs, $lhs")
+        , []>;
+def ULGT        : TwoInOneOut<IL_OP_U64_LT, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_U64_LT.Text, " $dst, $rhs, $lhs")
+        , []>;
+def ULGE        : TwoInOneOut<IL_OP_U64_GE, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_U64_GE.Text, " $dst, $lhs, $rhs")
+        , []>;
+def ULLT        : TwoInOneOut<IL_OP_U64_LT, (outs GPRI64:$dst),
+        (ins GPRI64:$lhs, GPRI64:$rhs),
+        !strconcat(IL_OP_U64_LT.Text, " $dst, $lhs, $rhs")
+        , []>;
+}
+//===---------------------------------------------------------------------===//
+// Scalar ==> Scalar conversion functions
+//===---------------------------------------------------------------------===//
+// f32 ==> f64
+def FTOD        : UnaryOp<IL_OP_F_2_D,         fextend,     GPRF64, GPRF32>;
+// f64 ==> f32
+def DTOF        : UnaryOp<IL_OP_D_2_F,         IL_d2f,     GPRF32, GPRF64>;
+// f32 ==> i32 signed
+def FTOI        : UnaryOp<IL_OP_FTOI,          fp_to_sint, GPRI32, GPRF32>;
+def FTOI_v2i32  : UnaryOp<IL_OP_FTOI,          fp_to_sint, GPRV2I32, GPRV2F32>;
+def FTOI_v4i32  : UnaryOp<IL_OP_FTOI,          fp_to_sint, GPRV4I32, GPRV4F32>;
+// i32 ==> f32 signed
+def ITOF        : UnaryOp<IL_OP_ITOF,          sint_to_fp, GPRF32, GPRI32>;
+def ITOF_v2f32  : UnaryOp<IL_OP_ITOF,          sint_to_fp, GPRV2F32, GPRV2I32>;
+def ITOF_v4f32  : UnaryOp<IL_OP_ITOF,          sint_to_fp, GPRV4F32, GPRV4I32>;
+// f32 ==> i32 unsigned
+def FTOU        : UnaryOp<IL_OP_FTOU,          fp_to_uint, GPRI32, GPRF32>;
+def FTOU_v2i32  : UnaryOp<IL_OP_FTOU,          fp_to_uint, GPRV2I32, GPRV2F32>;
+def FTOU_v4i32  : UnaryOp<IL_OP_FTOU,          fp_to_uint, GPRV4I32, GPRV4F32>;
+// i32 ==> f32 unsigned
+def UTOF        : UnaryOp<IL_OP_UTOF,          uint_to_fp, GPRF32, GPRI32>;
+def UTOF_v2f32  : UnaryOp<IL_OP_UTOF,          uint_to_fp, GPRV2F32, GPRV2I32>;
+def UTOF_v4f32  : UnaryOp<IL_OP_UTOF,          uint_to_fp, GPRV4F32, GPRV4I32>;
+// Get upper 32 bits of f64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DHI         : OneInOneOut<IL_OP_MOV,  (outs GPRI32:$dst), 
+                (ins GPRF64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRI32:$dst, (IL_dcomphi GPRF64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DHI_v2f64   : OneInOneOut<IL_OP_MOV,  (outs GPRV2I32:$dst),
+                (ins GPRV2F64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRV2I32:$dst, (IL_dcomphi2 GPRV2F64:$src))]>;
+// Get lower 32 bits of f64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DLO         : OneInOneOut<IL_OP_MOV,  (outs GPRI32:$dst), 
+                (ins GPRF64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRI32:$dst, (IL_dcomplo GPRF64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DLO_v2f64   : OneInOneOut<IL_OP_MOV,  (outs GPRV2I32:$dst),
+                (ins GPRV2F64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRV2I32:$dst, (IL_dcomplo2 GPRV2F64:$src))]>;
+// Convert two 32 bit integers into a f64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DCREATE     : TwoInOneOut<IL_OP_I_ADD, (outs GPRF64:$dst),
+                (ins GPRI32:$src0, GPRI32:$src1),
+                !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"),
+                [(set GPRF64:$dst, (IL_dcreate GPRI32:$src0, GPRI32:$src1))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def DCREATE_v2f64 : TwoInOneOut<IL_OP_I_ADD, (outs GPRV2F64:$dst),
+                (ins GPRV2I32:$src0, GPRV2I32:$src1),
+                !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"),
+                [(set GPRV2F64:$dst,
+                    (IL_dcreate2 GPRV2I32:$src0, GPRV2I32:$src1))]>;
+// Get upper 32 bits of i64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LHI         : OneInOneOut<IL_OP_MOV,  (outs GPRI32:$dst), 
+                (ins GPRI64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRI32:$dst, (IL_lcomphi GPRI64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LHI_v2i64         : OneInOneOut<IL_OP_MOV,  (outs GPRV2I32:$dst),
+                (ins GPRV2I64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRV2I32:$dst, (IL_lcomphi2 GPRV2I64:$src))]>;
+// Get lower 32 bits of i64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LLO         : OneInOneOut<IL_OP_MOV,  (outs GPRI32:$dst), 
+                (ins GPRI64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRI32:$dst, (IL_lcomplo GPRI64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LLO_v2i64         : OneInOneOut<IL_OP_MOV,  (outs GPRV2I32:$dst), 
+                (ins GPRV2I64:$src),
+                !strconcat(IL_OP_MOV.Text, " $dst, $src"),
+                [(set GPRV2I32:$dst, (IL_lcomplo2 GPRV2I64:$src))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def HILO_BITOR_v4i16 : TwoInOneOut<IL_OP_I_OR, (outs GPRI32:$dst), 
+                (ins GPRI32:$src, GPRI32:$src2),
+                !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def HILO_BITOR_v2i32 : TwoInOneOut<IL_OP_I_OR, (outs GPRI32:$dst), 
+                (ins GPRI32:$src, GPRI32:$src2),
+                !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def HILO_BITOR_v2i64 : TwoInOneOut<IL_OP_I_OR, (outs GPRI64:$dst), 
+                (ins GPRI64:$src, GPRI64:$src2),
+                !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>;
+// Convert two 32 bit integers into a i64
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LCREATE     : TwoInOneOut<IL_OP_I_ADD, (outs GPRI64:$dst), 
+                (ins GPRI32:$src0, GPRI32:$src1),
+                !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"),
+                [(set GPRI64:$dst, (IL_lcreate GPRI32:$src0, GPRI32:$src1))]>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+def LCREATE_v2i64     : TwoInOneOut<IL_OP_I_ADD, (outs GPRV2I64:$dst), 
+                (ins GPRV2I32:$src0, GPRV2I32:$src1),
+                !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"),
+                [(set GPRV2I64:$dst, 
+                    (IL_lcreate2 GPRV2I32:$src0, GPRV2I32:$src1))]>;
+//===---------------------------------------------------------------------===//
+// Scalar ==> Vector conversion functions
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+defm VCREATE          : UnaryOpMCVec<IL_OP_MOV, IL_vbuild>;
+
+//===---------------------------------------------------------------------===//
+// Vector ==> Scalar conversion functions
+//===---------------------------------------------------------------------===//
+
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+defm VEXTRACT         : VectorExtract<IL_vextract>;
+
+//===---------------------------------------------------------------------===//
+// Vector ==> Vector conversion functions
+//===---------------------------------------------------------------------===//
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+defm VINSERT          : VectorInsert<IL_vinsert>;
+// This opcode has custom swizzle pattern encoded in Swizzle Encoder
+defm VCONCAT      : VectorConcat<IL_vconcat>;
+
+//===---------------------------------------------------------------------===//
+// Bit conversion functions
+//===---------------------------------------------------------------------===//
+defm IL_ASCHAR  : BitConversion<IL_OP_MOV, GPRI8, IL_bitconv>;
+defm IL_ASSHORT : BitConversion<IL_OP_MOV, GPRI16, IL_bitconv>;
+defm IL_ASINT   : BitConversion<IL_OP_MOV, GPRI32, IL_bitconv>;
+defm IL_ASFLOAT : BitConversion<IL_OP_MOV, GPRF32, IL_bitconv>;
+defm IL_ASDOUBLE : BitConversion<IL_OP_MOV, GPRF64, IL_bitconv>;
+defm IL_ASLONG  : BitConversion<IL_OP_MOV, GPRI64, IL_bitconv>;
+defm IL_ASV2CHAR  : BitConversion<IL_OP_MOV, GPRV2I8, IL_bitconv>;
+defm IL_ASV2SHORT : BitConversion<IL_OP_MOV, GPRV2I16, IL_bitconv>;
+defm IL_ASV2INT   : BitConversion<IL_OP_MOV, GPRV2I32, IL_bitconv>;
+defm IL_ASV2FLOAT : BitConversion<IL_OP_MOV, GPRV2F32, IL_bitconv>;
+defm IL_ASV2DOUBLE : BitConversion<IL_OP_MOV, GPRV2F64, IL_bitconv>;
+defm IL_ASV2LONG  : BitConversion<IL_OP_MOV, GPRV2I64, IL_bitconv>;
+defm IL_ASV4CHAR  : BitConversion<IL_OP_MOV, GPRV4I8, IL_bitconv>;
+defm IL_ASV4SHORT : BitConversion<IL_OP_MOV, GPRV4I16, IL_bitconv>;
+defm IL_ASV4INT   : BitConversion<IL_OP_MOV, GPRV4I32, IL_bitconv>;
+defm IL_ASV4FLOAT : BitConversion<IL_OP_MOV, GPRV4F32, IL_bitconv>;
+
+let Predicates = [Has32BitPtr] in {
+  let isCodeGenOnly=1 in {
+    //===----------------------------------------------------------------------===//
+    // Store Memory Operations
+    //===----------------------------------------------------------------------===//
+    defm GLOBALTRUNCSTORE   : GTRUNCSTORE<"!global trunc store">;
+    defm GLOBALSTORE        : STORE<"!global store"         , global_store>;
+    defm LOCALTRUNCSTORE    : LTRUNCSTORE<"!local trunc store">;
+    defm LOCALSTORE         : STORE<"!local store"          , local_store>;
+    defm PRIVATETRUNCSTORE  : PTRUNCSTORE<"!private trunc store">;
+    defm PRIVATESTORE       : STORE<"!private store"        , private_store>;
+    defm REGIONTRUNCSTORE   : RTRUNCSTORE<"!region trunc store">;
+    defm REGIONSTORE        : STORE<"!region hw store"      , region_store>;
+
+
+    //===---------------------------------------------------------------------===//
+    // Load Memory Operations
+    //===---------------------------------------------------------------------===//
+    defm GLOBALLOAD         : LOAD<"!global load"            , global_load>;
+    defm GLOBALZEXTLOAD     : LOAD<"!global zext load"       , global_zext_load>;
+    defm GLOBALSEXTLOAD     : LOAD<"!global sext load"       , global_sext_load>;
+    defm GLOBALAEXTLOAD     : LOAD<"!global aext load"       , global_aext_load>;
+    defm PRIVATELOAD        : LOAD<"!private load"           , private_load>;
+    defm PRIVATEZEXTLOAD    : LOAD<"!private zext load"      , private_zext_load>;
+    defm PRIVATESEXTLOAD    : LOAD<"!private sext load"      , private_sext_load>;
+    defm PRIVATEAEXTLOAD    : LOAD<"!private aext load"      , private_aext_load>;
+    defm CPOOLLOAD          : LOAD<"!constant pool load"     , cp_load>;
+    defm CPOOLZEXTLOAD      : LOAD<"!constant pool zext load", cp_zext_load>;
+    defm CPOOLSEXTLOAD      : LOAD<"!constant pool sext load", cp_sext_load>;
+    defm CPOOLAEXTLOAD      : LOAD<"!constant aext pool load", cp_aext_load>;
+    defm CONSTANTLOAD       : LOAD<"!constant load"          , constant_load>;
+    defm CONSTANTZEXTLOAD   : LOAD<"!constant zext load"     , constant_zext_load>;
+    defm CONSTANTSEXTLOAD   : LOAD<"!constant sext load"     , constant_sext_load>;
+    defm CONSTANTAEXTLOAD   : LOAD<"!constant aext load"     , constant_aext_load>;
+    defm LOCALLOAD          : LOAD<"!local load"             , local_load>;
+    defm LOCALZEXTLOAD      : LOAD<"!local zext load"        , local_zext_load>;
+    defm LOCALSEXTLOAD      : LOAD<"!local sext load"        , local_sext_load>;
+    defm LOCALAEXTLOAD      : LOAD<"!local aext load"        , local_aext_load>;
+    defm REGIONLOAD         : LOAD<"!region load"            , region_load>;
+    defm REGIONZEXTLOAD     : LOAD<"!region zext load"       , region_zext_load>;
+    defm REGIONSEXTLOAD     : LOAD<"!region sext load"       , region_sext_load>;
+    defm REGIONAEXTLOAD     : LOAD<"!region aext load"       , region_aext_load>;
+  }
+
+
+  //===---------------------------------------------------------------------===//
+  // IO Expansion Load/Store Instructions
+  //===---------------------------------------------------------------------===//
+  let mayLoad = 1 in {
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHLOAD : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " $dst, x$id[$addy]"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def CBLOAD : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " $dst, cb$id[$addy]"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD_Y : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD_Z : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD_W : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC_v2i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC_v4i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_i8 : TwoInOneOut<IL_OP_LDS_LOAD_BYTE, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_BYTE.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_u8 : TwoInOneOut<IL_OP_LDS_LOAD_UBYTE, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_UBYTE.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_i16 : TwoInOneOut<IL_OP_LDS_LOAD_SHORT, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_SHORT.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_u16 : TwoInOneOut<IL_OP_LDS_LOAD_USHORT, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_USHORT.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_Y : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_Z : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD_W : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_i8 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(byte) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_i16 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(short) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+   // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+  }
+  let mayStore = 1 in {
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV4I32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy], $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_X : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].x___, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_Y : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy]._y__, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_Z : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].__z_, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_W : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].___w, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_XY : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV2I32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].xy__, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE_ZW : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV2I32:$data, i32imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].__zw, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE_Y : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE_Z : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE_W : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRI32:$mem),
+        (ins GPRI32:$addy, GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC_v2i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV2I32:$mem),
+        (ins GPRI32:$addy, GPRV2I32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC_v4i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV4I32:$mem),
+        (ins GPRI32:$addy, GPRV4I32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE_i8 : TwoInOneOut<IL_OP_LDS_STORE_BYTE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE_BYTE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE_i16 : TwoInOneOut<IL_OP_LDS_STORE_SHORT, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE_SHORT.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE_Y : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE_Z : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE_W : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_i8 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI8:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(byte) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_i16 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI16:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(short) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+   // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE_i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRI32:$mem),
+        (ins GPRI32:$addy, GPRI32:$src, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV2I32:$mem),
+        (ins GPRI32:$addy, GPRV2I32:$src, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV4I32:$mem),
+        (ins GPRI32:$addy, GPRV4I32:$src, i32imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+  }
+}
+let Predicates = [Has64BitPtr] in {
+  let isCodeGenOnly=1 in {
+    //===----------------------------------------------------------------------===//
+    // Store Memory Operations
+    //===----------------------------------------------------------------------===//
+    defm GLOBALTRUNCSTORE64 : GTRUNCSTORE64<"!global trunc store">;
+    defm GLOBALSTORE64      : STORE64<"!global store"         , global_store>;
+    defm LOCALTRUNCSTORE64  : LTRUNCSTORE64<"!local trunc store">;
+    defm LOCALSTORE64       : STORE64<"!local store"          , local_store>;
+    defm PRIVATETRUNCSTORE64 : PTRUNCSTORE64<"!private trunc store">;
+    defm PRIVATESTORE64     : STORE64<"!private store"        , private_store>;
+    defm REGIONTRUNCSTORE64 : RTRUNCSTORE64<"!region trunc store">;
+    defm REGIONSTORE64      : STORE64<"!region hw store"      , region_store>;
+
+
+    //===---------------------------------------------------------------------===//
+    // Load Memory Operations
+    //===---------------------------------------------------------------------===//
+    defm GLOBALLOAD64       : LOAD64<"!global load"            , global_load>;
+    defm GLOBALZEXTLOAD64   : LOAD64<"!global zext load"       , global_zext_load>;
+    defm GLOBALSEXTLOAD64   : LOAD64<"!global sext load"       , global_sext_load>;
+    defm GLOBALAEXTLOAD64   : LOAD64<"!global aext load"       , global_aext_load>;
+    defm PRIVATELOAD64      : LOAD64<"!private load"           , private_load>;
+    defm PRIVATEZEXTLOAD64  : LOAD64<"!private zext load"      , private_zext_load>;
+    defm PRIVATESEXTLOAD64  : LOAD64<"!private sext load"      , private_sext_load>;
+    defm PRIVATEAEXTLOAD64  : LOAD64<"!private aext load"      , private_aext_load>;
+    defm CPOOLLOAD64        : LOAD64<"!constant pool load"     , cp_load>;
+    defm CPOOLZEXTLOAD64    : LOAD64<"!constant pool zext load", cp_zext_load>;
+    defm CPOOLSEXTLOAD64    : LOAD64<"!constant pool sext load", cp_sext_load>;
+    defm CPOOLAEXTLOAD64    : LOAD64<"!constant aext pool load", cp_aext_load>;
+    defm CONSTANTLOAD64     : LOAD64<"!constant load"          , constant_load>;
+    defm CONSTANTZEXTLOAD64 : LOAD64<"!constant zext load"     , constant_zext_load>;
+    defm CONSTANTSEXTLOAD64 : LOAD64<"!constant sext load"     , constant_sext_load>;
+    defm CONSTANTAEXTLOAD64 : LOAD64<"!constant aext load"     , constant_aext_load>;
+    defm LOCALLOAD64        : LOAD64<"!local load"             , local_load>;
+    defm LOCALZEXTLOAD64    : LOAD64<"!local zext load"        , local_zext_load>;
+    defm LOCALSEXTLOAD64    : LOAD64<"!local sext load"        , local_sext_load>;
+    defm LOCALAEXTLOAD64    : LOAD64<"!local aext load"        , local_aext_load>;
+    defm REGIONLOAD64       : LOAD64<"!region load"            , region_load>;
+    defm REGIONZEXTLOAD64   : LOAD64<"!region zext load"       , region_zext_load>;
+    defm REGIONSEXTLOAD64   : LOAD64<"!region sext load"       , region_sext_load>;
+    defm REGIONAEXTLOAD64   : LOAD64<"!region aext load"       , region_aext_load>;
+  }
+
+
+  //===---------------------------------------------------------------------===//
+  // IO Expansion Load/Store Instructions
+  //===---------------------------------------------------------------------===//
+  let mayLoad = 1 in {
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHLOAD64 : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " $dst, x$id[$addy]"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def CBLOAD64 : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " $dst, cb$id[$addy]"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD64 : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD64_Y : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD64_Z : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSLOAD64_W : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC64 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC64_v2i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOADVEC64_v4i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64 : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_i8 : TwoInOneOut<IL_OP_LDS_LOAD_BYTE, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_BYTE.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_u8 : TwoInOneOut<IL_OP_LDS_LOAD_UBYTE, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_UBYTE.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_i16 : TwoInOneOut<IL_OP_LDS_LOAD_SHORT, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_SHORT.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_u16 : TwoInOneOut<IL_OP_LDS_LOAD_USHORT, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD_USHORT.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_Y : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_Z : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSLOAD64_W : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_i8 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(byte) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_i16 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(short) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENALOAD64_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>;
+   // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD64_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOAD64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED64_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWLOADCACHED64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst),
+        (ins GPRI32:$addy, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>;
+  }
+  let mayStore = 1 in {
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64 : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV4I32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy], $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_X : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].x___, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_Y : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy]._y__, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_Z : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].__z_, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_W : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRI32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].___w, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_XY : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV2I32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].xy__, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def SCRATCHSTORE64_ZW : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy),
+        (ins GPRV2I32:$data, i64imm:$id),
+        !strconcat(IL_OP_MOV.Text, " x$id[$addy].__zw, $data"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE64 : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE64_Y : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE64_Z : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def GDSSTORE64_W : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC64 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRI32:$mem),
+        (ins GPRI32:$addy, GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC64_v2i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV2I32:$mem),
+        (ins GPRI32:$addy, GPRV2I32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTOREVEC64_v4i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV4I32:$mem),
+        (ins GPRI32:$addy, GPRV4I32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64 : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64_i8 : TwoInOneOut<IL_OP_LDS_STORE_BYTE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE_BYTE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64_i16 : TwoInOneOut<IL_OP_LDS_STORE_SHORT, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE_SHORT.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64_Y : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64_Z : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def LDSSTORE64_W : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_i8 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI8:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(byte) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_i16 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI16:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(short) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVARENASTORE64_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy),
+        (ins GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_ARENA_UAV_STORE.Text, 
+            "_id($id)_size(dword) $addy, $src"), []>;
+   // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE64_i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRI32:$mem),
+        (ins GPRI32:$addy, GPRI32:$src, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV2I32:$mem),
+        (ins GPRI32:$addy, GPRV2I32:$src, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+    // This opcode has custom swizzle patterns for some of the arguments.
+    def UAVRAWSTORE64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV4I32:$mem),
+        (ins GPRI32:$addy, GPRV4I32:$src, i64imm:$id),
+        !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>;
+  }
+}
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// seperate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1 in {
+  def BRANCH : ILFormat<IL_PSEUDO_INST, (outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond>;
+}
+//===---------------------------------------------------------------------===//
+// return instructions
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def RETURN          : ILFormat<IL_OP_RET,(outs), (ins variable_ops),
+      IL_OP_RET.Text, [(IL_retflag)]>;
+}
+//===---------------------------------------------------------------------===//
+// Lower and raise the stack x amount
+//===---------------------------------------------------------------------===//
+def ADJCALLSTACKDOWN : ILFormat<IL_PSEUDO_INST, (outs), (ins i32imm:$amt),
+    "; begin of call sequence $amt",
+    [(IL_callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : ILFormat<IL_PSEUDO_INST, (outs), (ins i32imm:$amt1,
+    i32imm:$amt2),
+    "; end of call sequence $amt1 $amt2",
+    [(IL_callseq_end timm:$amt1, timm:$amt2)]>;
+
+//===---------------------------------------------------------------------===//
+// Handle a function call
+//===---------------------------------------------------------------------===//
+let isCall = 1,
+    Defs = [
+    R110, R111,
+    R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124,
+    R125, R126, R127,
+    R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140,
+    R141, R142, R143,
+    R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156,
+    R157, R158, R159,
+    R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172,
+    R173, R174, R175,
+    R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188,
+    R189, R190, R191,
+    R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204,
+    R205, R206, R207,
+    R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220,
+    R221, R222, R223,
+    R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236,
+    R237, R238, R239,
+    R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252,
+    R253, R254, R255
+    ]
+    ,
+    Uses = [
+    R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15,
+    R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+    R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47,
+    R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63,
+    R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79,
+    R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95,
+    R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109
+    ]
+    in {
+      def CALL : UnaryOpNoRet<IL_OP_CALL, (outs),
+      (ins calltarget:$dst, variable_ops), 
+      !strconcat(IL_OP_CALL.Text, " $dst"), []>;
+    }
+
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+let isTerminator=1 in {
+  def SWITCH      : ILFormat<IL_OP_SWITCH, (outs), (ins GPRI32:$src),
+  !strconcat(IL_OP_SWITCH.Text, " $src"), []>;
+  def CASE        : ILFormat<IL_OP_CASE, (outs), (ins GPRI32:$src),
+      !strconcat(IL_OP_CASE.Text, " $src"), []>;
+  def BREAK       : ILFormat<IL_OP_BREAK, (outs), (ins),
+      IL_OP_BREAK.Text, []>;
+  def CONTINUE    : ILFormat<IL_OP_CONTINUE, (outs), (ins),
+      IL_OP_CONTINUE.Text, []>;
+  def DEFAULT     : ILFormat<IL_OP_DEFAULT, (outs), (ins),
+      IL_OP_DEFAULT.Text, []>;
+  def ELSE        : ILFormat<IL_OP_ELSE, (outs), (ins),
+      IL_OP_ELSE.Text, []>;
+  def ENDSWITCH   : ILFormat<IL_OP_ENDSWITCH, (outs), (ins),
+      IL_OP_ENDSWITCH.Text, []>;
+  def ENDMAIN     : ILFormat<IL_OP_ENDMAIN, (outs), (ins),
+      IL_OP_ENDMAIN.Text, []>;
+  def END         : ILFormat<IL_OP_END, (outs), (ins),
+      IL_OP_END.Text, []>;
+  def ENDFUNC     : ILFormat<IL_OP_ENDFUNC, (outs), (ins),
+      IL_OP_ENDFUNC.Text, []>;
+  def ENDIF       : ILFormat<IL_OP_ENDIF, (outs), (ins),
+      IL_OP_ENDIF.Text, []>;
+  def WHILELOOP   : ILFormat<IL_OP_WHILE, (outs), (ins),
+      IL_OP_WHILE.Text, []>;
+  def ENDLOOP     : ILFormat<IL_OP_ENDLOOP, (outs), (ins),
+      IL_OP_ENDLOOP.Text, []>;
+  def FUNC        : ILFormat<IL_OP_FUNC, (outs), (ins),
+      IL_OP_FUNC.Text, []>;
+  def RETDYN      : ILFormat<IL_OP_RET_DYN, (outs), (ins),
+      IL_OP_RET_DYN.Text, []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<IL_OP_IF_LOGICALNZ>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<IL_OP_IF_LOGICALZ>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<IL_OP_BREAK_LOGICALNZ>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<IL_OP_BREAK_LOGICALZ>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<IL_OP_CONTINUE_LOGICALNZ>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<IL_OP_CONTINUE_LOGICALZ>;
+  defm IFC         : BranchInstr2<IL_OP_IFC>;
+  defm BREAKC      : BranchInstr2<IL_OP_BREAKC>;
+  defm CONTINUEC   : BranchInstr2<IL_OP_CONTINUEC>;
+}
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def TRAP : ILFormat<IL_OP_NOP, (outs), (ins),
+      IL_OP_NOP.Text, [(trap)]>;
+}
+
+//===---------------------------------------------------------------------===//
+//----------------- Work Item Functions - OpenCL 6.11.1 ---------------------//
+//===---------------------------------------------------------------------===//
+let isCall = 1, isAsCheapAsAMove = 1 in {
+  def GET_WORK_DIM : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), (ins),
+      !strconcat(IL_OP_MOV.Text, " $dst, cb0[0].w"),
+      [(set GPRI32:$dst, (int_AMDIL_get_work_dim))]>;
+
+  def GET_GLOBAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1021.xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_global_id))]>;
+
+  def GET_LOCAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_local_id))]>;
+
+  def GET_GROUP_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1023.xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_group_id))]>;
+
+  def GET_GLOBAL_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[0].xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_global_size))]>;
+
+  def GET_LOCAL_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[1].xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_local_size))]>;
+
+  def GET_NUM_GROUPS : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[2].xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_num_groups))]>;
+
+  def GET_GLOBAL_OFFSET : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+      (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[9].xyz0"),
+      [(set GPRV4I32:$dst, (int_AMDIL_get_global_offset))]>;
+
+  let Predicates = [Has64BitPtr] in {
+    def GET_PRINTF_OFFSET_i64: ILFormat<IL_OP_MOV, (outs GPRI32:$dst),
+        (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].zw"),
+        [(set GPRI32:$dst, (int_AMDIL_get_printf_offset))]>;
+    def GET_PRINTF_SIZE_i64 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst),
+        (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[9].x0"),
+        [(set GPRI32:$dst, (int_AMDIL_get_printf_size))]>;
+  }
+  let Predicates = [Has32BitPtr] in {
+    def GET_PRINTF_OFFSET_i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst),
+        (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].y0"),
+        [(set GPRI32:$dst, (int_AMDIL_get_printf_offset))]>;
+    def GET_PRINTF_SIZE_i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst),
+        (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].z0"),
+        [(set GPRI32:$dst, (int_AMDIL_get_printf_size))]>;
+  }
+}
+//===---------------------------------------------------------------------===//
+//------------- Synchronization Functions - OpenCL 6.11.9 -------------------//
+//===---------------------------------------------------------------------===//
+let isCall=1 in {
+
+  def FENCE : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag),
+      "fence_lds_memory_gds",
+      [(int_AMDIL_fence GPRI32:$flag)]>;
+
+  def FENCE_LOCAL : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag),
+      "fence_lds",
+      [(int_AMDIL_fence_local GPRI32:$flag)]>;
+
+  def FENCE_GLOBAL : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag),
+      "fence_memory",
+      [(int_AMDIL_fence_global GPRI32:$flag)]>;
+
+  def FENCE_REGION : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag),
+      "fence_gds",
+      [(int_AMDIL_fence_region GPRI32:$flag)]>;
+
+  def FENCE_READ_ONLY : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_lds_gds_memory_mem_read_only",
+      [(int_AMDIL_fence_read_only GPRI32:$flag)]>;
+
+  def FENCE_READ_ONLY_LOCAL : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_lds_mem_read_only",
+      [(int_AMDIL_fence_read_only_local GPRI32:$flag)]>;
+
+  def FENCE_READ_ONLY_GLOBAL : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_mem_read_only",
+      [(int_AMDIL_fence_read_only_global GPRI32:$flag)]>;
+
+  def FENCE_READ_ONLY_REGION : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_gds_mem_read_only",
+      [(int_AMDIL_fence_read_only_region GPRI32:$flag)]>;
+
+  def FENCE_WRITE_ONLY : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_lds_gds_memory_mem_write_only",
+      [(int_AMDIL_fence_write_only GPRI32:$flag)]>;
+
+  def FENCE_WRITE_ONLY_LOCAL : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_lds_mem_write_only",
+      [(int_AMDIL_fence_write_only_local GPRI32:$flag)]>;
+
+  def FENCE_WRITE_ONLY_GLOBAL : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_mem_write_only",
+      [(int_AMDIL_fence_write_only_global GPRI32:$flag)]>;
+
+  def FENCE_WRITE_ONLY_REGION : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs),
+      (ins GPRI32:$flag),
+      "fence_gds_mem_write_only",
+      [(int_AMDIL_fence_write_only_region GPRI32:$flag)]>;
+}
+let isReturn = 1 in {
+  def EARLY_EXIT : UnaryOpNoRet<IL_OP_RET_LOGICALNZ, (outs),
+      (ins GPRI32:$flag),
+      !strconcat(IL_OP_RET_LOGICALNZ.Text, " $flag"),
+      [(int_AMDIL_early_exit GPRI32:$flag)]>;
+}
+def MEDIA_UNPACK_0 : OneInOneOut<IL_OP_UNPACK_0, (outs GPRV4F32:$dst),
+    (ins GPRV4I32:$src),
+    !strconcat(IL_OP_UNPACK_0.Text, " $dst, $src"),
+    [(set GPRV4F32:$dst,
+        (v4f32 (int_AMDIL_media_unpack_byte_0 GPRV4I32:$src)))]>;
+def MEDIA_UNPACK_1 : OneInOneOut<IL_OP_UNPACK_1, (outs GPRV4F32:$dst),
+    (ins GPRV4I32:$src),
+    !strconcat(IL_OP_UNPACK_1.Text, " $dst, $src"),
+    [(set GPRV4F32:$dst,
+        (v4f32 (int_AMDIL_media_unpack_byte_1 GPRV4I32:$src)))]>;
+def MEDIA_UNPACK_2 : OneInOneOut<IL_OP_UNPACK_2, (outs GPRV4F32:$dst),
+    (ins GPRV4I32:$src),
+    !strconcat(IL_OP_UNPACK_2.Text, " $dst, $src"),
+    [(set GPRV4F32:$dst,
+        (v4f32 (int_AMDIL_media_unpack_byte_2 GPRV4I32:$src)))]>;
+def MEDIA_UNPACK_3 : OneInOneOut<IL_OP_UNPACK_3, (outs GPRV4F32:$dst),
+    (ins GPRV4I32:$src),
+    !strconcat(IL_OP_UNPACK_3.Text, " $dst, $src"),
+    [(set GPRV4F32:$dst,
+        (v4f32 (int_AMDIL_media_unpack_byte_3 GPRV4I32:$src)))]>;
+let Predicates = [Has32BitPtr] in {
+// All of the image functions
+def IMAGE1D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1DA_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_array_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2DA_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_array_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE3D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image3d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1DA_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_array_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2DA_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_array_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE3D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image3d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_info0 ADDR:$ptr))]>;
+def IMAGE1D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_info1 ADDR:$ptr))]>;
+def IMAGE1DA_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info0 ADDR:$ptr))]>;
+def IMAGE1DA_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info1 ADDR:$ptr))]>;
+def IMAGE2D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_info0 ADDR:$ptr))]>;
+def IMAGE2D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_info1 ADDR:$ptr))]>;
+def IMAGE2DA_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info0 ADDR:$ptr))]>;
+def IMAGE2DA_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info1 ADDR:$ptr))]>;
+def IMAGE3D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image3d_info0 ADDR:$ptr))]>;
+def IMAGE3D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image3d_info1 ADDR:$ptr))]>;
+def IMAGE1D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image1d_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE1DA_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image1d_array_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE2D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image2d_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE2DA_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image2d_array_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE3D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI32:$ptr, GPRV4I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image3d_write ADDR:$ptr, GPRV4I32:$addy, GPRV4I32:$data)]>;
+let hasSideEffects = 1, isNotDuplicable = 1 in {
+  // All of the noret atomic functions
+  def ATOM_G_ADD_NORET     : BinAtomNoRet<IL_OP_UAV_ADD,
+      "_id($id)", atom_g_add_noret>;
+  def ATOM_G_AND_NORET     : BinAtomNoRet<IL_OP_UAV_AND,
+      "_id($id)", atom_g_and_noret>;
+  def ATOM_G_MAX_NORET     : BinAtomNoRet<IL_OP_UAV_MAX,
+      "_id($id)", atom_g_max_noret>;
+  def ATOM_G_MIN_NORET     : BinAtomNoRet<IL_OP_UAV_MIN,
+      "_id($id)", atom_g_min_noret>;
+  def ATOM_G_UMAX_NORET    : BinAtomNoRet<IL_OP_UAV_UMAX,
+      "_id($id)", atom_g_umax_noret>;
+  def ATOM_G_UMIN_NORET    : BinAtomNoRet<IL_OP_UAV_UMIN,
+      "_id($id)", atom_g_umin_noret>;
+  def ATOM_G_OR_NORET      : BinAtomNoRet<IL_OP_UAV_OR,
+      "_id($id)", atom_g_or_noret>;
+  def ATOM_G_RSUB_NORET    : BinAtomNoRet<IL_OP_UAV_RSUB,
+      "_id($id)", atom_g_rsub_noret>;
+  def ATOM_G_SUB_NORET     : BinAtomNoRet<IL_OP_UAV_SUB,
+      "_id($id)", atom_g_sub_noret>;
+  def ATOM_G_XOR_NORET     : BinAtomNoRet<IL_OP_UAV_XOR,
+      "_id($id)", atom_g_xor_noret>;
+  def ATOM_G_INC_NORET     : BinAtomNoRet<IL_OP_UAV_INC,
+      "_id($id)", atom_g_inc_noret>;
+  def ATOM_G_DEC_NORET     : BinAtomNoRet<IL_OP_UAV_DEC,
+      "_id($id)", atom_g_dec_noret>;
+  def ATOM_G_CMPXCHG_NORET    : CmpXChgNoRet<IL_OP_UAV_CMP,
+      "_id($id)", atom_g_cmpxchg_noret>;
+  def ATOM_A_ADD_NORET     : BinAtomNoRet<IL_OP_UAV_ADD,
+      "_id($id)_arena", atom_g_add_noret>;
+  def ATOM_A_AND_NORET     : BinAtomNoRet<IL_OP_UAV_AND,
+      "_id($id)_arena", atom_g_and_noret>;
+  def ATOM_A_MAX_NORET     : BinAtomNoRet<IL_OP_UAV_MAX,
+      "_id($id)_arena", atom_g_max_noret>;
+  def ATOM_A_MIN_NORET     : BinAtomNoRet<IL_OP_UAV_MIN,
+      "_id($id)_arena", atom_g_min_noret>;
+  def ATOM_A_UMAX_NORET    : BinAtomNoRet<IL_OP_UAV_UMAX,
+      "_id($id)_arena", atom_g_umax_noret>;
+  def ATOM_A_UMIN_NORET    : BinAtomNoRet<IL_OP_UAV_UMIN,
+      "_id($id)_arena", atom_g_umin_noret>;
+  def ATOM_A_OR_NORET      : BinAtomNoRet<IL_OP_UAV_OR,
+      "_id($id)_arena", atom_g_or_noret>;
+  def ATOM_A_RSUB_NORET    : BinAtomNoRet<IL_OP_UAV_RSUB,
+      "_id($id)_arena", atom_g_rsub_noret>;
+  def ATOM_A_SUB_NORET     : BinAtomNoRet<IL_OP_UAV_SUB,
+      "_id($id)_arena", atom_g_sub_noret>;
+  def ATOM_A_XOR_NORET     : BinAtomNoRet<IL_OP_UAV_XOR,
+      "_id($id)_arena", atom_g_xor_noret>;
+  def ATOM_A_INC_NORET     : BinAtomNoRet<IL_OP_UAV_INC,
+      "_id($id)_arena", atom_g_inc_noret>;
+  def ATOM_A_DEC_NORET     : BinAtomNoRet<IL_OP_UAV_DEC,
+      "_id($id)_arena", atom_g_dec_noret>;
+  def ATOM_A_CMPXCHG_NORET    : CmpXChgNoRet<IL_OP_UAV_CMP,
+      "_id($id)_arena", atom_g_cmpxchg_noret>;
+  def ATOM_L_ADD_NORET     : BinAtomNoRet<IL_OP_LDS_ADD,
+      "_resource($id)", atom_l_add_noret>;
+  def ATOM_L_AND_NORET     : BinAtomNoRet<IL_OP_LDS_AND,
+      "_resource($id)", atom_l_and_noret>;
+  def ATOM_L_MAX_NORET     : BinAtomNoRet<IL_OP_LDS_MAX,
+      "_resource($id)", atom_l_max_noret>;
+  def ATOM_L_MIN_NORET     : BinAtomNoRet<IL_OP_LDS_MIN,
+      "_resource($id)", atom_l_min_noret>;
+  def ATOM_L_UMAX_NORET    : BinAtomNoRet<IL_OP_LDS_UMAX,
+      "_resource($id)", atom_l_umax_noret>;
+  def ATOM_L_UMIN_NORET    : BinAtomNoRet<IL_OP_LDS_UMIN,
+      "_resource($id)", atom_l_umin_noret>;
+  def ATOM_L_MSKOR_NORET   : TriAtomNoRet<IL_OP_LDS_MSKOR,
+      "_resource($id)", atom_l_mskor_noret>;
+  def ATOM_L_OR_NORET      : BinAtomNoRet<IL_OP_LDS_OR,
+      "_resource($id)", atom_l_or_noret>;
+  def ATOM_L_RSUB_NORET    : BinAtomNoRet<IL_OP_LDS_RSUB,
+      "_resource($id)", atom_l_rsub_noret>;
+  def ATOM_L_SUB_NORET     : BinAtomNoRet<IL_OP_LDS_SUB,
+      "_resource($id)", atom_l_sub_noret>;
+  def ATOM_L_XOR_NORET     : BinAtomNoRet<IL_OP_LDS_XOR,
+      "_resource($id)", atom_l_xor_noret>;
+  def ATOM_L_INC_NORET     : BinAtomNoRet<IL_OP_LDS_INC,
+      "_resource($id)", atom_l_inc_noret>;
+  def ATOM_L_DEC_NORET     : BinAtomNoRet<IL_OP_LDS_DEC,
+      "_resource($id)", atom_l_dec_noret>;
+  def ATOM_L_CMPXCHG_NORET    : TriAtomNoRet<IL_OP_LDS_CMP,
+      "_resource($id)", atom_l_cmpxchg_noret>;
+  def ATOM_R_ADD_NORET     : BinAtomNoRet<IL_OP_GDS_ADD,
+      "_resource($id)", atom_r_add_noret>;
+  def ATOM_R_AND_NORET     : BinAtomNoRet<IL_OP_GDS_AND,
+      "_resource($id)", atom_r_and_noret>;
+  def ATOM_R_MAX_NORET     : BinAtomNoRet<IL_OP_GDS_MAX,
+      "_resource($id)", atom_r_max_noret>;
+  def ATOM_R_MIN_NORET     : BinAtomNoRet<IL_OP_GDS_MIN,
+      "_resource($id)", atom_r_min_noret>;
+  def ATOM_R_UMAX_NORET    : BinAtomNoRet<IL_OP_GDS_UMAX,
+      "_resource($id)", atom_r_umax_noret>;
+  def ATOM_R_UMIN_NORET    : BinAtomNoRet<IL_OP_GDS_UMIN,
+      "_resource($id)", atom_r_umin_noret>;
+  def ATOM_R_MSKOR_NORET   : TriAtomNoRet<IL_OP_GDS_MSKOR,
+      "_resource($id)", atom_r_mskor_noret>;
+  def ATOM_R_OR_NORET      : BinAtomNoRet<IL_OP_GDS_OR,
+      "_resource($id)", atom_r_or_noret>;
+  def ATOM_R_RSUB_NORET    : BinAtomNoRet<IL_OP_GDS_RSUB,
+      "_resource($id)", atom_r_rsub_noret>;
+  def ATOM_R_SUB_NORET     : BinAtomNoRet<IL_OP_GDS_SUB,
+      "_resource($id)", atom_r_sub_noret>;
+  def ATOM_R_XOR_NORET     : BinAtomNoRet<IL_OP_GDS_XOR,
+      "_resource($id)", atom_r_xor_noret>;
+  def ATOM_R_INC_NORET     : BinAtomNoRet<IL_OP_GDS_INC,
+      "_resource($id)", atom_r_inc_noret>;
+  def ATOM_R_DEC_NORET     : BinAtomNoRet<IL_OP_GDS_DEC,
+      "_resource($id)", atom_r_dec_noret>;
+  def ATOM_R_CMPXCHG_NORET    : CmpXChgNoRet<IL_OP_GDS_CMP,
+      "_resource($id)", atom_r_cmpxchg_noret>;
+  def APPEND_ALLOC_NORET : AppendNoRet<IL_OP_APPEND_BUF_ALLOC,
+      "_id($id)", append_alloc_noret>;
+  def APPEND_CONSUME_NORET : AppendNoRet<IL_OP_APPEND_BUF_CONSUME,
+      "_id($id)", append_consume_noret>;
+  // All of the atomic functions that return
+  def ATOM_G_ADD     : BinAtom<IL_OP_UAV_READ_ADD, 
+      "_id($id)", atom_g_add>;
+  def ATOM_G_AND     : BinAtom<IL_OP_UAV_READ_AND, 
+      "_id($id)", atom_g_and>;
+  def ATOM_G_MAX     : BinAtom<IL_OP_UAV_READ_MAX, 
+      "_id($id)", atom_g_max>;
+  def ATOM_G_MIN     : BinAtom<IL_OP_UAV_READ_MIN, 
+      "_id($id)", atom_g_min>;
+  def ATOM_G_UMAX    : BinAtom<IL_OP_UAV_READ_UMAX, 
+      "_id($id)", atom_g_umax>;
+  def ATOM_G_UMIN    : BinAtom<IL_OP_UAV_READ_UMIN, 
+      "_id($id)", atom_g_umin>;
+  def ATOM_G_OR      : BinAtom<IL_OP_UAV_READ_OR, 
+      "_id($id)", atom_g_or>;
+  def ATOM_G_RSUB    : BinAtom<IL_OP_UAV_READ_RSUB, 
+      "_id($id)", atom_g_rsub>;
+  def ATOM_G_SUB     : BinAtom<IL_OP_UAV_READ_SUB, 
+      "_id($id)", atom_g_sub>;
+  def ATOM_G_XOR     : BinAtom<IL_OP_UAV_READ_XOR, 
+      "_id($id)", atom_g_xor>;
+  def ATOM_G_INC     : BinAtom<IL_OP_UAV_READ_INC, 
+      "_id($id)", atom_g_inc>;
+  def ATOM_G_DEC     : BinAtom<IL_OP_UAV_READ_DEC, 
+      "_id($id)", atom_g_dec>;
+  def ATOM_G_XCHG    : BinAtom<IL_OP_UAV_READ_XCHG, 
+      "_id($id)", atom_g_xchg>;
+  def ATOM_G_CMPXCHG : CmpXChg<IL_OP_UAV_READ_CMPXCHG, 
+      "_id($id)", atom_g_cmpxchg>;
+  // Arena atomic accesses
+  def ATOM_A_ADD     : BinAtom<IL_OP_UAV_READ_ADD, 
+      "_id($id)_arena", atom_g_add>;
+  def ATOM_A_AND     : BinAtom<IL_OP_UAV_READ_AND, 
+      "_id($id)_arena", atom_g_and>;
+  def ATOM_A_MAX     : BinAtom<IL_OP_UAV_READ_MAX, 
+      "_id($id)_arena", atom_g_max>;
+  def ATOM_A_MIN     : BinAtom<IL_OP_UAV_READ_MIN, 
+      "_id($id)_arena", atom_g_min>;
+  def ATOM_A_UMAX    : BinAtom<IL_OP_UAV_READ_UMAX, 
+      "_id($id)_arena", atom_g_umax>;
+  def ATOM_A_UMIN    : BinAtom<IL_OP_UAV_READ_UMIN, 
+      "_id($id)_arena", atom_g_umin>;
+  def ATOM_A_OR      : BinAtom<IL_OP_UAV_READ_OR, 
+      "_id($id)_arena", atom_g_or>;
+  def ATOM_A_RSUB    : BinAtom<IL_OP_UAV_READ_RSUB, 
+      "_id($id)_arena", atom_g_rsub>;
+  def ATOM_A_SUB     : BinAtom<IL_OP_UAV_READ_SUB, 
+      "_id($id)_arena", atom_g_sub>;
+  def ATOM_A_XOR     : BinAtom<IL_OP_UAV_READ_XOR, 
+      "_id($id)_arena", atom_g_xor>;
+  def ATOM_A_INC     : BinAtom<IL_OP_UAV_READ_INC, 
+      "_id($id)_arena", atom_g_inc>;
+  def ATOM_A_DEC     : BinAtom<IL_OP_UAV_READ_DEC, 
+      "_id($id)_arena", atom_g_dec>;
+  def ATOM_A_XCHG    : BinAtom<IL_OP_UAV_READ_XCHG, 
+      "_id($id)_arena", atom_g_xchg>;
+  def ATOM_A_CMPXCHG : CmpXChg<IL_OP_UAV_READ_CMPXCHG, 
+      "_id($id)_arena", atom_g_cmpxchg>;
+  def ATOM_L_ADD     : BinAtom<IL_OP_LDS_READ_ADD, 
+      "_resource($id)", atom_l_add>;
+  def ATOM_L_AND     : BinAtom<IL_OP_LDS_READ_AND, 
+      "_resource($id)", atom_l_and>;
+  def ATOM_L_MAX     : BinAtom<IL_OP_LDS_READ_MAX, 
+      "_resource($id)", atom_l_max>;
+  def ATOM_L_MIN     : BinAtom<IL_OP_LDS_READ_MIN, 
+      "_resource($id)", atom_l_min>;
+  def ATOM_L_UMAX    : BinAtom<IL_OP_LDS_READ_UMAX, 
+      "_resource($id)", atom_l_umax>;
+  def ATOM_L_UMIN    : BinAtom<IL_OP_LDS_READ_UMIN, 
+      "_resource($id)", atom_l_umin>;
+  def ATOM_L_OR      : BinAtom<IL_OP_LDS_READ_OR, 
+      "_resource($id)", atom_l_or>;
+  def ATOM_L_MSKOR   : TriAtom<IL_OP_LDS_READ_MSKOR, 
+      "_resource($id)", atom_l_mskor>;
+  def ATOM_L_RSUB    : BinAtom<IL_OP_LDS_READ_RSUB, 
+      "_resource($id)", atom_l_rsub>;
+  def ATOM_L_SUB     : BinAtom<IL_OP_LDS_READ_SUB, 
+      "_resource($id)", atom_l_sub>;
+  def ATOM_L_XOR     : BinAtom<IL_OP_LDS_READ_XOR, 
+      "_resource($id)", atom_l_xor>;
+  def ATOM_L_INC     : BinAtom<IL_OP_LDS_READ_INC, 
+      "_resource($id)", atom_l_inc>;
+  def ATOM_L_DEC     : BinAtom<IL_OP_LDS_READ_DEC, 
+      "_resource($id)", atom_l_dec>;
+  def ATOM_L_XCHG    : BinAtom<IL_OP_LDS_READ_XCHG, 
+      "_resource($id)", atom_l_xchg>;
+  def ATOM_L_CMPXCHG : TriAtom<IL_OP_LDS_READ_CMPXCHG, 
+      "_resource($id)", atom_l_cmpxchg>;
+  def ATOM_R_ADD     : BinAtom<IL_OP_GDS_READ_ADD, 
+      "_resource($id)", atom_r_add>;
+  def ATOM_R_AND     : BinAtom<IL_OP_GDS_READ_AND, 
+      "_resource($id)", atom_r_and>;
+  def ATOM_R_MAX     : BinAtom<IL_OP_GDS_READ_MAX, 
+      "_resource($id)", atom_r_max>;
+  def ATOM_R_MIN     : BinAtom<IL_OP_GDS_READ_MIN, 
+      "_resource($id)", atom_r_min>;
+  def ATOM_R_UMAX    : BinAtom<IL_OP_GDS_READ_UMAX, 
+      "_resource($id)", atom_r_umax>;
+  def ATOM_R_UMIN    : BinAtom<IL_OP_GDS_READ_UMIN, 
+      "_resource($id)", atom_r_umin>;
+  def ATOM_R_OR      : BinAtom<IL_OP_GDS_READ_OR, 
+      "_resource($id)", atom_r_or>;
+  def ATOM_R_MSKOR   : TriAtom<IL_OP_GDS_READ_MSKOR, 
+      "_resource($id)", atom_r_mskor>;
+  def ATOM_R_RSUB    : BinAtom<IL_OP_GDS_READ_RSUB, 
+      "_resource($id)", atom_r_rsub>;
+  def ATOM_R_SUB     : BinAtom<IL_OP_GDS_READ_SUB, 
+      "_resource($id)", atom_r_sub>;
+  def ATOM_R_XOR     : BinAtom<IL_OP_GDS_READ_XOR, 
+      "_resource($id)", atom_r_xor>;
+  def ATOM_R_INC     : BinAtom<IL_OP_GDS_READ_INC, 
+      "_resource($id)", atom_r_inc>;
+  def ATOM_R_DEC     : BinAtom<IL_OP_GDS_READ_DEC, 
+      "_resource($id)", atom_r_dec>;
+  def ATOM_R_XCHG    : BinAtom<IL_OP_GDS_READ_XCHG, 
+      "_resource($id)", atom_r_xchg>;
+  def ATOM_R_CMPXCHG : CmpXChg<IL_OP_GDS_READ_CMPXCHG, 
+      "_resource($id)", atom_r_cmpxchg>;
+  def APPEND_ALLOC : Append<IL_OP_APPEND_BUF_ALLOC,
+      "_id($id)", append_alloc>;
+  def APPEND_CONSUME : Append<IL_OP_APPEND_BUF_CONSUME,
+      "_id($id)", append_consume>;
+}
+}
+let Predicates = [Has64BitPtr] in {
+// All of the image functions
+def IMAGE1D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1DA64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_array_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2DA64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_array_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE3D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image3d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1DA64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image1d_array_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE2DA64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image2d_array_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE3D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy),
+    !strconcat(IL_OP_SAMPLE.Text, 
+        "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"),
+    [(set GPRV4I32:$dst,
+        (int_AMDIL_image3d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>;
+def IMAGE1D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_info0 ADDR64:$ptr))]>;
+def IMAGE1D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_info1 ADDR64:$ptr))]>;
+def IMAGE1DA64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info0 ADDR64:$ptr))]>;
+def IMAGE1DA64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info1 ADDR64:$ptr))]>;
+def IMAGE2DA64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info0 ADDR64:$ptr))]>;
+def IMAGE2DA64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info1 ADDR64:$ptr))]>;
+def IMAGE2D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_info0 ADDR64:$ptr))]>;
+def IMAGE2D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image2d_info1 ADDR64:$ptr))]>;
+def IMAGE3D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image3d_info0 ADDR64:$ptr))]>;
+def IMAGE3D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins MEMI64:$ptr),
+    !strconcat(IL_OP_MOV.Text, " $dst, $ptr"),
+    [(set GPRV4I32:$dst, (int_AMDIL_image3d_info1 ADDR64:$ptr))]>;
+def IMAGE1D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image1d_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE1DA64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image1d_array_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE2D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image2d_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE2DA64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image2d_array_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>;
+def IMAGE3D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs),
+    (ins MEMI64:$ptr, GPRV4I32:$addy, GPRV4I32:$data),
+    !strconcat(IL_OP_UAV_STORE.Text,
+        "_id($ptr) $addy, $data"),
+    [(int_AMDIL_image3d_write ADDR64:$ptr, GPRV4I32:$addy, GPRV4I32:$data)]>;
+let hasSideEffects= 1 in {
+  // All of the noret atomic functions
+  def ATOM_G64_ADD_NORET     : BinAtomNoRet64<IL_OP_UAV_ADD,
+      "_id($id)", atom_g_add_noret>;
+  def ATOM_G64_AND_NORET     : BinAtomNoRet64<IL_OP_UAV_AND,
+      "_id($id)", atom_g_and_noret>;
+  def ATOM_G64_MAX_NORET     : BinAtomNoRet64<IL_OP_UAV_MAX,
+      "_id($id)", atom_g_max_noret>;
+  def ATOM_G64_MIN_NORET     : BinAtomNoRet64<IL_OP_UAV_MIN,
+      "_id($id)", atom_g_min_noret>;
+  def ATOM_G64_UMAX_NORET    : BinAtomNoRet64<IL_OP_UAV_UMAX,
+      "_id($id)", atom_g_umax_noret>;
+  def ATOM_G64_UMIN_NORET    : BinAtomNoRet64<IL_OP_UAV_UMIN,
+      "_id($id)", atom_g_umin_noret>;
+  def ATOM_G64_OR_NORET      : BinAtomNoRet64<IL_OP_UAV_OR,
+      "_id($id)", atom_g_or_noret>;
+  def ATOM_G64_RSUB_NORET    : BinAtomNoRet64<IL_OP_UAV_RSUB,
+      "_id($id)", atom_g_rsub_noret>;
+  def ATOM_G64_SUB_NORET     : BinAtomNoRet64<IL_OP_UAV_SUB,
+      "_id($id)", atom_g_sub_noret>;
+  def ATOM_G64_XOR_NORET     : BinAtomNoRet64<IL_OP_UAV_XOR,
+      "_id($id)", atom_g_xor_noret>;
+  def ATOM_G64_INC_NORET     : BinAtomNoRet64<IL_OP_UAV_INC,
+      "_id($id)", atom_g_inc_noret>;
+  def ATOM_G64_DEC_NORET     : BinAtomNoRet64<IL_OP_UAV_DEC,
+      "_id($id)", atom_g_dec_noret>;
+  def ATOM_G64_CMPXCHG_NORET    : CmpXChgNoRet64<IL_OP_UAV_CMP,
+      "_id($id)", atom_g_cmpxchg_noret>;
+  def ATOM_A64_ADD_NORET     : BinAtomNoRet64<IL_OP_UAV_ADD,
+      "_id($id)_arena", atom_g_add_noret>;
+  def ATOM_A64_AND_NORET     : BinAtomNoRet64<IL_OP_UAV_AND,
+      "_id($id)_arena", atom_g_and_noret>;
+  def ATOM_A64_MAX_NORET     : BinAtomNoRet64<IL_OP_UAV_MAX,
+      "_id($id)_arena", atom_g_max_noret>;
+  def ATOM_A64_MIN_NORET     : BinAtomNoRet64<IL_OP_UAV_MIN,
+      "_id($id)_arena", atom_g_min_noret>;
+  def ATOM_A64_UMAX_NORET    : BinAtomNoRet64<IL_OP_UAV_UMAX,
+      "_id($id)_arena", atom_g_umax_noret>;
+  def ATOM_A64_UMIN_NORET    : BinAtomNoRet64<IL_OP_UAV_UMIN,
+      "_id($id)_arena", atom_g_umin_noret>;
+  def ATOM_A64_OR_NORET      : BinAtomNoRet64<IL_OP_UAV_OR,
+      "_id($id)_arena", atom_g_or_noret>;
+  def ATOM_A64_RSUB_NORET    : BinAtomNoRet64<IL_OP_UAV_RSUB,
+      "_id($id)_arena", atom_g_rsub_noret>;
+  def ATOM_A64_SUB_NORET     : BinAtomNoRet64<IL_OP_UAV_SUB,
+      "_id($id)_arena", atom_g_sub_noret>;
+  def ATOM_A64_XOR_NORET     : BinAtomNoRet64<IL_OP_UAV_XOR,
+      "_id($id)_arena", atom_g_xor_noret>;
+  def ATOM_A64_INC_NORET     : BinAtomNoRet64<IL_OP_UAV_INC,
+      "_id($id)_arena", atom_g_inc_noret>;
+  def ATOM_A64_DEC_NORET     : BinAtomNoRet64<IL_OP_UAV_DEC,
+      "_id($id)_arena", atom_g_dec_noret>;
+  def ATOM_A64_CMPXCHG_NORET    : CmpXChgNoRet64<IL_OP_UAV_CMP,
+      "_id($id)_arena", atom_g_cmpxchg_noret>;
+  def ATOM_L64_ADD_NORET     : BinAtomNoRet64<IL_OP_LDS_ADD,
+      "_resource($id)", atom_l_add_noret>;
+  def ATOM_L64_AND_NORET     : BinAtomNoRet64<IL_OP_LDS_AND,
+      "_resource($id)", atom_l_and_noret>;
+  def ATOM_L64_MAX_NORET     : BinAtomNoRet64<IL_OP_LDS_MAX,
+      "_resource($id)", atom_l_max_noret>;
+  def ATOM_L64_MIN_NORET     : BinAtomNoRet64<IL_OP_LDS_MIN,
+      "_resource($id)", atom_l_min_noret>;
+  def ATOM_L64_UMAX_NORET    : BinAtomNoRet64<IL_OP_LDS_UMAX,
+      "_resource($id)", atom_l_umax_noret>;
+  def ATOM_L64_UMIN_NORET    : BinAtomNoRet64<IL_OP_LDS_UMIN,
+      "_resource($id)", atom_l_umin_noret>;
+  def ATOM_L64_MSKOR_NORET   : TriAtomNoRet64<IL_OP_LDS_MSKOR,
+      "_resource($id)", atom_l_mskor_noret>;
+  def ATOM_L64_OR_NORET      : BinAtomNoRet64<IL_OP_LDS_OR,
+      "_resource($id)", atom_l_or_noret>;
+  def ATOM_L64_RSUB_NORET    : BinAtomNoRet64<IL_OP_LDS_RSUB,
+      "_resource($id)", atom_l_rsub_noret>;
+  def ATOM_L64_SUB_NORET     : BinAtomNoRet64<IL_OP_LDS_SUB,
+      "_resource($id)", atom_l_sub_noret>;
+  def ATOM_L64_XOR_NORET     : BinAtomNoRet64<IL_OP_LDS_XOR,
+      "_resource($id)", atom_l_xor_noret>;
+  def ATOM_L64_INC_NORET     : BinAtomNoRet64<IL_OP_LDS_INC,
+      "_resource($id)", atom_l_inc_noret>;
+  def ATOM_L64_DEC_NORET     : BinAtomNoRet64<IL_OP_LDS_DEC,
+      "_resource($id)", atom_l_dec_noret>;
+  def ATOM_L64_CMPXCHG_NORET    : TriAtomNoRet64<IL_OP_LDS_CMP,
+      "_resource($id)", atom_l_cmpxchg_noret>;
+  def ATOM_R64_ADD_NORET     : BinAtomNoRet64<IL_OP_GDS_ADD,
+      "_resource($id)", atom_r_add_noret>;
+  def ATOM_R64_AND_NORET     : BinAtomNoRet64<IL_OP_GDS_AND,
+      "_resource($id)", atom_r_and_noret>;
+  def ATOM_R64_MAX_NORET     : BinAtomNoRet64<IL_OP_GDS_MAX,
+      "_resource($id)", atom_r_max_noret>;
+  def ATOM_R64_MIN_NORET     : BinAtomNoRet64<IL_OP_GDS_MIN,
+      "_resource($id)", atom_r_min_noret>;
+  def ATOM_R64_UMAX_NORET    : BinAtomNoRet64<IL_OP_GDS_UMAX,
+      "_resource($id)", atom_r_umax_noret>;
+  def ATOM_R64_UMIN_NORET    : BinAtomNoRet64<IL_OP_GDS_UMIN,
+      "_resource($id)", atom_r_umin_noret>;
+  def ATOM_R64_MSKOR_NORET   : TriAtomNoRet64<IL_OP_GDS_MSKOR,
+      "_resource($id)", atom_r_mskor_noret>;
+  def ATOM_R64_OR_NORET      : BinAtomNoRet64<IL_OP_GDS_OR,
+      "_resource($id)", atom_r_or_noret>;
+  def ATOM_R64_RSUB_NORET    : BinAtomNoRet64<IL_OP_GDS_RSUB,
+      "_resource($id)", atom_r_rsub_noret>;
+  def ATOM_R64_SUB_NORET     : BinAtomNoRet64<IL_OP_GDS_SUB,
+      "_resource($id)", atom_r_sub_noret>;
+  def ATOM_R64_XOR_NORET     : BinAtomNoRet64<IL_OP_GDS_XOR,
+      "_resource($id)", atom_r_xor_noret>;
+  def ATOM_R64_INC_NORET     : BinAtomNoRet64<IL_OP_GDS_INC,
+      "_resource($id)", atom_r_inc_noret>;
+  def ATOM_R64_DEC_NORET     : BinAtomNoRet64<IL_OP_GDS_DEC,
+      "_resource($id)", atom_r_dec_noret>;
+  def ATOM_R64_CMPXCHG_NORET    : CmpXChgNoRet64<IL_OP_GDS_CMP,
+      "_resource($id)", atom_r_cmpxchg_noret>;
+  def APPEND_ALLOC64_NORET : AppendNoRet64<IL_OP_APPEND_BUF_ALLOC,
+      "_id($id)", append_alloc_noret>;
+  def APPEND_CONSUME64_NORET : AppendNoRet64<IL_OP_APPEND_BUF_CONSUME,
+      "_id($id)", append_consume_noret>;
+  // All of the atomic functions that return
+  def ATOM_G64_ADD     : BinAtom64<IL_OP_UAV_READ_ADD, 
+      "_id($id)", atom_g_add>;
+  def ATOM_G64_AND     : BinAtom64<IL_OP_UAV_READ_AND, 
+      "_id($id)", atom_g_and>;
+  def ATOM_G64_MAX     : BinAtom64<IL_OP_UAV_READ_MAX, 
+      "_id($id)", atom_g_max>;
+  def ATOM_G64_MIN     : BinAtom64<IL_OP_UAV_READ_MIN, 
+      "_id($id)", atom_g_min>;
+  def ATOM_G64_UMAX    : BinAtom64<IL_OP_UAV_READ_UMAX, 
+      "_id($id)", atom_g_umax>;
+  def ATOM_G64_UMIN    : BinAtom64<IL_OP_UAV_READ_UMIN, 
+      "_id($id)", atom_g_umin>;
+  def ATOM_G64_OR      : BinAtom64<IL_OP_UAV_READ_OR, 
+      "_id($id)", atom_g_or>;
+  def ATOM_G64_RSUB    : BinAtom64<IL_OP_UAV_READ_RSUB, 
+      "_id($id)", atom_g_rsub>;
+  def ATOM_G64_SUB     : BinAtom64<IL_OP_UAV_READ_SUB, 
+      "_id($id)", atom_g_sub>;
+  def ATOM_G64_XOR     : BinAtom64<IL_OP_UAV_READ_XOR, 
+      "_id($id)", atom_g_xor>;
+  def ATOM_G64_INC     : BinAtom64<IL_OP_UAV_READ_INC, 
+      "_id($id)", atom_g_inc>;
+  def ATOM_G64_DEC     : BinAtom64<IL_OP_UAV_READ_DEC, 
+      "_id($id)", atom_g_dec>;
+  def ATOM_G64_XCHG    : BinAtom64<IL_OP_UAV_READ_XCHG, 
+      "_id($id)", atom_g_xchg>;
+  def ATOM_G64_CMPXCHG : CmpXChg64<IL_OP_UAV_READ_CMPXCHG, 
+      "_id($id)", atom_g_cmpxchg>;
+  // Arena atomic accesses
+  def ATOM_A64_ADD     : BinAtom64<IL_OP_UAV_READ_ADD, 
+      "_id($id)_arena", atom_g_add>;
+  def ATOM_A64_AND     : BinAtom64<IL_OP_UAV_READ_AND, 
+      "_id($id)_arena", atom_g_and>;
+  def ATOM_A64_MAX     : BinAtom64<IL_OP_UAV_READ_MAX, 
+      "_id($id)_arena", atom_g_max>;
+  def ATOM_A64_MIN     : BinAtom64<IL_OP_UAV_READ_MIN, 
+      "_id($id)_arena", atom_g_min>;
+  def ATOM_A64_UMAX    : BinAtom64<IL_OP_UAV_READ_UMAX, 
+      "_id($id)_arena", atom_g_umax>;
+  def ATOM_A64_UMIN    : BinAtom64<IL_OP_UAV_READ_UMIN, 
+      "_id($id)_arena", atom_g_umin>;
+  def ATOM_A64_OR      : BinAtom64<IL_OP_UAV_READ_OR, 
+      "_id($id)_arena", atom_g_or>;
+  def ATOM_A64_RSUB    : BinAtom64<IL_OP_UAV_READ_RSUB, 
+      "_id($id)_arena", atom_g_rsub>;
+  def ATOM_A64_SUB     : BinAtom64<IL_OP_UAV_READ_SUB, 
+      "_id($id)_arena", atom_g_sub>;
+  def ATOM_A64_XOR     : BinAtom64<IL_OP_UAV_READ_XOR, 
+      "_id($id)_arena", atom_g_xor>;
+  def ATOM_A64_INC     : BinAtom64<IL_OP_UAV_READ_INC, 
+      "_id($id)_arena", atom_g_inc>;
+  def ATOM_A64_DEC     : BinAtom64<IL_OP_UAV_READ_DEC, 
+      "_id($id)_arena", atom_g_dec>;
+  def ATOM_A64_XCHG    : BinAtom64<IL_OP_UAV_READ_XCHG, 
+      "_id($id)_arena", atom_g_xchg>;
+  def ATOM_A64_CMPXCHG : CmpXChg64<IL_OP_UAV_READ_CMPXCHG, 
+      "_id($id)_arena", atom_g_cmpxchg>;
+  def ATOM_L64_ADD     : BinAtom64<IL_OP_LDS_READ_ADD, 
+      "_resource($id)", atom_l_add>;
+  def ATOM_L64_AND     : BinAtom64<IL_OP_LDS_READ_AND, 
+      "_resource($id)", atom_l_and>;
+  def ATOM_L64_MAX     : BinAtom64<IL_OP_LDS_READ_MAX, 
+      "_resource($id)", atom_l_max>;
+  def ATOM_L64_MIN     : BinAtom64<IL_OP_LDS_READ_MIN, 
+      "_resource($id)", atom_l_min>;
+  def ATOM_L64_UMAX    : BinAtom64<IL_OP_LDS_READ_UMAX, 
+      "_resource($id)", atom_l_umax>;
+  def ATOM_L64_UMIN    : BinAtom64<IL_OP_LDS_READ_UMIN, 
+      "_resource($id)", atom_l_umin>;
+  def ATOM_L64_OR      : BinAtom64<IL_OP_LDS_READ_OR, 
+      "_resource($id)", atom_l_or>;
+  def ATOM_L64_MSKOR   : TriAtom64<IL_OP_LDS_READ_MSKOR, 
+      "_resource($id)", atom_l_mskor>;
+  def ATOM_L64_RSUB    : BinAtom64<IL_OP_LDS_READ_RSUB, 
+      "_resource($id)", atom_l_rsub>;
+  def ATOM_L64_SUB     : BinAtom64<IL_OP_LDS_READ_SUB, 
+      "_resource($id)", atom_l_sub>;
+  def ATOM_L64_XOR     : BinAtom64<IL_OP_LDS_READ_XOR, 
+      "_resource($id)", atom_l_xor>;
+  def ATOM_L64_INC     : BinAtom64<IL_OP_LDS_READ_INC, 
+      "_resource($id)", atom_l_inc>;
+  def ATOM_L64_DEC     : BinAtom64<IL_OP_LDS_READ_DEC, 
+      "_resource($id)", atom_l_dec>;
+  def ATOM_L64_XCHG    : BinAtom64<IL_OP_LDS_READ_XCHG, 
+      "_resource($id)", atom_l_xchg>;
+  def ATOM_L64_CMPXCHG : TriAtom64<IL_OP_LDS_READ_CMPXCHG, 
+      "_resource($id)", atom_l_cmpxchg>;
+  def ATOM_R64_ADD     : BinAtom64<IL_OP_GDS_READ_ADD, 
+      "_resource($id)", atom_r_add>;
+  def ATOM_R64_AND     : BinAtom64<IL_OP_GDS_READ_AND, 
+      "_resource($id)", atom_r_and>;
+  def ATOM_R64_MAX     : BinAtom64<IL_OP_GDS_READ_MAX, 
+      "_resource($id)", atom_r_max>;
+  def ATOM_R64_MIN     : BinAtom64<IL_OP_GDS_READ_MIN, 
+      "_resource($id)", atom_r_min>;
+  def ATOM_R64_UMAX    : BinAtom64<IL_OP_GDS_READ_UMAX, 
+      "_resource($id)", atom_r_umax>;
+  def ATOM_R64_UMIN    : BinAtom64<IL_OP_GDS_READ_UMIN, 
+      "_resource($id)", atom_r_umin>;
+  def ATOM_R64_OR      : BinAtom64<IL_OP_GDS_READ_OR, 
+      "_resource($id)", atom_r_or>;
+  def ATOM_R64_MSKOR   : TriAtom64<IL_OP_GDS_READ_MSKOR, 
+      "_resource($id)", atom_r_mskor>;
+  def ATOM_R64_RSUB    : BinAtom64<IL_OP_GDS_READ_RSUB, 
+      "_resource($id)", atom_r_rsub>;
+  def ATOM_R64_SUB     : BinAtom64<IL_OP_GDS_READ_SUB, 
+      "_resource($id)", atom_r_sub>;
+  def ATOM_R64_XOR     : BinAtom64<IL_OP_GDS_READ_XOR, 
+      "_resource($id)", atom_r_xor>;
+  def ATOM_R64_INC     : BinAtom64<IL_OP_GDS_READ_INC, 
+      "_resource($id)", atom_r_inc>;
+  def ATOM_R64_DEC     : BinAtom64<IL_OP_GDS_READ_DEC, 
+      "_resource($id)", atom_r_dec>;
+  def ATOM_R64_XCHG    : BinAtom64<IL_OP_GDS_READ_XCHG, 
+      "_resource($id)", atom_r_xchg>;
+  def ATOM_R64_CMPXCHG : CmpXChg64<IL_OP_GDS_READ_CMPXCHG, 
+      "_resource($id)", atom_r_cmpxchg>;
+  def APPEND_ALLOC64 : Append64<IL_OP_APPEND_BUF_ALLOC,
+      "_id($id)", append_alloc>;
+  def APPEND_CONSUME64 : Append64<IL_OP_APPEND_BUF_CONSUME,
+      "_id($id)", append_consume>;
+}
+}
+/*
+def SEMAPHORE_INIT : BinaryOpNoRet<IL_OP_SEMAPHORE_INIT, (outs),
+    (ins MEMI32:$ptr, i32imm:$val),
+    !strconcat(IL_OP_SEMAPHORE_INIT.Text, "_id($ptr)_value($val)"),
+    [(int_AMDIL_semaphore_init ADDR:$ptr, timm:$val)]>;
+
+def SEMAPHORE_WAIT : UnaryOpNoRet<IL_OP_SEMAPHORE_WAIT, (outs), 
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_SEMAPHORE_WAIT.Text, "_id($ptr)"),
+    [(int_AMDIL_semaphore_wait ADDR:$ptr)]>;
+
+def SEMAPHORE_SIGNAL : UnaryOpNoRet<IL_OP_SEMAPHORE_SIGNAL, (outs), 
+    (ins MEMI32:$ptr),
+    !strconcat(IL_OP_SEMAPHORE_SIGNAL.Text, "_id($ptr)"),
+    [(int_AMDIL_semaphore_signal ADDR:$ptr)]>;
+*/
diff --git a/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp

new file mode 100644 (file)

index 0000000..75729ac
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp
@@ -0,0 +1,190 @@
+//===- AMDILIntrinsicInfo.cpp - AMDIL Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the AMDIL Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDILIntrinsicInfo.h"
+#include "AMDIL.h"
+#include "AMDILTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+
+using namespace llvm;
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "AMDILGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+AMDILIntrinsicInfo::AMDILIntrinsicInfo(AMDILTargetMachine *tm) 
+  : TargetIntrinsicInfo(), mTM(tm)
+{
+}
+
+std::string 
+AMDILIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
+    unsigned int numTys) const 
+{
+  static const char* const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDILGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  //assert(!isOverloaded(IntrID)
+  //&& "AMDIL Intrinsics are not overloaded");
+  if (IntrID < Intrinsic::num_intrinsics) {
+    return 0;
+  }
+  assert(IntrID < AMDGPUIntrinsic::num_AMDIL_intrinsics
+      && "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+  static bool
+checkTruncation(const char *Name, unsigned int& Len)
+{
+  const char *ptr = Name + (Len - 1);
+  while(ptr != Name && *ptr != '_') {
+    --ptr;
+  }
+  // We don't want to truncate on atomic instructions
+  // but we do want to enter the check Truncation
+  // section so that we can translate the atomic
+  // instructions if we need to.
+  if (!strncmp(Name, "__atom", 6)) {
+    return true;
+  }
+  if (strstr(ptr, "i32")
+      || strstr(ptr, "u32")
+      || strstr(ptr, "i64")
+      || strstr(ptr, "u64")
+      || strstr(ptr, "f32")
+      || strstr(ptr, "f64")
+      || strstr(ptr, "i16")
+      || strstr(ptr, "u16")
+      || strstr(ptr, "i8")
+      || strstr(ptr, "u8")) {
+    Len = (unsigned int)(ptr - Name);
+    return true;
+  }
+  return false;
+}
+
+// We don't want to support both the OpenCL 1.0 atomics
+// and the 1.1 atomics with different names, so we translate
+// the 1.0 atomics to the 1.1 naming here if needed.
+static char*
+atomTranslateIfNeeded(const char *Name, unsigned int Len) 
+{
+  char *buffer = NULL;
+  if (strncmp(Name, "__atom_", 7))  {
+    // If we are not starting with __atom_, then
+    // go ahead and continue on with the allocation.
+    buffer = new char[Len + 1];
+    memcpy(buffer, Name, Len);
+  } else {
+    buffer = new char[Len + 3];
+    memcpy(buffer, "__atomic_", 9);
+    memcpy(buffer + 9, Name + 7, Len - 7);
+    Len += 2;
+  }
+  buffer[Len] = '\0';
+  return buffer;
+}
+
+unsigned int
+AMDILIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const 
+{
+#define GET_FUNCTION_RECOGNIZER
+#include "AMDILGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  AMDGPUIntrinsic::ID IntrinsicID
+    = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  if (checkTruncation(Name, Len)) {
+    char *buffer = atomTranslateIfNeeded(Name, Len);
+    IntrinsicID = getIntrinsicForGCCBuiltin("AMDIL", buffer);
+    delete [] buffer;
+  } else {
+    IntrinsicID = getIntrinsicForGCCBuiltin("AMDIL", Name);
+  }
+  if (!isValidIntrinsic(IntrinsicID)) {
+    return 0;
+  }
+  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
+    return IntrinsicID;
+  }
+  return 0;
+}
+
+bool 
+AMDILIntrinsicInfo::isOverloaded(unsigned id) const 
+{
+  // Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDILGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+/// This defines the "getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "AMDILGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+Function*
+AMDILIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+    Type **Tys,
+    unsigned numTys) const 
+{
+  assert(!isOverloaded(IntrID) && "AMDIL intrinsics are not overloaded");
+  AttrListPtr AList = getAttributes((AMDGPUIntrinsic::ID) IntrID);
+  LLVMContext& Context = M->getContext();
+  unsigned int id = IntrID;
+  Type *ResultTy = NULL;
+  std::vector<Type*> ArgTys;
+  bool IsVarArg = false;
+
+#define GET_INTRINSIC_GENERATOR
+#include "AMDILGenIntrinsics.inc"
+#undef GET_INTRINSIC_GENERATOR
+  // We need to add the resource ID argument for atomics.
+  if (id >= AMDGPUIntrinsic::AMDIL_atomic_add_gi32
+        && id <= AMDGPUIntrinsic::AMDIL_atomic_xor_ru32_noret) {
+    ArgTys.push_back(IntegerType::get(Context, 32));
+  }
+
+  return cast<Function>(M->getOrInsertFunction(getName(IntrID),
+        FunctionType::get(ResultTy, ArgTys, IsVarArg),
+        AList));
+}
+
+/// Because the code generator has to support different SC versions, 
+/// this function is added to check that the intrinsic being used
+/// is actually valid. In the case where it isn't valid, the 
+/// function call is not translated into an intrinsic and the
+/// fall back software emulated path should pick up the result.
+bool
+AMDILIntrinsicInfo::isValidIntrinsic(unsigned int IntrID) const
+{
+  const AMDILSubtarget *stm = mTM->getSubtargetImpl();
+  switch (IntrID) {
+    default:
+      return true;
+    case AMDGPUIntrinsic::AMDIL_convert_f32_i32_rpi:
+    case AMDGPUIntrinsic::AMDIL_convert_f32_i32_flr:
+    case AMDGPUIntrinsic::AMDIL_convert_f32_f16_near:
+    case AMDGPUIntrinsic::AMDIL_convert_f32_f16_neg_inf:
+    case AMDGPUIntrinsic::AMDIL_convert_f32_f16_plus_inf:
+        return stm->calVersion() >= CAL_VERSION_SC_139;
+  };
+}
diff --git a/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h

new file mode 100644 (file)

index 0000000..513c6f0
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h
@@ -0,0 +1,49 @@
+//===- AMDILIntrinsicInfo.h - AMDIL Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//   Interface for the AMDIL Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef _AMDIL_INTRINSICS_H_
+#define _AMDIL_INTRINSICS_H_
+
+#include "llvm/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+  class AMDILTargetMachine;
+  namespace AMDGPUIntrinsic {
+    enum ID {
+      last_non_AMDIL_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDILGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDIL_intrinsics
+    };
+
+  }
+
+
+  class AMDILIntrinsicInfo : public TargetIntrinsicInfo {
+    AMDILTargetMachine *mTM;
+    public:
+      AMDILIntrinsicInfo(AMDILTargetMachine *tm);
+      std::string getName(unsigned int IntrId, Type **Tys = 0,
+          unsigned int numTys = 0) const;
+      unsigned int lookupName(const char *Name, unsigned int Len) const;
+      bool isOverloaded(unsigned int IID) const;
+      Function *getDeclaration(Module *M, unsigned int ID,
+          Type **Tys = 0,
+          unsigned int numTys = 0) const;
+      bool isValidIntrinsic(unsigned int) const;
+  }; // AMDILIntrinsicInfo
+}
+
+#endif // _AMDIL_INTRINSICS_H_
+
diff --git a/src/gallium/drivers/radeon/AMDILIntrinsics.td b/src/gallium/drivers/radeon/AMDILIntrinsics.td

new file mode 100644 (file)

index 0000000..ef361f4
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILIntrinsics.td
@@ -0,0 +1,705 @@
+//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines all of the amdil-specific intrinsics
+//
+//===---------------------------------------------------------------===//
+
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+//------------- Synchronization Functions - OpenCL 6.11.9 --------------------//
+  def int_AMDIL_fence   : GCCBuiltin<"mem_fence">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_global   : GCCBuiltin<"mem_fence_global">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_local   : GCCBuiltin<"mem_fence_local">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_region   : GCCBuiltin<"mem_fence_region">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_read_only   : GCCBuiltin<"read_mem_fence">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_read_only_global   : GCCBuiltin<"read_mem_fence_global">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_read_only_local   : GCCBuiltin<"read_mem_fence_local">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_read_only_region : GCCBuiltin<"read_mem_fence_region">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_write_only   : GCCBuiltin<"write_mem_fence">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_write_only_global   : GCCBuiltin<"write_mem_fence_global">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_write_only_local   : GCCBuiltin<"write_mem_fence_local">,
+        UnaryIntNoRetInt;
+  def int_AMDIL_fence_write_only_region : GCCBuiltin<"write_mem_fence_region">,
+        UnaryIntNoRetInt;
+
+  def int_AMDIL_early_exit : GCCBuiltin<"__amdil_early_exit">,
+        UnaryIntNoRetInt;
+
+  def int_AMDIL_cmov_logical  : GCCBuiltin<"__amdil_cmov_logical">,
+          TernaryIntInt;
+  def int_AMDIL_fabs : GCCBuiltin<"__amdil_fabs">, UnaryIntFloat;
+  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
+
+  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
+          UnaryIntInt;
+  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
+          UnaryIntInt;
+  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
+                    TernaryIntInt;
+  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
+                    TernaryIntInt;
+  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
+                    QuaternaryIntInt;
+  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
+      TernaryIntInt;
+  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
+      BinaryIntInt;
+  def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
+          TernaryIntInt;
+  def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
+          TernaryIntInt;
+  def int_AMDIL_mad     : GCCBuiltin<"__amdil_mad">,
+          TernaryIntFloat;
+  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
+          TernaryIntInt;
+  def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
+          TernaryIntInt;
+  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
+          BinaryIntInt;
+  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
+          BinaryIntInt;
+  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
+          BinaryIntInt;
+  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
+          BinaryIntInt;
+  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
+          BinaryIntFloat;
+  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
+          BinaryIntInt;
+  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
+          BinaryIntInt;
+  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
+          BinaryIntFloat;
+  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
+          TernaryIntInt;
+  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
+          UnaryIntFloat;
+  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
+          TernaryIntFloat;
+  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
+          UnaryIntFloat;
+  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
+          UnaryIntFloat;
+  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
+          UnaryIntFloat;
+  def int_AMDIL_round_posinf : GCCBuiltin<"__amdil_round_posinf">,
+          UnaryIntFloat;
+  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
+          UnaryIntFloat;
+  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
+          UnaryIntFloat;
+  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
+          UnaryIntFloat;
+  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
+          UnaryIntFloat;
+  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
+          UnaryIntFloat;
+  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
+          UnaryIntFloat;
+  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
+          UnaryIntFloat;
+  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
+  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
+  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
+  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
+          UnaryIntFloat;
+  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
+          UnaryIntFloat;
+  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
+          UnaryIntFloat;
+  def int_AMDIL_log : GCCBuiltin<"__amdil_log">,
+          UnaryIntFloat;
+  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
+          UnaryIntFloat;
+  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
+          UnaryIntFloat;
+  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
+          TernaryIntFloat;
+  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
+          UnaryIntFloat;
+  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
+          UnaryIntFloat;
+  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
+          TernaryIntFloat;
+  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
+           llvm_v4i32_ty, llvm_i32_ty], []>;
+
+  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
+ def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
+    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
+  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
+  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
+      ConvertIntITOF;
+  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
+      ConvertIntFTOI;
+ def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
+  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
+      ConvertIntITOF;
+  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty, llvm_float_ty], []>;
+  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty], []>;
+  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+//===---------------------- Image functions begin ------------------------===//
+  def int_AMDIL_image1d_write : GCCBuiltin<"__amdil_image1d_write">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_read_norm  : GCCBuiltin<"__amdil_image1d_read_norm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_read_unnorm  : GCCBuiltin<"__amdil_image1d_read_unnorm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_info0 : GCCBuiltin<"__amdil_image1d_info0">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image1d_info1 : GCCBuiltin<"__amdil_image1d_info1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+ def int_AMDIL_image1d_array_write : GCCBuiltin<"__amdil_image1d_array_write">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_array_read_norm  : GCCBuiltin<"__amdil_image1d_array_read_norm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_array_read_unnorm  : GCCBuiltin<"__amdil_image1d_array_read_unnorm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image1d_array_info0 : GCCBuiltin<"__amdil_image1d_array_info0">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image1d_array_info1 : GCCBuiltin<"__amdil_image1d_array_info1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+ def int_AMDIL_image2d_write : GCCBuiltin<"__amdil_image2d_write">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_read_norm  : GCCBuiltin<"__amdil_image2d_read_norm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_read_unnorm  : GCCBuiltin<"__amdil_image2d_read_unnorm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_info0 : GCCBuiltin<"__amdil_image2d_info0">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image2d_info1 : GCCBuiltin<"__amdil_image2d_info1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+ def int_AMDIL_image2d_array_write : GCCBuiltin<"__amdil_image2d_array_write">,
+      Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_array_read_norm  : GCCBuiltin<"__amdil_image2d_array_read_norm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_array_read_unnorm  : GCCBuiltin<"__amdil_image2d_array_read_unnorm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image2d_array_info0 : GCCBuiltin<"__amdil_image2d_array_info0">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image2d_array_info1 : GCCBuiltin<"__amdil_image2d_array_info1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image3d_write : GCCBuiltin<"__amdil_image3d_write">,
+         Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image3d_read_norm  : GCCBuiltin<"__amdil_image3d_read_norm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image3d_read_unnorm  : GCCBuiltin<"__amdil_image3d_read_unnorm">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_image3d_info0 : GCCBuiltin<"__amdil_image3d_info0">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+  def int_AMDIL_image3d_info1 : GCCBuiltin<"__amdil_image3d_info1">,
+      Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>;
+
+//===---------------------- Image functions end --------------------------===//
+
+  def int_AMDIL_append_alloc_i32 : GCCBuiltin<"__amdil_append_alloc">,
+      Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_AMDIL_append_consume_i32 : GCCBuiltin<"__amdil_append_consume">,
+      Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_AMDIL_append_alloc_i32_noret : GCCBuiltin<"__amdil_append_alloc_noret">,
+      Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
+  def int_AMDIL_append_consume_i32_noret : GCCBuiltin<"__amdil_append_consume_noret">,
+      Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>;
+
+  def int_AMDIL_get_global_id : GCCBuiltin<"__amdil_get_global_id_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_local_id : GCCBuiltin<"__amdil_get_local_id_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_group_id : GCCBuiltin<"__amdil_get_group_id_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_num_groups : GCCBuiltin<"__amdil_get_num_groups_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_local_size : GCCBuiltin<"__amdil_get_local_size_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_global_size : GCCBuiltin<"__amdil_get_global_size_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_global_offset : GCCBuiltin<"__amdil_get_global_offset_int">,
+      Intrinsic<[llvm_v4i32_ty], [], []>;
+  def int_AMDIL_get_work_dim : GCCBuiltin<"get_work_dim">,
+      Intrinsic<[llvm_i32_ty], [], []>;
+  def int_AMDIL_get_printf_offset : GCCBuiltin<"__amdil_get_printf_offset">,
+      Intrinsic<[llvm_i32_ty], []>;
+  def int_AMDIL_get_printf_size : GCCBuiltin<"__amdil_get_printf_size">,
+      Intrinsic<[llvm_i32_ty], []>;
+
+/// Intrinsics for atomic instructions with no return value
+/// Signed 32 bit integer atomics for global address space
+def int_AMDIL_atomic_add_gi32_noret : GCCBuiltin<"__atomic_add_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_gi32_noret : GCCBuiltin<"__atomic_sub_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_gi32_noret : GCCBuiltin<"__atomic_rsub_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_gi32_noret : GCCBuiltin<"__atomic_xchg_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_gi32_noret : GCCBuiltin<"__atomic_inc_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_gi32_noret : GCCBuiltin<"__atomic_dec_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_gi32_noret : GCCBuiltin<"__atomic_cmpxchg_gi32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_gi32_noret : GCCBuiltin<"__atomic_min_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_gi32_noret : GCCBuiltin<"__atomic_max_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_gi32_noret : GCCBuiltin<"__atomic_and_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_gi32_noret : GCCBuiltin<"__atomic_or_gi32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_gi32_noret : GCCBuiltin<"__atomic_xor_gi32_noret">,
+    BinaryAtomicIntNoRet;
+
+
+
+/// Unsigned 32 bit integer atomics for global address space
+def int_AMDIL_atomic_add_gu32_noret : GCCBuiltin<"__atomic_add_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_gu32_noret : GCCBuiltin<"__atomic_sub_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_gu32_noret : GCCBuiltin<"__atomic_rsub_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_gu32_noret : GCCBuiltin<"__atomic_xchg_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_gu32_noret : GCCBuiltin<"__atomic_inc_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_gu32_noret : GCCBuiltin<"__atomic_dec_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_gu32_noret : GCCBuiltin<"__atomic_cmpxchg_gu32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_gu32_noret : GCCBuiltin<"__atomic_min_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_gu32_noret : GCCBuiltin<"__atomic_max_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_gu32_noret : GCCBuiltin<"__atomic_and_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_gu32_noret : GCCBuiltin<"__atomic_or_gu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_gu32_noret : GCCBuiltin<"__atomic_xor_gu32_noret">,
+    BinaryAtomicIntNoRet;
+
+
+/// Intrinsics for atomic instructions with a return value
+/// Signed 32 bit integer atomics for global address space
+def int_AMDIL_atomic_add_gi32 : GCCBuiltin<"__atomic_add_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_gi32 : GCCBuiltin<"__atomic_sub_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_gi32 : GCCBuiltin<"__atomic_rsub_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_gi32 : GCCBuiltin<"__atomic_xchg_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_gi32 : GCCBuiltin<"__atomic_inc_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_gi32 : GCCBuiltin<"__atomic_dec_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_gi32 : GCCBuiltin<"__atomic_cmpxchg_gi32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_gi32 : GCCBuiltin<"__atomic_min_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_gi32 : GCCBuiltin<"__atomic_max_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_gi32 : GCCBuiltin<"__atomic_and_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_gi32 : GCCBuiltin<"__atomic_or_gi32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xor_gi32 : GCCBuiltin<"__atomic_xor_gi32">,
+    BinaryAtomicInt;
+
+/// 32 bit float atomics required by OpenCL
+def int_AMDIL_atomic_xchg_gf32 : GCCBuiltin<"__atomic_xchg_gf32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_gf32_noret : GCCBuiltin<"__atomic_xchg_gf32_noret">,
+    BinaryAtomicIntNoRet;
+
+/// Unsigned 32 bit integer atomics for global address space
+def int_AMDIL_atomic_add_gu32 : GCCBuiltin<"__atomic_add_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_gu32 : GCCBuiltin<"__atomic_sub_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_gu32 : GCCBuiltin<"__atomic_rsub_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_gu32 : GCCBuiltin<"__atomic_xchg_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_gu32 : GCCBuiltin<"__atomic_inc_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_gu32 : GCCBuiltin<"__atomic_dec_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_gu32 : GCCBuiltin<"__atomic_cmpxchg_gu32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_gu32 : GCCBuiltin<"__atomic_min_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_gu32 : GCCBuiltin<"__atomic_max_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_gu32 : GCCBuiltin<"__atomic_and_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_gu32 : GCCBuiltin<"__atomic_or_gu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xor_gu32 : GCCBuiltin<"__atomic_xor_gu32">,
+    BinaryAtomicInt;
+
+
+/// Intrinsics for atomic instructions with no return value
+/// Signed 32 bit integer atomics for local address space
+def int_AMDIL_atomic_add_li32_noret : GCCBuiltin<"__atomic_add_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_li32_noret : GCCBuiltin<"__atomic_sub_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_li32_noret : GCCBuiltin<"__atomic_rsub_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_li32_noret : GCCBuiltin<"__atomic_xchg_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_li32_noret : GCCBuiltin<"__atomic_inc_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_li32_noret : GCCBuiltin<"__atomic_dec_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_li32_noret : GCCBuiltin<"__atomic_cmpxchg_li32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_li32_noret : GCCBuiltin<"__atomic_min_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_li32_noret : GCCBuiltin<"__atomic_max_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_li32_noret : GCCBuiltin<"__atomic_and_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_li32_noret : GCCBuiltin<"__atomic_or_li32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_mskor_li32_noret : GCCBuiltin<"__atomic_mskor_li32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_li32_noret : GCCBuiltin<"__atomic_xor_li32_noret">,
+    BinaryAtomicIntNoRet;
+
+/// Signed 32 bit integer atomics for region address space
+def int_AMDIL_atomic_add_ri32_noret : GCCBuiltin<"__atomic_add_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_ri32_noret : GCCBuiltin<"__atomic_sub_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_ri32_noret : GCCBuiltin<"__atomic_rsub_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_ri32_noret : GCCBuiltin<"__atomic_xchg_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_ri32_noret : GCCBuiltin<"__atomic_inc_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_ri32_noret : GCCBuiltin<"__atomic_dec_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_ri32_noret : GCCBuiltin<"__atomic_cmpxchg_ri32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_ri32_noret : GCCBuiltin<"__atomic_min_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_ri32_noret : GCCBuiltin<"__atomic_max_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_ri32_noret : GCCBuiltin<"__atomic_and_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_ri32_noret : GCCBuiltin<"__atomic_or_ri32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_mskor_ri32_noret : GCCBuiltin<"__atomic_mskor_ri32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_ri32_noret : GCCBuiltin<"__atomic_xor_ri32_noret">,
+    BinaryAtomicIntNoRet;
+
+
+
+/// Unsigned 32 bit integer atomics for local address space
+def int_AMDIL_atomic_add_lu32_noret : GCCBuiltin<"__atomic_add_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_lu32_noret : GCCBuiltin<"__atomic_sub_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_lu32_noret : GCCBuiltin<"__atomic_rsub_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_lu32_noret : GCCBuiltin<"__atomic_xchg_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_lu32_noret : GCCBuiltin<"__atomic_inc_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_lu32_noret : GCCBuiltin<"__atomic_dec_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_lu32_noret : GCCBuiltin<"__atomic_cmpxchg_lu32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_lu32_noret : GCCBuiltin<"__atomic_min_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_lu32_noret : GCCBuiltin<"__atomic_max_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_lu32_noret : GCCBuiltin<"__atomic_and_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_lu32_noret : GCCBuiltin<"__atomic_or_lu32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_mskor_lu32_noret : GCCBuiltin<"__atomic_mskor_lu32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_lu32_noret : GCCBuiltin<"__atomic_xor_lu32_noret">,
+    BinaryAtomicIntNoRet;
+
+/// Unsigned 32 bit integer atomics for region address space
+def int_AMDIL_atomic_add_ru32_noret : GCCBuiltin<"__atomic_add_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_sub_ru32_noret : GCCBuiltin<"__atomic_sub_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_rsub_ru32_noret : GCCBuiltin<"__atomic_rsub_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_ru32_noret : GCCBuiltin<"__atomic_xchg_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_inc_ru32_noret : GCCBuiltin<"__atomic_inc_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_dec_ru32_noret : GCCBuiltin<"__atomic_dec_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_cmpxchg_ru32_noret : GCCBuiltin<"__atomic_cmpxchg_ru32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_min_ru32_noret : GCCBuiltin<"__atomic_min_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_max_ru32_noret : GCCBuiltin<"__atomic_max_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_and_ru32_noret : GCCBuiltin<"__atomic_and_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_or_ru32_noret : GCCBuiltin<"__atomic_or_ru32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_mskor_ru32_noret : GCCBuiltin<"__atomic_mskor_ru32_noret">,
+    TernaryAtomicIntNoRet;
+def int_AMDIL_atomic_xor_ru32_noret : GCCBuiltin<"__atomic_xor_ru32_noret">,
+    BinaryAtomicIntNoRet;
+
+def int_AMDIL_get_cycle_count : GCCBuiltin<"__amdil_get_cycle_count">,
+    VoidIntLong;
+
+def int_AMDIL_compute_unit_id : GCCBuiltin<"__amdil_compute_unit_id">,
+    VoidIntInt;
+
+def int_AMDIL_wavefront_id : GCCBuiltin<"__amdil_wavefront_id">,
+    VoidIntInt;
+
+
+/// Intrinsics for atomic instructions with a return value
+/// Signed 32 bit integer atomics for local address space
+def int_AMDIL_atomic_add_li32 : GCCBuiltin<"__atomic_add_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_li32 : GCCBuiltin<"__atomic_sub_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_li32 : GCCBuiltin<"__atomic_rsub_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_li32 : GCCBuiltin<"__atomic_xchg_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_li32 : GCCBuiltin<"__atomic_inc_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_li32 : GCCBuiltin<"__atomic_dec_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_li32 : GCCBuiltin<"__atomic_cmpxchg_li32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_li32 : GCCBuiltin<"__atomic_min_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_li32 : GCCBuiltin<"__atomic_max_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_li32 : GCCBuiltin<"__atomic_and_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_li32 : GCCBuiltin<"__atomic_or_li32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_mskor_li32 : GCCBuiltin<"__atomic_mskor_li32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_xor_li32 : GCCBuiltin<"__atomic_xor_li32">,
+    BinaryAtomicInt;
+
+/// Signed 32 bit integer atomics for region address space
+def int_AMDIL_atomic_add_ri32 : GCCBuiltin<"__atomic_add_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_ri32 : GCCBuiltin<"__atomic_sub_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_ri32 : GCCBuiltin<"__atomic_rsub_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_ri32 : GCCBuiltin<"__atomic_xchg_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_ri32 : GCCBuiltin<"__atomic_inc_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_ri32 : GCCBuiltin<"__atomic_dec_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_ri32 : GCCBuiltin<"__atomic_cmpxchg_ri32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_ri32 : GCCBuiltin<"__atomic_min_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_ri32 : GCCBuiltin<"__atomic_max_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_ri32 : GCCBuiltin<"__atomic_and_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_ri32 : GCCBuiltin<"__atomic_or_ri32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_mskor_ri32 : GCCBuiltin<"__atomic_mskor_ri32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_xor_ri32 : GCCBuiltin<"__atomic_xor_ri32">,
+    BinaryAtomicInt;
+
+/// 32 bit float atomics required by OpenCL
+def int_AMDIL_atomic_xchg_lf32 : GCCBuiltin<"__atomic_xchg_lf32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_lf32_noret : GCCBuiltin<"__atomic_xchg_lf32_noret">,
+    BinaryAtomicIntNoRet;
+def int_AMDIL_atomic_xchg_rf32 : GCCBuiltin<"__atomic_xchg_rf32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_rf32_noret : GCCBuiltin<"__atomic_xchg_rf32_noret">,
+    BinaryAtomicIntNoRet;
+
+/// Unsigned 32 bit integer atomics for local address space
+def int_AMDIL_atomic_add_lu32 : GCCBuiltin<"__atomic_add_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_lu32 : GCCBuiltin<"__atomic_sub_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_lu32 : GCCBuiltin<"__atomic_rsub_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_lu32 : GCCBuiltin<"__atomic_xchg_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_lu32 : GCCBuiltin<"__atomic_inc_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_lu32 : GCCBuiltin<"__atomic_dec_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_lu32 : GCCBuiltin<"__atomic_cmpxchg_lu32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_lu32 : GCCBuiltin<"__atomic_min_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_lu32 : GCCBuiltin<"__atomic_max_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_lu32 : GCCBuiltin<"__atomic_and_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_lu32 : GCCBuiltin<"__atomic_or_lu32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_mskor_lu32 : GCCBuiltin<"__atomic_mskor_lu32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_xor_lu32 : GCCBuiltin<"__atomic_xor_lu32">,
+    BinaryAtomicInt;
+
+/// Unsigned 32 bit integer atomics for region address space
+def int_AMDIL_atomic_add_ru32 : GCCBuiltin<"__atomic_add_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_sub_ru32 : GCCBuiltin<"__atomic_sub_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_rsub_ru32 : GCCBuiltin<"__atomic_rsub_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_xchg_ru32 : GCCBuiltin<"__atomic_xchg_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_inc_ru32 : GCCBuiltin<"__atomic_inc_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_dec_ru32 : GCCBuiltin<"__atomic_dec_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_cmpxchg_ru32 : GCCBuiltin<"__atomic_cmpxchg_ru32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_min_ru32 : GCCBuiltin<"__atomic_min_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_max_ru32 : GCCBuiltin<"__atomic_max_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_and_ru32 : GCCBuiltin<"__atomic_and_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_or_ru32 : GCCBuiltin<"__atomic_or_ru32">,
+    BinaryAtomicInt;
+def int_AMDIL_atomic_mskor_ru32 : GCCBuiltin<"__atomic_mskor_ru32">,
+    TernaryAtomicInt;
+def int_AMDIL_atomic_xor_ru32 : GCCBuiltin<"__atomic_xor_ru32">,
+    BinaryAtomicInt;
+
+/// Semaphore signal/wait/init
+def int_AMDIL_semaphore_init : GCCBuiltin<"__amdil_semaphore_init">,
+    Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_AMDIL_semaphore_wait : GCCBuiltin<"__amdil_semaphore_wait">,
+    Intrinsic<[], [llvm_ptr_ty]>;
+def int_AMDIL_semaphore_signal : GCCBuiltin<"__amdil_semaphore_signal">,
+    Intrinsic<[], [llvm_ptr_ty]>;
+def int_AMDIL_semaphore_size   : GCCBuiltin<"__amdil_max_semaphore_size">,
+    Intrinsic<[llvm_i32_ty], []>;
+}
diff --git a/src/gallium/drivers/radeon/AMDILKernel.h b/src/gallium/drivers/radeon/AMDILKernel.h

new file mode 100644 (file)

index 0000000..ce7ea04
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILKernel.h
@@ -0,0 +1,84 @@
+//===------------- AMDILKernel.h - AMDIL Kernel Class ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// Definition of a AMDILKernel object and the various subclasses that 
+// are used.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDIL_KERNEL_H_
+#define _AMDIL_KERNEL_H_
+#include "AMDIL.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Constant.h"
+#include "llvm/Value.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+  class AMDILTargetMachine;
+  /// structure that holds information for a single local/region address array
+  typedef struct _AMDILArrayMemRec {
+    uint32_t vecSize; // size of each vector
+    uint32_t offset;  // offset into the memory section
+    bool isHW;        // flag to specify if HW is used or SW is used
+    bool isRegion;    // flag to specify if GDS is used or not
+  } AMDILArrayMem;
+
+  /// structure that holds information about a constant address
+  /// space pointer that is a kernel argument
+  typedef struct _AMDILConstPtrRec {
+    const llvm::Value *base;
+    uint32_t size;
+    uint32_t offset;
+    uint32_t cbNum; // value of 0 means that it does not use hw CB
+    bool isArray;
+    bool isArgument;
+    bool usesHardware;
+    std::string name;
+  } AMDILConstPtr;
+ 
+  /// Structure that holds information for all local/region address
+  /// arrays in the kernel
+  typedef struct _AMDILLocalArgRec {
+    llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS> local;
+    std::string name; // Kernel Name
+  } AMDILLocalArg;
+
+  /// Structure that holds information for each kernel argument
+  typedef struct _AMDILkernelArgRec {
+    uint32_t reqGroupSize[3];
+    uint32_t reqRegionSize[3];
+    llvm::SmallVector<uint32_t, DEFAULT_VEC_SLOTS> argInfo;
+    bool mHasRWG;
+    bool mHasRWR;
+  } AMDILKernelAttr;
+
+  /// Structure that holds information for each kernel
+  class AMDILKernel {
+    public:
+      AMDILKernel() {}
+      uint32_t curSize;
+      uint32_t curRSize;
+      uint32_t curHWSize;
+      uint32_t curHWRSize;
+      uint32_t constSize;
+      bool mKernel;
+      std::string mName;
+      AMDILKernelAttr *sgv;
+      AMDILLocalArg *lvgv;
+      llvm::SmallVector<struct _AMDILConstPtrRec, DEFAULT_VEC_SLOTS> constPtr;
+      uint32_t constSizes[HW_MAX_NUM_CB];
+      llvm::SmallSet<uint32_t, OPENCL_MAX_READ_IMAGES> readOnly;
+      llvm::SmallSet<uint32_t, OPENCL_MAX_WRITE_IMAGES> writeOnly;
+      llvm::SmallVector<std::pair<uint32_t, const llvm::Constant *>,
+        DEFAULT_VEC_SLOTS> CPOffsets;
+      typedef llvm::SmallVector<struct _AMDILConstPtrRec, DEFAULT_VEC_SLOTS>::iterator constptr_iterator;
+      typedef llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS>::iterator arraymem_iterator;
+  }; // AMDILKernel
+} // end llvm namespace
+#endif // _AMDIL_KERNEL_H_
diff --git a/src/gallium/drivers/radeon/AMDILKernelManager.cpp b/src/gallium/drivers/radeon/AMDILKernelManager.cpp

new file mode 100644 (file)

index 0000000..4df81ff
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILKernelManager.cpp
@@ -0,0 +1,1356 @@
+//===-- AMDILKernelManager.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILKernelManager.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILKernelManager.h"
+#ifdef UPSTREAM_LLVM
+#include "AMDILAsmPrinter.h"
+#endif
+#include "AMDILCompilerErrors.h"
+#include "AMDILDeviceInfo.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILModuleInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <stdio.h>
+
+using namespace llvm;
+#define NUM_EXTRA_SLOTS_PER_IMAGE 1
+
+static bool errorPrint(const char *ptr, llvm::raw_ostream &O) {
+  if (ptr[0] == 'E') {
+    O << ";error:" << ptr << "\n";
+  } else {
+    O << ";warning:" << ptr << "\n";
+  }
+  return false;
+}
+
+#if 0
+static bool
+samplerPrint(StringMap<SamplerInfo>::iterator &data, llvm::raw_ostream &O) {
+  O << ";sampler:" << (*data).second.name << ":" << (*data).second.idx
+    << ":" << ((*data).second.val == (uint32_t)-1 ? 0 : 1) 
+    << ":" << ((*data).second.val != (uint32_t)-1 ? (*data).second.val : 0)
+    << "\n";
+  return false;
+}
+#endif
+
+static bool arenaPrint(uint32_t val, llvm::raw_ostream &O) {
+  if (val >= ARENA_SEGMENT_RESERVED_UAVS) {
+    O << "dcl_arena_uav_id(" << val << ")\n";
+  }
+  return false;
+}
+
+static bool uavPrint(uint32_t val, llvm::raw_ostream &O) {
+  if (val < 8 || val == 11){
+    O << "dcl_raw_uav_id(" << val << ")\n";
+  }
+  return false;
+}
+
+static bool uavPrintSI(uint32_t val, llvm::raw_ostream &O) {
+  O << "dcl_typeless_uav_id(" << val << ")_stride(4)_length(4)_access(read_write)\n";
+  return false;
+}
+
+static bool
+printfPrint(std::pair<const std::string, PrintfInfo *> &data, llvm::raw_ostream &O) {
+  O << ";printf_fmt:" << data.second->getPrintfID();
+  // Number of operands
+  O << ":" << data.second->getNumOperands();
+  // Size of each operand
+  for (size_t i = 0, e = data.second->getNumOperands(); i < e; ++i) {
+    O << ":" << (data.second->getOperandID(i) >> 3);
+  }
+  const char *ptr = data.first.c_str();
+  uint32_t size = data.first.size() - 1;
+  // The format string size
+  O << ":" << size << ":";
+  for (size_t i = 0; i < size; ++i) {
+    if (ptr[i] == '\r') {
+      O << "\\r";
+    } else if (ptr[i] == '\n') {
+      O << "\\n";
+    } else {
+      O << ptr[i];
+    }
+  }
+  O << ";\n";   // c_str() is cheap way to trim
+  return false;
+}
+
+
+void AMDILKernelManager::updatePtrArg(Function::const_arg_iterator Ip,
+                                      int numWriteImages, int raw_uav_buffer,
+                                      int counter, bool isKernel,
+                                      const Function *F) {
+  assert(F && "Cannot pass a NULL Pointer to F!");
+  assert(Ip->getType()->isPointerTy() &&
+         "Argument must be a pointer to be passed into this function!\n");
+  std::string ptrArg(";pointer:");
+  const char *symTab = "NoSymTab";
+  uint32_t ptrID = getUAVID(Ip);
+  const PointerType *PT = cast<PointerType>(Ip->getType());
+  uint32_t Align = 4;
+  const char *MemType = "uav";
+  if (PT->getElementType()->isSized()) {
+    Align = NextPowerOf2((uint32_t)mTM->getTargetData()->
+                            getTypeAllocSize(PT->getElementType()));
+  }
+  ptrArg += Ip->getName().str() + ":" + getTypeName(PT, symTab) + ":1:1:" +
+            itostr(counter * 16) + ":";
+  switch (PT->getAddressSpace()) {
+  case AMDILAS::ADDRESS_NONE:
+    //O << "No Address space qualifier!";
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+    assert(1);
+    break;
+  case AMDILAS::GLOBAL_ADDRESS:
+    if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) {
+      if (ptrID >= ARENA_SEGMENT_RESERVED_UAVS) {
+        ptrID = 8;
+      }
+    }
+    mMFI->uav_insert(ptrID);
+    break;
+  case AMDILAS::CONSTANT_ADDRESS: {
+    if (isKernel && mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)){
+      const kernel t = mGM->getKernel(F->getName());
+      if (mGM->usesHWConstant(t, Ip->getName())) {
+        MemType = "hc\0";
+        ptrID = mGM->getConstPtrCB(t, Ip->getName());
+      } else {
+        MemType = "c\0";
+        mMFI->uav_insert(ptrID);
+      }
+    } else {
+      MemType = "c\0";
+      mMFI->uav_insert(ptrID);
+    }
+    break; 
+  }
+  default:
+  case AMDILAS::PRIVATE_ADDRESS:
+    if (mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      MemType = (mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) 
+        ? "up\0" : "hp\0";
+    } else {
+      MemType = "p\0";
+      mMFI->uav_insert(ptrID);
+    }
+    break;
+  case AMDILAS::REGION_ADDRESS:
+    mMFI->setUsesRegion();
+    if (mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) {
+      MemType = "hr\0";
+      ptrID = 0;
+    } else {
+      MemType = "r\0";
+      mMFI->uav_insert(ptrID);
+    }
+    break;
+  case AMDILAS::LOCAL_ADDRESS:
+    mMFI->setUsesLocal();
+    if (mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) {
+      MemType = "hl\0";
+      ptrID = 1;
+    } else {
+      MemType = "l\0";
+      mMFI->uav_insert(ptrID);
+    }
+    break;
+  };
+  ptrArg += std::string(MemType) + ":";
+  ptrArg += itostr(ptrID) + ":";
+  ptrArg += itostr(Align);
+  mMFI->addMetadata(ptrArg, true);
+}
+
+AMDILKernelManager::AMDILKernelManager(AMDILTargetMachine *TM,
+                                       AMDILGlobalManager *GM)
+{
+  mTM = TM;
+  mSTM = mTM->getSubtargetImpl();
+  mGM = GM;
+  clear();
+}
+
+AMDILKernelManager::~AMDILKernelManager() {
+  clear();
+}
+
+void 
+AMDILKernelManager::setMF(MachineFunction *MF)
+{
+  mMF = MF;
+  mMFI = MF->getInfo<AMDILMachineFunctionInfo>();
+}
+
+void AMDILKernelManager::clear() {
+  mUniqueID = 0;
+  mIsKernel = false;
+  mWasKernel = false;
+  mHasImageWrite = false;
+  mHasOutputInst = false;
+}
+
+bool AMDILKernelManager::useCompilerWrite(const MachineInstr *MI) {
+  return (MI->getOpcode() == AMDIL::RETURN && wasKernel() && !mHasImageWrite
+          && !mHasOutputInst);
+}
+
+void AMDILKernelManager::processArgMetadata(llvm::raw_ostream &O,
+                                            uint32_t buf,
+                                            bool isKernel) 
+{
+  const Function *F = mMF->getFunction();
+  const char * symTab = "NoSymTab";
+  Function::const_arg_iterator Ip = F->arg_begin();
+  Function::const_arg_iterator Ep = F->arg_end();
+  
+  if (F->hasStructRetAttr()) {
+    assert(Ip != Ep && "Invalid struct return fucntion!");
+    mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+    ++Ip;
+  }
+  uint32_t mCBSize = 0;
+  int raw_uav_buffer = mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+  bool MultiUAV = mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV);
+  bool ArenaSegment =
+    mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment);
+  int numWriteImages =
+    mSTM->getGlobalManager()->getNumWriteImages(F->getName());
+  if (numWriteImages == OPENCL_MAX_WRITE_IMAGES || MultiUAV || ArenaSegment) {
+    if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+      raw_uav_buffer = mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+    }
+  }
+  uint32_t CounterNum = 0;
+  uint32_t ROArg = 0;
+  uint32_t WOArg = 0;
+  uint32_t NumArg = 0;
+  while (Ip != Ep) {
+    Type *cType = Ip->getType();
+    if (cType->isIntOrIntVectorTy() || cType->isFPOrFPVectorTy()) {
+      std::string argMeta(";value:");
+      argMeta += Ip->getName().str() + ":" + getTypeName(cType, symTab) + ":";
+      int bitsize = cType->getPrimitiveSizeInBits();
+      int numEle = 1;
+      if (cType->getTypeID() == Type::VectorTyID) {
+        numEle = cast<VectorType>(cType)->getNumElements();
+      }
+      argMeta += itostr(numEle) + ":1:" + itostr(mCBSize << 4);
+      mMFI->addMetadata(argMeta, true);
+
+      // FIXME: simplify
+      if ((bitsize / numEle) < 32) {
+        bitsize = numEle >> 2;
+      } else {
+        bitsize >>= 7;
+      }
+      if (!bitsize) {
+        bitsize = 1;
+      }
+
+      mCBSize += bitsize;
+      ++NumArg;
+    } else if (const PointerType *PT = dyn_cast<PointerType>(cType)) {
+      Type *CT = PT->getElementType();
+      const StructType *ST = dyn_cast<StructType>(CT);
+      if (ST && ST->isOpaque()) {
+        StringRef name = ST->getName();
+        bool i1d  = name.equals( "struct._image1d_t" );
+        bool i1da = name.equals( "struct._image1d_array_t" );
+        bool i1db = name.equals( "struct._image1d_buffer_t" );
+        bool i2d  = name.equals( "struct._image2d_t" );
+        bool i2da = name.equals( "struct._image2d_array_t" );
+        bool i3d  = name.equals( "struct._image3d_t" );
+        bool c32  = name.equals( "struct._counter32_t" );
+        bool c64  = name.equals( "struct._counter64_t" );
+        if (i1d || i1da || i1db || i2d | i2da || i3d) {
+          if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) {
+            std::string imageArg(";image:");
+            imageArg += Ip->getName().str() + ":";
+            if (i1d)       imageArg += "1D:";
+            else if (i1da) imageArg += "1DA:";
+            else if (i1db) imageArg += "1DB:";
+            else if (i2d)  imageArg += "2D:";
+            else if (i2da) imageArg += "2DA:";
+            else if (i3d)  imageArg += "3D:";
+
+            if (isKernel) {
+              if (mGM->isReadOnlyImage (mMF->getFunction()->getName(),
+                                        (ROArg + WOArg))) {
+                imageArg += "RO:" + itostr(ROArg);
+                O << "dcl_resource_id(" << ROArg << ")_type(";
+                if (i1d)       O << "1d";
+                else if (i1da) O << "1darray";
+                else if (i1db) O << "buffer";
+                else if (i2d)  O << "2d";
+                else if (i2da) O << "2darray";
+                else if (i3d)  O << "3d";
+                O << ")_fmtx(unknown)_fmty(unknown)"
+                  << "_fmtz(unknown)_fmtw(unknown)\n";
+                ++ROArg;
+              } else if (mGM->isWriteOnlyImage(mMF->getFunction()->getName(),
+                                               (ROArg + WOArg))) {
+                uint32_t offset = 0;
+                offset += WOArg;
+                imageArg += "WO:" + itostr(offset & 0x7);
+                O << "dcl_uav_id(" << ((offset) & 0x7) << ")_type(";
+                if (i1d)       O << "1d";
+                else if (i1da) O << "1darray";
+                else if (i1db) O << "buffer";
+                else if (i2d)  O << "2d";
+                else if (i2da) O << "2darray";
+                else if (i3d)  O << "3d";
+                O << ")_fmtx(uint)\n";
+                ++WOArg;
+              } else {
+                imageArg += "RW:" + itostr(ROArg + WOArg);
+              }
+            }
+            imageArg += ":1:" + itostr(mCBSize * 16);
+            mMFI->addMetadata(imageArg, true);
+            mMFI->addi32Literal(mCBSize);
+            mCBSize += NUM_EXTRA_SLOTS_PER_IMAGE + 1;
+            ++NumArg;
+          } else {
+            mMFI->addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]);
+            ++NumArg;
+          }
+        } else if (c32 || c64) {
+          std::string counterArg(";counter:");
+          counterArg += Ip->getName().str() + ":"
+            + itostr(c32 ? 32 : 64) + ":"
+            + itostr(CounterNum++) + ":1:" + itostr(mCBSize * 16);
+          mMFI->addMetadata(counterArg, true);
+          ++NumArg;
+          ++mCBSize;
+        } else {
+          updatePtrArg(Ip, numWriteImages, raw_uav_buffer, mCBSize, isKernel,
+                       F);
+          ++NumArg;
+          ++mCBSize;
+        }
+      }
+        else if (CT->getTypeID() == Type::StructTyID
+                 && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+        const TargetData *td = mTM->getTargetData();
+        const StructLayout *sl = td->getStructLayout(dyn_cast<StructType>(CT));
+        int bytesize = sl->getSizeInBytes();
+        int reservedsize = (bytesize + 15) & ~15;
+        int numSlots = reservedsize >> 4;
+        if (!numSlots) {
+          numSlots = 1;
+        }
+        std::string structArg(";value:");
+        structArg += Ip->getName().str() + ":struct:"
+          + itostr(bytesize) + ":1:" + itostr(mCBSize * 16);
+        mMFI->addMetadata(structArg, true);
+        mCBSize += numSlots;
+        ++NumArg;
+      } else if (CT->isIntOrIntVectorTy()
+                 || CT->isFPOrFPVectorTy()
+                 || CT->getTypeID() == Type::ArrayTyID
+                 || CT->getTypeID() == Type::PointerTyID
+                 || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+        updatePtrArg(Ip, numWriteImages, raw_uav_buffer, mCBSize, isKernel, F);
+        ++NumArg;
+        ++mCBSize;
+      } else {
+        assert(0 && "Cannot process current pointer argument");
+        mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+        ++NumArg;
+      }
+    } else {
+      assert(0 && "Cannot process current kernel argument");
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+      ++NumArg;
+    }
+    ++Ip;
+  }
+}
+
+void AMDILKernelManager::printHeader(AMDILAsmPrinter *AsmPrinter,
+                                     llvm::raw_ostream &O,
+                                     const std::string &name) {
+#ifdef UPSTREAM_LLVM
+  mName = name;
+  std::string kernelName;
+  kernelName = name;
+  int kernelId = mGM->getOrCreateFunctionID(kernelName);
+  O << "func " << kernelId << " ; " << kernelName << "\n";
+  if (mSTM->is64bit()) {
+    O << "mov " << AsmPrinter->getRegisterName(AMDIL::SDP) << ", cb0[8].xy\n";
+  } else {
+    O << "mov " << AsmPrinter->getRegisterName(AMDIL::SDP) << ", cb0[8].x\n";
+  }
+  O << "mov " << AsmPrinter->getRegisterName(AMDIL::SP) << ", l1.0\n";
+#endif
+}
+
+void AMDILKernelManager::printGroupSize(llvm::raw_ostream& O) {
+  // The HD4XXX generation of hardware does not support a 3D launch, so we need
+  // to use dcl_num_thread_per_group to specify the launch size. If the launch
+  // size is specified via a kernel attribute, we print it here. Otherwise we
+  // use the the default size.
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    if (mGM->hasRWG(mName) 
+        || !mMFI->usesLocal()) {
+      // if the user has specified what the required workgroup size is then we
+      // need to compile for that size and that size only.  Otherwise we compile
+      // for the max workgroup size that is passed in as an option to the
+      // backend.
+      O << "dcl_num_thread_per_group ";
+      O << mGM->getLocal(mName, 0) << ", ";
+      O << mGM->getLocal(mName, 1) << ", ";
+      O << mGM->getLocal(mName, 2) << "        \n";
+    } else {
+      // If the kernel uses local memory, then the kernel is being
+      // compiled in single wavefront mode. So we have to generate code slightly
+      // different.
+      O << "dcl_num_thread_per_group "
+        << mSTM->device()->getWavefrontSize()
+        << ", 1, 1       \n";
+    }
+  } else {
+    // Otherwise we generate for devices that support 3D launch natively.  If
+    // the reqd_workgroup_size attribute was specified, then we can specify the
+    // exact launch dimensions.
+    if (mGM->hasRWG(mName)) {
+      O << "dcl_num_thread_per_group ";
+      O << mGM->getLocal(mName, 0) << ", ";
+      O << mGM->getLocal(mName, 1) << ", ";
+      O << mGM->getLocal(mName, 2) << "        \n";
+    } else {
+      // Otherwise we specify the largest workgroup size that can be launched.
+      O << "dcl_max_thread_per_group " << mGM->getLocal(mName, 3) << " \n";
+    }
+  }
+  // Now that we have specified the workgroup size, lets declare the local
+  // memory size. If we are using hardware and we know the value at compile
+  // time, then we need to declare the correct value. Otherwise we should just
+  // declare the maximum size.
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) {
+    size_t kernelLocalSize = (mGM->getHWLocalSize(mName) + 3) & ~3;
+    if (kernelLocalSize > mSTM->device()->getMaxLDSSize()) {
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_LOCAL_RESOURCES]);
+    }
+    // If there is a local pointer as a kernel argument, we don't know the size
+    // at compile time, so we reserve all of the space.
+    if (mMFI->usesLocal() && (mMFI->hasLocalArg() || !kernelLocalSize)) {
+      O << "dcl_lds_id(" << DEFAULT_LDS_ID << ") "
+        << mSTM->device()->getMaxLDSSize() << "\n";
+      mMFI->setUsesMem(AMDILDevice::LDS_ID);
+    } else if (kernelLocalSize) {
+      // We know the size, so lets declare it correctly.
+      O << "dcl_lds_id(" << DEFAULT_LDS_ID << ") "
+        << kernelLocalSize << "\n";
+      mMFI->setUsesMem(AMDILDevice::LDS_ID);
+    }
+  }
+  // If the device supports the region memory extension, which maps to our
+  // hardware GDS memory, then lets declare it so we can use it later on.
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) {
+    size_t kernelGDSSize = (mGM->getHWRegionSize(mName) + 3) & ~3;
+    if (kernelGDSSize > mSTM->device()->getMaxGDSSize()) {
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_REGION_RESOURCES]);
+    }
+    // If there is a region pointer as a kernel argument, we don't know the size
+    // at compile time, so we reserved all of the space.
+    if (mMFI->usesRegion() && (mMFI->hasRegionArg() || !kernelGDSSize)) {
+      O << "dcl_gds_id(" << DEFAULT_GDS_ID <<
+        ") " << mSTM->device()->getMaxGDSSize() << "\n";
+      mMFI->setUsesMem(AMDILDevice::GDS_ID);
+    } else if (kernelGDSSize) {
+      // We know the size, so lets declare it.
+      O << "dcl_gds_id(" << DEFAULT_GDS_ID <<
+        ") " << kernelGDSSize << "\n";
+      mMFI->setUsesMem(AMDILDevice::GDS_ID);
+    }
+  }
+}
+
+void
+AMDILKernelManager::printDecls(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O) {
+  // If we are a HD4XXX generation device, then we only support a single uav
+  // surface, so we declare it and leave
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    O << "dcl_raw_uav_id(" 
+      << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID)
+      << ")\n";
+    mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID);
+    getIntrinsicSetup(AsmPrinter, O);
+    return;
+  }
+  // If we are supporting multiple uav's view the MultiUAV capability, then we
+  // need to print out the declarations here. MultiUAV conflicts with write
+  // images, so they only use 8 - NumWriteImages uav's. Therefor only pointers
+  // with ID's < 8 will get printed.
+  if (mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+    binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), uavPrint, O);
+    mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID);
+  }
+  // If arena segments are supported, then we should emit them now.  Arena
+  // segments are similiar to MultiUAV, except ArenaSegments are virtual and up
+  // to 1024 of them can coexist. These are more compiler hints for CAL and thus
+  // cannot overlap in any form.  Each ID maps to a seperate piece of memory and
+  // CAL determines whether the load/stores should go to the fast path/slow path
+  // based on the usage and instruction.
+  if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) {
+    binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), arenaPrint, O);
+  }
+  // Now that we have printed out all of the arena and multi uav declaration,
+  // now we must print out the default raw uav id. This always exists on HD5XXX
+  // and HD6XXX hardware. The reason is that the hardware supports 12 UAV's and
+  // 11 are taken up by MultiUAV/Write Images and Arena.  However, if we do not
+  // have UAV 11 as the raw UAV and there are 8 write images, we must revert
+  // everything to the arena and not print out the default raw uav id.
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD5XXX
+      || mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX) {
+    if ((mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) < 11 &&
+         mSTM->getGlobalManager()->getNumWriteImages(mName)
+         != OPENCL_MAX_WRITE_IMAGES
+         && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV))
+        || mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) {
+      if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID)
+          && mMFI->uav_count(mSTM->device()->
+              getResourceID(AMDILDevice::RAW_UAV_ID))) {
+        O << "dcl_raw_uav_id("
+          << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+        O << ")\n";
+        mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID);
+      }
+    }
+    // If we have not printed out the arena ID yet, then do so here.
+      if (!mMFI->usesMem(AMDILDevice::ARENA_UAV_ID)
+          && mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaUAV)) {
+        O << "dcl_arena_uav_id("
+          << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID) << ")\n";
+        mMFI->setUsesMem(AMDILDevice::ARENA_UAV_ID);
+      }
+  } else if (mSTM->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) {
+    binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), uavPrintSI, O);
+    mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID);
+  }
+  getIntrinsicSetup(AsmPrinter, O);
+}
+
+void AMDILKernelManager::getIntrinsicSetup(AMDILAsmPrinter *AsmPrinter,
+                                           llvm::raw_ostream &O)
+{
+  O << "mov r0.z, vThreadGrpIdFlat.x\n"
+    << "mov r1022.xyz0, vTidInGrp.xyz\n";
+  if (mSTM->device()->getGeneration() > AMDILDeviceInfo::HD4XXX) {
+    O << "mov r1023.xyz0, vThreadGrpId.xyz\n";
+   } else {
+    O << "imul r0.w, cb0[2].x, cb0[2].y\n"
+      // Calculates the local id.
+      // Calculates the group id.
+      << "umod r1023.x, r0.z, cb0[2].x\n"
+      << "udiv r1023.y, r0.z, cb0[2].x\n"
+      << "umod r1023.y, r1023.y, cb0[2].y\n"
+      << "udiv r1023.z, r0.z, r0.w\n";
+  }
+  // Calculates the global id.
+  if (mGM->hasRWG(mName) && 0) {
+    // Anytime we declare a literal, we need to reserve it, if it is not emitted
+    // in emitLiterals.
+    mMFI->addReservedLiterals(1);
+    O << "dcl_literal l" << mMFI->getNumLiterals() + 1 << ", ";
+    O << mGM->getLocal(mName, 0) << ", ";
+    O << mGM->getLocal(mName, 1) << ", ";
+    O << mGM->getLocal(mName, 2) << ", ";
+    O << "0\n";
+    O << "imad r1021.xyz0, r1023.xyz, l" << mMFI->getNumLiterals() + 1 << ".xyz, r1022.xyz\n";
+    mMFI->addReservedLiterals(1);
+  } else {
+    O << "imad r1021.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz\n";
+  }
+
+  // Add the global/group offset for multi-launch support.
+  O << "iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0\n"
+    << "iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0\n"
+    // moves the flat group id.
+    << "mov r1023.w, r0.z\n";
+#ifdef UPSTREAM_LLVM
+  if (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) {
+    if (mSTM->is64bit()) {
+      O << "umul " << AsmPrinter->getRegisterName(AMDIL::T2) 
+        << ".x0, r1023.w, cb0[4].z\n"
+        << "i64add " << AsmPrinter->getRegisterName(AMDIL::T2)
+        << ".xy, " << AsmPrinter->getRegisterName(AMDIL::T2)
+        << ".xy, cb0[4].xy\n";
+
+    } else {
+      O << "imad " << AsmPrinter->getRegisterName(AMDIL::T2)
+        << ".x, r1023.w, cb0[4].y, cb0[4].x\n";
+    }
+  }
+  // Shift the flat group id to be in bytes instead of dwords.
+  O << "ishl r1023.w, r1023.w, l0.z\n";
+  if (mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem)) {
+    if (mSTM->is64bit()) {
+      O << "umul " << AsmPrinter->getRegisterName(AMDIL::T1) 
+        << ".x0, vAbsTidFlat.x, cb0[3].z\n"
+        << "i64add " << AsmPrinter->getRegisterName(AMDIL::T1)
+        << ".xy, " << AsmPrinter->getRegisterName(AMDIL::T1)
+        << ".xy, cb0[3].xy\n";
+
+    } else {
+      O << "imad " << AsmPrinter->getRegisterName(AMDIL::T1)
+        << ".x, vAbsTidFlat.x, cb0[3].y, cb0[3].x\n";
+    }
+  } else {
+    O << "mov " << AsmPrinter->getRegisterName(AMDIL::T1) << ".x, l0.0\n";
+  }
+#endif
+  if (mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) {
+    O << "udiv r1024.xyz, r1021.xyz, cb0[10].xyz\n";
+    if (mGM->hasRWR(mName) && 0) {
+      // Anytime we declare a literal, we need to reserve it, if it is not emitted
+      // in emitLiterals.
+      mMFI->addReservedLiterals(1);
+      O << "dcl_literal l" << mMFI->getNumLiterals() + 1 << ", ";
+      O << mGM->getLocal(mName, 0) << ", ";
+      O << mGM->getLocal(mName, 1) << ", ";
+      O << mGM->getLocal(mName, 2) << ", ";
+      O << "0\n";
+      O << "imad r1025.xyz0, r1023.xyz, l" << mMFI->getNumLiterals() + 1 << ".xyz, r1022.xyz\n";
+      mMFI->addReservedLiterals(1);
+    } else {
+      O << "imad r1025.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz\n";
+    }
+  }
+}
+
+void AMDILKernelManager::printFooter(llvm::raw_ostream &O) {
+  O << "ret\n";
+  O << "endfunc ; " << mName << "\n";
+}
+
+void
+AMDILKernelManager::printMetaData(llvm::raw_ostream &O, uint32_t id, bool kernel) {
+  if (kernel) {
+    int kernelId = mGM->getOrCreateFunctionID(mName);
+    mMFI->addCalledFunc(id);
+    mUniqueID = kernelId;
+    mIsKernel = true;
+  }
+  printKernelArgs(O);
+  if (kernel) {
+    mIsKernel = false;
+    mMFI->eraseCalledFunc(id);
+    mUniqueID = id;
+  }
+}
+
+void AMDILKernelManager::setKernel(bool kernel) {
+  mIsKernel = kernel;
+  if (kernel) {
+    mWasKernel = mIsKernel;
+  }
+}
+
+void AMDILKernelManager::setID(uint32_t id)
+{
+  mUniqueID = id;
+}
+
+void AMDILKernelManager::setName(const std::string &name) {
+  mName = name;
+}
+
+bool AMDILKernelManager::isKernel() {
+  return mIsKernel;
+}
+
+bool AMDILKernelManager::wasKernel() {
+  return mWasKernel;
+}
+
+void AMDILKernelManager::setImageWrite() {
+  mHasImageWrite = true;
+}
+
+void AMDILKernelManager::setOutputInst() {
+  mHasOutputInst = true;
+}
+
+void AMDILKernelManager::printConstantToRegMapping(
+       AMDILAsmPrinter *RegNames,
+       uint32_t &LII,
+       llvm::raw_ostream &O,
+       uint32_t &Counter,
+       uint32_t Buffer,
+       uint32_t n,
+       const char *lit,
+       uint32_t fcall,
+       bool isImage,
+       bool isHWCB)
+{
+#ifdef UPSTREAM_LLVM
+  // TODO: This needs to be enabled or SC will never statically index into the
+  // CB when a pointer is used.
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem) && isHWCB) {
+    const char *name = RegNames->getRegisterName(LII);
+    O << "mov " << name << ", l5.x\n";
+    ++LII;
+    Counter++;
+    return;
+  }
+  for (uint32_t x = 0; x < n; ++x) {
+    const char *name = RegNames->getRegisterName(LII);
+    if (isImage) {
+      O << "mov " << name << ", l" << mMFI->getIntLits(Counter++) << "\n";
+    } else {
+      O << "mov " << name << ", cb" <<Buffer<< "[" <<Counter++<< "]\n";
+    }
+    switch(fcall) {
+    case 1093:
+      O << "ishr " << name << ", " << name << ".xxyy, l3.0y0y\n"
+        "ishl " << name << ", " << name << ", l3.y\n"
+        "ishr " << name << ", " << name << ", l3.y\n";
+      break;
+    case 1092:
+      O << "ishr " << name << ", " << name << ".xx, l3.0y\n"
+        "ishl " << name << ", " << name << ", l3.y\n"
+        "ishr " << name << ", " << name << ", l3.y\n";
+      break;
+    case 1091:
+      O << "ishr " << name << ", " << name << ".xxxx, l3.0zyx\n"
+        "ishl " << name << ", " << name << ", l3.x\n"
+        "ishr " << name << ", " << name << ", l3.x\n";
+      break;
+    case 1090:
+      O << "ishr " << name << ", " << name << ".xx, l3.0z\n"
+        "ishl " << name << ".xy__, " << name << ".xy, l3.x\n"
+        "ishr " << name << ".xy__, " << name << ".xy, l3.x\n";
+      break;
+    default:
+      break;
+    };
+    if (lit) {
+      O << "ishl " << name << ", " << name
+        << ", " << lit << "\n";
+      O << "ishr " << name << ", " << name
+        << ", " << lit << "\n";
+    }
+    if (isImage) {
+      Counter += NUM_EXTRA_SLOTS_PER_IMAGE;
+    }
+    ++LII;
+  }
+#endif
+}
+
+void
+AMDILKernelManager::printCopyStructPrivate(const StructType *ST,
+                                           llvm::raw_ostream &O,
+                                           size_t stackSize,
+                                           uint32_t Buffer,
+                                           uint32_t mLitIdx,
+                                           uint32_t &Counter)
+{
+  size_t n = ((stackSize + 15) & ~15) >> 4;
+  for (size_t x = 0; x < n; ++x) {
+    O << "mov r2, cb" << Buffer << "[" << Counter++ << "]\n";
+    O << "mov r1.x, r0.x\n";
+    if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    if (mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) {
+      O << "ishr r1.x, r1.x, l0.x\n";
+      O << "mov x" << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID)
+        <<"[r1.x], r2\n";
+    } else {
+        O << "uav_raw_store_id(" <<
+          mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID)
+          << ") mem0, r1.x, r2\n";
+    }
+    } else {
+      O << "uav_raw_store_id(" <<
+        mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID)
+        << ") mem0, r1.x, r2\n";
+    }
+    O << "iadd r0.x, r0.x, l" << mLitIdx << ".z\n";
+  }
+}
+
+void AMDILKernelManager::printKernelArgs(llvm::raw_ostream &O) {
+  std::string version(";version:");
+  version += itostr(AMDIL_MAJOR_VERSION) + ":"
+    + itostr(AMDIL_MINOR_VERSION) + ":" + itostr(AMDIL_REVISION_NUMBER);
+  O << ";ARGSTART:" <<mName<< "\n";
+  if (mIsKernel) {
+    O << version << "\n";
+    O << ";device:" <<mSTM->getDeviceName() << "\n";
+  }
+  O << ";uniqueid:" <<mUniqueID<< "\n";
+  
+  size_t local = mGM->getLocalSize(mName);
+  size_t hwlocal = ((mGM->getHWLocalSize(mName) + 3) & (~0x3));
+  size_t region = mGM->getRegionSize(mName);
+  size_t hwregion = ((mGM->getHWRegionSize(mName) + 3) & (~0x3));
+  bool usehwlocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+  bool usehwprivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+  bool usehwregion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+  bool useuavprivate = mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV);
+  if (mIsKernel) {
+    O << ";memory:" << ((usehwprivate) ? 
+        (useuavprivate) ? "uav" : "hw" : "" ) << "private:"
+      <<(((mMFI->getStackSize() + 15) & (~0xF)))<< "\n";
+  }
+  if (mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) {
+    O << ";memory:" << ((usehwregion) ? "hw" : "") << "region:"
+      << ((usehwregion) ? hwregion : hwregion + region) << "\n";
+  }
+  O << ";memory:" << ((usehwlocal) ? "hw" : "") << "local:"
+    << ((usehwlocal) ? hwlocal : hwlocal + local) << "\n";
+  
+  if (mIsKernel) {
+    if (mGM->hasRWG(mName)) {
+      O << ";cws:" << mGM->getLocal(mName, 0) << ":";
+      O << mGM->getLocal(mName, 1) << ":";
+      O << mGM->getLocal(mName, 2) << "\n";
+    }
+    if (mGM->hasRWR(mName)) {
+      O << ";crs:" << mGM->getRegion(mName, 0) << ":";
+      O << mGM->getRegion(mName, 1) << ":";
+      O << mGM->getRegion(mName, 2) << "\n";
+    }
+  }
+  if (mIsKernel) {
+    for (std::vector<std::string>::iterator ib = mMFI->kernel_md_begin(),
+           ie = mMFI->kernel_md_end(); ib != ie; ++ib) {
+      O << (*ib) << "\n";
+    }
+  }
+  for (std::set<std::string>::iterator ib = mMFI->func_md_begin(),
+         ie = mMFI->func_md_end(); ib != ie; ++ib) {
+    O << (*ib) << "\n";
+  }
+  if (!mMFI->func_empty()) {
+    O << ";function:" << mMFI->func_size();
+    binaryForEach(mMFI->func_begin(), mMFI->func_end(), commaPrint, O);
+    O << "\n";
+  }
+
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::MacroDB)
+      && !mMFI->intr_empty()) {
+    O << ";intrinsic:" << mMFI->intr_size();
+    binaryForEach(mMFI->intr_begin(), mMFI->intr_end(), commaPrint, O);
+    O << "\n";
+  }
+
+  if (!mIsKernel) {
+    binaryForEach(mMFI->printf_begin(), mMFI->printf_end(), printfPrint, O);
+    mMF->getMMI().getObjFileInfo<AMDILModuleInfo>().add_printf_offset(
+        mMFI->printf_size());
+  } else {
+    for (StringMap<SamplerInfo>::iterator 
+        smb = mMFI->sampler_begin(),
+        sme = mMFI->sampler_end(); smb != sme; ++ smb) {
+      O << ";sampler:" << (*smb).second.name << ":" << (*smb).second.idx
+        << ":" << ((*smb).second.val == (uint32_t)-1 ? 0 : 1) 
+        << ":" << ((*smb).second.val != (uint32_t)-1 ? (*smb).second.val : 0)
+        << "\n";
+    }
+  }
+  if (mSTM->is64bit()) {
+    O << ";memory:64bitABI\n";
+  }
+
+  if (mMFI->errors_empty()) {
+    binaryForEach(mMFI->errors_begin(), mMFI->errors_end(), errorPrint, O);
+  }
+  // This has to come last
+  if (mIsKernel 
+      && mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    if (mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) >
+        mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+      if (mMFI->uav_size() == 1) {
+        if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
+            && *(mMFI->uav_begin()) >= ARENA_SEGMENT_RESERVED_UAVS) {
+          O << ";uavid:"
+            << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+          O << "\n";
+        } else {
+          O << ";uavid:" << *(mMFI->uav_begin()) << "\n";
+        }
+      } else if (mMFI->uav_count(mSTM->device()->
+            getResourceID(AMDILDevice::RAW_UAV_ID))) {
+        O << ";uavid:"
+          << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+        O << "\n";
+      } else {
+        O << ";uavid:"
+          << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+        O << "\n";
+      }
+    } else if (mSTM->getGlobalManager()->getNumWriteImages(mName) !=
+        OPENCL_MAX_WRITE_IMAGES
+        && !mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
+        && mMFI->uav_count(mSTM->device()->
+          getResourceID(AMDILDevice::RAW_UAV_ID))) {
+      O << ";uavid:"
+        << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) << "\n";
+    } else if (mMFI->uav_size() == 1) {
+      O << ";uavid:" << *(mMFI->uav_begin()) << "\n";
+    } else {
+      O << ";uavid:"
+        << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+      O << "\n";
+    }
+  }
+  O << ";ARGEND:" << mName << "\n";
+}
+
+void AMDILKernelManager::printArgCopies(llvm::raw_ostream &O,
+    AMDILAsmPrinter *RegNames)
+{
+  Function::const_arg_iterator I = mMF->getFunction()->arg_begin();
+  Function::const_arg_iterator Ie = mMF->getFunction()->arg_end();
+  uint32_t Counter = 0;
+
+  if (mMFI->getArgSize()) {
+    O << "dcl_cb cb1";
+    O << "[" << (mMFI->getArgSize() >> 4) << "]\n";
+    mMFI->setUsesMem(AMDILDevice::CONSTANT_ID);
+  }
+  const Function *F = mMF->getFunction();
+  // Get the stack size
+  uint32_t stackSize = mMFI->getStackSize();
+  uint32_t privateSize = mMFI->getScratchSize();
+  uint32_t stackOffset = (privateSize + 15) & (~0xF);
+  if (stackSize 
+      && mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) {
+    // TODO: If the size is too large, we need to fall back to software emulated
+    // instead of using the hardware capability.
+    int size = (((stackSize + 15) & (~0xF)) >> 4);
+    if (size > 4096) {
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_PRIVATE_RESOURCES]);
+    }
+    if (size) {
+    // For any stack variables, we need to declare the literals for them so that
+    // we can use them when we copy our data to the stack.
+    mMFI->addReservedLiterals(1);
+    // Anytime we declare a literal, we need to reserve it, if it is not emitted
+    // in emitLiterals.
+#ifdef UPSTREAM_LLVM
+    O << "dcl_literal l" << mMFI->getNumLiterals() << ", " << stackSize << ", "
+      << privateSize << ", 16, " << ((stackSize == privateSize) ? 0 : stackOffset) << "\n"
+      << "iadd r0.x, " << RegNames->getRegisterName(AMDIL::T1) << ".x, l"
+      << mMFI->getNumLiterals() << ".w\n";
+    if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    O << "dcl_indexed_temp_array x"
+      << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) << "["
+      << size << "]\n";
+    } else {
+      O << "dcl_typeless_uav_id("
+        << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) 
+        << ")_stride(4)_length(" << (size  << 4 )<< ")_access(private)\n";
+
+    }
+    O << "mov " << RegNames->getRegisterName(AMDIL::FP) 
+      << ".x, l" << mMFI->getNumLiterals() << ".0\n";
+#endif    
+    mMFI->setUsesMem(AMDILDevice::SCRATCH_ID);
+    }
+  }
+  I = mMF->getFunction()->arg_begin();
+  int32_t count = 0;
+  // uint32_t Image = 0;
+  bool displaced1 = false;
+  bool displaced2 = false;
+  uint32_t curReg = AMDIL::R1;
+  // TODO: We don't handle arguments that were pushed onto the stack!
+  for (; I != Ie; ++I) {
+    Type *curType = I->getType();
+    unsigned int Buffer = 1;
+    O << "; Kernel arg setup: " << I->getName() << "\n";
+    if (curType->isIntegerTy() || curType->isFloatingPointTy()) {
+      switch (curType->getPrimitiveSizeInBits()) {
+        default:
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1);
+          break;
+        case 16:
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1,
+              "l3.y" );
+          break;
+        case 8:
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1, "l3.x" );
+          break;
+      }
+#ifdef UPSTREAM_LLVM
+    } else if (const VectorType *VT = dyn_cast<VectorType>(curType)) {
+      Type *ET = VT->getElementType();
+      int numEle = VT->getNumElements();
+      switch (ET->getPrimitiveSizeInBits()) {
+        default:
+          if (numEle == 3) {
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".x, cb" << Buffer << "[" << Counter << "].x\n";
+            curReg++;
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".x, cb" << Buffer << "[" << Counter << "].y\n";
+            curReg++;
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".x, cb" << Buffer << "[" << Counter << "].z\n";
+            curReg++;
+            Counter++;
+          } else {
+            printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer,
+                (numEle+2) >> 2);
+          }
+          break;
+        case 64:
+          if (numEle == 3) {
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".xy, cb" << Buffer << "[" << Counter << "].xy\n";
+            curReg++;
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".xy, cb" << Buffer << "[" << Counter++ << "].zw\n";
+            curReg++;
+            O << "mov " << RegNames->getRegisterName(curReg);
+            O << ".xy, cb" << Buffer << "[" << Counter << "].xy\n";
+            curReg++;
+            Counter++;
+          } else {
+            printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer,
+                (numEle) >> 1);
+          }
+          break;
+        case 16: 
+          {
+                   switch (numEle) {
+                     default:
+                       printConstantToRegMapping(RegNames, curReg, O, Counter,
+                           Buffer, (numEle+2) >> 2, "l3.y", 1093);
+                       if (numEle == 3) {
+                         O << "mov " << RegNames->getRegisterName(curReg) << ".x, ";
+                         O << RegNames->getRegisterName(curReg) << ".y\n";
+                         ++curReg;
+                         O << "mov " << RegNames->getRegisterName(curReg) << ".x, ";
+                         O << RegNames->getRegisterName(curReg) << ".z\n";
+                         ++curReg;
+                       }
+                       break;
+                     case 2:
+                       printConstantToRegMapping(RegNames, curReg, O, Counter,
+                           Buffer, 1, "l3.y", 1092);
+                       break;
+                   }
+                   break;
+                 }
+        case 8: 
+          {
+                  switch (numEle) {
+                    default:
+                      printConstantToRegMapping(RegNames, curReg, O, Counter,
+                          Buffer, (numEle+2) >> 2, "l3.x", 1091);
+                      if (numEle == 3) {
+                        O << "mov " << RegNames->getRegisterName(curReg) << ".x, ";
+                        O << RegNames->getRegisterName(curReg) << ".y\n";
+                        ++curReg;
+                        O << "mov " << RegNames->getRegisterName(curReg) << ".x, ";
+                        O << RegNames->getRegisterName(curReg) << ".z\n";
+                        ++curReg;
+                      }
+                      break;
+                    case 2:
+                      printConstantToRegMapping(RegNames, curReg, O, Counter,
+                          Buffer, 1, "l3.x", 1090);
+                      break;
+                  }
+                  break;
+                }
+      }
+#endif
+    } else if (const PointerType *PT = dyn_cast<PointerType>(curType)) {
+      Type *CT = PT->getElementType();
+      const StructType *ST = dyn_cast<StructType>(CT);
+      if (ST && ST->isOpaque()) {
+        bool i1d  = ST->getName() == "struct._image1d_t";
+        bool i1da = ST->getName() == "struct._image1d_array_t";
+        bool i1db = ST->getName() == "struct._image1d_buffer_t";
+        bool i2d  = ST->getName() == "struct._image2d_t";
+        bool i2da = ST->getName() == "struct._image2d_array_t";
+        bool i3d  = ST->getName() == "struct._image3d_t";
+        bool is_image = i1d || i1da || i1db || i2d || i2da || i3d;
+        if (is_image) {
+          if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) {
+            printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer,
+                1, NULL, 0, is_image);
+          } else {
+            mMFI->addErrorMsg(
+                amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]);
+            ++curReg;
+          }
+        } else {
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1);
+        }
+      } else if (CT->isStructTy()
+          && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+        StructType *ST = dyn_cast<StructType>(CT);
+        bool i1d  = ST->getName() == "struct._image1d_t";
+        bool i1da = ST->getName() == "struct._image1d_array_t";
+        bool i1db = ST->getName() == "struct._image1d_buffer_t";
+        bool i2d  = ST->getName() == "struct._image2d_t";
+        bool i2da = ST->getName() == "struct._image2d_array_t";
+        bool i3d  = ST->getName() == "struct._image3d_t";
+        bool is_image = i1d || i1da || i1db || i2d || i2da || i3d;
+        if (is_image) {
+          if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) {
+            printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer,
+                1, NULL, 0, is_image);
+          } else {
+            mMFI->addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]);
+            ++curReg;
+          }
+        } else {
+          if (count) {
+            // Anytime we declare a literal, we need to reserve it, if it
+            // is not emitted in emitLiterals.
+            mMFI->addReservedLiterals(1);
+            O << "dcl_literal l" << mMFI->getNumLiterals() << ", "
+              << -stackSize << ", " << stackSize << ", 16, "
+              << stackOffset << "\n";
+          }
+          ++count;
+          size_t structSize;
+          structSize = (getTypeSize(ST) + 15) & ~15;
+          stackOffset += structSize;
+#ifdef UPSTREAM_LLVM
+          O << "mov " << RegNames->getRegisterName((curReg)) << ", l"
+            << mMFI->getNumLiterals()<< ".w\n";
+          if (!displaced1) {
+            O << "mov r1011, r1\n";
+            displaced1 = true;
+          }
+          if (!displaced2 && strcmp(RegNames->getRegisterName(curReg), "r1")) {
+            O << "mov r1010, r2\n";
+            displaced2 = true;
+          }
+#endif
+          printCopyStructPrivate(ST, O, structSize, Buffer, mMFI->getNumLiterals(),
+              Counter);
+          ++curReg;
+        }
+      } else if (CT->isIntOrIntVectorTy()
+          || CT->isFPOrFPVectorTy()
+          || CT->isArrayTy()
+          || CT->isPointerTy()
+          || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+        if (PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) {
+          const kernel& krnl = mGM->getKernel(F->getName());
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer,
+              1, NULL, 0, false, 
+              mGM->usesHWConstant(krnl, I->getName()));
+        } else if (PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) {
+          // TODO: If we are region address space, the first region pointer, no
+          // array pointers exist, and hardware RegionMem is enabled then we can
+          // zero out register as the initial offset is zero.
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1);
+        } else if (PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) {
+          // TODO: If we are local address space, the first local pointer, no
+          // array pointers exist, and hardware LocalMem is enabled then we can
+          // zero out register as the initial offset is zero.
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1);
+        } else {
+          printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1);
+        }
+      } else {
+        assert(0 && "Current type is not supported!");
+        mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+        ++curReg;
+      }
+    } else {
+      assert(0 && "Current type is not supported!");
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+      ++curReg;
+    }
+  }
+  if (displaced1) {
+    O << "mov r1, r1011\n";
+  }
+  if (displaced2) {
+    O << "mov r2, r1010\n";
+  }
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    const kernel& krnl = mGM->getKernel(F->getName());
+    uint32_t constNum = 0;
+    for (uint32_t x = 0; x < mSTM->device()->getMaxNumCBs(); ++x) {
+      if (krnl.constSizes[x]) {
+        O << "dcl_cb cb" << x + CB_BASE_OFFSET;
+        O << "[" << (((krnl.constSizes[x] + 15) & ~15) >> 4) << "]\n";
+        ++constNum;
+        mMFI->setUsesMem(AMDILDevice::CONSTANT_ID);
+      }
+    }
+    // TODO: If we run out of constant resources, we need to push some of the
+    // constant pointers to the software emulated section.
+    if (constNum > mSTM->device()->getMaxNumCBs()) {
+      assert(0 && "Max constant buffer limit passed!");
+      mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_CONSTANT_RESOURCES]);
+    }
+  }
+}
+
+  const char *
+AMDILKernelManager::getTypeName(const Type *ptr, const char *symTab)
+{
+  // symTab argument is ignored...
+  LLVMContext& ctx = ptr->getContext();
+  switch (ptr->getTypeID()) {
+    case Type::StructTyID:
+      {
+        const StructType *ST = cast<StructType>(ptr);
+        if (!ST->isOpaque())
+          return "struct";
+        // ptr is a pre-LLVM 3.0 "opaque" type.
+        StringRef name = ST->getName();
+        if (name.equals( "struct._event_t" ))         return "event";
+        if (name.equals( "struct._image1d_t" ))       return "image1d";
+        if (name.equals( "struct._image1d_array_t" )) return "image1d_array";
+        if (name.equals( "struct._image2d_t" ))       return "image2d";
+        if (name.equals( "struct._image2d_array_t" )) return "image2d_array";
+        if (name.equals( "struct._image3d_t" ))       return "image3d";
+        if (name.equals( "struct._counter32_t" ))     return "counter32";
+        if (name.equals( "struct._counter64_t" ))     return "counter64";
+        return "opaque";
+        break;
+      }
+    case Type::FloatTyID:
+      return "float";
+    case Type::DoubleTyID: 
+      {
+        const AMDILSubtarget *mSTM= mTM->getSubtargetImpl();
+        if (!mSTM->device()->usesHardware(AMDILDeviceInfo::DoubleOps)) {
+          mMFI->addErrorMsg(amd::CompilerErrorMessage[DOUBLE_NOT_SUPPORTED]);
+        }
+        return "double";
+      }
+    case Type::IntegerTyID: 
+      {
+        if (ptr == Type::getInt8Ty(ctx)) {
+          return "i8";
+        } else if (ptr == Type::getInt16Ty(ctx)) {
+          return "i16";
+        } else if (ptr == Type::getInt32Ty(ctx)) {
+          return "i32";
+        } else if(ptr == Type::getInt64Ty(ctx)) {
+          return "i64";
+        }
+        break;
+      }
+    default:
+      break;
+    case Type::ArrayTyID: 
+      {
+        const ArrayType *AT = cast<ArrayType>(ptr);
+        const Type *name = AT->getElementType();
+        return getTypeName(name, symTab);
+        break;
+      }
+    case Type::VectorTyID: 
+      {
+        const VectorType *VT = cast<VectorType>(ptr);
+        const Type *name = VT->getElementType();
+        return getTypeName(name, symTab);
+        break;
+      }
+    case Type::PointerTyID: 
+      {
+        const PointerType *PT = cast<PointerType>(ptr);
+        const Type *name = PT->getElementType();
+        return getTypeName(name, symTab);
+        break;
+      }
+    case Type::FunctionTyID: 
+      {
+        const FunctionType *FT = cast<FunctionType>(ptr);
+        const Type *name = FT->getReturnType();
+        return getTypeName(name, symTab);
+        break;
+      }
+  }
+  ptr->dump();
+  mMFI->addErrorMsg(amd::CompilerErrorMessage[UNKNOWN_TYPE_NAME]);
+  return "unknown";
+}
+
+void AMDILKernelManager::emitLiterals(llvm::raw_ostream &O) {
+  char buffer[256];
+  std::map<uint32_t, uint32_t>::iterator ilb, ile;
+  for (ilb = mMFI->begin_32(), ile = mMFI->end_32(); ilb != ile; ++ilb) {
+    uint32_t a = ilb->first;
+    O << "dcl_literal l" <<ilb->second<< ", ";
+    sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x", a, a, a, a);
+    O << buffer << "; f32:i32 " << ilb->first << "\n";
+  }
+  std::map<uint64_t, uint32_t>::iterator llb, lle;
+  for (llb = mMFI->begin_64(), lle = mMFI->end_64(); llb != lle; ++llb) {
+    uint32_t v[2];
+    uint64_t a = llb->first;
+    memcpy(v, &a, sizeof(uint64_t));
+    O << "dcl_literal l" <<llb->second<< ", ";
+    sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x; f64:i64 ",
+        v[0], v[1], v[0], v[1]);
+    O << buffer << llb->first << "\n";
+  }
+  std::map<std::pair<uint64_t, uint64_t>, uint32_t>::iterator vlb, vle;
+  for (vlb = mMFI->begin_128(), vle = mMFI->end_128(); vlb != vle; ++vlb) {
+    uint32_t v[2][2];
+    uint64_t a = vlb->first.first;
+    uint64_t b = vlb->first.second;
+    memcpy(v[0], &a, sizeof(uint64_t));
+    memcpy(v[1], &b, sizeof(uint64_t));
+    O << "dcl_literal l" << vlb->second << ", ";
+    sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x; f128:i128 ",
+        v[0][0], v[0][1], v[1][0], v[1][1]);
+    O << buffer << vlb->first.first << vlb->first.second << "\n";
+  }
+}
+
+// If the value is not known, then the uav is set, otherwise the mValueIDMap
+// is used.
+void AMDILKernelManager::setUAVID(const Value *value, uint32_t ID) {
+  if (value) {
+    mValueIDMap[value] = ID;
+  }
+}
+
+uint32_t AMDILKernelManager::getUAVID(const Value *value) {
+  if (mValueIDMap.find(value) != mValueIDMap.end()) {
+    return mValueIDMap[value];
+  }
+
+  if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
+    return mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+  } else {
+    return mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+  }
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILKernelManager.h b/src/gallium/drivers/radeon/AMDILKernelManager.h

new file mode 100644 (file)

index 0000000..d5eb296
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILKernelManager.h
@@ -0,0 +1,177 @@
+//===-- AMDILKernelManager.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// 
+// Class that handles the metadata/abi management for the
+// ASM printer. Handles the parsing and generation of the metadata
+// for each kernel and keeps track of its arguments.
+//
+//==-----------------------------------------------------------------------===//
+#ifndef _AMDILKERNELMANAGER_H_
+#define _AMDILKERNELMANAGER_H_
+#include "AMDIL.h"
+#include "AMDILDevice.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Function.h"
+
+#include <map>
+#include <set>
+#include <string>
+
+#define IMAGETYPE_2D 0
+#define IMAGETYPE_3D 1
+#define RESERVED_LIT_COUNT 6
+
+namespace llvm {
+class AMDILGlobalManager;
+class AMDILSubtarget;
+class AMDILMachineFunctionInfo;
+class AMDILTargetMachine;
+class AMDILAsmPrinter;
+class StructType;
+class Value;
+class TypeSymbolTable;
+class MachineFunction;
+class MachineInstr;
+class ConstantFP;
+class PrintfInfo;
+
+
+class AMDILKernelManager {
+public:
+  typedef enum {
+    RELEASE_ONLY,
+    DEBUG_ONLY,
+    ALWAYS
+  } ErrorMsgEnum;
+  AMDILKernelManager(AMDILTargetMachine *TM, AMDILGlobalManager *GM);
+  virtual ~AMDILKernelManager();
+  
+  /// Clear the state of the KernelManager putting it in its most initial state.
+  void clear();
+  void setMF(MachineFunction *MF);
+
+  /// Process the specific kernel parsing out the parameter information for the
+  /// kernel.
+  void processArgMetadata(llvm::raw_ostream &O,
+                          uint32_t buf, bool kernel);
+
+
+  /// Prints the header for the kernel which includes the groupsize declaration
+  /// and calculation of the local/group/global id's.
+  void printHeader(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O,
+                   const std::string &name);
+
+  virtual void printDecls(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O);
+  virtual void printGroupSize(llvm::raw_ostream &O);
+
+  /// Copies the data from the runtime setup constant buffers into registers so
+  /// that the program can correctly access memory or data that was set by the
+  /// host program.
+  void printArgCopies(llvm::raw_ostream &O, AMDILAsmPrinter* RegNames);
+
+  /// Prints out the end of the function.
+  void printFooter(llvm::raw_ostream &O);
+  
+  /// Prints out the metadata for the specific function depending if it is a
+  /// kernel or not.
+  void printMetaData(llvm::raw_ostream &O, uint32_t id, bool isKernel = false);
+  
+  /// Set bool value on whether to consider the function a kernel or a normal
+  /// function.
+  void setKernel(bool kernel);
+
+  /// Set the unique ID of the kernel/function.
+  void setID(uint32_t id);
+
+  /// Set the name of the kernel/function.
+  void setName(const std::string &name);
+
+  /// Flag to specify whether the function is a kernel or not.
+  bool isKernel();
+
+  /// Flag that specifies whether this function has a kernel wrapper.
+  bool wasKernel();
+
+  void getIntrinsicSetup(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O); 
+
+  // Returns whether a compiler needs to insert a write to memory or not.
+  bool useCompilerWrite(const MachineInstr *MI);
+
+  // Set the flag that there exists an image write.
+  void setImageWrite();
+  void setOutputInst();
+
+  const char *getTypeName(const Type *name, const char * symTab);
+
+  void emitLiterals(llvm::raw_ostream &O);
+
+  // Set the uav id for the specific pointer value.  If value is NULL, then the
+  // ID sets the default ID.
+  void setUAVID(const Value *value, uint32_t ID);
+
+  // Get the UAV id for the specific pointer value.
+  uint32_t getUAVID(const Value *value);
+
+private:
+
+  /// Helper function that prints the actual metadata and should only be called
+  /// by printMetaData.
+  void printKernelArgs(llvm::raw_ostream &O);
+  void printCopyStructPrivate(const StructType *ST,
+                              llvm::raw_ostream &O,
+                              size_t stackSize,
+                              uint32_t Buffer,
+                              uint32_t mLitIdx,
+                              uint32_t &counter);
+  virtual void
+  printConstantToRegMapping(AMDILAsmPrinter *RegNames,
+                            uint32_t &LII,
+                            llvm::raw_ostream &O,
+                            uint32_t &counter,
+                            uint32_t Buffer,
+                            uint32_t n,
+                            const char *lit = NULL,
+                            uint32_t fcall = 0,
+                            bool isImage = false,
+                            bool isHWCB = false);
+  void updatePtrArg(llvm::Function::const_arg_iterator Ip,
+                    int numWriteImages,
+                    int raw_uav_buffer,
+                    int counter,
+                    bool isKernel,
+                    const Function *F);
+  /// Name of the current kernel.
+  std::string mName;
+  uint32_t mUniqueID;
+  bool mIsKernel;
+  bool mWasKernel;
+  bool mCompilerWrite;
+  /// Flag to specify if an image write has occured or not in order to not add a
+  /// compiler specific write if no other writes to memory occured.
+  bool mHasImageWrite;
+  bool mHasOutputInst;
+  
+  /// Map from const Value * to UAV ID.
+  std::map<const Value *, uint32_t> mValueIDMap;
+
+  AMDILTargetMachine * mTM;
+  const AMDILSubtarget * mSTM;
+  AMDILGlobalManager * mGM;
+  /// This is the global offset of the printf string id's.
+  MachineFunction *mMF;
+  AMDILMachineFunctionInfo *mMFI;
+}; // class AMDILKernelManager
+
+} // llvm namespace
+#endif // _AMDILKERNELMANAGER_H_
diff --git a/src/gallium/drivers/radeon/AMDILLiteralManager.cpp b/src/gallium/drivers/radeon/AMDILLiteralManager.cpp

new file mode 100644 (file)

index 0000000..43167f5
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILLiteralManager.cpp
@@ -0,0 +1,128 @@
+//===--- AMDILLiteralManager.cpp - AMDIL Literal Manager Pass --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "literal_manager"
+
+#include "AMDIL.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+
+// AMDIL Literal Manager traverses through all of the LOADCONST instructions and
+// converts them from an immediate value to the literal index. The literal index
+// is valid IL, but the immediate values are not. The Immediate values must be
+// aggregated and declared for clarity and to reduce the number of literals that
+// are used. It is also illegal to declare the same literal twice, so this keeps
+// that from occuring.
+
+namespace {
+  class AMDILLiteralManager : public MachineFunctionPass {
+  public:
+    static char ID;
+    AMDILLiteralManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+    virtual const char *getPassName() const;
+
+    bool runOnMachineFunction(MachineFunction &MF);
+  private:
+    bool trackLiterals(MachineBasicBlock::iterator *bbb);
+    TargetMachine &TM;
+    const AMDILSubtarget *mSTM;
+    AMDILKernelManager *mKM;
+    AMDILMachineFunctionInfo *mMFI;
+    int32_t mLitIdx;
+    bool mChanged;
+  };
+  char AMDILLiteralManager::ID = 0;
+}
+
+namespace llvm {
+  FunctionPass *
+  createAMDILLiteralManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) {
+    return new AMDILLiteralManager(tm AMDIL_OPT_LEVEL_VAR);
+  }
+  
+}
+
+AMDILLiteralManager::AMDILLiteralManager(TargetMachine &tm
+                                         AMDIL_OPT_LEVEL_DECL)
+  : MachineFunctionPass(ID),
+    TM(tm) {
+}
+
+bool AMDILLiteralManager::runOnMachineFunction(MachineFunction &MF) {
+  mChanged = false;
+  mMFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  const AMDILTargetMachine *amdtm =
+    reinterpret_cast<const AMDILTargetMachine *>(&TM);
+  mSTM = dynamic_cast<const AMDILSubtarget *>(amdtm->getSubtargetImpl());
+  mKM = const_cast<AMDILKernelManager *>(mSTM->getKernelManager());
+  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+      std::bind1st(std::mem_fun(&AMDILLiteralManager::trackLiterals), this));
+  return mChanged;
+}
+
+bool AMDILLiteralManager::trackLiterals(MachineBasicBlock::iterator *bbb) {
+  MachineInstr *MI = *bbb;
+  uint32_t Opcode = MI->getOpcode();
+  switch(Opcode) {
+  default:
+    return false;
+  case AMDIL::LOADCONST_i8:
+  case AMDIL::LOADCONST_i16:
+  case AMDIL::LOADCONST_i32:
+  case AMDIL::LOADCONST_i64:
+  case AMDIL::LOADCONST_f32:
+  case AMDIL::LOADCONST_f64:
+    break;
+  };
+  MachineOperand &dstOp = MI->getOperand(0);
+  MachineOperand &litOp = MI->getOperand(1);
+  if (!litOp.isImm() && !litOp.isFPImm()) {
+    return false;
+  }
+  if (!dstOp.isReg()) {
+    return false;
+  }
+  // Change the literal to the correct index for each literal that is found.
+  if (litOp.isImm()) {
+    int64_t immVal = litOp.getImm();
+    uint32_t idx = MI->getOpcode() == AMDIL::LOADCONST_i64 
+                     ? mMFI->addi64Literal(immVal)
+                     : mMFI->addi32Literal(static_cast<int>(immVal), Opcode);
+    litOp.ChangeToImmediate(idx);
+    return false;
+  } 
+
+  if (litOp.isFPImm()) {
+    const ConstantFP *fpVal = litOp.getFPImm();
+    uint32_t idx = MI->getOpcode() == AMDIL::LOADCONST_f64
+                     ? mMFI->addf64Literal(fpVal)
+                     : mMFI->addf32Literal(fpVal);
+    litOp.ChangeToImmediate(idx);
+    return false;
+  }
+
+  return false;
+}
+
+const char* AMDILLiteralManager::getPassName() const {
+    return "AMDIL Constant Propagation";
+}
+
+
diff --git a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp

new file mode 100644 (file)

index 0000000..9366f2e
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp
@@ -0,0 +1,158 @@
+//===---- AMDILMCCodeEmitter.cpp - Convert AMDIL text to AMDIL binary ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "amdil-emitter"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+#if 0
+namespace {
+  class AMDILMCCodeEmitter : public MCCodeEmitter {
+    AMDILMCCodeEmitter(const AMDILMCCodeEmitter &);// DO NOT IMPLEMENT
+    void operator=(const AMDILMCCodeEmitter &); // DO NOT IMPLEMENT
+    const TargetMachine &TM;
+    const TargetInstrInfo &TII;
+    MCContext &Ctx;
+    bool Is64BitMode;
+    public:
+    AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit);
+    ~AMDILMCCodeEmitter();
+    unsigned getNumFixupKinds() const;
+    const MCFixupKindInfo& getFixupKindInfo(MCFixupKind Kind) const;
+    static unsigned GetAMDILRegNum(const MCOperand &MO);
+    void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const;
+    void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+        raw_ostream &OS) const;
+    void EmitImmediate(const MCOperand &Disp, unsigned ImmSize,
+        MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &os,
+        SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
+
+    void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+        SmallVectorImpl<MCFixup> &Fixups) const;
+
+  }; // class AMDILMCCodeEmitter
+}; // anonymous namespace
+
+namespace llvm {
+  MCCodeEmitter *createAMDILMCCodeEmitter(const Target &,
+      TargetMachine &TM, MCContext &Ctx)
+  {
+    return new AMDILMCCodeEmitter(TM, Ctx, false);
+  }
+}
+
+AMDILMCCodeEmitter::AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx
+    , bool is64Bit)
+: TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx)
+{
+  Is64BitMode = is64Bit;
+}
+
+AMDILMCCodeEmitter::~AMDILMCCodeEmitter()
+{
+}
+
+unsigned
+AMDILMCCodeEmitter::getNumFixupKinds() const
+{
+  return 0;
+}
+
+const MCFixupKindInfo &
+AMDILMCCodeEmitter::getFixupKindInfo(MCFixupKind Kind) const
+{
+//  const static MCFixupKindInfo Infos[] = {};
+  if (Kind < FirstTargetFixupKind) {
+    return MCCodeEmitter::getFixupKindInfo(Kind);
+  }
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+      "Invalid kind!");
+  return MCCodeEmitter::getFixupKindInfo(Kind);
+ // return Infos[Kind - FirstTargetFixupKind];
+
+}
+
+void
+AMDILMCCodeEmitter::EmitByte(unsigned char C, unsigned &CurByte,
+    raw_ostream &OS) const
+{
+  OS << (char) C;
+  ++CurByte;
+}
+void
+AMDILMCCodeEmitter::EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+    raw_ostream &OS) const
+{
+  // Output the constant in little endian byte order
+  for (unsigned i = 0; i != Size; ++i) {
+    EmitByte(Val & 255, CurByte, OS);
+    Val >>= 8;
+  }
+}
+void
+AMDILMCCodeEmitter::EmitImmediate(const MCOperand &DispOp, unsigned ImmSize,
+    MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+    SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const
+{
+  // If this is a simple integer displacement that doesn't require a relocation
+  // emit it now.
+  if (DispOp.isImm()) {
+    EmitConstant(DispOp.getImm() + ImmOffset, ImmSize, CurByte, OS);
+  }
+
+  // If we have an immoffset, add it to the expression
+  const MCExpr *Expr = DispOp.getExpr();
+
+  if (ImmOffset) {
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+        MCConstantExpr::Create(ImmOffset, Ctx), Ctx);
+  }
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
+  // TODO: Why the 4 zeros?
+  EmitConstant(0, ImmSize, CurByte, OS);
+}
+
+void
+AMDILMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+    SmallVectorImpl<MCFixup> &Fixups) const
+{
+#if 0
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = TII.get(Opcode);
+  unsigned TSFlags = Desc.TSFlags;
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+
+  unsigned char BaseOpcode = 0;
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (// !Desc.isVariadic() &&
+      CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+#endif
+}
+#endif
diff --git a/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp

new file mode 100644 (file)

index 0000000..0061d29
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp
@@ -0,0 +1,597 @@
+//===-- AMDILMachineFunctionInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILCompilerErrors.h"
+#include "AMDILModuleInfo.h"
+#include "AMDILSubtarget.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+static const AMDILConstPtr *getConstPtr(const AMDILKernel *krnl, const std::string &arg) {
+  llvm::SmallVector<AMDILConstPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end;
+  for (begin = krnl->constPtr.begin(), end = krnl->constPtr.end();
+       begin != end; ++begin) {
+    if (!strcmp(begin->name.data(),arg.c_str())) {
+      return &(*begin);
+    }
+  }
+  return NULL;
+}
+
+void PrintfInfo::addOperand(size_t idx, uint32_t size) {
+  mOperands.resize((unsigned)(idx + 1));
+  mOperands[(unsigned)idx] = size;
+}
+
+uint32_t PrintfInfo::getPrintfID() {
+  return mPrintfID;
+}
+
+void PrintfInfo::setPrintfID(uint32_t id) {
+  mPrintfID = id;
+}
+
+size_t PrintfInfo::getNumOperands() {
+  return mOperands.size();
+}
+
+uint32_t PrintfInfo::getOperandID(uint32_t idx) {
+  return mOperands[idx];
+}
+
+AMDILMachineFunctionInfo::AMDILMachineFunctionInfo()
+  : CalleeSavedFrameSize(0), BytesToPopOnReturn(0),
+  DecorationStyle(None), ReturnAddrIndex(0),
+  TailCallReturnAddrDelta(0),
+  SRetReturnReg(0), UsesLDS(false), LDSArg(false),
+  UsesGDS(false), GDSArg(false),
+  mReservedLits(9)
+{
+  for (uint32_t x = 0; x < AMDILDevice::MAX_IDS; ++x) {
+    mUsedMem[x] = false;
+  }
+  mMF = NULL;
+  mKernel = NULL;
+  mScratchSize = -1;
+  mArgSize = -1;
+  mStackSize = -1;
+}
+
+AMDILMachineFunctionInfo::AMDILMachineFunctionInfo(MachineFunction& MF)
+  : CalleeSavedFrameSize(0), BytesToPopOnReturn(0),
+  DecorationStyle(None), ReturnAddrIndex(0),
+  TailCallReturnAddrDelta(0),
+  SRetReturnReg(0), UsesLDS(false), LDSArg(false),
+  UsesGDS(false), GDSArg(false),
+  mReservedLits(9)
+{
+  for (uint32_t x = 0; x < AMDILDevice::MAX_IDS; ++x) {
+    mUsedMem[x] = false;
+  }
+  const Function *F = MF.getFunction();
+  mMF = &MF;
+  MachineModuleInfo &mmi = MF.getMMI();
+  const AMDILTargetMachine *TM = 
+      reinterpret_cast<const AMDILTargetMachine*>(&MF.getTarget());
+  AMDILModuleInfo *AMI = &(mmi.getObjFileInfo<AMDILModuleInfo>());
+  AMI->processModule(mmi.getModule(), TM);
+  mSTM = TM->getSubtargetImpl();
+  mKernel = AMI->getKernel(F->getName());
+
+  mScratchSize = -1;
+  mArgSize = -1;
+  mStackSize = -1;
+}
+
+AMDILMachineFunctionInfo::~AMDILMachineFunctionInfo()
+{
+  for (std::map<std::string, PrintfInfo*>::iterator pfb = printf_begin(),
+      pfe = printf_end(); pfb != pfe; ++pfb) {
+    delete pfb->second;
+  }
+}
+unsigned int
+AMDILMachineFunctionInfo::getCalleeSavedFrameSize() const
+{
+  return CalleeSavedFrameSize;
+}
+void
+AMDILMachineFunctionInfo::setCalleeSavedFrameSize(unsigned int bytes)
+{
+  CalleeSavedFrameSize = bytes;
+}
+unsigned int
+AMDILMachineFunctionInfo::getBytesToPopOnReturn() const
+{
+  return BytesToPopOnReturn;
+}
+void
+AMDILMachineFunctionInfo::setBytesToPopOnReturn(unsigned int bytes)
+{
+  BytesToPopOnReturn = bytes;
+}
+NameDecorationStyle
+AMDILMachineFunctionInfo::getDecorationStyle() const
+{
+  return DecorationStyle;
+}
+void
+AMDILMachineFunctionInfo::setDecorationStyle(NameDecorationStyle style)
+{
+  DecorationStyle = style;
+}
+int
+AMDILMachineFunctionInfo::getRAIndex() const
+{
+  return ReturnAddrIndex;
+}
+void
+AMDILMachineFunctionInfo::setRAIndex(int index)
+{
+  ReturnAddrIndex = index;
+}
+int
+AMDILMachineFunctionInfo::getTCReturnAddrDelta() const
+{
+  return TailCallReturnAddrDelta;
+}
+void
+AMDILMachineFunctionInfo::setTCReturnAddrDelta(int delta)
+{
+  TailCallReturnAddrDelta = delta;
+}
+unsigned int
+AMDILMachineFunctionInfo::getSRetReturnReg() const
+{
+  return SRetReturnReg;
+}
+void
+AMDILMachineFunctionInfo::setSRetReturnReg(unsigned int reg)
+{
+  SRetReturnReg = reg;
+}
+
+void 
+AMDILMachineFunctionInfo::setUsesLocal()
+{
+  UsesLDS = true;
+}
+
+bool
+AMDILMachineFunctionInfo::usesLocal() const
+{
+  return UsesLDS;
+}
+
+void 
+AMDILMachineFunctionInfo::setHasLocalArg()
+{
+  LDSArg = true;
+}
+
+bool
+AMDILMachineFunctionInfo::hasLocalArg() const
+{
+  return LDSArg;
+}
+
+
+
+void
+AMDILMachineFunctionInfo::setUsesRegion()
+{
+  UsesGDS = true;
+}
+
+bool
+AMDILMachineFunctionInfo::usesRegion() const
+{
+  return UsesGDS;
+}
+
+void 
+AMDILMachineFunctionInfo::setHasRegionArg()
+{
+  GDSArg = true;
+}
+
+bool
+AMDILMachineFunctionInfo::hasRegionArg() const
+{
+  return GDSArg;
+}
+
+
+bool
+AMDILMachineFunctionInfo::usesHWConstant(std::string name) const
+{
+  const AMDILConstPtr *curConst = getConstPtr(mKernel, name);
+  if (curConst) {
+    return curConst->usesHardware;
+  } else {
+    return false;
+  }
+}
+
+uint32_t
+AMDILMachineFunctionInfo::getLocal(uint32_t dim)
+{
+  if (mKernel && mKernel->sgv) {
+    AMDILKernelAttr *sgv = mKernel->sgv;
+    switch (dim) {
+    default: break;
+    case 0:
+    case 1:
+    case 2:
+      return sgv->reqGroupSize[dim];
+      break;
+    case 3:
+      return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2];
+    };
+  }
+  switch (dim) {
+  default:
+    return 1;
+  case 3:
+    return mSTM->getDefaultSize(0) *
+           mSTM->getDefaultSize(1) *
+           mSTM->getDefaultSize(2);
+  case 2:
+  case 1:
+  case 0:
+    return mSTM->getDefaultSize(dim);
+    break;
+  };
+  return 1;
+}
+bool
+AMDILMachineFunctionInfo::isKernel() const
+{
+  return mKernel != NULL && mKernel->mKernel;
+}
+
+AMDILKernel*
+AMDILMachineFunctionInfo::getKernel()
+{
+  return mKernel;
+}
+
+std::string
+AMDILMachineFunctionInfo::getName()
+{
+  if (mMF) {
+    return mMF->getFunction()->getName();
+  } else {
+    return "";
+  }
+}
+
+uint32_t
+AMDILMachineFunctionInfo::getArgSize()
+{
+  if (mArgSize == -1) {
+    Function::const_arg_iterator I = mMF->getFunction()->arg_begin();
+    Function::const_arg_iterator Ie = mMF->getFunction()->arg_end();
+    uint32_t Counter = 0;
+    while (I != Ie) {
+      Type* curType = I->getType();
+      if (curType->isIntegerTy() || curType->isFloatingPointTy()) {
+        ++Counter;
+      } else if (const VectorType *VT = dyn_cast<VectorType>(curType)) {
+        Type *ET = VT->getElementType();
+        int numEle = VT->getNumElements();
+        switch (ET->getPrimitiveSizeInBits()) {
+          default:
+            if (numEle == 3) {
+              Counter++;
+            } else {
+              Counter += ((numEle + 2) >> 2);
+            }
+            break;
+          case 64:
+            if (numEle == 3) {
+              Counter += 2;
+            } else {
+              Counter += (numEle >> 1);
+            }
+            break;
+          case 16:
+          case 8:
+            switch (numEle) {
+              default:
+                Counter += ((numEle + 2) >> 2);
+              case 2:
+                Counter++;
+                break;
+            }
+            break;
+        }
+      } else if (const PointerType *PT = dyn_cast<PointerType>(curType)) {
+        Type *CT = PT->getElementType();
+        const StructType *ST = dyn_cast<StructType>(CT);
+        if (ST && ST->isOpaque()) {
+          bool i1d  = ST->getName() == "struct._image1d_t";
+          bool i1da = ST->getName() == "struct._image1d_array_t";
+          bool i1db = ST->getName() == "struct._image1d_buffer_t";
+          bool i2d  = ST->getName() == "struct._image2d_t";
+          bool i2da = ST->getName() == "struct._image2d_array_t";
+          bool i3d  = ST->getName() == "struct._image3d_t";
+          bool is_image = i1d || i1da || i1db || i2d || i2da || i3d;
+          if (is_image) {
+            if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) {
+              Counter += 2;
+            } else {
+              addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]);
+            }
+          } else {
+            Counter++;
+          }
+        } else if (CT->isStructTy()
+            && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+          StructType *ST = dyn_cast<StructType>(CT);
+          Counter += ((getTypeSize(ST) + 15) & ~15) >> 4;
+        } else if (CT->isIntOrIntVectorTy()
+            || CT->isFPOrFPVectorTy()
+            || CT->isArrayTy()
+            || CT->isPointerTy()
+            || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+          ++Counter;
+        } else {
+          assert(0 && "Current type is not supported!");
+          addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+        }
+      } else {
+        assert(0 && "Current type is not supported!");
+        addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]);
+      }
+      ++I;
+    }
+    // Convert from slots to bytes by multiplying by 16(shift by 4).
+    mArgSize = Counter << 4;
+  }
+  return (uint32_t)mArgSize;
+}
+  uint32_t
+AMDILMachineFunctionInfo::getScratchSize()
+{
+  if (mScratchSize == -1) {
+    mScratchSize = 0;
+    Function::const_arg_iterator I = mMF->getFunction()->arg_begin();
+    Function::const_arg_iterator Ie = mMF->getFunction()->arg_end();
+    while (I != Ie) {
+      Type *curType = I->getType();
+      mScratchSize += ((getTypeSize(curType) + 15) & ~15);
+      ++I;
+    }
+    mScratchSize += ((mScratchSize + 15) & ~15);
+  }
+  return (uint32_t)mScratchSize;
+}
+
+  uint32_t
+AMDILMachineFunctionInfo::getStackSize()
+{
+  if (mStackSize == -1) {
+    uint32_t privSize = 0;
+    const MachineFrameInfo *MFI = mMF->getFrameInfo();
+    privSize = MFI->getOffsetAdjustment() + MFI->getStackSize();
+    const AMDILTargetMachine *TM = 
+      reinterpret_cast<const AMDILTargetMachine*>(&mMF->getTarget());
+    bool addStackSize = TM->getOptLevel() == CodeGenOpt::None;
+    Function::const_arg_iterator I = mMF->getFunction()->arg_begin();
+    Function::const_arg_iterator Ie = mMF->getFunction()->arg_end();
+    while (I != Ie) {
+      Type *curType = I->getType();
+      ++I;
+      if (dyn_cast<PointerType>(curType)) {
+        Type *CT = dyn_cast<PointerType>(curType)->getElementType();
+        if (CT->isStructTy()
+            && dyn_cast<PointerType>(curType)->getAddressSpace() 
+            == AMDILAS::PRIVATE_ADDRESS) {
+          addStackSize = true;
+        }
+      }
+    }
+    if (addStackSize) {
+      privSize += getScratchSize();
+    }
+    mStackSize = privSize;
+  }
+  return (uint32_t)mStackSize;
+
+}
+
+uint32_t 
+AMDILMachineFunctionInfo::addi32Literal(uint32_t val, int Opcode) {
+  // Since we have emulated 16/8/1 bit register types with a 32bit real
+  // register, we need to sign extend the constants to 32bits in order for
+  // comparisons against the constants to work correctly, this fixes some issues
+  // we had in conformance failing for saturation.
+  if (Opcode == AMDIL::LOADCONST_i16) {
+    val = (((int32_t)val << 16) >> 16);
+  } else if (Opcode == AMDIL::LOADCONST_i8) {
+    val = (((int32_t)val << 24) >> 24);
+  }
+  if (mIntLits.find(val) == mIntLits.end()) {
+    mIntLits[val] = getNumLiterals();
+  }
+  return mIntLits[val];
+}
+
+uint32_t 
+AMDILMachineFunctionInfo::addi64Literal(uint64_t val) {
+  if (mLongLits.find(val) == mLongLits.end()) {
+    mLongLits[val] = getNumLiterals();
+  }
+  return mLongLits[val];
+}
+
+uint32_t 
+AMDILMachineFunctionInfo::addi128Literal(uint64_t val_lo, uint64_t val_hi) {
+  std::pair<uint64_t, uint64_t> a;
+  a.first = val_lo;
+  a.second = val_hi;
+  if (mVecLits.find(a) == mVecLits.end()) {
+    mVecLits[a] = getNumLiterals();
+  }
+  return mVecLits[a];
+}
+
+uint32_t 
+AMDILMachineFunctionInfo::addf32Literal(const ConstantFP *CFP) {
+  uint32_t val = (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+  if (mIntLits.find(val) == mIntLits.end()) {
+    mIntLits[val] = getNumLiterals();
+  }
+  return mIntLits[val];
+}
+
+uint32_t 
+AMDILMachineFunctionInfo::addf64Literal(const ConstantFP *CFP) {
+  union dtol_union {
+    double d;
+    uint64_t ul;
+  } dval;
+  const APFloat &APF = CFP->getValueAPF();
+  if (&APF.getSemantics() == (const llvm::fltSemantics *)&APFloat::IEEEsingle) {
+    float fval = APF.convertToFloat();
+    dval.d = (double)fval;
+  } else {
+    dval.d = APF.convertToDouble();
+  }
+  if (mLongLits.find(dval.ul) == mLongLits.end()) {
+    mLongLits[dval.ul] = getNumLiterals();
+  }
+  return mLongLits[dval.ul];
+}
+
+  uint32_t 
+AMDILMachineFunctionInfo::getIntLits(uint32_t offset) 
+{
+  return mIntLits[offset];
+}
+
+  uint32_t 
+AMDILMachineFunctionInfo::getLongLits(uint64_t offset) 
+{
+  return mLongLits[offset];
+}
+
+  uint32_t
+AMDILMachineFunctionInfo::getVecLits(uint64_t low64, uint64_t high64)
+{
+  return mVecLits[std::pair<uint64_t, uint64_t>(low64, high64)];
+}
+
+size_t 
+AMDILMachineFunctionInfo::getNumLiterals() const {
+  return mLongLits.size() + mIntLits.size() + mVecLits.size() + mReservedLits;
+}
+
+  void
+AMDILMachineFunctionInfo::addReservedLiterals(uint32_t size)
+{
+  mReservedLits += size;
+}
+
+  uint32_t 
+AMDILMachineFunctionInfo::addSampler(std::string name, uint32_t val)
+{
+  if (mSamplerMap.find(name) != mSamplerMap.end()) {
+    SamplerInfo newVal = mSamplerMap[name];
+    assert(newVal.val == val 
+        && "Found a sampler with same name but different values!");
+    return mSamplerMap[name].idx;
+  } else {
+    SamplerInfo curVal;
+    curVal.name = name;
+    curVal.val = val;
+    curVal.idx = mSamplerMap.size();
+    mSamplerMap[name] = curVal;
+    return curVal.idx;
+  }
+}
+
+void
+AMDILMachineFunctionInfo::setUsesMem(unsigned id) {
+  assert(id < AMDILDevice::MAX_IDS &&
+      "Must set the ID to be less than MAX_IDS!");
+  mUsedMem[id] = true;
+}
+
+bool 
+AMDILMachineFunctionInfo::usesMem(unsigned id) {
+  assert(id < AMDILDevice::MAX_IDS &&
+      "Must set the ID to be less than MAX_IDS!");
+  return mUsedMem[id];
+}
+
+  void 
+AMDILMachineFunctionInfo::addErrorMsg(const char *msg, ErrorMsgEnum val) 
+{
+  if (val == DEBUG_ONLY) {
+#if defined(DEBUG) || defined(_DEBUG)
+    mErrors.insert(msg);
+#endif
+  }  else if (val == RELEASE_ONLY) {
+#if !defined(DEBUG) && !defined(_DEBUG)
+    mErrors.insert(msg);
+#endif
+  } else if (val == ALWAYS) {
+    mErrors.insert(msg);
+  }
+}
+
+  uint32_t 
+AMDILMachineFunctionInfo::addPrintfString(std::string &name, unsigned offset) 
+{
+  if (mPrintfMap.find(name) != mPrintfMap.end()) {
+    return mPrintfMap[name]->getPrintfID();
+  } else {
+    PrintfInfo *info = new PrintfInfo;
+    info->setPrintfID(mPrintfMap.size() + offset);
+    mPrintfMap[name] = info;
+    return info->getPrintfID();
+  }
+}
+
+  void 
+AMDILMachineFunctionInfo::addPrintfOperand(std::string &name, 
+    size_t idx,
+    uint32_t size) 
+{
+  mPrintfMap[name]->addOperand(idx, size);
+}
+
+  void 
+AMDILMachineFunctionInfo::addMetadata(const char *md, bool kernelOnly) 
+{
+  addMetadata(std::string(md), kernelOnly);
+}
+
+  void 
+AMDILMachineFunctionInfo::addMetadata(std::string md, bool kernelOnly) 
+{
+  if (kernelOnly) {
+    mMetadataKernel.push_back(md);
+  } else {
+    mMetadataFunc.insert(md);
+  }
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h

new file mode 100644 (file)

index 0000000..45f5751
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h
@@ -0,0 +1,422 @@
+//== AMDILMachineFunctionInfo.h - AMD il Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file declares AMDIL-specific per-machine-function information
+//
+//===----------------------------------------------------------------------===//
+#ifndef _AMDILMACHINEFUNCTIONINFO_H_
+#define _AMDILMACHINEFUNCTIONINFO_H_
+#include "AMDIL.h"
+#include "AMDILDevice.h"
+#include "AMDILKernel.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Function.h"
+
+#include <map>
+#include <set>
+#include <string>
+
+namespace llvm
+{
+  class AMDILSubtarget;
+  class PrintfInfo {
+    uint32_t mPrintfID;
+    SmallVector<uint32_t, DEFAULT_VEC_SLOTS> mOperands;
+    public:
+    void addOperand(size_t idx, uint32_t size);
+    uint32_t getPrintfID();
+    void setPrintfID(uint32_t idx);
+    size_t getNumOperands();
+    uint32_t getOperandID(uint32_t idx);
+  }; // class PrintfInfo
+
+  enum NameDecorationStyle
+  {
+    None,
+    StdCall,
+    FastCall
+  };
+  typedef struct SamplerInfoRec {
+    std::string name; // The name of the sampler
+    uint32_t val; // The value of the sampler
+    uint32_t idx; // The sampler resource id
+  } SamplerInfo;
+  // Some typedefs that will help with using the various iterators
+  // of the machine function info class.
+  typedef std::map<uint32_t, uint32_t>::iterator lit32_iterator;
+  typedef std::map<uint64_t, uint32_t>::iterator lit64_iterator;
+  typedef std::map<std::pair<uint64_t, uint64_t>, uint32_t>::iterator
+    lit128_iterator;
+  typedef StringMap<SamplerInfo>::iterator sampler_iterator;
+  typedef DenseSet<uint32_t>::iterator func_iterator;
+  typedef DenseSet<uint32_t>::iterator intr_iterator;
+  typedef DenseSet<uint32_t>::iterator uav_iterator;
+  typedef DenseSet<uint32_t>::iterator read_image2d_iterator;
+  typedef DenseSet<uint32_t>::iterator read_image3d_iterator;
+  typedef DenseSet<uint32_t>::iterator write_image2d_iterator;
+  typedef DenseSet<uint32_t>::iterator write_image3d_iterator;
+  typedef DenseSet<const char*>::iterator error_iterator;
+  typedef std::map<std::string, PrintfInfo*>::iterator printf_iterator;
+  typedef std::set<std::string>::iterator func_md_iterator;
+  typedef std::vector<std::string>::iterator kernel_md_iterator;
+  // AMDILMachineFunctionInfo - This class is
+  // derived from MachineFunction private
+  // amdil target-specific information for each MachineFunction
+  class AMDILMachineFunctionInfo : public MachineFunctionInfo
+  {
+    // CalleeSavedFrameSize - Size of the callee-saved
+    // register portion of the
+    // stack frame in bytes.
+    unsigned int CalleeSavedFrameSize;
+    // BytesToPopOnReturn - Number of bytes function pops on return.
+    // Used on windows platform for stdcall & fastcall name decoration
+    unsigned int BytesToPopOnReturn;
+    // DecorationStyle - If the function requires additional
+    // name decoration,
+    // DecorationStyle holds the right way to do so.
+    NameDecorationStyle DecorationStyle;
+    // ReturnAddrIndex - FrameIndex for return slot.
+    int ReturnAddrIndex;
+
+    // TailCallReturnAddrDelta - Delta the ReturnAddr stack slot is moved
+    // Used for creating an area before the register spill area
+    // on the stack
+    // the returnaddr can be savely move to this area
+    int TailCallReturnAddrDelta;
+
+    // SRetReturnReg - Some subtargets require that sret lowering includes
+    // returning the value of the returned struct in a register.
+    // This field holds the virtual register into which the sret
+    // argument is passed.
+    unsigned int SRetReturnReg;
+
+    // UsesLocal - Specifies that this function uses LDS memory and
+    // that it needs to be allocated.
+    bool UsesLDS;
+
+    // LDSArg - Flag that specifies if this function has an Local
+    // argument or not
+    bool LDSArg;
+
+    // UsesGDS - Specifies that this function uses GDS memory and
+    // that it needs to be allocated.
+    bool UsesGDS;
+
+    // GDSArg - Flag that specifies if this function has an Region
+    // argument or not
+    bool GDSArg;
+
+    // The size in bytes required to host all of the kernel arguments.
+    // -1 means this value has not been determined yet.
+    int32_t mArgSize;
+
+    // The size in bytes required to host the stack and the kernel arguments
+    // in private memory.
+    // -1 means this value has not been determined yet.
+    int32_t mScratchSize;
+
+    // The size in bytes required to host the the kernel arguments
+    // on the stack.
+    // -1 means this value has not been determined yet.
+    int32_t mStackSize;
+
+    /// A map of constant to literal mapping for all of the 32bit or
+    /// smaller literals in the current function.
+    std::map<uint32_t, uint32_t> mIntLits;
+
+    /// A map of constant to literal mapping for all of the 64bit
+    /// literals in the current function.
+    std::map<uint64_t, uint32_t> mLongLits;
+
+    /// A map of constant to literal mapping for all of the 128bit
+    /// literals in the current function.
+    std::map<std::pair<uint64_t, uint64_t>, uint32_t> mVecLits;
+
+    /// The number of literals that should be reserved.
+    /// TODO: Remove this when the wrapper emitter is added.
+    uint32_t mReservedLits;
+
+    /// A map of name to sampler information that is used to emit
+    /// metadata to the IL stream that the runtimes can use for
+    /// hardware setup.
+    StringMap<SamplerInfo> mSamplerMap;
+
+    /// Array of flags to specify if a specific memory type is used or not.
+    bool mUsedMem[AMDILDevice::MAX_IDS];
+
+    /// Set of all functions that this function calls.
+    DenseSet<uint32_t> mFuncs;
+
+    /// Set of all intrinsics that this function calls.
+    DenseSet<uint32_t> mIntrs;
+
+    /// Set of all read only 2D images.
+    DenseSet<uint32_t> mRO2D;
+    /// Set of all read only 3D images.
+    DenseSet<uint32_t> mRO3D;
+    /// Set of all write only 2D images.
+    DenseSet<uint32_t> mWO2D;
+    /// Set of all write only 3D images.
+    DenseSet<uint32_t> mWO3D;
+    /// Set of all the raw uavs.
+    DenseSet<uint32_t> mRawUAV;
+    /// Set of all the arena uavs.
+    DenseSet<uint32_t> mArenaUAV;
+
+    /// A set of all errors that occured in the backend for this function.
+    DenseSet<const char *> mErrors;
+
+    /// A mapping of printf data and the printf string
+    std::map<std::string, PrintfInfo*> mPrintfMap;
+
+    /// A set of all of the metadata that is used for the current function.
+    std::set<std::string> mMetadataFunc;
+
+    /// A set of all of the metadata that is used for the function wrapper.
+    std::vector<std::string> mMetadataKernel;
+
+    /// Information about the kernel, NULL if the function is not a kernel.
+    AMDILKernel *mKernel;
+
+    /// Pointer to the machine function that this information belongs to.
+    MachineFunction *mMF;
+
+    /// Pointer to the subtarget for this function.
+    const AMDILSubtarget *mSTM;
+    public:
+    AMDILMachineFunctionInfo();
+    AMDILMachineFunctionInfo(MachineFunction &MF);
+    virtual ~AMDILMachineFunctionInfo();
+    unsigned int
+      getCalleeSavedFrameSize() const;
+    void
+      setCalleeSavedFrameSize(unsigned int bytes);
+
+    unsigned int
+      getBytesToPopOnReturn() const;
+    void
+      setBytesToPopOnReturn (unsigned int bytes);
+
+    NameDecorationStyle
+      getDecorationStyle() const;
+    void
+      setDecorationStyle(NameDecorationStyle style);
+
+    int
+      getRAIndex() const;
+    void
+      setRAIndex(int Index);
+
+    int
+      getTCReturnAddrDelta() const;
+    void
+      setTCReturnAddrDelta(int delta);
+
+    unsigned int
+      getSRetReturnReg() const;
+    void
+      setSRetReturnReg(unsigned int Reg);
+
+    void 
+      setUsesLocal();
+    bool 
+      usesLocal() const;
+    void
+      setHasLocalArg();
+    bool 
+      hasLocalArg() const;
+
+    void 
+      setUsesRegion();
+    bool 
+      usesRegion() const;
+    void
+      setHasRegionArg();
+    bool 
+      hasRegionArg() const;
+
+    bool
+      usesHWConstant(std::string name) const;
+    uint32_t
+      getLocal(uint32_t);
+    bool
+      isKernel() const;
+    AMDILKernel*
+      getKernel();
+
+    std::string
+      getName();
+
+    /// Get the size in bytes that are required to host all of
+    /// arguments based on the argument alignment rules in the AMDIL 
+    /// Metadata spec.
+    uint32_t getArgSize();
+
+    /// Get the size in bytes that are required to host all of
+    /// arguments and stack memory in scratch.
+    uint32_t getScratchSize();
+
+    /// Get the size in bytes that is required to host all of
+    /// the arguments on the stack.
+    uint32_t getStackSize();
+
+    ///
+    /// @param val value to add the lookup table
+    /// @param Opcode opcode of the literal instruction
+    /// @brief adds the specified value of the type represented by the
+    /// Opcode
+    /// to the literal to integer and integer to literal mappings.
+    ///
+    /// Add a 32bit integer value to the literal table.
+    uint32_t addi32Literal(uint32_t val, int Opcode = AMDIL::LOADCONST_i32);
+
+    /// Add a 32bit floating point value to the literal table.
+    uint32_t addf32Literal(const ConstantFP *CFP);
+
+    /// Add a 64bit integer value to the literal table.
+    uint32_t addi64Literal(uint64_t val);
+
+    /// Add a 128 bit integer value to the literal table.
+    uint32_t addi128Literal(uint64_t val_lo, uint64_t val_hi);
+
+    /// Add a 64bit floating point literal as a 64bit integer value.
+    uint32_t addf64Literal(const ConstantFP *CFP);
+
+    /// Get the number of literals that have currently been allocated.
+    size_t getNumLiterals() const;
+
+    /// Get the literal ID of an Integer literal of the given offset.
+    uint32_t getIntLits(uint32_t lit);
+
+    /// Get the literal ID of a Long literal of the given offset.
+    uint32_t getLongLits(uint64_t lit);
+
+    /// Get the literal ID of a Long literal of the given offset.
+    uint32_t getVecLits(uint64_t low64, uint64_t high64);
+
+    /// Add some literals to the number of reserved literals.
+    void addReservedLiterals(uint32_t);
+
+    // Functions that return iterators to the beginning and end
+    // of the various literal maps.
+    // Functions that return the beginning and end of the 32bit literal map
+    lit32_iterator begin_32() { return mIntLits.begin(); }
+    lit32_iterator end_32() { return mIntLits.end(); }
+
+    // Functions that return the beginning and end of the 64bit literal map
+    lit64_iterator begin_64() { return mLongLits.begin(); }
+    lit64_iterator end_64() { return mLongLits.end(); }
+
+    // Functions that return the beginning and end of the 2x64bit literal map
+    lit128_iterator begin_128() { return mVecLits.begin(); }
+    lit128_iterator end_128() { return mVecLits.end(); }
+
+    // Add a sampler to the set of known samplers for the current kernel.
+    uint32_t addSampler(std::string name, uint32_t value);
+    
+    // Iterators that point to the beginning and end of the sampler map.
+    sampler_iterator sampler_begin() { return mSamplerMap.begin(); }
+    sampler_iterator sampler_end() { return mSamplerMap.end(); }
+
+
+    /// Set the flag for the memory ID to true for the current function.
+    void setUsesMem(unsigned);
+    /// Retrieve the flag for the memory ID.
+    bool usesMem(unsigned);
+
+    /// Add called functions to the set of all functions this function calls.
+    void addCalledFunc(uint32_t id) { mFuncs.insert(id); }
+    void eraseCalledFunc(uint32_t id) { mFuncs.erase(id); }
+    size_t func_size() { return mFuncs.size(); }
+    bool func_empty() { return mFuncs.empty(); }
+    func_iterator func_begin() { return mFuncs.begin(); }
+    func_iterator func_end() { return mFuncs.end(); }
+
+    /// Add called intrinsics to the set of all intrinscis this function calls.
+    void addCalledIntr(uint32_t id) { mIntrs.insert(id); }
+    size_t intr_size() { return mIntrs.size(); }
+    bool intr_empty() { return mIntrs.empty(); }
+    intr_iterator intr_begin() { return mIntrs.begin(); }
+    intr_iterator intr_end() { return mIntrs.end(); }
+
+    /// Add a 2D read_only image id.
+    void addROImage2D(uint32_t id) { mRO2D.insert(id); }
+    size_t read_image2d_size() { return mRO2D.size(); }
+    read_image2d_iterator read_image2d_begin() { return mRO2D.begin(); }
+    read_image2d_iterator read_image2d_end() { return mRO2D.end(); }
+
+    /// Add a 3D read_only image id.
+    void addROImage3D(uint32_t id) { mRO3D.insert(id); }
+    size_t read_image3d_size() { return mRO3D.size(); }
+    read_image3d_iterator read_image3d_begin() { return mRO3D.begin(); }
+    read_image3d_iterator read_image3d_end() { return mRO3D.end(); }
+
+    /// Add a 2D write_only image id.
+    void addWOImage2D(uint32_t id) { mWO2D.insert(id); }
+    size_t write_image2d_size() { return mWO2D.size(); }
+    write_image2d_iterator write_image2d_begin() { return mWO2D.begin(); }
+    write_image2d_iterator write_image2d_end() { return mWO2D.end(); }
+
+       /// Add a 3D write_only image id.
+    void addWOImage3D(uint32_t id) { mWO3D.insert(id); }
+    size_t write_image3d_size() { return mWO3D.size(); }
+    write_image3d_iterator write_image3d_begin() { return mWO3D.begin(); }
+    write_image3d_iterator write_image3d_end() { return mWO3D.end(); }
+
+    /// Add a raw uav id.
+    void uav_insert(uint32_t id) { mRawUAV.insert(id); }
+    bool uav_count(uint32_t id) { return mRawUAV.count(id); }
+    size_t uav_size() { return mRawUAV.size(); }
+    uav_iterator uav_begin() { return mRawUAV.begin(); }
+    uav_iterator uav_end() { return mRawUAV.end(); }
+
+    /// Add an arena uav id.
+    void arena_insert(uint32_t id) { mArenaUAV.insert(id); }
+    bool arena_count(uint32_t id) { return mArenaUAV.count(id); }
+    size_t arena_size() { return mArenaUAV.size(); }
+    uav_iterator arena_begin() { return mArenaUAV.begin(); }
+    uav_iterator arena_end() { return mArenaUAV.end(); }
+
+    // Add an error to the output for the current function.
+    typedef enum {
+      RELEASE_ONLY, /// Only emit error message in release mode.
+      DEBUG_ONLY, /// Only emit error message in debug mode.
+      ALWAYS /// Always emit the error message.
+    } ErrorMsgEnum;
+    /// Add an error message to the set of all error messages.
+    void addErrorMsg(const char* msg, ErrorMsgEnum val = ALWAYS);
+    bool errors_empty() { return mErrors.empty(); }
+    error_iterator errors_begin() { return mErrors.begin(); }
+    error_iterator errors_end() { return mErrors.end(); }
+
+    /// Add a string to the printf map
+    uint32_t addPrintfString(std::string &name, unsigned offset);
+    /// Add a operand to the printf string
+    void addPrintfOperand(std::string &name, size_t idx, uint32_t size);
+    bool printf_empty() { return mPrintfMap.empty(); }
+    size_t printf_size() { return mPrintfMap.size(); }
+    printf_iterator printf_begin() { return mPrintfMap.begin(); }
+    printf_iterator printf_end() { return mPrintfMap.end(); }
+
+    /// Add a string to the metadata set for a function/kernel wrapper
+    void addMetadata(const char *md, bool kernelOnly = false);
+    void addMetadata(std::string md, bool kernelOnly = false);
+    func_md_iterator func_md_begin() { return mMetadataFunc.begin(); }
+    func_md_iterator func_md_end() { return mMetadataFunc.end(); }
+    kernel_md_iterator kernel_md_begin() { return mMetadataKernel.begin(); }
+    kernel_md_iterator kernel_md_end() { return mMetadataKernel.end(); }
+  };
+} // llvm namespace
+#endif // _AMDILMACHINEFUNCTIONINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp

new file mode 100644 (file)

index 0000000..b8e5363
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
@@ -0,0 +1,173 @@
+//===-- AMDILMachinePeephole.cpp - AMDIL Machine Peephole Pass -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+
+#define DEBUG_TYPE "machine_peephole"
+#if !defined(NDEBUG)
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME (false)
+#endif
+
+#include "AMDIL.h"
+#include "AMDILSubtarget.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+namespace
+{
+  class AMDILMachinePeephole : public MachineFunctionPass
+  {
+    public:
+      static char ID;
+      AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+      //virtual ~AMDILMachinePeephole();
+      virtual const char*
+        getPassName() const;
+      virtual bool
+        runOnMachineFunction(MachineFunction &MF);
+    private:
+      void insertFence(MachineBasicBlock::iterator &MIB);
+      TargetMachine &TM;
+      bool mDebug;
+  }; // AMDILMachinePeephole
+  char AMDILMachinePeephole::ID = 0;
+} // anonymous namespace
+
+namespace llvm
+{
+  FunctionPass*
+    createAMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+    {
+      return new AMDILMachinePeephole(tm AMDIL_OPT_LEVEL_VAR);
+    }
+} // llvm namespace
+
+AMDILMachinePeephole::AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : MachineFunctionPass(ID), TM(tm)
+{
+  mDebug = DEBUGME;
+}
+
+bool
+AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
+{
+  bool Changed = false;
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  for (MachineFunction::iterator MBB = MF.begin(), MBE = MF.end();
+      MBB != MBE; ++MBB) {
+    MachineBasicBlock *mb = MBB;
+    for (MachineBasicBlock::iterator MIB = mb->begin(), MIE = mb->end();
+        MIB != MIE; ++MIB) {
+      MachineInstr *mi = MIB;
+      const char * name;
+      name = TM.getInstrInfo()->getName(mi->getOpcode());
+      switch (mi->getOpcode()) {
+        default:
+          if (isAtomicInst(TM.getInstrInfo(), mi)) {
+            // If we don't support the hardware accellerated address spaces,
+            // then the atomic needs to be transformed to the global atomic.
+            if (strstr(name, "_L_")
+                && STM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) {
+              BuildMI(*mb, MIB, mi->getDebugLoc(), 
+                  TM.getInstrInfo()->get(AMDIL::ADD_i32), AMDIL::R1011)
+                .addReg(mi->getOperand(1).getReg())
+                .addReg(AMDIL::T2);
+              mi->getOperand(1).setReg(AMDIL::R1011);
+              mi->setDesc(
+                  TM.getInstrInfo()->get(
+                    (mi->getOpcode() - AMDIL::ATOM_L_ADD) + AMDIL::ATOM_G_ADD));
+            } else if (strstr(name, "_R_")
+                && STM->device()->usesSoftware(AMDILDeviceInfo::RegionMem)) {
+              assert(!"Software region memory is not supported!");
+              mi->setDesc(
+                  TM.getInstrInfo()->get(
+                    (mi->getOpcode() - AMDIL::ATOM_R_ADD) + AMDIL::ATOM_G_ADD));
+            }
+          } else if ((isLoadInst(TM.getInstrInfo(), mi) || isStoreInst(TM.getInstrInfo(), mi)) && isVolatileInst(TM.getInstrInfo(), mi)) {
+            insertFence(MIB);
+          }
+          continue;
+          break;
+        case AMDIL::USHR_i16:
+        case AMDIL::USHR_v2i16:
+        case AMDIL::USHR_v4i16:
+        case AMDIL::USHRVEC_i16:
+        case AMDIL::USHRVEC_v2i16:
+        case AMDIL::USHRVEC_v4i16:
+          if (TM.getSubtarget<AMDILSubtarget>()
+              .device()->usesSoftware(AMDILDeviceInfo::ShortOps)) {
+            unsigned lReg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRI32RegClass);
+            unsigned Reg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRV4I32RegClass);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::LOADCONST_i32),
+                lReg).addImm(0xFFFF);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32),
+                Reg)
+              .addReg(mi->getOperand(1).getReg())
+              .addReg(lReg);
+            mi->getOperand(1).setReg(Reg);
+          }
+          break;
+        case AMDIL::USHR_i8:
+        case AMDIL::USHR_v2i8:
+        case AMDIL::USHR_v4i8:
+        case AMDIL::USHRVEC_i8:
+        case AMDIL::USHRVEC_v2i8:
+        case AMDIL::USHRVEC_v4i8:
+          if (TM.getSubtarget<AMDILSubtarget>()
+              .device()->usesSoftware(AMDILDeviceInfo::ByteOps)) {
+            unsigned lReg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRI32RegClass);
+            unsigned Reg = MF.getRegInfo()
+              .createVirtualRegister(&AMDIL::GPRV4I32RegClass);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::LOADCONST_i32),
+                lReg).addImm(0xFF);
+            BuildMI(*mb, MIB, mi->getDebugLoc(),
+                TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32),
+                Reg)
+              .addReg(mi->getOperand(1).getReg())
+              .addReg(lReg);
+            mi->getOperand(1).setReg(Reg);
+          }
+          break;
+      }
+    }
+  }
+  return Changed;
+}
+
+const char*
+AMDILMachinePeephole::getPassName() const
+{
+  return "AMDIL Generic Machine Peephole Optimization Pass";
+}
+
+void
+AMDILMachinePeephole::insertFence(MachineBasicBlock::iterator &MIB)
+{
+  MachineInstr *MI = MIB;
+  MachineInstr *fence = BuildMI(*(MI->getParent()->getParent()),
+        MI->getDebugLoc(),
+        TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1);
+
+  MI->getParent()->insert(MIB, fence);
+  fence = BuildMI(*(MI->getParent()->getParent()),
+        MI->getDebugLoc(),
+        TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1);
+  MIB = MI->getParent()->insertAfter(MIB, fence);
+}
diff --git a/src/gallium/drivers/radeon/AMDILModuleInfo.cpp b/src/gallium/drivers/radeon/AMDILModuleInfo.cpp

new file mode 100644 (file)

index 0000000..82c3e4c
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILModuleInfo.cpp
@@ -0,0 +1,1266 @@
+//===-- AMDILModuleInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILModuleInfo.h"
+#include "AMDILDevices.h"
+#include "AMDILKernel.h"
+#include "AMDILSubtarget.h"
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILModuleInfo.h"
+#include "AMDILDevices.h"
+#include "AMDILKernel.h"
+#include "AMDILSubtarget.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include <cstdio>
+
+#define CB_BASE_OFFSET 2
+using namespace llvm;
+
+AMDILModuleInfo::AMDILModuleInfo(const MachineModuleInfo &MMI)
+{
+  mMMI = &MMI;
+  mOffset = 0;
+  mReservedBuffs = 0;
+  symTab = NULL;
+  mCurrentCPOffset = 0;
+  mPrintfOffset = 0;
+}
+
+AMDILModuleInfo::~AMDILModuleInfo() {
+  for (StringMap<AMDILKernel*>::iterator kb = mKernels.begin(), ke = mKernels.end();
+      kb != ke; ++kb) {
+    StringMapEntry<AMDILKernel*> cur = *kb;
+    AMDILKernel *ptr = cur.getValue();
+    delete ptr;
+  }
+}
+
+static const AMDILConstPtr *getConstPtr(const AMDILKernel *krnl, const std::string &arg) {
+  llvm::SmallVector<AMDILConstPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end;
+  for (begin = krnl->constPtr.begin(), end = krnl->constPtr.end();
+       begin != end; ++begin) {
+    if (!strcmp(begin->name.data(),arg.c_str())) {
+      return &(*begin);
+    }
+  }
+  return NULL;
+}
+#if 0
+static bool structContainsSub32bitType(const StructType *ST) {
+  StructType::element_iterator eib, eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    Type *ptr = *eib;
+    uint32_t size = (uint32_t)GET_SCALAR_SIZE(ptr);
+    if (!size) {
+      if (const StructType *ST = dyn_cast<StructType>(ptr)) {
+        if (structContainsSub32bitType(ST)) {
+          return true;
+        }
+      }
+    } else if (size < 32) {
+      return true;
+    }
+  }
+  return false;
+}
+#endif
+
+void AMDILModuleInfo::processModule(const Module *M,
+                                       const AMDILTargetMachine *mTM)
+{
+  Module::const_global_iterator GI;
+  Module::const_global_iterator GE;
+  mSTM = mTM->getSubtargetImpl();
+  for (GI = M->global_begin(), GE = M->global_end(); GI != GE; ++GI) {
+    const GlobalValue *GV = GI;
+    llvm::StringRef GVName = GV->getName();
+    const char *name = GVName.data();
+    if (!strncmp(name, "sgv", 3)) {
+      mKernelArgs[GVName] = parseSGV(GV);
+    } else if (!strncmp(name, "fgv", 3)) {
+      // we can ignore this since we don't care about the filename
+      // string
+    } else if (!strncmp(name, "lvgv", 4)) {
+      mLocalArgs[GVName] = parseLVGV(GV);
+    } else if (!strncmp(name, "llvm.image.annotations", 22)) {
+      parseImageAnnotate(GV);
+    } else if (!strncmp(name, "llvm.global.annotations", 23)) {
+      parseGlobalAnnotate(GV);
+    } else if (!strncmp(name, "llvm.constpointer.annotations", 29)) {
+      parseConstantPtrAnnotate(GV);
+    } else if (!strncmp(name, "llvm.readonlypointer.annotations", 32)) {
+      // These are skipped as we handle them later in AMDILPointerManager.cpp
+    } else if (GV->getType()->getAddressSpace() == 3) { // *** Match cl_kernel.h local AS #
+      parseAutoArray(GV, false);
+    } else if (strstr(name, "clregion")) {
+      parseAutoArray(GV, true);
+    } else if (!GV->use_empty()
+               && mIgnoreStr.find(GVName) == mIgnoreStr.end()) {
+      parseConstantPtr(GV);
+    }
+  }
+  allocateGlobalCB();
+
+  safeForEach(M->begin(), M->end(),
+      std::bind1st(
+        std::mem_fun(&AMDILModuleInfo::checkConstPtrsUseHW),
+        this));
+}
+
+void AMDILModuleInfo::allocateGlobalCB(void) {
+  uint32_t maxCBSize = mSTM->device()->getMaxCBSize();
+  uint32_t offset = 0;
+  uint32_t curCB = 0;
+  uint32_t swoffset = 0;
+  for (StringMap<AMDILConstPtr>::iterator cpb = mConstMems.begin(),
+       cpe = mConstMems.end(); cpb != cpe; ++cpb) {
+    bool constHW = mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem);
+    cpb->second.usesHardware = false;
+    if (constHW) {
+      // If we have a limit on the max CB Size, then we need to make sure that
+      // the constant sizes fall within the limits.
+      if (cpb->second.size <= maxCBSize) {
+        if (offset + cpb->second.size > maxCBSize) {
+          offset = 0;
+          curCB++;
+        }
+        if (curCB < mSTM->device()->getMaxNumCBs()) {
+          cpb->second.cbNum = curCB + CB_BASE_OFFSET;
+          cpb->second.offset = offset;
+          offset += (cpb->second.size + 15) & (~15);
+          cpb->second.usesHardware = true;
+          continue;
+        }
+      }
+    }
+    cpb->second.cbNum = 0;
+    cpb->second.offset = swoffset;
+    swoffset += (cpb->second.size + 15) & (~15);
+  }
+  if (!mConstMems.empty()) {
+    mReservedBuffs = curCB + 1;
+  }
+}
+
+bool AMDILModuleInfo::checkConstPtrsUseHW(llvm::Module::const_iterator *FCI)
+{
+  Function::const_arg_iterator AI, AE;
+  const Function *func = *FCI;
+  std::string name = func->getName();
+  if (!strstr(name.c_str(), "__OpenCL")
+      || !strstr(name.c_str(), "_AMDILKernel")) {
+    return false;
+  }
+  AMDILKernel *krnl = mKernels[name];
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) {
+    for (AI = func->arg_begin(), AE = func->arg_end();
+         AI != AE; ++AI) {
+      const Argument *Arg = &(*AI);
+      const PointerType *P = dyn_cast<PointerType>(Arg->getType());
+      if (!P) {
+        continue;
+      }
+      if (P->getAddressSpace() != AMDILAS::CONSTANT_ADDRESS) {
+        continue;
+      }
+      const AMDILConstPtr *ptr = getConstPtr(krnl, Arg->getName());
+      if (ptr) {
+        continue;
+      }
+      AMDILConstPtr constAttr;
+      constAttr.name = Arg->getName();
+      constAttr.size = this->mSTM->device()->getMaxCBSize();
+      constAttr.base = Arg;
+      constAttr.isArgument = true;
+      constAttr.isArray = false;
+      constAttr.offset = 0;
+      constAttr.usesHardware =
+        mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem);
+      if (constAttr.usesHardware) {
+        constAttr.cbNum = krnl->constPtr.size() + 2;
+      } else {
+        constAttr.cbNum = 0;
+      }
+      krnl->constPtr.push_back(constAttr);
+    }
+  }
+  // Now lets make sure that only the N largest buffers
+  // get allocated in hardware if we have too many buffers
+  uint32_t numPtrs = krnl->constPtr.size();
+  if (numPtrs > (this->mSTM->device()->getMaxNumCBs() - mReservedBuffs)) {
+    // TODO: Change this routine so it sorts
+    // AMDILConstPtr instead of pulling the sizes out
+    // and then grab the N largest and disable the rest
+    llvm::SmallVector<uint32_t, 16> sizes;
+    for (uint32_t x = 0; x < numPtrs; ++x) {
+      sizes.push_back(krnl->constPtr[x].size);
+    }
+    std::sort(sizes.begin(), sizes.end());
+    uint32_t numToDisable = numPtrs - (mSTM->device()->getMaxNumCBs() -
+                                       mReservedBuffs);
+    uint32_t safeSize = sizes[numToDisable-1];
+    for (uint32_t x = 0; x < numPtrs && numToDisable; ++x) {
+      if (krnl->constPtr[x].size <= safeSize) {
+        krnl->constPtr[x].usesHardware = false;
+        --numToDisable;
+      }
+    }
+  }
+  // Renumber all of the valid CB's so that
+  // they are linear increase
+  uint32_t CBid = 2 + mReservedBuffs;
+  for (uint32_t x = 0; x < numPtrs; ++x) {
+    if (krnl->constPtr[x].usesHardware) {
+      krnl->constPtr[x].cbNum = CBid++;
+    }
+  }
+  for (StringMap<AMDILConstPtr>::iterator cpb = mConstMems.begin(),
+       cpe = mConstMems.end(); cpb != cpe; ++cpb) {
+    if (cpb->second.usesHardware) {
+      krnl->constPtr.push_back(cpb->second);
+    }
+  }
+  for (uint32_t x = 0; x < krnl->constPtr.size(); ++x) {
+    AMDILConstPtr &c = krnl->constPtr[x];
+    uint32_t cbNum = c.cbNum - CB_BASE_OFFSET;
+    if (cbNum < HW_MAX_NUM_CB && c.cbNum >= CB_BASE_OFFSET) {
+      if ((c.size + c.offset) > krnl->constSizes[cbNum]) {
+        krnl->constSizes[cbNum] =
+          ((c.size + c.offset) + 15) & ~15;
+      }
+    } else {
+      krnl->constPtr[x].usesHardware = false;
+    }
+  }
+  return false;
+}
+
+int32_t AMDILModuleInfo::getArrayOffset(const llvm::StringRef &a) const {
+  StringMap<AMDILArrayMem>::const_iterator iter = mArrayMems.find(a);
+  if (iter != mArrayMems.end()) {
+    return iter->second.offset;
+  } else {
+    return -1;
+  }
+}
+
+int32_t AMDILModuleInfo::getConstOffset(const llvm::StringRef &a) const {
+  StringMap<AMDILConstPtr>::const_iterator iter = mConstMems.find(a);
+  if (iter != mConstMems.end()) {
+    return iter->second.offset;
+  } else {
+    return -1;
+  }
+}
+
+bool AMDILModuleInfo::getConstHWBit(const llvm::StringRef &name) const {
+  StringMap<AMDILConstPtr>::const_iterator iter = mConstMems.find(name);
+  if (iter != mConstMems.end()) {
+    return iter->second.usesHardware;
+  } else {
+    return false;
+  }
+}
+
+// As of right now we only care about the required group size
+// so we can skip the variable encoding
+AMDILKernelAttr AMDILModuleInfo::parseSGV(const GlobalValue *G) {
+  AMDILKernelAttr nArg;
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  memset(&nArg, 0, sizeof(nArg));
+  for (int x = 0; x < 3; ++x) {
+    nArg.reqGroupSize[x] = mSTM->getDefaultSize(x);
+    nArg.reqRegionSize[x] = mSTM->getDefaultSize(x);
+  }
+  if (!GV || !GV->hasInitializer()) {
+    return nArg;
+  }
+  const Constant *CV = GV->getInitializer();
+  const ConstantDataArray *CA = dyn_cast_or_null<ConstantDataArray>(CV);
+  if (!CA || !CA->isString()) {
+    return nArg;
+  }
+  std::string init = CA->getAsString();
+  size_t pos = init.find("RWG");
+  if (pos != llvm::StringRef::npos) {
+    pos += 3;
+    std::string LWS = init.substr(pos, init.length() - pos);
+    const char *lws = LWS.c_str();
+    sscanf(lws, "%u,%u,%u", &(nArg.reqGroupSize[0]),
+           &(nArg.reqGroupSize[1]),
+           &(nArg.reqGroupSize[2]));
+    nArg.mHasRWG = true;
+  }
+  pos = init.find("RWR");
+  if (pos != llvm::StringRef::npos) {
+    pos += 3;
+    std::string LWS = init.substr(pos, init.length() - pos);
+    const char *lws = LWS.c_str();
+    sscanf(lws, "%u,%u,%u", &(nArg.reqRegionSize[0]),
+           &(nArg.reqRegionSize[1]),
+           &(nArg.reqRegionSize[2]));
+    nArg.mHasRWR = true;
+  }
+  return nArg;
+}
+
+AMDILLocalArg AMDILModuleInfo::parseLVGV(const GlobalValue *G) {
+  AMDILLocalArg nArg;
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  nArg.name = "";
+  if (!GV || !GV->hasInitializer()) {
+    return nArg;
+  }
+  const ConstantArray *CA =
+    dyn_cast_or_null<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return nArg;
+  }
+  for (size_t x = 0, y = CA->getNumOperands(); x < y; ++x) {
+    const Value *local = CA->getOperand(x);
+    const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(local);
+    if (!CE || !CE->getNumOperands()) {
+      continue;
+    }
+    nArg.name = (*(CE->op_begin()))->getName();
+    if (mArrayMems.find(nArg.name) != mArrayMems.end()) {
+      nArg.local.push_back(&(mArrayMems[nArg.name]));
+    }
+  }
+  return nArg;
+}
+
+void AMDILModuleInfo::parseConstantPtrAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(G);
+  const ConstantArray *CA =
+    dyn_cast_or_null<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return;
+  }
+  uint32_t numOps = CA->getNumOperands();
+  for (uint32_t x = 0; x < numOps; ++x) {
+    const Value *V = CA->getOperand(x);
+    const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V);
+    if (!CS) {
+      continue;
+    }
+    assert(CS->getNumOperands() == 2 && "There can only be 2"
+           " fields, a name and size");
+    const ConstantExpr *nameField = dyn_cast<ConstantExpr>(CS->getOperand(0));
+    const ConstantInt *sizeField = dyn_cast<ConstantInt>(CS->getOperand(1));
+    assert(nameField && "There must be a constant name field");
+    assert(sizeField && "There must be a constant size field");
+    const GlobalVariable *nameGV =
+      dyn_cast<GlobalVariable>(nameField->getOperand(0));
+    const ConstantDataArray *nameArray =
+      dyn_cast<ConstantDataArray>(nameGV->getInitializer());
+    // Lets add this string to the set of strings we should ignore processing
+    mIgnoreStr.insert(nameGV->getName());
+    if (mConstMems.find(nameGV->getName())
+        != mConstMems.end()) {
+      // If we already processesd this string as a constant, lets remove it from
+      // the list of known constants.  This way we don't process unneeded data
+      // and don't generate code/metadata for strings that are never used.
+      mConstMems.erase(mConstMems.find(nameGV->getName()));
+    } else {
+      mIgnoreStr.insert(CS->getOperand(0)->getName());
+    }
+    AMDILConstPtr constAttr;
+    constAttr.name = nameArray->getAsString();
+    constAttr.size = (sizeField->getZExtValue() + 15) & ~15;
+    constAttr.base = CS;
+    constAttr.isArgument = true;
+    constAttr.isArray = false;
+    constAttr.cbNum = 0;
+    constAttr.offset = 0;
+    constAttr.usesHardware = (constAttr.size <= mSTM->device()->getMaxCBSize());
+    // Now that we have all our constant information,
+    // lets update the AMDILKernel
+    llvm::StringRef  AMDILKernelName = G->getName().data() + 30;
+    AMDILKernel *k;
+    if (mKernels.find(AMDILKernelName) != mKernels.end()) {
+      k = mKernels[AMDILKernelName];
+    } else {
+      k = new AMDILKernel;
+      k->curSize = 0;
+      k->curRSize = 0;
+      k->curHWSize = 0;
+      k->curHWRSize = 0;
+      k->constSize = 0;
+      k->lvgv = NULL;
+      k->sgv = NULL;
+      memset(k->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+    }
+    constAttr.cbNum = k->constPtr.size() + 2;
+    k->constPtr.push_back(constAttr);
+    mKernels[AMDILKernelName] = k;
+  }
+}
+
+void AMDILModuleInfo::parseImageAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  const ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!CA) {
+    return;
+  }
+  if (isa<GlobalValue>(CA)) {
+    return;
+  }
+  uint32_t e = CA->getNumOperands();
+  if (!e) {
+    return;
+  }
+  AMDILKernel *k;
+  llvm::StringRef name = G->getName().data() + 23;
+  if (mKernels.find(name) != mKernels.end()) {
+    k = mKernels[name];
+  } else {
+    k = new AMDILKernel;
+    k->curSize = 0;
+    k->curRSize = 0;
+    k->curHWSize = 0;
+    k->curHWRSize = 0;
+    k->constSize = 0;
+    k->lvgv = NULL;
+    k->sgv = NULL;
+    memset(k->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+  }
+  for (uint32_t i = 0; i != e; ++i) {
+    const Value *V = CA->getOperand(i);
+    const Constant *C = dyn_cast<Constant>(V);
+    const ConstantStruct *CS = dyn_cast<ConstantStruct>(C);
+    if (CS && CS->getNumOperands() == 2) {
+      if (mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()) !=
+          mConstMems.end()) {
+        // If we already processesd this string as a constant, lets remove it
+        // from the list of known constants.  This way we don't process unneeded
+        // data and don't generate code/metadata for strings that are never
+        // used.
+        mConstMems.erase(
+            mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()));
+      } else {
+        mIgnoreStr.insert(CS->getOperand(0)->getOperand(0)->getName());
+      }
+      const ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(1));
+      uint32_t val = (uint32_t)CI->getZExtValue();
+      if (val == 1) {
+        k->readOnly.insert(i);
+      } else if (val == 2) {
+        k->writeOnly.insert(i);
+      } else {
+        assert(!"Unknown image type value!");
+      }
+    }
+  }
+  mKernels[name] = k;
+}
+
+void AMDILModuleInfo::parseAutoArray(const GlobalValue *GV, bool isRegion) {
+  const GlobalVariable *G = dyn_cast<GlobalVariable>(GV);
+  Type *Ty = (G) ? G->getType() : NULL;
+  AMDILArrayMem tmp;
+  tmp.isHW = true;
+  tmp.offset = 0;
+  tmp.vecSize = getTypeSize(Ty, true);
+  tmp.isRegion = isRegion;
+  mArrayMems[GV->getName()] = tmp;
+}
+
+void AMDILModuleInfo::parseConstantPtr(const GlobalValue *GV) {
+  const GlobalVariable *G = dyn_cast<GlobalVariable>(GV);
+  Type *Ty = (G) ? G->getType() : NULL;
+  AMDILConstPtr constAttr;
+  constAttr.name = G->getName();
+  constAttr.size = getTypeSize(Ty, true);
+  constAttr.base = GV;
+  constAttr.isArgument = false;
+  constAttr.isArray = true;
+  constAttr.offset = 0;
+  constAttr.cbNum = 0;
+  constAttr.usesHardware = false;
+  mConstMems[GV->getName()] = constAttr;
+}
+
+void AMDILModuleInfo::parseGlobalAnnotate(const GlobalValue *G) {
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(G);
+  if (!GV->hasInitializer()) {
+    return;
+  }
+  const Constant *CT = GV->getInitializer();
+  if (!CT || isa<GlobalValue>(CT)) {
+    return;
+  }
+  const ConstantArray *CA = dyn_cast<ConstantArray>(CT);
+  if (!CA) {
+    return;
+  }
+
+  unsigned int nKernels = CA->getNumOperands();
+  for (unsigned int i = 0, e = nKernels; i != e; ++i) {
+    parseKernelInformation(CA->getOperand(i));
+  }
+}
+
+void AMDILModuleInfo::parseKernelInformation(const Value *V) {
+  if (isa<GlobalValue>(V)) {
+    return;
+  }
+  const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V);
+  if (!CS) {
+    return;
+  }
+  uint32_t N = CS->getNumOperands();
+  if (N != 5) {
+    return;
+  }
+  AMDILKernel *tmp;
+
+  // The first operand is always a pointer to the AMDILKernel.
+  const Constant *CV = dyn_cast<Constant>(CS->getOperand(0));
+  llvm::StringRef AMDILKernelName = "";
+  if (CV->getNumOperands()) {
+    AMDILKernelName = (*(CV->op_begin()))->getName();
+  }
+
+  // If we have images, then we have already created the AMDILKernel and we just need
+  // to get the AMDILKernel information.
+  if (mKernels.find(AMDILKernelName) != mKernels.end()) {
+    tmp = mKernels[AMDILKernelName];
+  } else {
+    tmp = new AMDILKernel;
+    tmp->curSize = 0;
+    tmp->curRSize = 0;
+    tmp->curHWSize = 0;
+    tmp->curHWRSize = 0;
+    tmp->constSize = 0;
+    tmp->lvgv = NULL;
+    tmp->sgv = NULL;
+    memset(tmp->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB);
+  }
+
+
+  // The second operand is SGV, there can only be one so we don't need to worry
+  // about parsing out multiple data points.
+  CV = dyn_cast<Constant>(CS->getOperand(1));
+
+  llvm::StringRef sgvName;
+  if (CV->getNumOperands()) {
+    sgvName = (*(CV->op_begin()))->getName();
+  }
+
+  if (mKernelArgs.find(sgvName) != mKernelArgs.end()) {
+    tmp->sgv = &mKernelArgs[sgvName];
+  }
+  // The third operand is FGV, which is skipped
+  // The fourth operand is LVGV
+  // There can be multiple local arrays, so we
+  // need to handle each one seperatly
+  CV = dyn_cast<Constant>(CS->getOperand(3));
+  llvm::StringRef lvgvName = "";
+  if (CV->getNumOperands()) {
+    lvgvName = (*(CV->op_begin()))->getName();
+  }
+  if (mLocalArgs.find(lvgvName) != mLocalArgs.end()) {
+    AMDILLocalArg *ptr = &mLocalArgs[lvgvName];
+    tmp->lvgv = ptr;
+    llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS>::iterator ib, ie;
+    for (ib = ptr->local.begin(), ie = ptr->local.end(); ib != ie; ++ib) {
+      if ((*ib)->isRegion) {
+        if ((*ib)->isHW) {
+          (*ib)->offset = tmp->curHWRSize;
+          tmp->curHWRSize += ((*ib)->vecSize + 15) & ~15;
+        } else {
+          (*ib)->offset = tmp->curRSize;
+          tmp->curRSize += ((*ib)->vecSize + 15) & ~15;
+        }
+      } else {
+        if ((*ib)->isHW) {
+          (*ib)->offset = tmp->curHWSize;
+          tmp->curHWSize += ((*ib)->vecSize + 15) & ~15;
+        } else {
+          (*ib)->offset = tmp->curSize;
+          tmp->curSize += ((*ib)->vecSize + 15) & ~15;
+        }
+      }
+    }
+  }
+
+  // The fifth operand is NULL
+  mKernels[AMDILKernelName] = tmp;
+}
+
+AMDILKernel *
+AMDILModuleInfo::getKernel(const llvm::StringRef &name) {
+  StringMap<AMDILKernel*>::iterator iter = mKernels.find(name);
+  if (iter == mKernels.end()) {
+      return NULL; 
+  } else {
+    return iter->second;
+  }
+}
+
+bool AMDILModuleInfo::isKernel(const llvm::StringRef &name) const {
+  return (mKernels.find(name) != mKernels.end());
+}
+
+bool AMDILModuleInfo::isWriteOnlyImage(const llvm::StringRef &name,
+                                          uint32_t iID) const {
+  const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return false;
+  }
+  return kiter->second->writeOnly.count(iID);
+}
+#if 0
+uint32_t
+AMDILModuleInfo::getNumWriteImages(const llvm::StringRef &name) const {
+  char *env = NULL;
+  env = getenv("GPU_DISABLE_RAW_UAV");
+  if (env && env[0] == '1') {
+    return 8;
+  }
+  const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return 0;
+  } else {
+    return kiter->second->writeOnly.size();
+  }
+}
+#endif
+bool AMDILModuleInfo::isReadOnlyImage(const llvm::StringRef &name,
+                                         uint32_t iID) const {
+  const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name);
+  if (kiter == mKernels.end()) {
+    return false;
+  }
+  return kiter->second->readOnly.count(iID);
+}
+#if 0
+bool AMDILModuleInfo::hasRWG(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    AMDILKernelAttr *ptr = iter->second->sgv;
+    if (ptr) {
+      return ptr->mHasRWG;
+    }
+  }
+  return false;
+}
+
+bool AMDILModuleInfo::hasRWR(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    AMDILKernelAttr *ptr = iter->second->sgv;
+    if (ptr) {
+      return ptr->mHasRWR;
+    }
+  }
+  return false;
+}
+
+uint32_t
+AMDILModuleInfo::getMaxGroupSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    AMDILKernelAttr *sgv = iter->second->sgv;
+    if (sgv) {
+      return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2];
+    }
+  }
+  return mSTM->getDefaultSize(0) *
+         mSTM->getDefaultSize(1) *
+         mSTM->getDefaultSize(2);
+}
+
+uint32_t
+AMDILModuleInfo::getMaxRegionSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    AMDILKernelAttr *sgv = iter->second->sgv;
+    if (sgv) {
+      return sgv->reqRegionSize[0] *
+             sgv->reqRegionSize[1] *
+             sgv->reqRegionSize[2];
+    }
+  }
+  return mSTM->getDefaultSize(0) *
+         mSTM->getDefaultSize(1) *
+         mSTM->getDefaultSize(2);
+}
+uint32_t AMDILModuleInfo::getRegionSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second->curRSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILModuleInfo::getLocalSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second->curSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILModuleInfo::getConstSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second->constSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDILModuleInfo::getHWRegionSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second->curHWRSize;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILModuleInfo::getHWLocalSize(const llvm::StringRef &name) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end()) {
+    return iter->second->curHWSize;
+  } else {
+    return 0;
+  }
+}
+#endif
+
+int32_t AMDILModuleInfo::getArgID(const Argument *arg) {
+  DenseMap<const Argument *, int32_t>::iterator argiter = mArgIDMap.find(arg);
+  if (argiter != mArgIDMap.end()) {
+    return argiter->second;
+  } else {
+    return -1;
+  }
+}
+
+
+uint32_t
+AMDILModuleInfo::getRegion(const llvm::StringRef &name, uint32_t dim) const {
+  StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name);
+  if (iter != mKernels.end() && iter->second->sgv) {
+    AMDILKernelAttr *sgv = iter->second->sgv;
+    switch (dim) {
+    default: break;
+    case 0:
+    case 1:
+    case 2:
+      return sgv->reqRegionSize[dim];
+      break;
+    case 3:
+      return sgv->reqRegionSize[0] *
+             sgv->reqRegionSize[1] *
+             sgv->reqRegionSize[2];
+    };
+  }
+  switch (dim) {
+  default:
+    return 1;
+  case 3:
+    return mSTM->getDefaultSize(0) *
+           mSTM->getDefaultSize(1) *
+           mSTM->getDefaultSize(2);
+  case 2:
+  case 1:
+  case 0:
+    return mSTM->getDefaultSize(dim);
+    break;
+  };
+  return 1;
+}
+
+StringMap<AMDILConstPtr>::iterator AMDILModuleInfo::consts_begin() {
+  return mConstMems.begin();
+}
+
+
+StringMap<AMDILConstPtr>::iterator AMDILModuleInfo::consts_end() {
+  return mConstMems.end();
+}
+
+bool AMDILModuleInfo::byteStoreExists(StringRef S) const {
+  return mByteStore.find(S) != mByteStore.end();
+}
+
+uint32_t AMDILModuleInfo::getConstPtrSize(const AMDILKernel *krnl,
+                                             const llvm::StringRef &arg)
+{
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->size;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILModuleInfo::getConstPtrOff(const AMDILKernel *krnl,
+                                            const llvm::StringRef &arg)
+{
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->offset;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t AMDILModuleInfo::getConstPtrCB(const AMDILKernel *krnl,
+                                           const llvm::StringRef &arg)
+{
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->cbNum;
+  } else {
+    return 0;
+  }
+}
+
+void AMDILModuleInfo::calculateCPOffsets(const MachineFunction *MF,
+                                            AMDILKernel *krnl)
+{
+  const MachineConstantPool *MCP = MF->getConstantPool();
+  if (!MCP) {
+    return;
+  }
+  const std::vector<MachineConstantPoolEntry> consts = MCP->getConstants();
+  size_t numConsts = consts.size();
+  for (size_t x = 0; x < numConsts; ++x) {
+    krnl->CPOffsets.push_back(
+        std::make_pair<uint32_t, const Constant*>(
+          mCurrentCPOffset, consts[x].Val.ConstVal));
+    size_t curSize = getTypeSize(consts[x].Val.ConstVal->getType(), true);
+    // Align the size to the vector boundary
+    curSize = (curSize + 15) & (~15);
+    mCurrentCPOffset += curSize;
+  }
+}
+
+bool AMDILModuleInfo::isConstPtrArray(const AMDILKernel *krnl,
+                                         const llvm::StringRef &arg) {
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->isArray;
+  } else {
+    return false;
+  }
+}
+
+bool AMDILModuleInfo::isConstPtrArgument(const AMDILKernel *krnl,
+                                            const llvm::StringRef &arg)
+{
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->isArgument;
+  } else {
+    return false;
+  }
+}
+
+const Value *AMDILModuleInfo::getConstPtrValue(const AMDILKernel *krnl,
+                                                  const llvm::StringRef &arg) {
+  const AMDILConstPtr *curConst = getConstPtr(krnl, arg);
+  if (curConst) {
+    return curConst->base;
+  } else {
+    return NULL;
+  }
+}
+
+static void
+dumpZeroElements(StructType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(IntegerType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(ArrayType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(VectorType * const T, llvm::raw_ostream &O, bool asBytes);
+static void
+dumpZeroElements(Type * const T, llvm::raw_ostream &O, bool asBytes);
+
+void dumpZeroElements(Type * const T, llvm::raw_ostream &O, bool asBytes) {
+  if (!T) {
+    return;
+  }
+  switch(T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::DoubleTyID:
+    if (asBytes) {
+      O << ":0:0:0:0:0:0:0:0";
+    } else {
+      O << ":0";
+    }
+    break;
+  case Type::FloatTyID:
+  case Type::PointerTyID:
+  case Type::FunctionTyID:
+    if (asBytes) {
+      O << ":0:0:0:0";
+    } else {
+      O << ":0";
+    }
+  case Type::IntegerTyID:
+    dumpZeroElements(dyn_cast<IntegerType>(T), O, asBytes);
+    break;
+  case Type::StructTyID:
+    {
+      const StructType *ST = cast<StructType>(T);
+      if (!ST->isOpaque()) {
+        dumpZeroElements(dyn_cast<StructType>(T), O, asBytes);
+      } else { // A pre-LLVM 3.0 opaque type
+        if (asBytes) {
+          O << ":0:0:0:0";
+        } else {
+          O << ":0";
+        }
+      }
+    }
+    break;
+  case Type::ArrayTyID:
+    dumpZeroElements(dyn_cast<ArrayType>(T), O, asBytes);
+    break;
+  case Type::VectorTyID:
+    dumpZeroElements(dyn_cast<VectorType>(T), O, asBytes);
+    break;
+  };
+}
+
+void
+dumpZeroElements(StructType * const ST, llvm::raw_ostream &O, bool asBytes) {
+  if (!ST) {
+    return;
+  }
+  Type *curType;
+  StructType::element_iterator eib = ST->element_begin();
+  StructType::element_iterator eie = ST->element_end();
+  for (;eib != eie; ++eib) {
+    curType = *eib;
+    dumpZeroElements(curType, O, asBytes);
+  }
+}
+
+void
+dumpZeroElements(IntegerType * const IT, llvm::raw_ostream &O, bool asBytes) {
+  if (asBytes) {
+    unsigned byteWidth = (IT->getBitWidth() >> 3);
+    for (unsigned x = 0; x < byteWidth; ++x) {
+      O << ":0";
+    }
+  }
+}
+
+void
+dumpZeroElements(ArrayType * const AT, llvm::raw_ostream &O, bool asBytes) {
+  size_t size = AT->getNumElements();
+  for (size_t x = 0; x < size; ++x) {
+    dumpZeroElements(AT->getElementType(), O, asBytes);
+  }
+}
+
+void
+dumpZeroElements(VectorType * const VT, llvm::raw_ostream &O, bool asBytes) {
+  size_t size = VT->getNumElements();
+  for (size_t x = 0; x < size; ++x) {
+    dumpZeroElements(VT->getElementType(), O, asBytes);
+  }
+}
+
+void AMDILModuleInfo::printConstantValue(const Constant *CAval,
+                                            llvm::raw_ostream &O, bool asBytes) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CAval)) {
+    bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble;
+    if (isDouble) {
+      double val = CFP->getValueAPF().convertToDouble();
+      union dtol_union {
+        double d;
+        uint64_t l;
+        char c[8];
+      } conv;
+      conv.d = val;
+      if (!asBytes) {
+        O << ":";
+        O.write_hex(conv.l);
+      } else {
+        for (int i = 0; i < 8; ++i) {
+          O << ":";
+          O.write_hex((unsigned)conv.c[i] & 0xFF);
+        }
+      }
+    } else {
+      float val = CFP->getValueAPF().convertToFloat();
+      union ftoi_union {
+        float f;
+        uint32_t u;
+        char c[4];
+      } conv;
+      conv.f = val;
+      if (!asBytes) {
+        O << ":";
+        O.write_hex(conv.u);
+      } else {
+        for (int i = 0; i < 4; ++i) {
+          O << ":";
+          O.write_hex((unsigned)conv.c[i] & 0xFF);
+        }
+      }
+    }
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CAval)) {
+    uint64_t zVal = CI->getValue().getZExtValue();
+    if (!asBytes) {
+      O << ":";
+      O.write_hex(zVal);
+    } else {
+      switch (CI->getBitWidth()) {
+      default:
+        {
+          union ltob_union {
+            uint64_t l;
+            char c[8];
+          } conv;
+          conv.l = zVal;
+          for (int i = 0; i < 8; ++i) {
+            O << ":";
+            O.write_hex((unsigned)conv.c[i] & 0xFF);
+          }
+        }
+        break;
+      case 8:
+        O << ":";
+        O.write_hex(zVal & 0xFF);
+        break;
+      case 16:
+        {
+          union stob_union {
+            uint16_t s;
+            char c[2];
+          } conv;
+          conv.s = (uint16_t)zVal;
+          O << ":";
+          O.write_hex((unsigned)conv.c[0] & 0xFF);
+          O << ":";
+          O.write_hex((unsigned)conv.c[1] & 0xFF);
+        }
+        break;
+      case 32:
+        {
+          union itob_union {
+            uint32_t i;
+            char c[4];
+          } conv;
+          conv.i = (uint32_t)zVal;
+          for (int i = 0; i < 4; ++i) {
+            O << ":";
+            O.write_hex((unsigned)conv.c[i] & 0xFF);
+          }
+        }
+        break;
+      }
+    }
+  } else if (const ConstantVector *CV = dyn_cast<ConstantVector>(CAval)) {
+    int y = CV->getNumOperands()-1;
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CV->getOperand(x), O, asBytes);
+    }
+    printConstantValue(CV->getOperand(x), O, asBytes);
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CAval)) {
+    int y = CS->getNumOperands();
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CS->getOperand(x), O, asBytes);
+    }
+  } else if (const ConstantAggregateZero *CAZ
+      = dyn_cast<ConstantAggregateZero>(CAval)) {
+    int y = CAZ->getNumOperands();
+    if (y > 0) {
+      int x = 0;
+      for (; x < y; ++x) {
+        printConstantValue((llvm::Constant *)CAZ->getOperand(x),
+            O, asBytes);
+      }
+    } else {
+      if (asBytes) {
+        dumpZeroElements(CAval->getType(), O, asBytes);
+      } else {
+        int y = getNumElements(CAval->getType())-1;
+        for (int x = 0; x < y; ++x) {
+          O << ":0";
+        }
+        O << ":0";
+      }
+    }
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CAval)) {
+    int y = CA->getNumOperands();
+    int x = 0;
+    for (; x < y; ++x) {
+      printConstantValue(CA->getOperand(x), O, asBytes);
+    }
+  } else if (dyn_cast<ConstantPointerNull>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else if (dyn_cast<ConstantExpr>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else if (dyn_cast<UndefValue>(CAval)) {
+    O << ":0";
+    //assert(0 && "Hit condition which was not expected");
+  } else {
+    assert(0 && "Hit condition which was not expected");
+  }
+}
+#if 0
+static bool isStruct(Type * const T)
+{
+  if (!T) {
+    return false;
+  }
+  switch (T->getTypeID()) {
+  default:
+    return false;
+  case Type::PointerTyID:
+    return isStruct(T->getContainedType(0));
+  case Type::StructTyID:
+    return true;
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+    return isStruct(dyn_cast<SequentialType>(T)->getElementType());
+  };
+
+}
+
+void AMDILModuleInfo::dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km,
+                                      uint32_t id) {
+  uint32_t size = 0;
+  for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(),
+      cme = consts_end(); cmb != cme; ++cmb) {
+    if (id == cmb->second.cbNum) {
+      size += (cmb->second.size + 15) & (~15);
+    }
+  }
+  if (id == 0) {
+    O << ";#DATASTART:" << (size + mCurrentCPOffset) << "\n";
+    if (mCurrentCPOffset) {
+      for (StringMap<AMDILKernel*>::iterator kcpb = mKernels.begin(),
+          kcpe = mKernels.end(); kcpb != kcpe; ++kcpb) {
+        const AMDILKernel *k = kcpb->second;
+        size_t numConsts = k->CPOffsets.size();
+        for (size_t x = 0; x < numConsts; ++x) {
+          size_t offset = k->CPOffsets[x].first;
+          const Constant *C = k->CPOffsets[x].second;
+          Type *Ty = C->getType();
+          size_t size = (isStruct(Ty) ? getTypeSize(Ty, true)
+                                      : getNumElements(Ty));
+          O << ";#" << km->getTypeName(Ty, symTab) << ":";
+          O << offset << ":" << size ;
+          printConstantValue(C, O, isStruct(Ty));
+          O << "\n";
+        }
+      }
+    }
+  } else {
+    O << ";#DATASTART:" << id << ":" << size << "\n";
+  }
+
+  for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(), cme = consts_end();
+       cmb != cme; ++cmb) {
+    if (cmb->second.cbNum != id) {
+      continue;
+    }
+    const GlobalVariable *G = dyn_cast<GlobalVariable>(cmb->second.base);
+    Type *Ty = (G) ? G->getType() : NULL;
+    size_t offset = cmb->second.offset;
+    const Constant *C = G->getInitializer();
+    size_t size = (isStruct(Ty)
+        ? getTypeSize(Ty, true)
+        : getNumElements(Ty));
+    O << ";#" << km->getTypeName(Ty, symTab) << ":";
+    if (!id) {
+      O << (offset + mCurrentCPOffset) << ":" << size;
+    } else {
+      O << offset << ":" << size;
+    }
+    if (C) {
+      printConstantValue(C, O, isStruct(Ty));
+    } else {
+      assert(0 && "Cannot have a constant pointer"
+          " without an initializer!");
+    }
+    O <<"\n";
+  }
+  if (id == 0) {
+    O << ";#DATAEND\n";
+  } else {
+    O << ";#DATAEND:" << id << "\n";
+  }
+}
+
+void
+AMDILModuleInfo::dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km) {
+  if (mConstMems.empty() && !mCurrentCPOffset) {
+    return;
+  } else {
+    llvm::DenseSet<uint32_t> const_set;
+    for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(), cme = consts_end();
+         cmb != cme; ++cmb) {
+      const_set.insert(cmb->second.cbNum);
+    }
+    if (mCurrentCPOffset) {
+      const_set.insert(0);
+    }
+    for (llvm::DenseSet<uint32_t>::iterator setb = const_set.begin(),
+           sete = const_set.end(); setb != sete; ++setb) {
+      dumpDataToCB(O, km, *setb);
+    }
+  }
+}
+#endif
+/// Create a function ID if it is not known or return the known
+/// function ID.
+uint32_t AMDILModuleInfo::getOrCreateFunctionID(const GlobalValue* func) {
+  if (func->getName().size()) {
+    return getOrCreateFunctionID(func->getName());
+  } 
+  uint32_t id;
+  if (mFuncPtrNames.find(func) == mFuncPtrNames.end()) {
+    id = mFuncPtrNames.size() + RESERVED_FUNCS + mFuncNames.size();
+    mFuncPtrNames[func] = id;
+  } else {
+    id = mFuncPtrNames[func];
+  }
+  return id;
+}
+uint32_t AMDILModuleInfo::getOrCreateFunctionID(const std::string &func) {
+  uint32_t id;
+  if (mFuncNames.find(func) == mFuncNames.end()) {
+    id = mFuncNames.size() + RESERVED_FUNCS + mFuncPtrNames.size();
+    mFuncNames[func] = id;
+  } else {
+    id = mFuncNames[func];
+  }
+  return id;
+}
diff --git a/src/gallium/drivers/radeon/AMDILModuleInfo.h b/src/gallium/drivers/radeon/AMDILModuleInfo.h

new file mode 100644 (file)

index 0000000..5111b87
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILModuleInfo.h
@@ -0,0 +1,159 @@
+//===--------------- AMDILModuleInfo.h -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for AMDIL targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDIL_MACHINE_MODULE_INFO_H_
+#define _AMDIL_MACHINE_MODULE_INFO_H_
+#include "AMDIL.h"
+#include "AMDILKernel.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Module.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <set>
+#include <string>
+
+namespace llvm {
+  class AMDILKernel;
+  class Argument;
+  class TypeSymbolTable;
+  class GlobalValue;
+  class MachineFunction;
+  class GlobalValue;
+
+  class AMDILModuleInfo : public MachineModuleInfoImpl {
+    protected:
+      const MachineModuleInfo *mMMI;
+    public:
+      AMDILModuleInfo(const MachineModuleInfo &);
+      virtual ~AMDILModuleInfo();
+
+      void processModule(const Module *MF, const AMDILTargetMachine* mTM);
+
+      /// Process the given module and parse out the global variable metadata passed
+      /// down from the frontend-compiler
+
+      /// Returns true if the image ID corresponds to a read only image.
+      bool isReadOnlyImage(const llvm::StringRef &name, uint32_t iID) const;
+
+      /// Returns true if the image ID corresponds to a write only image.
+      bool isWriteOnlyImage(const llvm::StringRef &name, uint32_t iID) const;
+
+      /// Gets the group size of the kernel for the given dimension.
+      uint32_t getRegion(const llvm::StringRef &name, uint32_t dim) const;
+
+      /// Get the offset of the array for the kernel.
+      int32_t getArrayOffset(const llvm::StringRef &name) const;
+
+      /// Get the offset of the const memory for the kernel.
+      int32_t getConstOffset(const llvm::StringRef &name) const;
+
+      /// Get the boolean value if this particular constant uses HW or not.
+      bool getConstHWBit(const llvm::StringRef &name) const;
+
+      /// Get a reference to the kernel metadata information for the given function
+      /// name.
+      AMDILKernel *getKernel(const llvm::StringRef &name);
+      bool isKernel(const llvm::StringRef &name) const;
+
+      /// Dump the data section to the output stream for the given kernel.
+      //void dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km);
+
+      /// Iterate through the constants that are global to the compilation unit.
+      StringMap<AMDILConstPtr>::iterator consts_begin();
+      StringMap<AMDILConstPtr>::iterator consts_end();
+
+      /// Query if the kernel has a byte store.
+      bool byteStoreExists(llvm::StringRef S) const;
+
+      /// Query if the constant pointer is an argument.
+      bool isConstPtrArgument(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Query if the constant pointer is an array that is globally scoped.
+      bool isConstPtrArray(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Query the size of the constant pointer.
+      uint32_t getConstPtrSize(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Query the offset of the constant pointer.
+      uint32_t getConstPtrOff(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Query the constant buffer number for a constant pointer.
+      uint32_t getConstPtrCB(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Query the Value* that the constant pointer originates from.
+      const Value *getConstPtrValue(const AMDILKernel *krnl, const llvm::StringRef &arg);
+
+      /// Get the ID of the argument.
+      int32_t getArgID(const Argument *arg);
+
+      /// Get the unique function ID for the specific function name and create a new
+      /// unique ID if it is not found.
+      uint32_t getOrCreateFunctionID(const GlobalValue* func);
+      uint32_t getOrCreateFunctionID(const std::string& func);
+
+      /// Calculate the offsets of the constant pool for the given kernel and
+      /// machine function.
+      void calculateCPOffsets(const MachineFunction *MF, AMDILKernel *krnl);
+
+      void add_printf_offset(uint32_t offset) { mPrintfOffset += offset; }
+      uint32_t get_printf_offset() { return mPrintfOffset; }
+
+    private:
+      /// Various functions that parse global value information and store them in
+      /// the global manager. This approach is used instead of dynamic parsing as it
+      /// might require more space, but should allow caching of data that gets
+      /// requested multiple times.
+      AMDILKernelAttr parseSGV(const GlobalValue *GV);
+      AMDILLocalArg  parseLVGV(const GlobalValue *GV);
+      void parseGlobalAnnotate(const GlobalValue *G);
+      void parseImageAnnotate(const GlobalValue *G);
+      void parseConstantPtrAnnotate(const GlobalValue *G);
+      void printConstantValue(const Constant *CAval,
+          llvm::raw_ostream& O,
+          bool asByte);
+      void parseKernelInformation(const Value *V);
+      void parseAutoArray(const GlobalValue *G, bool isRegion);
+      void parseConstantPtr(const GlobalValue *G);
+      void allocateGlobalCB();
+      bool checkConstPtrsUseHW(Module::const_iterator *F);
+
+      llvm::StringMap<AMDILKernel*> mKernels;
+      llvm::StringMap<AMDILKernelAttr> mKernelArgs;
+      llvm::StringMap<AMDILArrayMem> mArrayMems;
+      llvm::StringMap<AMDILConstPtr> mConstMems;
+      llvm::StringMap<AMDILLocalArg> mLocalArgs;
+      llvm::StringMap<uint32_t> mFuncNames;
+      llvm::DenseMap<const GlobalValue*, uint32_t> mFuncPtrNames;
+      llvm::DenseMap<uint32_t, llvm::StringRef> mImageNameMap;
+      std::set<llvm::StringRef> mByteStore;
+      std::set<llvm::StringRef> mIgnoreStr;
+      llvm::DenseMap<const Argument *, int32_t> mArgIDMap;
+      const TypeSymbolTable *symTab;
+      const AMDILSubtarget *mSTM;
+      size_t mOffset;
+      uint32_t mReservedBuffs;
+      uint32_t mCurrentCPOffset;
+      uint32_t mPrintfOffset;
+  };
+
+
+
+} // end namespace llvm
+
+#endif // _AMDIL_COFF_MACHINE_MODULE_INFO_H_
+
diff --git a/src/gallium/drivers/radeon/AMDILMultiClass.td b/src/gallium/drivers/radeon/AMDILMultiClass.td

new file mode 100644 (file)

index 0000000..92691db
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILMultiClass.td
@@ -0,0 +1,1440 @@
+//===-- AMDILMultiClass.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// Multiclass that handles branch instructions
+multiclass BranchConditional<SDNode Op> {
+    def _i8 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRI8:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, GPRI8:$src0)]>;
+    def _i16 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRI16:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, GPRI16:$src0)]>;
+    def _i32 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRI32:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, GPRI32:$src0)]>;
+    def _f32 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRF32:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, GPRF32:$src0)]>;
+    def _i64 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRI64:$src0),
+        "; f64 Pseudo branch instruction",
+  [(Op bb:$target, (i64 GPRI64:$src0))]>;
+    def _f64 : ILFormat<IL_OP_IFC, (outs),
+  (ins brtarget:$target, GPRF64:$src0),
+        "; f64 Pseudo branch instruction",
+  [(Op bb:$target, (f64 GPRF64:$src0))]>;
+}
+// Multiclass that handles compare instructions
+// When a definition is added here, a corrosponding defition
+// needs to be added at:
+// AMDILISelLowering.cpp@EmitInstrWithCustomInserter
+multiclass Compare<string asm> {
+  def _i8 : ILFormat<IL_OP_CMP, (outs GPRI8:$dst),
+      (ins i32imm:$cc, GPRI8:$src0, GPRI8:$src1),
+      !strconcat("; i8 ", asm),
+      [(set GPRI8:$dst, (IL_cmp imm:$cc, GPRI8:$src0, GPRI8:$src1))]>;
+  def _i16 : ILFormat<IL_OP_CMP, (outs GPRI16:$dst),
+      (ins i32imm:$cc, GPRI16:$src0, GPRI16:$src1),
+      !strconcat("; i16 ", asm),
+      [(set GPRI16:$dst, (IL_cmp imm:$cc, GPRI16:$src0, GPRI16:$src1))]>;
+  def _i32 : ILFormat<IL_OP_CMP, (outs GPRI32:$dst),
+      (ins i32imm:$cc, GPRI32:$src0, GPRI32:$src1),
+      !strconcat("; i32 ", asm),
+      [(set GPRI32:$dst, (IL_cmp imm:$cc, GPRI32:$src0, GPRI32:$src1))]>;
+  def _i64 : ILFormat<IL_OP_CMP, (outs GPRI64:$dst),
+      (ins i32imm:$cc, GPRI64:$src0, GPRI64:$src1),
+      !strconcat("; i64 ", asm),
+      [(set GPRI64:$dst, (IL_cmp imm:$cc, GPRI64:$src0, GPRI64:$src1))]>;
+  def _f32 : ILFormat<IL_OP_CMP, (outs GPRF32:$dst),
+      (ins i32imm:$cc, GPRF32:$src0, GPRF32:$src1),
+      !strconcat("; f32 ", asm),
+      [(set GPRF32:$dst, (IL_cmp imm:$cc, GPRF32:$src0, GPRF32:$src1))]>;
+  def _f64 : ILFormat<IL_OP_CMP, (outs GPRF64:$dst),
+      (ins i32imm:$cc, GPRF64:$src0, GPRF64:$src1),
+      !strconcat("; f64 ", asm),
+      [(set GPRF64:$dst, (IL_cmp imm:$cc, GPRF64:$src0, GPRF64:$src1))]>;
+  def _v2i8 : ILFormat<IL_OP_CMP, (outs GPRV2I8:$dst),
+      (ins i32imm:$cc, GPRV2I8:$src0, GPRV2I8:$src1),
+      !strconcat("; i8 ", asm),
+      [(set GPRV2I8:$dst, (IL_cmp imm:$cc, GPRV2I8:$src0, GPRV2I8:$src1))]>;
+  def _v2i16 : ILFormat<IL_OP_CMP, (outs GPRV2I16:$dst),
+      (ins i32imm:$cc, GPRV2I16:$src0, GPRV2I16:$src1),
+      !strconcat("; i16 ", asm),
+      [(set GPRV2I16:$dst, (IL_cmp imm:$cc, GPRV2I16:$src0, GPRV2I16:$src1))]>;
+  def _v2i32 : ILFormat<IL_OP_CMP, (outs GPRV2I32:$dst),
+      (ins i32imm:$cc, GPRV2I32:$src0, GPRV2I32:$src1),
+      !strconcat("; i32 ", asm),
+      [(set GPRV2I32:$dst, (IL_cmp imm:$cc, GPRV2I32:$src0, GPRV2I32:$src1))]>;
+  def _v2i64 : ILFormat<IL_OP_CMP, (outs GPRV2I64:$dst),
+      (ins i32imm:$cc, GPRV2I64:$src0, GPRV2I64:$src1),
+      !strconcat("; i64 ", asm),
+      [(set GPRV2I64:$dst, (IL_cmp imm:$cc, GPRV2I64:$src0, GPRV2I64:$src1))]>;
+  def _v2f32 : ILFormat<IL_OP_CMP, (outs GPRV2F32:$dst),
+      (ins i32imm:$cc, GPRV2F32:$src0, GPRV2F32:$src1),
+      !strconcat("; f32 ", asm),
+      [(set GPRV2F32:$dst, (IL_cmp imm:$cc, GPRV2F32:$src0, GPRV2F32:$src1))]>;
+  def _v2f64 : ILFormat<IL_OP_CMP, (outs GPRV2F64:$dst),
+      (ins i32imm:$cc, GPRV2F64:$src0, GPRV2F64:$src1),
+      !strconcat("; f64 ", asm),
+      [(set GPRV2F64:$dst, (IL_cmp imm:$cc, GPRV2F64:$src0, GPRV2F64:$src1))]>;
+  def _v4i8 : ILFormat<IL_OP_CMP, (outs GPRV4I8:$dst),
+      (ins i32imm:$cc, GPRV4I8:$src0, GPRV4I8:$src1),
+      !strconcat("; i8 ", asm),
+      [(set GPRV4I8:$dst, (IL_cmp imm:$cc, GPRV4I8:$src0, GPRV4I8:$src1))]>;
+  def _v4i16 : ILFormat<IL_OP_CMP, (outs GPRV4I16:$dst),
+      (ins i32imm:$cc, GPRV4I16:$src0, GPRV4I16:$src1),
+      !strconcat("; i16 ", asm),
+      [(set GPRV4I16:$dst, (IL_cmp imm:$cc, GPRV4I16:$src0, GPRV4I16:$src1))]>;
+  def _v4i32 : ILFormat<IL_OP_CMP, (outs GPRV4I32:$dst),
+      (ins i32imm:$cc, GPRV4I32:$src0, GPRV4I32:$src1),
+      !strconcat("; i32 ", asm),
+      [(set GPRV4I32:$dst, (IL_cmp imm:$cc, GPRV4I32:$src0, GPRV4I32:$src1))]>;
+  def _v4f32 : ILFormat<IL_OP_CMP, (outs GPRV4F32:$dst),
+      (ins i32imm:$cc, GPRV4F32:$src0, GPRV4F32:$src1),
+      !strconcat("; f32 ", asm),
+      [(set GPRV4F32:$dst, (IL_cmp imm:$cc, GPRV4F32:$src0, GPRV4F32:$src1))]>;
+}
+
+// Multiclass that handles constant values
+multiclass ILConstant<string asm> {
+  def _i8 : ILFormat<IL_OP_MOV, (outs GPRI8:$dst),
+      (ins i8imm:$val),
+      asm, [(set GPRI8:$dst, imm:$val)]>;
+
+  //  def _v2i8 : ILFormat<IL_OP_MOV, (outs GPRV2I8:$dst),
+  //      (ins i8imm:$val),
+  //      asm, [(set GPRV2I8:$dst, GPRV2I8:$val)]>;
+
+  //def _v4i8 : ILFormat<IL_OP_MOV, (outs GPRV4I8:$dst),
+  //(ins i8imm:$val),
+  //asm, [(set GPRV4I8:$dst, GPRV4I8:$val)]>;
+
+  def _i16 : ILFormat<IL_OP_MOV, (outs GPRI16:$dst),
+      (ins i16imm:$val),
+      asm, [(set GPRI16:$dst, imm:$val)]>;
+
+  //  def _v2i16 : ILFormat<IL_OP_MOV, (outs GPRV2I16:$dst),
+  //      (ins i16imm:$val),
+  //      asm, [(set GPRV2I16:$dst, GPRV2I16:$val)]>;
+
+  //  def _v4i16 : ILFormat<IL_OP_MOV, (outs GPRV4I16:$dst),
+  //      (ins i16imm:$val),
+  //      asm, [(set GPRV4I16:$dst, GPRV4I16:$val)]>;
+
+  def _i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst),
+      (ins i32imm:$val),
+      asm, [(set GPRI32:$dst, imm:$val)]>;
+
+  //  def _v2i32 : ILFormat<IL_OP_MOV, (outs GPRV2I32:$dst),
+  //      (ins i32imm:$val),
+  //      asm, [(set GPRV2I32:$dst, GPRV2I32:$val)]>;
+
+  //  def _v4i32 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+  //      (ins GPRV4I32:$val),
+  //      asm, [(set GPRV4I32:$dst, GPRV4I32:$val)]>;
+
+  def _i64 : ILFormat<IL_OP_MOV, (outs GPRI64:$dst),
+      (ins i64imm:$val),
+      asm, [(set GPRI64:$dst, imm:$val)]>;
+
+  //  def _v2i64 : ILFormat<IL_OP_MOV, (outs GPRV2I64:$dst),
+  //      (ins i64imm:$val),
+  //      asm, [(set GPRV2I64:$dst, GPRV2I64:$val)]>;
+
+  def _f32 : ILFormat<IL_OP_MOV, (outs GPRF32:$dst),
+      (ins f32imm:$val),
+      asm, [(set GPRF32:$dst, fpimm:$val)]>;
+
+  //  def _v2f32 : ILFormat<IL_OP_MOV, (outs GPRV2F32:$dst),
+  //      (ins f32imm:$val),
+  //      asm, [(set GPRV2F32:$dst, GPRV2F32:$val)]>;
+
+  //  def _v4f32 : ILFormat<IL_OP_MOV, (outs GPRV4F32:$dst),
+  //      (ins f32imm:$val),
+  //      asm, [(set GPRV4F32:$dst, GPRV4F32:$val)]>;
+
+  def _f64 : ILFormat<IL_OP_MOV, (outs GPRF64:$dst),
+      (ins f64imm:$val),
+      asm, [(set GPRF64:$dst, fpimm:$val)]>;
+
+  //  def _v2f64 : ILFormat<IL_OP_MOV, (outs GPRV2F64:$dst),
+  //      (ins f64imm:$val),
+  //        asm, [(set GPRV2F64:$dst, GPRV2F64:$val)]>;
+
+}
+
+// Multiclass that handles memory store operations
+multiclass GTRUNCSTORE<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI16:$val, ADDR:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i16trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i16trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i32trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_f32trunc_store GPRF64:$val, ADDR:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass LTRUNCSTORE<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI16:$val, ADDR:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i16trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i16trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i32trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_f32trunc_store GPRF64:$val, ADDR:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass PTRUNCSTORE<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI16:$val, ADDR:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i16trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i16trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i32trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_f32trunc_store GPRF64:$val, ADDR:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass RTRUNCSTORE<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI16:$val, ADDR:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i16trunc_store GPRI32:$val, ADDR:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i16trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i32trunc_store GPRI64:$val, ADDR:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_f32trunc_store GPRF64:$val, ADDR:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>;
+}
+
+
+// Multiclass that handles memory store operations
+multiclass STORE<string asm, PatFrag OpNode> {
+  def _i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI8:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI8:$val, ADDR:$ptr)]>;
+  def _i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI16:$val, ADDR:$ptr)]>;
+  def _i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI32:$val, ADDR:$ptr)]>;
+  def _f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRF32:$val, ADDR:$ptr)]>;
+  def _i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI64:$val, ADDR:$ptr)]>;
+  def _f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRF64:$val, ADDR:$ptr)]>;
+  def _v4f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4F32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4F32:$val, ADDR:$ptr)]>;
+  def _v2f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2F32:$val, ADDR:$ptr)]>;
+  def _v4i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I32:$val, ADDR:$ptr)]>;
+  def _v2i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I8:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I8:$val, ADDR:$ptr)]>;
+  def _v2i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I16:$val, ADDR:$ptr)]>;
+  def _v4i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I8:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I8:$val, ADDR:$ptr)]>;
+  def _v4i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I16:$val, ADDR:$ptr)]>;
+  def _v2i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I32:$val, ADDR:$ptr)]>;
+  def _v2f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2F64:$val, ADDR:$ptr)]>;
+  def _v2i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I64:$val, ADDR:$ptr)]>;
+}
+
+// Multiclass that handles load operations
+multiclass LOAD<string asm, PatFrag OpNode> {
+  def _i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI8:$dst, (OpNode ADDR:$ptr))]>;
+  def _i16 : OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI16:$dst, (OpNode ADDR:$ptr))]>;
+  def _i32 : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI32:$dst, (OpNode ADDR:$ptr))]>;
+  def _f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRF32:$dst, (OpNode ADDR:$ptr))]>;
+  def _i64 : OneInOneOut<IL_OP_MOV, (outs GPRI64:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI64:$dst, (OpNode ADDR:$ptr))]>;
+  def _f64 : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRF64:$dst, (OpNode ADDR:$ptr))]>;
+  def _v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4F32:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2f32 : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2F32:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2F64:$dst, (OpNode ADDR:$ptr))]>;
+  def _v4i32 : OneInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I32:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2i8 : OneInOneOut<IL_OP_MOV, (outs GPRV2I8:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I8:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2i16 : OneInOneOut<IL_OP_MOV, (outs GPRV2I16:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I16:$dst, (OpNode ADDR:$ptr))]>;
+  def _v4i8 : OneInOneOut<IL_OP_MOV, (outs GPRV4I8:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I8:$dst, (OpNode ADDR:$ptr))]>;
+  def _v4i16 : OneInOneOut<IL_OP_MOV, (outs GPRV4I16:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I16:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2i32 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I32:$dst, (OpNode ADDR:$ptr))]>;
+  def _v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I64:$dst), (ins MEMI32:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I64:$dst, (OpNode ADDR:$ptr))]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass GTRUNCSTORE64<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(global_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass LTRUNCSTORE64<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(local_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass PTRUNCSTORE64<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(private_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+}
+
+// Multiclass that handles memory store operations
+multiclass RTRUNCSTORE64<string asm> {
+  def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>;
+  def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>;
+  def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>;
+  def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>;
+  def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>;
+  def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>;
+  def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>;
+  def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+  def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(region_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>;
+}
+
+
+// Multiclass that handles memory store operations
+multiclass STORE64<string asm, PatFrag OpNode> {
+  def _i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI8:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI8:$val, ADDR64:$ptr)]>;
+  def _i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI16:$val, ADDR64:$ptr)]>;
+  def _i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI32:$val, ADDR64:$ptr)]>;
+  def _f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRF32:$val, ADDR64:$ptr)]>;
+  def _i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRI64:$val, ADDR64:$ptr)]>;
+  def _f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRF64:$val, ADDR64:$ptr)]>;
+  def _v4f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4F32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4F32:$val, ADDR64:$ptr)]>;
+  def _v2f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2F32:$val, ADDR64:$ptr)]>;
+  def _v4i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I32:$val, ADDR64:$ptr)]>;
+  def _v2i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I8:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I8:$val, ADDR64:$ptr)]>;
+  def _v2i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I16:$val, ADDR64:$ptr)]>;
+  def _v4i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I8:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I8:$val, ADDR64:$ptr)]>;
+  def _v4i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV4I16:$val, ADDR64:$ptr)]>;
+  def _v2i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I32:$val, ADDR64:$ptr)]>;
+  def _v2f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2F64:$val, ADDR64:$ptr)]>;
+  def _v2i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr),
+      !strconcat(asm, " $val $ptr"),
+      [(OpNode GPRV2I64:$val, ADDR64:$ptr)]>;
+}
+
+// Multiclass that handles load operations
+multiclass LOAD64<string asm, PatFrag OpNode> {
+  def _i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI8:$dst, (OpNode ADDR64:$ptr))]>;
+  def _i16 : OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI16:$dst, (OpNode ADDR64:$ptr))]>;
+  def _i32 : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRF32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _i64 : OneInOneOut<IL_OP_MOV, (outs GPRI64:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRI64:$dst, (OpNode ADDR64:$ptr))]>;
+  def _f64 : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRF64:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4F32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2f32 : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2F32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2F64:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v4i32 : OneInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2i8 : OneInOneOut<IL_OP_MOV, (outs GPRV2I8:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I8:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2i16 : OneInOneOut<IL_OP_MOV, (outs GPRV2I16:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I16:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v4i8 : OneInOneOut<IL_OP_MOV, (outs GPRV4I8:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I8:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v4i16 : OneInOneOut<IL_OP_MOV, (outs GPRV4I16:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV4I16:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2i32 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I32:$dst, (OpNode ADDR64:$ptr))]>;
+  def _v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I64:$dst), (ins MEMI64:$ptr),
+      !strconcat(asm, " $dst $ptr"),
+      [(set GPRV2I64:$dst, (OpNode ADDR64:$ptr))]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<ILOpCode opc> {
+  def _i8 : UnaryOpNoRet<opc, (outs), (ins GPRI8:$src),
+      !strconcat(opc.Text, " $src"), []>;
+  def _i16 : UnaryOpNoRet<opc, (outs), (ins GPRI16:$src),
+      !strconcat(opc.Text, " $src"), []>;
+  def _i32 : UnaryOpNoRet<opc, (outs), (ins GPRI32:$src),
+      !strconcat(opc.Text, " $src"), []>;
+  def _i64 : UnaryOpNoRet<opc, (outs), (ins GPRI64:$src),
+      !strconcat(opc.Text, " $src"), []>;
+  def _f32 : UnaryOpNoRet<opc, (outs), (ins GPRF32:$src),
+      !strconcat(opc.Text, " $src"), []>;
+  def _f64 : UnaryOpNoRet<opc, (outs), (ins GPRF64:$src),
+      !strconcat(opc.Text, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<ILOpCode opc> {
+  def _i8 : BinaryOpNoRet<opc, (outs),  (ins  GPRI8:$src0,  GPRI8:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+  def _i16 : BinaryOpNoRet<opc, (outs), (ins GPRI16:$src0, GPRI16:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+  def _i32 : BinaryOpNoRet<opc, (outs), (ins GPRI32:$src0, GPRI32:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+  def _i64 : BinaryOpNoRet<opc, (outs), (ins GPRI64:$src0, GPRI64:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+  def _f32 : BinaryOpNoRet<opc, (outs), (ins GPRF32:$src0, GPRF32:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+  def _f64 : BinaryOpNoRet<opc, (outs), (ins GPRF64:$src0, GPRF64:$src1),
+      !strconcat(opc.Text, " $src0, $src1"), []>;
+}
+
+// Class that handles the various vector extract patterns
+multiclass VectorExtract<SDNode OpNode> {
+  def _v2f64 : ExtractVectorClass<GPRF64, GPRV2F64, OpNode>;
+  def _v4f32: ExtractVectorClass<GPRF32, GPRV4F32, OpNode>;
+  def _v2f32 : ExtractVectorClass<GPRF32, GPRV2F32, OpNode>;
+  def _v2i64 : ExtractVectorClass<GPRI64, GPRV2I64, OpNode>;
+  def _v4i8 : ExtractVectorClass<GPRI8, GPRV4I8, OpNode>;
+  def _v4i16 : ExtractVectorClass<GPRI16, GPRV4I16, OpNode>;
+  def _v4i32 : ExtractVectorClass<GPRI32, GPRV4I32, OpNode>;
+  def _v2i8 : ExtractVectorClass<GPRI8, GPRV2I8, OpNode>;
+  def _v2i16 : ExtractVectorClass<GPRI16, GPRV2I16, OpNode>;
+  def _v2i32 : ExtractVectorClass<GPRI32, GPRV2I32, OpNode>;
+}
+
+multiclass VectorConcat<SDNode OpNode> {
+  def _v2f64 : VectorConcatClass<GPRV2F64, GPRF64, OpNode>;
+  def _v2i64 : VectorConcatClass<GPRV2F64, GPRI64, OpNode>;
+  def _v4f32 : VectorConcatClass<GPRV4F32, GPRV2F32, OpNode>;
+  def _v4i32 : VectorConcatClass<GPRV4I32, GPRV2I32, OpNode>;
+  def _v4i16 : VectorConcatClass<GPRV4I16, GPRV2I16, OpNode>;
+  def _v4i8 : VectorConcatClass<GPRV4I8, GPRV2I8, OpNode>;
+  def _v2f32 : VectorConcatClass<GPRV2F32, GPRF32, OpNode>;
+  def _v2i32 : VectorConcatClass<GPRV2I32, GPRI32, OpNode>;
+  def _v2i16 : VectorConcatClass<GPRV2I16, GPRI16, OpNode>;
+  def _v2i8 : VectorConcatClass<GPRV2I8, GPRI8, OpNode>;
+}
+
+// Class that handles the various vector insert patterns
+multiclass VectorInsert<SDNode OpNode> {
+  def _v2f64 : InsertVectorClass<IL_OP_I_ADD, GPRV2F64,
+      GPRF64, OpNode, "iadd">;
+  def _v4f32: InsertVectorClass<IL_OP_I_ADD, GPRV4F32,
+      GPRF32, OpNode, "iadd">;
+  def _v2f32 : InsertVectorClass<IL_OP_I_ADD, GPRV2F32,
+      GPRF32, OpNode, "iadd">;
+  def _v2i64 : InsertVectorClass<IL_OP_I_ADD, GPRV2I64,
+      GPRI64, OpNode, "iadd">;
+  def _v4i8 : InsertVectorClass<IL_OP_I_ADD, GPRV4I8,
+      GPRI8, OpNode, "iadd">;
+  def _v4i16 : InsertVectorClass<IL_OP_I_ADD, GPRV4I16,
+      GPRI16, OpNode, "iadd">;
+  def _v4i32 : InsertVectorClass<IL_OP_I_ADD, GPRV4I32,
+      GPRI32, OpNode, "iadd">;
+  def _v2i8 : InsertVectorClass<IL_OP_I_ADD, GPRV2I8,
+      GPRI8, OpNode, "iadd">;
+  def _v2i16 : InsertVectorClass<IL_OP_I_ADD, GPRV2I16,
+      GPRI16, OpNode, "iadd">;
+  def _v2i32 : InsertVectorClass<IL_OP_I_ADD, GPRV2I32,
+      GPRI32, OpNode, "iadd">;
+}
+
+// generic class that handles math instruction for OneInOneOut instruction
+// patterns
+multiclass UnaryOpMC<ILOpCode OpCode, SDNode OpNode> {
+  def _i8    : UnaryOp<OpCode, OpNode, GPRI8, GPRI8>;
+  def _i16    : UnaryOp<OpCode, OpNode, GPRI16, GPRI16>;
+  def _i32    : UnaryOp<OpCode, OpNode, GPRI32, GPRI32>;
+  def _f32    : UnaryOp<OpCode, OpNode, GPRF32, GPRF32>;
+  def _f64    : UnaryOp<OpCode, OpNode, GPRF64, GPRF64>;
+  def _i64    : UnaryOp<OpCode, OpNode, GPRI64, GPRI64>;
+  def _v4f32: UnaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32>;
+  def _v4i16  : UnaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16>;
+  def _v4i8  : UnaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8>;
+  def _v4i32  : UnaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32>;
+  def _v2f32  : UnaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32>;
+  def _v2i16  : UnaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16>;
+  def _v2i8  : UnaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8>;
+  def _v2i32  : UnaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32>;
+  def _v2f64  : UnaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64>;
+  def _v2i64  : UnaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64>;
+}
+multiclass UnaryOpMCVec<ILOpCode OpCode, SDNode OpNode> {
+  def _v4f32: UnaryOp<OpCode, OpNode, GPRV4F32, GPRF32>;
+  def _v4i16  : UnaryOp<OpCode, OpNode, GPRV4I16, GPRI16>;
+  def _v4i8  : UnaryOp<OpCode, OpNode, GPRV4I8, GPRI8>;
+  def _v4i32  : UnaryOp<OpCode, OpNode, GPRV4I32, GPRI32>;
+  def _v2f32  : UnaryOp<OpCode, OpNode, GPRV2F32, GPRF32>;
+  def _v2i16  : UnaryOp<OpCode, OpNode, GPRV2I16, GPRI16>;
+  def _v2i8  : UnaryOp<OpCode, OpNode, GPRV2I8, GPRI8>;
+  def _v2i32  : UnaryOp<OpCode, OpNode, GPRV2I32, GPRI32>;
+  def _v2f64  : UnaryOp<OpCode, OpNode, GPRV2F64, GPRF64>;
+  def _v2i64  : UnaryOp<OpCode, OpNode, GPRV2I64, GPRI64>;
+}
+
+multiclass UnaryOpMCf32<
+ILOpCode f32OpCode,
+         SDNode OpNode> {
+           def _f32    : UnaryOp<f32OpCode, OpNode, GPRF32, GPRF32>;
+           def _v4f32: UnaryOp<f32OpCode, OpNode, GPRV4F32, GPRV4F32>;
+           def _v2f32  : UnaryOp<f32OpCode, OpNode, GPRV2F32, GPRV2F32>;
+         }
+
+multiclass UnaryOpMCi32<
+ILOpCode i32OpCode,
+         SDNode OpNode> {
+           def _i8    : UnaryOp<i32OpCode, OpNode, GPRI8, GPRI8>;
+           def _i16    : UnaryOp<i32OpCode, OpNode, GPRI16, GPRI16>;
+           def _i32    : UnaryOp<i32OpCode, OpNode, GPRI32, GPRI32>;
+           def _v4i16  : UnaryOp<i32OpCode, OpNode, GPRV4I16, GPRV4I16>;
+           def _v4i8  : UnaryOp<i32OpCode, OpNode, GPRV4I8, GPRV4I8>;
+           def _v4i32  : UnaryOp<i32OpCode, OpNode, GPRV4I32, GPRV4I32>;
+           def _v2i16  : UnaryOp<i32OpCode, OpNode, GPRV2I16, GPRV2I16>;
+           def _v2i8  : UnaryOp<i32OpCode, OpNode, GPRV2I8, GPRV2I8>;
+           def _v2i32  : UnaryOp<i32OpCode, OpNode, GPRV2I32, GPRV2I32>;
+         }
+
+
+multiclass BinaryOpMC<ILOpCode OpCode, SDNode OpNode> {
+  def _i8    : BinaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8>;
+
+  def _i16    : BinaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16>;
+  def _i32    : BinaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32>;
+  def _f32    : BinaryOp<OpCode, OpNode, GPRF32, GPRF32, GPRF32>;
+  def _f64    : BinaryOp<OpCode, OpNode, GPRF64, GPRF64, GPRF64>;
+  def _i64    : BinaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64>;
+  def _v4f32: BinaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32, GPRV4F32>;
+  def _v4i16  : BinaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16, GPRV4I16>;
+  def _v4i8  : BinaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8, GPRV4I8>;
+  def _v4i32  : BinaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32, GPRV4I32>;
+  def _v2f32  : BinaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32, GPRV2F32>;
+  def _v2i16  : BinaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16, GPRV2I16>;
+  def _v2i8  : BinaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8, GPRV2I8>;
+  def _v2i32  : BinaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32, GPRV2I32>;
+  def _v2f64  : BinaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64, GPRV2F64>;
+  def _v2i64  : BinaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64, GPRV2I64>;
+}
+
+multiclass BinaryOpMCInt<ILOpCode OpCode, SDNode OpNode> {
+  def _i8    : BinaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8>;
+
+  def _i16    : BinaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16>;
+  def _i32    : BinaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32>;
+  def _i64    : BinaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64>;
+  def _v4i16  : BinaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16, GPRV4I16>;
+  def _v4i8  : BinaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8, GPRV4I8>;
+  def _v4i32  : BinaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32, GPRV4I32>;
+  def _v2i16  : BinaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16, GPRV2I16>;
+  def _v2i8  : BinaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8, GPRV2I8>;
+  def _v2i32  : BinaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32, GPRV2I32>;
+  def _v2i64  : BinaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64, GPRV2I64>;
+}
+
+// generic class that handles math instruction for ThreeInOneOut
+// instruction patterns
+multiclass TernaryOpMC<ILOpCode OpCode, SDNode OpNode> {
+  def _i8    : TernaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8, GPRI8>;
+  def _i16    : TernaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16, GPRI16>;
+  def _i32    : TernaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32, GPRI32>;
+  def _f32    : TernaryOp<OpCode, OpNode, GPRF32, GPRF32, GPRF32, GPRF32>;
+  def _f64    : TernaryOp<OpCode, OpNode, GPRF64, GPRF64, GPRF64, GPRF64>;
+  def _i64    : TernaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64, GPRI64>;
+  def _v4f32: TernaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32,
+      GPRV4F32, GPRV4F32>;
+  def _v4i8  : TernaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8,
+      GPRV4I8, GPRV4I8>;
+  def _v4i16  : TernaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16,
+      GPRV4I16, GPRV4I16>;
+  def _v4i32  : TernaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32,
+      GPRV4I32, GPRV4I32>;
+  def _v2f32  : TernaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32,
+      GPRV2F32, GPRV2F32>;
+  def _v2i8  : TernaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8,
+      GPRV2I8, GPRV2I8>;
+  def _v2i16  : TernaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16,
+      GPRV2I16, GPRV2I16>;
+  def _v2i32  : TernaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32,
+      GPRV2I32, GPRV2I32>;
+  def _v2f64  : TernaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64,
+      GPRV2F64, GPRV2F64>;
+  def _v2i64  : TernaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64,
+      GPRV2I64, GPRV2I64>;
+}
+multiclass BinaryOpMCi32<ILOpCode i32OpCode, SDNode OpNode> {
+  def _i8    : BinaryOp<i32OpCode, OpNode, GPRI8, GPRI8, GPRI8>;
+  def _i16   : BinaryOp<i32OpCode, OpNode, GPRI16, GPRI16, GPRI16>;
+  def _i32   : BinaryOp<i32OpCode, OpNode, GPRI32, GPRI32, GPRI32>;
+  def _v4i16 : BinaryOp<i32OpCode, OpNode, GPRV4I16,
+      GPRV4I16, GPRV4I16>;
+  def _v4i8  : BinaryOp<i32OpCode, OpNode, GPRV4I8,
+      GPRV4I8, GPRV4I8>;
+  def _v4i32 : BinaryOp<i32OpCode, OpNode, GPRV4I32,
+      GPRV4I32, GPRV4I32>;
+  def _v2i16 : BinaryOp<i32OpCode, OpNode, GPRV2I16,
+      GPRV2I16, GPRV2I16>;
+  def _v2i8  : BinaryOp<i32OpCode, OpNode, GPRV2I8,
+      GPRV2I8, GPRV2I8>;
+  def _v2i32 : BinaryOp<i32OpCode, OpNode, GPRV2I32,
+      GPRV2I32, GPRV2I32>;
+}
+multiclass BinaryOpMCi64<ILOpCode i64OpCode, SDNode OpNode> {
+  def _i64   : BinaryOp<i64OpCode, OpNode, GPRI64, GPRI64, GPRI64>;
+  def _v2i64 : BinaryOp<i64OpCode, OpNode, GPRV2I64,
+      GPRV2I64, GPRV2I64>;
+}
+multiclass BinaryOpMCi32Const<ILOpCode i32OpCode, SDNode OpNode> {
+  def _i8    : BinaryOp<i32OpCode, OpNode, GPRI8, GPRI8, GPRI32>;
+  def _i16   : BinaryOp<i32OpCode, OpNode, GPRI16, GPRI16, GPRI32>;
+  def _i32   : BinaryOp<i32OpCode, OpNode, GPRI32, GPRI32, GPRI32>;
+  def _v4i16 : BinaryOp<i32OpCode, OpNode, GPRV4I32,
+      GPRV4I32, GPRI32>;
+  def _v4i8  : BinaryOp<i32OpCode, OpNode, GPRV4I32,
+      GPRV4I32, GPRI32>;
+  def _v4i32 : BinaryOp<i32OpCode, OpNode, GPRV4I32,
+      GPRV4I32, GPRI32>;
+  def _v2i16 : BinaryOp<i32OpCode, OpNode, GPRV2I32,
+      GPRV2I32, GPRI32>;
+  def _v2i8  : BinaryOp<i32OpCode, OpNode, GPRV2I32,
+      GPRV2I32, GPRI32>;
+  def _v2i32 : BinaryOp<i32OpCode, OpNode, GPRV2I32,
+      GPRV2I32, GPRI32>;
+}
+multiclass BinaryOpMCf32<ILOpCode f32OpCode, SDNode OpNode> {
+  def _f32    : BinaryOp<f32OpCode, OpNode, GPRF32,
+      GPRF32, GPRF32>;
+  def _v4f32: BinaryOp<f32OpCode, OpNode, GPRV4F32,
+      GPRV4F32, GPRV4F32>;
+  def _v2f32  : BinaryOp<f32OpCode, OpNode, GPRV2F32,
+      GPRV2F32, GPRV2F32>;
+}
+
+multiclass TernaryOpMCf64<ILOpCode f64OpCode, SDNode OpNode> {
+  def _f64    : TernaryOp<f64OpCode, OpNode, GPRF64,
+      GPRF64, GPRF64, GPRF64>;
+}
+
+multiclass TernaryOpMCf32<ILOpCode f32OpCode, SDNode OpNode> {
+  def _f32    : TernaryOp<f32OpCode, OpNode, GPRF32,
+      GPRF32, GPRF32, GPRF32>;
+  def _v4f32: TernaryOp<f32OpCode, OpNode, GPRV4F32,
+      GPRV4F32, GPRV4F32, GPRV4F32>;
+  def _v2f32  : TernaryOp<f32OpCode, OpNode, GPRV2F32,
+      GPRV2F32, GPRV2F32, GPRV2F32>;
+}
+multiclass BinaryOpMCFloat<ILOpCode f32OpCode, ILOpCode f64OpCode,
+           SDNode OpNode> {
+             def _f64    : BinaryOp<f64OpCode, OpNode, GPRF64,
+             GPRF64, GPRF64>;
+             def _v2f64  : BinaryOp<f64OpCode, OpNode, GPRV2F64,
+                 GPRV2F64, GPRV2F64>;
+             def _f32    : BinaryOp<f32OpCode, OpNode, GPRF32,
+                 GPRF32, GPRF32>;
+             def _v2f32  : BinaryOp<f32OpCode, OpNode, GPRV2F32,
+                 GPRV2F32, GPRV2F32>;
+             def _v4f32: BinaryOp<f32OpCode, OpNode, GPRV4F32,
+                 GPRV4F32, GPRV4F32>;
+           }
+
+multiclass TernaryOpMCScalar<ILOpCode opcode, SDNode node>
+{
+    def _i8:  TernaryOp<opcode, node, GPRI8, GPRI8, GPRI8, GPRI8>;
+    def _i16: TernaryOp<opcode, node, GPRI16, GPRI8, GPRI16, GPRI16>;
+    def _i32: TernaryOp<opcode, node, GPRI32, GPRI8, GPRI32, GPRI32>;
+    def _i64: TernaryOp<opcode, node, GPRI64, GPRI8, GPRI64, GPRI64>;
+    def _f32: TernaryOp<opcode, node, GPRF32, GPRI8, GPRF32, GPRF32>;
+    def _f64: TernaryOp<opcode, node, GPRF64, GPRI8, GPRF64, GPRF64>;
+}
+
+
+multiclass BitConversion<ILOpCode opcode, RegisterClass Regs, SDNode OpNode>
+{
+    def _i8    : UnaryOp<opcode, OpNode, Regs,    GPRI8>;
+    def _i16   : UnaryOp<opcode, OpNode, Regs,   GPRI16>;
+    def _i32   : UnaryOp<opcode, OpNode, Regs,   GPRI32>;
+    def _f32   : UnaryOp<opcode, OpNode, Regs,   GPRF32>;
+    def _i64   : UnaryOp<opcode, OpNode, Regs,   GPRI64>;
+    def _f64   : UnaryOp<opcode, OpNode, Regs,   GPRF64>;
+    def _v2i8  : UnaryOp<opcode, OpNode, Regs,  GPRV2I8>;
+    def _v2i16 : UnaryOp<opcode, OpNode, Regs, GPRV2I16>;
+    def _v2i32 : UnaryOp<opcode, OpNode, Regs, GPRV2I32>;
+    def _v2f32 : UnaryOp<opcode, OpNode, Regs, GPRV2F32>;
+    def _v2i64 : UnaryOp<opcode, OpNode, Regs, GPRV2I64>;
+    def _v2f64 : UnaryOp<opcode, OpNode, Regs, GPRV2F64>;
+    def _v4i8  : UnaryOp<opcode, OpNode, Regs,  GPRV4I8>;
+    def _v4i16 : UnaryOp<opcode, OpNode, Regs, GPRV4I16>;
+    def _v4i32 : UnaryOp<opcode, OpNode, Regs, GPRV4I32>;
+    def _v4f32 : UnaryOp<opcode, OpNode, Regs, GPRV4F32>;
+}
+
+
+multiclass UnaryIntrinsicInt<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRI32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRI32:$dst, (intr GPRI32:$src))]>;
+def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2I32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2I32:$dst, (intr GPRV2I32:$src))]>;
+def _v4i32 : OneInOneOut<opcode, (outs GPRV4I32:$dst),
+      (ins GPRV4I32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4I32:$dst, (intr GPRV4I32:$src))]>;
+}
+
+multiclass IntrConvertF32TOF16<ILOpCode opcode, Intrinsic intr>
+{
+def _i16 : OneInOneOut<opcode, (outs GPRI16:$dst),
+      (ins GPRF32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRI16:$dst, (intr GPRF32:$src))]>;
+def _v2i16 : OneInOneOut<opcode, (outs GPRV2I16:$dst),
+      (ins GPRV2F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2I16:$dst, (intr GPRV2F32:$src))]>;
+def _v4i16 : OneInOneOut<opcode, (outs GPRV4I16:$dst),
+      (ins GPRV4F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4I16:$dst, (intr GPRV4F32:$src))]>;
+}
+
+
+multiclass IntrConvertF32TOI32<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRF32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRI32:$dst, (intr GPRF32:$src))]>;
+def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2I32:$dst, (intr GPRV2F32:$src))]>;
+def _v4i32 : OneInOneOut<opcode, (outs GPRV4I32:$dst),
+      (ins GPRV4F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4I32:$dst, (intr GPRV4F32:$src))]>;
+}
+
+multiclass IntrConvertF64TOI32<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRF64:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRI32:$dst, (intr GPRF64:$src))]>;
+def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2F64:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2I32:$dst, (intr GPRV2F64:$src))]>;
+}
+
+multiclass IntrConvertF16TOF32<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRI16:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF32:$dst, (intr GPRI16:$src))]>;
+def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst),
+      (ins GPRV2I16:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2F32:$dst, (intr GPRV2I16:$src))]>;
+def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst),
+      (ins GPRV4I16:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4F32:$dst, (intr GPRV4I16:$src))]>;
+}
+
+
+multiclass IntrConvertI32TOF32<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRI32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF32:$dst, (intr GPRI32:$src))]>;
+def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst),
+      (ins GPRV2I32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2F32:$dst, (intr GPRV2I32:$src))]>;
+def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst),
+      (ins GPRV4I32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4F32:$dst, (intr GPRV4I32:$src))]>;
+}
+
+multiclass BinaryIntrinsicLong<ILOpCode opcode, Intrinsic intr>
+{
+def _i64 : TwoInOneOut<opcode, (outs GPRI64:$dst),
+      (ins GPRI64:$src, GPRI64:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRI64:$dst,
+      (intr GPRI64:$src, GPRI64:$src2))]>;
+}
+
+
+multiclass BinaryIntrinsicInt<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : TwoInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRI32:$src, GPRI32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRI32:$dst,
+      (intr GPRI32:$src, GPRI32:$src2))]>;
+def _v2i32 : TwoInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2I32:$src, GPRV2I32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRV2I32:$dst,
+      (intr GPRV2I32:$src, GPRV2I32:$src2))]>;
+def _v4i32 : TwoInOneOut<opcode, (outs GPRV4I32:$dst),
+      (ins GPRV4I32:$src, GPRV4I32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRV4I32:$dst,
+      (intr GPRV4I32:$src, GPRV4I32:$src2))]>;
+}
+
+multiclass TernaryIntrinsicInt<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : ThreeInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRI32:$src, GPRI32:$src2, GPRI32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRI32:$dst,
+      (intr GPRI32:$src, GPRI32:$src2, GPRI32:$src3))]>;
+def _v2i32 : ThreeInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRV2I32:$dst,
+      (intr GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3))]>;
+def _v4i32 : ThreeInOneOut<opcode, (outs GPRV4I32:$dst),
+      (ins GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRV4I32:$dst,
+      (intr GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3))]>;
+}
+
+multiclass TernaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : ThreeInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRF32:$src, GPRF32:$src2, GPRF32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRF32:$dst,
+      (intr GPRF32:$src, GPRF32:$src2, GPRF32:$src3))]>;
+def _v2f32 : ThreeInOneOut<opcode, (outs GPRV2F32:$dst),
+      (ins GPRV2F32:$src, GPRV2F32:$src2, GPRV2F32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRV2F32:$dst,
+      (intr GPRV2F32:$src, GPRV2F32:$src2, GPRV2F32:$src3))]>;
+def _v4f32 : ThreeInOneOut<opcode, (outs GPRV4F32:$dst),
+      (ins GPRV4F32:$src, GPRV4F32:$src2, GPRV4F32:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRV4F32:$dst,
+      (intr GPRV4F32:$src, GPRV4F32:$src2, GPRV4F32:$src3))]>;
+}
+
+multiclass BinaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : ThreeInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src, GPRF64:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRF64:$dst,
+      (intr GPRF64:$src, GPRF64:$src2))]>;
+}
+
+multiclass TernaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : ThreeInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src, GPRF64:$src2, GPRF64:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRF64:$dst,
+      (intr GPRF64:$src, GPRF64:$src2, GPRF64:$src3))]>;
+}
+
+
+multiclass TernaryIntrinsicLongScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _i64 : ThreeInOneOut<opcode, (outs GPRI64:$dst),
+      (ins GPRI64:$src, GPRI64:$src2, GPRI64:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRI64:$dst,
+      (intr GPRI64:$src, GPRI64:$src2, GPRI64:$src3))]>;
+}
+
+multiclass QuaternaryIntrinsicInt<ILOpCode opcode, Intrinsic intr>
+{
+def _i32 : FourInOneOut<opcode, (outs GPRI32:$dst),
+      (ins GPRI32:$src, GPRI32:$src2, GPRI32:$src3, GPRI32:$src4),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"),
+      [(set GPRI32:$dst,
+      (intr GPRI32:$src, GPRI32:$src2, GPRI32:$src3, GPRI32:$src4))]>;
+def _v2i32 : FourInOneOut<opcode, (outs GPRV2I32:$dst),
+      (ins GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3, GPRV2I32:$src4),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"),
+      [(set GPRV2I32:$dst,
+      (intr GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3, GPRV2I32:$src4))]>;
+def _v4i32 : FourInOneOut<opcode, (outs GPRV4I32:$dst),
+      (ins GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3, GPRV4I32:$src4),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"),
+      [(set GPRV4I32:$dst,
+      (intr GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3, GPRV4I32:$src4))]>;
+}
+
+multiclass UnaryIntrinsicFloatScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRF32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF32:$dst, (intr GPRF32:$src))]>;
+}
+
+multiclass UnaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRF32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF32:$dst, (intr GPRF32:$src))]>;
+def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst),
+      (ins GPRV2F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2F32:$dst, (intr GPRV2F32:$src))]>;
+def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst),
+      (ins GPRV4F32:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV4F32:$dst, (intr GPRV4F32:$src))]>;
+}
+
+multiclass BinaryIntrinsicFloatScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : TwoInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRF32:$src, GPRF32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRF32:$dst,
+      (intr GPRF32:$src, GPRF32:$src2))]>;
+}
+multiclass BinaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr>
+{
+def _f32 : TwoInOneOut<opcode, (outs GPRF32:$dst),
+      (ins GPRF32:$src, GPRF32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRF32:$dst,
+      (intr GPRF32:$src, GPRF32:$src2))]>;
+def _v2f32 : TwoInOneOut<opcode, (outs GPRV2F32:$dst),
+      (ins GPRV2F32:$src, GPRV2F32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRV2F32:$dst,
+      (intr GPRV2F32:$src, GPRV2F32:$src2))]>;
+def _v4f32 : TwoInOneOut<opcode, (outs GPRV4F32:$dst),
+      (ins GPRV4F32:$src, GPRV4F32:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRV4F32:$dst,
+      (intr GPRV4F32:$src, GPRV4F32:$src2))]>;
+}
+
+multiclass UnaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : OneInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF64:$dst, (intr GPRF64:$src))]>;
+}
+
+multiclass UnaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : OneInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRF64:$dst, (intr GPRF64:$src))]>;
+def _v2f64 : OneInOneOut<opcode, (outs GPRV2F64:$dst),
+      (ins GPRV2F64:$src),
+      !strconcat(opcode.Text, " $dst, $src"),
+      [(set GPRV2F64:$dst, (intr GPRV2F64:$src))]>;
+}
+
+multiclass BinaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : TwoInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src, GPRF64:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRF64:$dst,
+      (intr GPRF64:$src, GPRF64:$src2))]>;
+def _v2f64 : TwoInOneOut<opcode, (outs GPRV2F64:$dst),
+      (ins GPRV2F64:$src, GPRV2F64:$src2),
+      !strconcat(opcode.Text, " $dst, $src, $src2"),
+      [(set GPRV2F64:$dst,
+      (intr GPRV2F64:$src, GPRV2F64:$src2))]>;
+}
+
+multiclass TernaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr>
+{
+def _f64 : TwoInOneOut<opcode, (outs GPRF64:$dst),
+      (ins GPRF64:$src, GPRF64:$src2, GPRF64:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRF64:$dst,
+      (intr GPRF64:$src, GPRF64:$src2, GPRF64:$src3))]>;
+def _v2f64 : TwoInOneOut<opcode, (outs GPRV2F64:$dst),
+      (ins GPRV2F64:$src, GPRV2F64:$src2, GPRV2F64:$src3),
+      !strconcat(opcode.Text, " $dst, $src, $src2, $src3"),
+      [(set GPRV2F64:$dst,
+      (intr GPRV2F64:$src, GPRV2F64:$src2, GPRV2F64:$src3))]>;
+}
diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.cpp b/src/gallium/drivers/radeon/AMDILNIDevice.cpp

new file mode 100644 (file)

index 0000000..8fda1c1
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILNIDevice.cpp
@@ -0,0 +1,71 @@
+//===-- AMDILNIDevice.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILNIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILSubtarget.h"
+
+using namespace llvm;
+
+AMDILNIDevice::AMDILNIDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST)
+{
+  std::string name = ST->getDeviceName();
+  if (name == "caicos") {
+    mDeviceFlag = OCL_DEVICE_CAICOS;
+  } else if (name == "turks") {
+    mDeviceFlag = OCL_DEVICE_TURKS;
+  } else if (name == "cayman") {
+    mDeviceFlag = OCL_DEVICE_CAYMAN;
+  } else {
+    mDeviceFlag = OCL_DEVICE_BARTS;
+  }
+}
+AMDILNIDevice::~AMDILNIDevice()
+{
+}
+
+size_t
+AMDILNIDevice::getMaxLDSSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDILNIDevice::getGeneration() const
+{
+  return AMDILDeviceInfo::HD6XXX;
+}
+
+
+AMDILCaymanDevice::AMDILCaymanDevice(AMDILSubtarget *ST)
+  : AMDILNIDevice(ST)
+{
+  setCaps();
+}
+
+AMDILCaymanDevice::~AMDILCaymanDevice()
+{
+}
+
+void
+AMDILCaymanDevice::setCaps()
+{
+  if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDILDeviceInfo::DoubleOps);
+    mHWBits.set(AMDILDeviceInfo::FMA);
+  }
+  mHWBits.set(AMDILDeviceInfo::Signed24BitOps);
+  mSWBits.reset(AMDILDeviceInfo::Signed24BitOps);
+  mSWBits.set(AMDILDeviceInfo::ArenaSegment);
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.h b/src/gallium/drivers/radeon/AMDILNIDevice.h

new file mode 100644 (file)

index 0000000..556670a
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILNIDevice.h
@@ -0,0 +1,59 @@
+//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===---------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef _AMDILNIDEVICE_H_
+#define _AMDILNIDEVICE_H_
+#include "AMDILEvergreenDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+//===---------------------------------------------------------------------===//
+// NI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+// The AMDILNIDevice is the base class for all Northern Island series of
+// cards. It is very similiar to the AMDILEvergreenDevice, with the major
+// exception being differences in wavefront size and hardware capabilities.  The
+// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
+// integer operations
+
+  class AMDILNIDevice : public AMDILEvergreenDevice {
+    public:
+      AMDILNIDevice(AMDILSubtarget*);
+      virtual ~AMDILNIDevice();
+      virtual size_t getMaxLDSSize() const;
+      virtual uint32_t getGeneration() const;
+    protected:
+  }; // AMDILNIDevice
+
+// Just as the AMDILCypressDevice is the double capable version of the
+// AMDILEvergreenDevice, the AMDILCaymanDevice is the double capable version of
+// the AMDILNIDevice.  The other major difference that is not as useful from
+// standpoint is that the Cayman Device has 4 wide ALU's, whereas the rest of the
+// NI family is a 5 wide.
+     
+  class AMDILCaymanDevice: public AMDILNIDevice {
+    public:
+      AMDILCaymanDevice(AMDILSubtarget*);
+      virtual ~AMDILCaymanDevice();
+    private:
+      virtual void setCaps();
+  }; // AMDILCaymanDevice
+
+  static const unsigned int MAX_LDS_SIZE_900 = AMDILDevice::MAX_LDS_SIZE_800;
+} // namespace llvm
+#endif // _AMDILNIDEVICE_H_
diff --git a/src/gallium/drivers/radeon/AMDILNodes.td b/src/gallium/drivers/radeon/AMDILNodes.td

new file mode 100644 (file)

index 0000000..8cf07a5
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILNodes.td
@@ -0,0 +1,325 @@
+//===- AMDILNodes.td - AMD IL nodes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Conversion DAG Nodes
+//===----------------------------------------------------------------------===//
+// Double to Single conversion
+def IL_d2f : SDNode<"AMDILISD::DP_TO_FP"   , SDTIL_DPToFPOp>;
+
+def IL_inttoany: SDNode<"AMDILISD::INTTOANY", SDTIL_IntToAny>;
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDILISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Comparison DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_cmp       : SDNode<"AMDILISD::CMP", SDTIL_Cmp>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_callseq_start : SDNode<"ISD::CALLSEQ_START", SDTIL_CallSeqStart,
+    [SDNPHasChain, SDNPOutGlue]>;
+def IL_callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDTIL_CallSeqEnd,
+    [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def IL_call      : SDNode<"AMDILISD::CALL", SDTIL_Call,
+    [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def IL_retflag       : SDNode<"AMDILISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic DAG Nodes
+//===----------------------------------------------------------------------===//
+// Address modification nodes
+def IL_addaddrri : SDNode<"AMDILISD::ADDADDR", SDTIL_AddAddrri,
+    [SDNPCommutative, SDNPAssociative]>;
+def IL_addaddrir : SDNode<"AMDILISD::ADDADDR", SDTIL_AddAddrir,
+    [SDNPCommutative, SDNPAssociative]>;
+
+//===--------------------------------------------------------------------===//
+// Instructions
+//===--------------------------------------------------------------------===//
+// Floating point math functions
+def IL_cmov_logical : SDNode<"AMDILISD::CMOVLOG", SDTIL_GenTernaryOp>;
+def IL_add       : SDNode<"AMDILISD::ADD"     , SDTIL_GenBinaryOp>;
+def IL_cmov        : SDNode<"AMDILISD::CMOV"    , SDTIL_GenBinaryOp>;
+def IL_or      : SDNode<"AMDILISD::OR"    ,SDTIL_GenBinaryOp>;
+def IL_and      : SDNode<"AMDILISD::AND"    ,SDTIL_GenBinaryOp>;
+def IL_xor          : SDNode<"AMDILISD::XOR", SDTIL_GenBinaryOp>;
+def IL_not          : SDNode<"AMDILISD::NOT", SDTIL_GenUnaryOp>;
+def IL_div_inf      : SDNode<"AMDILISD::DIV_INF", SDTIL_GenBinaryOp>;
+def IL_mad          : SDNode<"AMDILISD::MAD", SDTIL_GenTernaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Integer functions
+//===----------------------------------------------------------------------===//
+def IL_inegate     : SDNode<"AMDILISD::INEGATE" , SDTIntUnaryOp>;
+def IL_umul        : SDNode<"AMDILISD::UMUL"    , SDTIntBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def IL_mov        : SDNode<"AMDILISD::MOVE", SDTIL_GenUnaryOp>;
+def IL_phimov      : SDNode<"AMDILISD::PHIMOVE", SDTIL_GenUnaryOp>;
+def IL_bitconv     : SDNode<"AMDILISD::BITCONV", SDTIL_GenBitConv>;
+def IL_ffb_hi      : SDNode<"AMDILISD::IFFB_HI", SDTIL_GenUnaryOp>;
+def IL_ffb_lo      : SDNode<"AMDILISD::IFFB_LO", SDTIL_GenUnaryOp>;
+def IL_smax        : SDNode<"AMDILISD::SMAX", SDTIL_GenBinaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Double functions
+//===----------------------------------------------------------------------===//
+def IL_dcreate     : SDNode<"AMDILISD::DCREATE"   , SDTIL_DCreate>;
+def IL_dcomphi     : SDNode<"AMDILISD::DCOMPHI"     , SDTIL_DComp>;
+def IL_dcomplo     : SDNode<"AMDILISD::DCOMPLO"     , SDTIL_DComp>;
+def IL_dcreate2     : SDNode<"AMDILISD::DCREATE2"   , SDTIL_DCreate2>;
+def IL_dcomphi2     : SDNode<"AMDILISD::DCOMPHI2"     , SDTIL_DComp2>;
+def IL_dcomplo2     : SDNode<"AMDILISD::DCOMPLO2"     , SDTIL_DComp2>;
+
+//===----------------------------------------------------------------------===//
+// Long functions
+//===----------------------------------------------------------------------===//
+def IL_lcreate     : SDNode<"AMDILISD::LCREATE"   , SDTIL_LCreate>;
+def IL_lcreate2    : SDNode<"AMDILISD::LCREATE2"   , SDTIL_LCreate2>;
+def IL_lcomphi     : SDNode<"AMDILISD::LCOMPHI"     , SDTIL_LComp>;
+def IL_lcomphi2    : SDNode<"AMDILISD::LCOMPHI2"     , SDTIL_LComp2>;
+def IL_lcomplo     : SDNode<"AMDILISD::LCOMPLO"     , SDTIL_LComp>;
+def IL_lcomplo2    : SDNode<"AMDILISD::LCOMPLO2"     , SDTIL_LComp2>;
+
+//===----------------------------------------------------------------------===//
+// Vector functions
+//===----------------------------------------------------------------------===//
+def IL_vbuild     : SDNode<"AMDILISD::VBUILD", SDTIL_GenVecBuild,
+    []>;
+def IL_vextract   : SDNode<"AMDILISD::VEXTRACT", SDTIL_GenVecExtract,
+    []>;
+def IL_vinsert    : SDNode<"AMDILISD::VINSERT", SDTIL_GenVecInsert,
+    []>;
+def IL_vconcat    : SDNode<"AMDILISD::VCONCAT", SDTIL_GenVecConcat,
+    []>;
+
+//===----------------------------------------------------------------------===//
+// AMDIL Image Custom SDNodes
+//===----------------------------------------------------------------------===//
+def image2d_read  : SDNode<"AMDILISD::IMAGE2D_READ", SDTIL_ImageRead,
+    [SDNPHasChain, SDNPMayLoad]>;
+def image2d_write : SDNode<"AMDILISD::IMAGE2D_WRITE", SDTIL_ImageWrite,
+    [SDNPHasChain, SDNPMayStore]>;
+def image2d_info0 : SDNode<"AMDILISD::IMAGE2D_INFO0", SDTIL_ImageInfo, []>;
+def image2d_info1 : SDNode<"AMDILISD::IMAGE2D_INFO1", SDTIL_ImageInfo, []>;
+def image3d_read  : SDNode<"AMDILISD::IMAGE3D_READ", SDTIL_ImageRead,
+    [SDNPHasChain, SDNPMayLoad]>;
+def image3d_write : SDNode<"AMDILISD::IMAGE3D_WRITE", SDTIL_ImageWrite3D,
+    [SDNPHasChain, SDNPMayStore]>;
+def image3d_info0 : SDNode<"AMDILISD::IMAGE3D_INFO0", SDTIL_ImageInfo, []>;
+def image3d_info1 : SDNode<"AMDILISD::IMAGE3D_INFO1", SDTIL_ImageInfo, []>;
+
+//===----------------------------------------------------------------------===//
+// AMDIL Atomic Custom SDNodes
+//===----------------------------------------------------------------------===//
+//===-------------- 32 bit global atomics with return values --------------===//
+def atom_g_add : SDNode<"AMDILISD::ATOM_G_ADD", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_and : SDNode<"AMDILISD::ATOM_G_AND", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_cmpxchg : SDNode<"AMDILISD::ATOM_G_CMPXCHG", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; 
+def atom_g_dec : SDNode<"AMDILISD::ATOM_G_DEC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_inc : SDNode<"AMDILISD::ATOM_G_INC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_max : SDNode<"AMDILISD::ATOM_G_MAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_umax : SDNode<"AMDILISD::ATOM_G_UMAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_min : SDNode<"AMDILISD::ATOM_G_MIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_umin : SDNode<"AMDILISD::ATOM_G_UMIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_or : SDNode<"AMDILISD::ATOM_G_OR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_sub : SDNode<"AMDILISD::ATOM_G_SUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_rsub : SDNode<"AMDILISD::ATOM_G_RSUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_xchg : SDNode<"AMDILISD::ATOM_G_XCHG", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_xor : SDNode<"AMDILISD::ATOM_G_XOR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===------------- 32 bit global atomics without return values ------------===//
+def atom_g_add_noret : SDNode<"AMDILISD::ATOM_G_ADD_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_and_noret : SDNode<"AMDILISD::ATOM_G_AND_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_cmpxchg_noret : SDNode<"AMDILISD::ATOM_G_CMPXCHG_NORET",
+    SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_cmp_noret : SDNode<"AMDILISD::ATOM_G_CMPXCHG_NORET",
+    SDTIL_TriAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_dec_noret : SDNode<"AMDILISD::ATOM_G_DEC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_inc_noret : SDNode<"AMDILISD::ATOM_G_INC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_max_noret : SDNode<"AMDILISD::ATOM_G_MAX_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_umax_noret: SDNode<"AMDILISD::ATOM_G_UMAX_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_min_noret : SDNode<"AMDILISD::ATOM_G_MIN_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_umin_noret: SDNode<"AMDILISD::ATOM_G_UMIN_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_or_noret : SDNode<"AMDILISD::ATOM_G_OR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_sub_noret : SDNode<"AMDILISD::ATOM_G_SUB_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_rsub_noret : SDNode<"AMDILISD::ATOM_G_RSUB_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_xchg_noret: SDNode<"AMDILISD::ATOM_G_XCHG_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_g_xor_noret : SDNode<"AMDILISD::ATOM_G_XOR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===--------------- 32 bit local atomics with return values --------------===//
+def atom_l_add : SDNode<"AMDILISD::ATOM_L_ADD", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_and : SDNode<"AMDILISD::ATOM_L_AND", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_cmpxchg : SDNode<"AMDILISD::ATOM_L_CMPXCHG", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_dec : SDNode<"AMDILISD::ATOM_L_DEC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_inc : SDNode<"AMDILISD::ATOM_L_INC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_max : SDNode<"AMDILISD::ATOM_L_MAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_umax : SDNode<"AMDILISD::ATOM_L_UMAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_min : SDNode<"AMDILISD::ATOM_L_MIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_umin : SDNode<"AMDILISD::ATOM_L_UMIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_or : SDNode<"AMDILISD::ATOM_L_OR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_mskor : SDNode<"AMDILISD::ATOM_L_MSKOR", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_sub : SDNode<"AMDILISD::ATOM_L_SUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_rsub : SDNode<"AMDILISD::ATOM_L_RSUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_xchg : SDNode<"AMDILISD::ATOM_L_XCHG", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_xor : SDNode<"AMDILISD::ATOM_L_XOR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+//===-------------- 32 bit local atomics without return values ------------===//
+def atom_l_add_noret : SDNode<"AMDILISD::ATOM_L_ADD_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_and_noret : SDNode<"AMDILISD::ATOM_L_AND_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_cmpxchg_noret : SDNode<"AMDILISD::ATOM_L_CMPXCHG_NORET",
+    SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_dec_noret : SDNode<"AMDILISD::ATOM_L_DEC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_inc_noret : SDNode<"AMDILISD::ATOM_L_INC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_max_noret : SDNode<"AMDILISD::ATOM_L_MAX_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_umax_noret: SDNode<"AMDILISD::ATOM_L_UMAX_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_min_noret : SDNode<"AMDILISD::ATOM_L_MIN_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_umin_noret: SDNode<"AMDILISD::ATOM_L_UMIN_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_or_noret : SDNode<"AMDILISD::ATOM_L_OR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_mskor_noret : SDNode<"AMDILISD::ATOM_L_MSKOR_NORET",
+    SDTIL_TriAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_sub_noret : SDNode<"AMDILISD::ATOM_L_SUB_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_rsub_noret : SDNode<"AMDILISD::ATOM_L_RSUB_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_xchg_noret: SDNode<"AMDILISD::ATOM_L_XCHG_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_l_xor_noret : SDNode<"AMDILISD::ATOM_L_XOR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===--------------- 32 bit local atomics with return values --------------===//
+def atom_r_add : SDNode<"AMDILISD::ATOM_R_ADD", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_and : SDNode<"AMDILISD::ATOM_R_AND", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_cmpxchg : SDNode<"AMDILISD::ATOM_R_CMPXCHG", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_dec : SDNode<"AMDILISD::ATOM_R_DEC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_inc : SDNode<"AMDILISD::ATOM_R_INC", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_max : SDNode<"AMDILISD::ATOM_R_MAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_umax : SDNode<"AMDILISD::ATOM_R_UMAX", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_min : SDNode<"AMDILISD::ATOM_R_MIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_umin : SDNode<"AMDILISD::ATOM_R_UMIN", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_or : SDNode<"AMDILISD::ATOM_R_OR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_mskor : SDNode<"AMDILISD::ATOM_R_MSKOR", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_sub : SDNode<"AMDILISD::ATOM_R_SUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_rsub : SDNode<"AMDILISD::ATOM_R_RSUB", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_xchg : SDNode<"AMDILISD::ATOM_R_XCHG", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_xor : SDNode<"AMDILISD::ATOM_R_XOR", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+
+//===-------------- 32 bit local atomics without return values ------------===//
+def atom_r_add_noret : SDNode<"AMDILISD::ATOM_R_ADD_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_and_noret : SDNode<"AMDILISD::ATOM_R_AND_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_cmpxchg_noret : SDNode<"AMDILISD::ATOM_R_CMPXCHG_NORET",
+    SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_dec_noret : SDNode<"AMDILISD::ATOM_R_DEC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_inc_noret : SDNode<"AMDILISD::ATOM_R_INC_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_max_noret : SDNode<"AMDILISD::ATOM_R_MAX_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_umax_noret: SDNode<"AMDILISD::ATOM_R_UMAX_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_min_noret : SDNode<"AMDILISD::ATOM_R_MIN_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_umin_noret: SDNode<"AMDILISD::ATOM_R_UMIN_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_or_noret : SDNode<"AMDILISD::ATOM_R_OR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_mskor_noret : SDNode<"AMDILISD::ATOM_R_MSKOR_NORET", SDTIL_TriAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_sub_noret : SDNode<"AMDILISD::ATOM_R_SUB_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_rsub_noret : SDNode<"AMDILISD::ATOM_R_RSUB_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_xchg_noret: SDNode<"AMDILISD::ATOM_R_XCHG_NORET",
+    SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>;
+def atom_r_xor_noret : SDNode<"AMDILISD::ATOM_R_XOR_NORET", SDTIL_BinAtom,
+    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+//===--------------- 32 bit atomic counter instructions -------------------===//
+def append_alloc : SDNode<"AMDILISD::APPEND_ALLOC", SDTIL_Append,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+def append_consume : SDNode<"AMDILISD::APPEND_CONSUME", SDTIL_Append,
+    [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
+def append_alloc_noret : SDNode<"AMDILISD::APPEND_ALLOC_NORET", SDTIL_Append,
+    [SDNPHasChain, SDNPMayStore]>;
+def append_consume_noret : SDNode<"AMDILISD::APPEND_CONSUME_NORET",
+    SDTIL_Append, [SDNPHasChain, SDNPMayStore]>;
diff --git a/src/gallium/drivers/radeon/AMDILOperands.td b/src/gallium/drivers/radeon/AMDILOperands.td

new file mode 100644 (file)

index 0000000..b22c67b
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILOperands.td
@@ -0,0 +1,37 @@
+//===- AMDILOperands.td - AMD IL Operands ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Custom memory operand
+//===----------------------------------------------------------------------===//
+
+def MEMI32  : Operand<i32> {
+    let PrintMethod = "printMemOperand";
+    let MIOperandInfo = (ops GPRI32, GPRI32);
+}
+
+def MEMI64 : Operand<i64> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPRI64, GPRI64);
+}
+
+// Call target types
+def calltarget   : Operand<i32>;
+def brtarget   : Operand<OtherVT>;
+
+// def v2i8imm : Operand<v2i8>;
+// def v4i8imm : Operand<v4i8>;
+// def v2i16imm : Operand<v2i16>;
+// def v4i16imm : Operand<v4i16>;
+// def v2i32imm : Operand<v2i32>;
+// def v4i32imm : Operand<v4i32>;
+// def v2i64imm : Operand<v2i64>;
+// def v2f32imm : Operand<v2f32>;
+// def v4f32imm : Operand<v4f32>;
+// def v2f64imm : Operand<v2f64>;
+
diff --git a/src/gallium/drivers/radeon/AMDILPatterns.td b/src/gallium/drivers/radeon/AMDILPatterns.td

new file mode 100644 (file)

index 0000000..aa59bcb
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILPatterns.td
@@ -0,0 +1,504 @@
+//===- AMDILPatterns.td - AMDIL Target Patterns------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Store pattern fragments
+//===----------------------------------------------------------------------===//
+def truncstorei64 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+def truncstorev2i8 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i8;
+}]>;
+def truncstorev2i16 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16;
+}]>;
+def truncstorev2i32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i32;
+}]>;
+def truncstorev2i64 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i64;
+}]>;
+def truncstorev2f32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2f32;
+}]>;
+def truncstorev2f64 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2f64;
+}]>;
+def truncstorev4i8 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8;
+}]>;
+def truncstorev4i16 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i16;
+}]>;
+def truncstorev4i32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i32;
+}]>;
+def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr),
+                            (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32;
+}]>;
+
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei8 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei16 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei64 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref64 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i8 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i16 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i64 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f64 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i8 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i16 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def global_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4f32 node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstore node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei8 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei16 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei64 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref64 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i8 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i16 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i64 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f64 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i8 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i16 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def private_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4f32 node:$val, node:$ptr), [{
+        return isPrivateStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstore node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei8 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei16 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei64 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref64 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i8 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i16 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i64 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f64 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i8 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i16 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def local_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4f32 node:$val, node:$ptr), [{
+        return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def region_trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstore node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei8 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei16 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorei64 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstoref64 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i8 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i16 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2i64 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev2f64 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i8 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i16 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4i32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+def region_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr),
+    (truncstorev4f32 node:$val, node:$ptr), [{
+        return isRegionStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Load pattern fragments
+//===----------------------------------------------------------------------===//
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def global_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def global_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def global_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Private address space loads
+def private_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isPrivateLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def private_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+    return isPrivateLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def private_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+    return isPrivateLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def private_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+    return isPrivateLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Local address space loads
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def local_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def local_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def local_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Region address space loads
+def region_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isRegionLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def region_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+    return isRegionLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def region_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+    return isRegionLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def region_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+    return isRegionLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+def constant_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+def constant_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+def constant_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+// Constant pool loads
+def cp_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return isCPLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def cp_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{
+  return isCPLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def cp_zext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{
+  return isCPLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+def cp_aext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{
+  return isCPLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Complex addressing mode patterns
+//===----------------------------------------------------------------------===//
+def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
+def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
+def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
+def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
+
+
+//===----------------------------------------------------------------------===//
+// Conditional Instruction Pattern Leafs
+//===----------------------------------------------------------------------===//
+class IL_CC_Op<int N> : PatLeaf<(i32 N)>;
+def IL_CC_D_EQ  : IL_CC_Op<0>;
+def IL_CC_D_GE  : IL_CC_Op<1>;
+def IL_CC_D_LT  : IL_CC_Op<2>;
+def IL_CC_D_NE  : IL_CC_Op<3>;
+def IL_CC_F_EQ  : IL_CC_Op<4>;
+def IL_CC_F_GE  : IL_CC_Op<5>;
+def IL_CC_F_LT  : IL_CC_Op<6>;
+def IL_CC_F_NE  : IL_CC_Op<7>;
+def IL_CC_I_EQ  : IL_CC_Op<8>;
+def IL_CC_I_GE  : IL_CC_Op<9>;
+def IL_CC_I_LT  : IL_CC_Op<10>;
+def IL_CC_I_NE  : IL_CC_Op<11>;
+def IL_CC_U_GE  : IL_CC_Op<12>;
+def IL_CC_U_LT  : IL_CC_Op<13>;
+// Pseudo IL comparison instructions that aren't natively supported
+def IL_CC_F_GT  : IL_CC_Op<14>;
+def IL_CC_U_GT  : IL_CC_Op<15>;
+def IL_CC_I_GT  : IL_CC_Op<16>;
+def IL_CC_D_GT  : IL_CC_Op<17>;
+def IL_CC_F_LE  : IL_CC_Op<18>;
+def IL_CC_U_LE  : IL_CC_Op<19>;
+def IL_CC_I_LE  : IL_CC_Op<20>;
+def IL_CC_D_LE  : IL_CC_Op<21>;
+def IL_CC_F_UNE : IL_CC_Op<22>;
+def IL_CC_F_UEQ : IL_CC_Op<23>;
+def IL_CC_F_ULT : IL_CC_Op<24>;
+def IL_CC_F_UGT : IL_CC_Op<25>;
+def IL_CC_F_ULE : IL_CC_Op<26>;
+def IL_CC_F_UGE : IL_CC_Op<27>;
+def IL_CC_F_ONE : IL_CC_Op<28>;
+def IL_CC_F_OEQ : IL_CC_Op<29>;
+def IL_CC_F_OLT : IL_CC_Op<30>;
+def IL_CC_F_OGT : IL_CC_Op<31>;
+def IL_CC_F_OLE : IL_CC_Op<32>;
+def IL_CC_F_OGE : IL_CC_Op<33>;
+def IL_CC_D_UNE : IL_CC_Op<34>;
+def IL_CC_D_UEQ : IL_CC_Op<35>;
+def IL_CC_D_ULT : IL_CC_Op<36>;
+def IL_CC_D_UGT : IL_CC_Op<37>;
+def IL_CC_D_ULE : IL_CC_Op<38>;
+def IL_CC_D_UGE : IL_CC_Op<39>;
+def IL_CC_D_ONE : IL_CC_Op<30>;
+def IL_CC_D_OEQ : IL_CC_Op<41>;
+def IL_CC_D_OLT : IL_CC_Op<42>;
+def IL_CC_D_OGT : IL_CC_Op<43>;
+def IL_CC_D_OLE : IL_CC_Op<44>;
+def IL_CC_D_OGE : IL_CC_Op<45>;
+def IL_CC_U_EQ  : IL_CC_Op<46>;
+def IL_CC_U_NE  : IL_CC_Op<47>;
+def IL_CC_F_O   : IL_CC_Op<48>;
+def IL_CC_D_O   : IL_CC_Op<49>;
+def IL_CC_F_UO  : IL_CC_Op<50>;
+def IL_CC_D_UO  : IL_CC_Op<51>;
+def IL_CC_L_LE  : IL_CC_Op<52>;
+def IL_CC_L_GE  : IL_CC_Op<53>;
+def IL_CC_L_EQ  : IL_CC_Op<54>;
+def IL_CC_L_NE  : IL_CC_Op<55>;
+def IL_CC_L_LT  : IL_CC_Op<56>;
+def IL_CC_L_GT  : IL_CC_Op<57>;
+def IL_CC_UL_LE  : IL_CC_Op<58>;
+def IL_CC_UL_GE  : IL_CC_Op<59>;
+def IL_CC_UL_EQ  : IL_CC_Op<60>;
+def IL_CC_UL_NE  : IL_CC_Op<61>;
+def IL_CC_UL_LT  : IL_CC_Op<62>;
+def IL_CC_UL_GT  : IL_CC_Op<63>;
diff --git a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp

new file mode 100644 (file)

index 0000000..9383bfc
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
@@ -0,0 +1,1211 @@
+//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "PeepholeOpt"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <sstream>
+
+#if 0
+STATISTIC(PointerAssignments, "Number of dynamic pointer "
+    "assigments discovered");
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
+#endif
+STATISTIC(LocalFuncs, "Number of get_local_size(N) functions removed");
+
+using namespace llvm;
+// The Peephole optimization pass is used to do simple last minute optimizations
+// that are required for correct code or to remove redundant functions
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
+public:
+  TargetMachine &TM;
+  static char ID;
+  AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+  ~AMDILPeepholeOpt();
+  const char *getPassName() const;
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+protected:
+private:
+  // Function to initiate all of the instruction level optimizations.
+  bool instLevelOptimizations(BasicBlock::iterator *inst);
+  // Quick check to see if we need to dump all of the pointers into the
+  // arena. If this is correct, then we set all pointers to exist in arena. This
+  // is a workaround for aliasing of pointers in a struct/union.
+  bool dumpAllIntoArena(Function &F);
+  // Because I don't want to invalidate any pointers while in the
+  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
+  // it later. This function does the conversions if required.
+  void doAtomicConversionIfNeeded(Function &F);
+  // Because __amdil_is_constant cannot be properly evaluated if
+  // optimizations are disabled, the call's are placed in a vector
+  // and evaluated after the __amdil_image* functions are evaluated
+  // which should allow the __amdil_is_constant function to be
+  // evaluated correctly.
+  void doIsConstCallConversionIfNeeded();
+  bool mChanged;
+  bool mDebug;
+  bool mRWGOpt;
+  bool mConvertAtomics;
+  CodeGenOpt::Level optLevel;
+  // Run a series of tests to see if we can optimize a CALL instruction.
+  bool optimizeCallInst(BasicBlock::iterator *bbb);
+  // A peephole optimization to optimize bit extract sequences.
+  bool optimizeBitExtract(Instruction *inst);
+  // A peephole optimization to optimize bit insert sequences.
+  bool optimizeBitInsert(Instruction *inst);
+  bool setupBitInsert(Instruction *base, 
+                      Instruction *&src, 
+                      Constant *&mask, 
+                      Constant *&shift);
+  // Expand the bit field insert instruction on versions of OpenCL that
+  // don't support it.
+  bool expandBFI(CallInst *CI);
+  // Expand the bit field mask instruction on version of OpenCL that 
+  // don't support it.
+  bool expandBFM(CallInst *CI);
+  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
+  // this case we need to expand them. These functions check for 24bit functions
+  // and then expand.
+  bool isSigned24BitOps(CallInst *CI);
+  void expandSigned24BitOps(CallInst *CI);
+  // One optimization that can occur is that if the required workgroup size is
+  // specified then the result of get_local_size is known at compile time and
+  // can be returned accordingly.
+  bool isRWGLocalOpt(CallInst *CI);
+  void expandRWGLocalOpt(CallInst *CI);
+  // On northern island cards, the division is slightly less accurate than on
+  // previous generations, so we need to utilize a more accurate division. So we
+  // can translate the accurate divide to a normal divide on all other cards.
+  bool convertAccurateDivide(CallInst *CI);
+  void expandAccurateDivide(CallInst *CI);
+  // If the alignment is set incorrectly, it can produce really inefficient
+  // code. This checks for this scenario and fixes it if possible.
+  bool correctMisalignedMemOp(Instruction *inst);
+
+  // If we are in no opt mode, then we need to make sure that
+  // local samplers are properly propagated as constant propagation 
+  // doesn't occur and we need to know the value of kernel defined
+  // samplers at compile time.
+  bool propagateSamplerInst(CallInst *CI);
+
+  LLVMContext *mCTX;
+  Function *mF;
+  const AMDILSubtarget *mSTM;
+  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
+  SmallVector<CallInst *, 16> isConstVec;
+}; // class AMDILPeepholeOpt
+  char AMDILPeepholeOpt::ID = 0;
+} // anonymous namespace
+
+namespace llvm {
+  FunctionPass *
+  createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) 
+  {
+    return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
+  }
+} // llvm namespace
+
+AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+  : FunctionPass(ID), TM(tm) 
+{
+  mDebug = DEBUGME;
+  optLevel = TM.getOptLevel();
+
+}
+
+AMDILPeepholeOpt::~AMDILPeepholeOpt() 
+{
+}
+
+const char *
+AMDILPeepholeOpt::getPassName() const 
+{
+  return "AMDIL PeepHole Optimization Pass";
+}
+
+bool 
+containsPointerType(Type *Ty) 
+{
+  if (!Ty) {
+    return false;
+  }
+  switch(Ty->getTypeID()) {
+  default:
+    return false;
+  case Type::StructTyID: {
+    const StructType *ST = dyn_cast<StructType>(Ty);
+    for (StructType::element_iterator stb = ST->element_begin(),
+           ste = ST->element_end(); stb != ste; ++stb) {
+      if (!containsPointerType(*stb)) {
+        continue;
+      }
+      return true;
+    }
+    break;
+  }
+  case Type::VectorTyID:
+  case Type::ArrayTyID:
+    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
+  case Type::PointerTyID:
+    return true;
+  };
+  return false;
+}
+
+bool 
+AMDILPeepholeOpt::dumpAllIntoArena(Function &F) 
+{
+  bool dumpAll = false;
+  for (Function::const_arg_iterator cab = F.arg_begin(),
+       cae = F.arg_end(); cab != cae; ++cab) {
+    const Argument *arg = cab;
+    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
+    if (!PT) {
+      continue;
+    }
+    Type *DereferencedType = PT->getElementType();
+    if (!dyn_cast<StructType>(DereferencedType) 
+        ) {
+      continue;
+    }
+    if (!containsPointerType(DereferencedType)) {
+      continue;
+    }
+    // FIXME: Because a pointer inside of a struct/union may be aliased to
+    // another pointer we need to take the conservative approach and place all
+    // pointers into the arena until more advanced detection is implemented.
+    dumpAll = true;
+  }
+  return dumpAll;
+}
+void
+AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
+{
+  if (isConstVec.empty()) {
+    return;
+  }
+  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
+    CallInst *CI = isConstVec[x];
+    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+      : ConstantInt::get(aType, 0);
+    CI->replaceAllUsesWith(Val);
+    CI->eraseFromParent();
+  }
+  isConstVec.clear();
+}
+void 
+AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 
+{
+  // Don't do anything if we don't have any atomic operations.
+  if (atomicFuncs.empty()) {
+    return;
+  }
+  // Change the function name for the atomic if it is required
+  uint32_t size = atomicFuncs.size();
+  for (uint32_t x = 0; x < size; ++x) {
+    atomicFuncs[x].first->setOperand(
+        atomicFuncs[x].first->getNumOperands()-1, 
+        atomicFuncs[x].second);
+
+  }
+  mChanged = true;
+  if (mConvertAtomics) {
+    return;
+  }
+  // If we did not convert all of the atomics, then we need to make sure that
+  // the atomics that were not converted have their base pointers set to use the
+  // arena path.
+  Function::arg_iterator argB = F.arg_begin();
+  Function::arg_iterator argE = F.arg_end();
+  AMDILKernelManager *KM = mSTM->getKernelManager();
+  AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
+    .getInfo<AMDILMachineFunctionInfo>();
+  for (; argB != argE; ++argB) {
+    if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
+      KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
+      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
+    } else {
+      KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
+      mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
+    }
+  }
+}
+
+bool 
+AMDILPeepholeOpt::runOnFunction(Function &MF) 
+{
+  mChanged = false;
+  mF = &MF;
+  mSTM = &TM.getSubtarget<AMDILSubtarget>();
+  if (mDebug) {
+    MF.dump();
+  }
+  mCTX = &MF.getType()->getContext();
+  mConvertAtomics = true;
+  if (dumpAllIntoArena(MF)) {
+    for (Function::const_arg_iterator cab = MF.arg_begin(),
+         cae = MF.arg_end(); cab != cae; ++cab) {
+      const Argument *arg = cab;
+      AMDILKernelManager *KM = mSTM->getKernelManager();
+      KM->setUAVID(getBasePointerValue(arg),
+          mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
+    }
+  }
+  mRWGOpt = mSTM->getGlobalManager()->hasRWG(MF.getName());
+  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+     std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
+                  this));
+
+  doAtomicConversionIfNeeded(MF);
+  doIsConstCallConversionIfNeeded();
+
+  if (mDebug) {
+    MF.dump();
+  }
+  return mChanged;
+}
+
+bool 
+AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 
+{
+  Instruction *inst = (*bbb);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+  if (!CI) {
+    return false;
+  }
+  if (isSigned24BitOps(CI)) {
+    expandSigned24BitOps(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (isRWGLocalOpt(CI)) {
+    expandRWGLocalOpt(CI);
+    return false;
+  }
+  if (propagateSamplerInst(CI)) {
+    return false;
+  }
+  if (expandBFI(CI) || expandBFM(CI)) {
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (convertAccurateDivide(CI)) {
+    expandAccurateDivide(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+
+  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
+  if (calleeName.startswith("__amdil_is_constant")) {
+    // If we do not have optimizations, then this
+    // cannot be properly evaluated, so we add the
+    // call instruction to a vector and process
+    // them at the end of processing after the
+    // samplers have been correctly handled.
+    if (optLevel == CodeGenOpt::None) {
+      isConstVec.push_back(CI);
+      return false;
+    } else {
+      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+      Type *aType = Type::getInt32Ty(*mCTX);
+      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+        : ConstantInt::get(aType, 0);
+      CI->replaceAllUsesWith(Val);
+      ++(*bbb);
+      CI->eraseFromParent();
+      return true;
+    }
+  }
+
+  if (calleeName.equals("__amdil_is_asic_id_i32")) {
+    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = CV;
+    if (Val) {
+      Val = ConstantInt::get(aType, 
+          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
+    } else {
+      Val = ConstantInt::get(aType, 0);
+    }
+    CI->replaceAllUsesWith(Val);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
+  if (!F) {
+    return false;
+  } 
+  if (F->getName().startswith("__atom") && !CI->getNumUses() 
+      && F->getName().find("_xchg") == StringRef::npos) {
+    std::string buffer(F->getName().str() + "_noret");
+    F = dyn_cast<Function>(
+          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
+    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
+  }
+  
+  if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
+      && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+    return false;
+  }
+  if (!mConvertAtomics) {
+    return false;
+  }
+  StringRef name = F->getName();
+  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
+    Value *ptr = CI->getOperand(0);
+    const Value *basePtr = getBasePointerValue(ptr);
+    const Argument *Arg = dyn_cast<Argument>(basePtr);
+    if (Arg) {
+      AMDILGlobalManager *GM = mSTM->getGlobalManager();
+      int32_t id = GM->getArgID(Arg);
+      if (id >= 0) {
+        std::stringstream ss;
+        ss << name.data() << "_" << id << '\n';
+        std::string val;
+        ss >> val;
+        F = dyn_cast<Function>(
+              F->getParent() ->getOrInsertFunction(val, F->getFunctionType()));
+        atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
+      } else {
+        mConvertAtomics = false;
+      }
+    } else {
+      mConvertAtomics = false;
+    }
+  }
+  return false;
+}
+
+bool
+AMDILPeepholeOpt::setupBitInsert(Instruction *base, 
+    Instruction *&src, 
+    Constant *&mask, 
+    Constant *&shift)
+{
+  if (!base) {
+    if (mDebug) {
+      dbgs() << "Null pointer passed into function.\n";
+    }
+    return false;
+  }
+  bool andOp = false;
+  if (base->getOpcode() == Instruction::Shl) {
+    shift = dyn_cast<Constant>(base->getOperand(1));
+  } else if (base->getOpcode() == Instruction::And) {
+    mask = dyn_cast<Constant>(base->getOperand(1));
+    andOp = true;
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
+    }
+    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
+    return false;
+  }
+  src = dyn_cast<Instruction>(base->getOperand(0));
+  if (!src) {
+    if (mDebug) {
+      dbgs() << "Failed setup since the base operand is not an instruction!\n";
+    }
+    return false;
+  }
+  // If we find an 'and' operation, then we don't need to
+  // find the next operation as we already know the
+  // bits that are valid at this point.
+  if (andOp) {
+    return true;
+  }
+  if (src->getOpcode() == Instruction::Shl && !shift) {
+    shift = dyn_cast<Constant>(src->getOperand(1));
+    src = dyn_cast<Instruction>(src->getOperand(0));
+  } else if (src->getOpcode() == Instruction::And && !mask) {
+    mask = dyn_cast<Constant>(src->getOperand(1));
+  }
+  if (!mask && !shift) {
+    if (mDebug) {
+      dbgs() << "Failed setup since both mask and shift are NULL!\n";
+    }
+    // Did not find a constant mask or a shift.
+    return false;
+  }
+  return true;
+}
+bool
+AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst) 
+{
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::Or) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do an optimization on a sequence of ops that in the end equals a
+  // single ISA instruction.
+  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
+  // Some simplified versions of this pattern are as follows:
+  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
+  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
+  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
+  // (A & B) | (D << F) when (1 << F) >= B
+  // (A << C) | (D & E) when (1 << C) >= E
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // The HD4XXX hardware doesn't support the ubit_insert instruction.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This optimization only works on 32bit integers.
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  // TODO: Handle vectors.
+  if (isVector) {
+    if (mDebug) {
+      dbgs() << "!!! Vectors are not supported yet!\n";
+    }
+    return false;
+  }
+  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
+  Constant *LHSMask = NULL, *RHSMask = NULL;
+  Constant *LHSShift = NULL, *RHSShift = NULL;
+  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
+  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
+  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (LHS) { LHS->dump(); }
+      if (LHSSrc) { LHSSrc->dump(); }
+      if (LHSMask) { LHSMask->dump(); }
+      if (LHSShift) { LHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (RHS) { RHS->dump(); }
+      if (RHSSrc) { RHSSrc->dump(); }
+      if (RHSMask) { RHSMask->dump(); }
+      if (RHSShift) { RHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
+    dbgs() << "Op:        "; inst->dump();
+    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
+  }
+  Constant *offset = NULL;
+  Constant *width = NULL;
+  int32_t lhsMaskVal = 0, rhsMaskVal = 0;
+  int32_t lhsShiftVal = 0, rhsShiftVal = 0;
+  int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
+  int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
+  lhsMaskVal = (int32_t)(LHSMask 
+      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
+  rhsMaskVal = (int32_t)(RHSMask 
+      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
+  lhsShiftVal = (int32_t)(LHSShift 
+      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
+  rhsShiftVal = (int32_t)(RHSShift 
+      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
+  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
+  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
+  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
+  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
+  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
+  if (mDebug) {
+      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
+      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
+      dbgs() << (RHSMask ? " & E)" : ")");
+      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
+      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
+      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
+      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
+      dbgs() << "width(B) = " << lhsMaskWidth;
+      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
+      dbgs() << "offset(B) = " << lhsMaskOffset;
+      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
+      dbgs() << "Constraints: \n";
+      dbgs() << "\t(1) B ^ E == 0\n";
+      dbgs() << "\t(2-LHS) B is a mask\n";
+      dbgs() << "\t(2-LHS) E is a mask\n";
+      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
+      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
+  }
+  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
+    if (mDebug) {
+      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
+      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
+      dbgs() << "Failed constraint 1!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "LHS = " << lhsMaskOffset << "";
+    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
+    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
+    dbgs() << "\nRHS = " << rhsMaskOffset << "";
+    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
+    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
+    dbgs() << "\n";
+  }
+  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
+    offset = ConstantInt::get(aType, lhsMaskOffset, false);
+    width = ConstantInt::get(aType, lhsMaskWidth, false);
+    RHSSrc = RHS;
+    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
+      if (mDebug) {
+        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
+        dbgs() << "Failed constraint 2!\n";
+      }
+      return false;
+    }
+    if (!LHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    } else if (lhsShiftVal != lhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing LHS!\n";
+    }
+  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
+    offset = ConstantInt::get(aType, rhsMaskOffset, false);
+    width = ConstantInt::get(aType, rhsMaskWidth, false);
+    LHSSrc = RHSSrc;
+    RHSSrc = LHS;
+    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
+      if (mDebug) {
+        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
+        dbgs() << "Failed constraint 2!\n";
+      }
+      return false;
+    }
+    if (!RHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    } else if (rhsShiftVal != rhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing RHS!\n";
+    }
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed constraint 3!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+  }
+  if (!offset || !width) {
+    if (mDebug) {
+      dbgs() << "Either width or offset are NULL, failed detection!\n";
+    }
+    return false;
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_insert";
+  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+        getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[4] = {
+    width,
+    offset,
+    LHSSrc,
+    RHSSrc
+  };
+  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
+  if (mDebug) {
+    dbgs() << "Old Inst: ";
+    inst->dump();
+    dbgs() << "New Inst: ";
+    CI->dump();
+    dbgs() << "\n\n";
+  }
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool 
+AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst) 
+{
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::And) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do some simple optimizations on Shift right/And patterns. The
+  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
+  // value smaller than 32 and C is a mask. If C is a constant value, then the
+  // following transformation can occur. For signed integers, it turns into the
+  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
+  // integers, it turns into the function call dst =
+  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
+  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
+  // Evergreen hardware.
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
+    // This does not work on HD4XXX hardware.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This only works on 32bit integers
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
+  // If the first operand is not a shift instruction, then we can return as it
+  // doesn't match this pattern.
+  if (!ShiftInst || !ShiftInst->isShift()) {
+    return false;
+  }
+  // If we are a shift left, then we need don't match this pattern.
+  if (ShiftInst->getOpcode() == Instruction::Shl) {
+    return false;
+  }
+  bool isSigned = ShiftInst->isArithmeticShift();
+  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
+  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
+  // Lets make sure that the shift value and the and mask are constant integers.
+  if (!AndMask || !ShrVal) {
+    return false;
+  }
+  Constant *newMaskConst;
+  Constant *shiftValConst;
+  if (isVector) {
+    // Handle the vector case
+    std::vector<Constant *> maskVals;
+    std::vector<Constant *> shiftVals;
+    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
+    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
+    Type *scalarType = AndMaskVec->getType()->getScalarType();
+    assert(AndMaskVec->getNumOperands() ==
+           ShrValVec->getNumOperands() && "cannot have a "
+           "combination where the number of elements to a "
+           "shift and an and are different!");
+    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
+      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
+      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
+      if (!AndCI || !ShiftIC) {
+        return false;
+      }
+      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
+      if (!isMask_32(maskVal)) {
+        return false;
+      }
+      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
+      // If the mask or shiftval is greater than the bitcount, then break out.
+      if (maskVal >= 32 || shiftVal >= 32) {
+        return false;
+      }
+      // If the mask val is greater than the the number of original bits left
+      // then this optimization is invalid.
+      if (maskVal > (32 - shiftVal)) {
+        return false;
+      }
+      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
+      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
+    }
+    newMaskConst = ConstantVector::get(maskVals);
+    shiftValConst = ConstantVector::get(shiftVals);
+  } else {
+    // Handle the scalar case
+    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
+    // This must be a mask value where all lower bits are set to 1 and then any
+    // bit higher is set to 0.
+    if (!isMask_32(maskVal)) {
+      return false;
+    }
+    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+    // Count the number of bits set in the mask, this is the width of the
+    // resulting bit set that is extracted from the source value.
+    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
+    // If the mask or shift val is greater than the bitcount, then break out.
+    if (maskVal >= 32 || shiftVal >= 32) {
+      return false;
+    }
+    // If the mask val is greater than the the number of original bits left then
+    // this optimization is invalid.
+    if (maskVal > (32 - shiftVal)) {
+      return false;
+    }
+    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
+    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_extract";
+  if (isVector) {
+    name += "_v" + itostr(numEle) + "i32";
+  } else {
+    name += "_i32";
+  }
+  // Lets create the function.
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[3] = {
+    newMaskConst,
+    shiftValConst,
+    ShiftInst->getOperand(0)
+  };
+  // Lets create the Call with the operands
+  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::expandBFI(CallInst *CI)
+{
+  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfi")) {
+    return false;
+  }
+  Type* type = CI->getOperand(0)->getType();
+  Constant *negOneConst = NULL;
+  if (type->isVectorTy()) {
+    std::vector<Constant *> negOneVals;
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      negOneVals.push_back(negOneConst);
+    }
+    negOneConst = ConstantVector::get(negOneVals);
+  } else {
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+  }
+  // __amdil_bfi => (A & B) | (~A & C)
+  BinaryOperator *lhs = 
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        CI->getOperand(1), "bfi_and", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
+        "bfi_not", CI);
+  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
+      "bfi_and", CI);
+  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::expandBFM(CallInst *CI)
+{
+  if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfm")) {
+    return false;
+  }
+  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
+  Constant *newMaskConst = NULL;
+  Constant *newShiftConst = NULL;
+  Type* type = CI->getOperand(0)->getType();
+  if (type->isVectorTy()) {
+    std::vector<Constant*> newMaskVals, newShiftVals;
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      newMaskVals.push_back(newMaskConst);
+      newShiftVals.push_back(newShiftConst);
+    }
+    newMaskConst = ConstantVector::get(newMaskVals);
+    newShiftConst = ConstantVector::get(newShiftVals);
+  } else {
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+  }
+  BinaryOperator *lhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
+      lhs, "bfm_shl", CI);
+  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
+      newShiftConst, "bfm_sub", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 
+{
+  Instruction *inst = (*bbb);
+  if (optimizeCallInst(bbb)) {
+    return true;
+  }
+  if (optimizeBitExtract(inst)) {
+    return false;
+  }
+  if (optimizeBitInsert(inst)) {
+    return false;
+  }
+  if (correctMisalignedMemOp(inst)) {
+    return false;
+  }
+  return false;
+}
+bool
+AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
+{
+  LoadInst *linst = dyn_cast<LoadInst>(inst);
+  StoreInst *sinst = dyn_cast<StoreInst>(inst);
+  unsigned alignment;
+  Type* Ty = inst->getType();
+  if (linst) {
+    alignment = linst->getAlignment();
+    Ty = inst->getType();
+  } else if (sinst) {
+    alignment = sinst->getAlignment();
+    Ty = sinst->getValueOperand()->getType();
+  } else {
+    return false;
+  }
+  unsigned size = getTypeSize(Ty);
+  if (size == alignment || size < alignment) {
+    return false;
+  }
+  if (!Ty->isStructTy()) {
+    return false;
+  }
+  if (alignment < 4) {
+    if (linst) {
+      linst->setAlignment(0);
+      return true;
+    } else if (sinst) {
+      sinst->setAlignment(0);
+      return true;
+    }
+  }
+  return false;
+}
+bool 
+AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI) 
+{
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  std::string namePrefix = LHS->getName().substr(0, 14);
+  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
+      && namePrefix != "__amdil__imul24_high") {
+    return false;
+  }
+  if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
+    return false;
+  }
+  return true;
+}
+
+void 
+AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI) 
+{
+  assert(isSigned24BitOps(CI) && "Must be a "
+      "signed 24 bit operation to call this function!");
+  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
+  // On 7XX and 8XX we do not have signed 24bit, so we need to
+  // expand it to the following:
+  // imul24 turns into 32bit imul
+  // imad24 turns into 32bit imad
+  // imul24_high turns into 32bit imulhigh
+  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
+    Type *aType = CI->getOperand(0)->getType();
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    callTypes.push_back(CI->getOperand(2)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imad";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[3] = {
+      CI->getOperand(0),
+      CI->getOperand(1),
+      CI->getOperand(2)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
+    BinaryOperator *mulOp =
+      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
+          CI->getOperand(1), "imul24", CI);
+    CI->replaceAllUsesWith(mulOp);
+  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
+    Type *aType = CI->getOperand(0)->getType();
+
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imul_high";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[2] = {
+      CI->getOperand(0),
+      CI->getOperand(1)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  }
+}
+
+bool 
+AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI) 
+{
+  return (CI != NULL && mRWGOpt
+          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
+          == "__amdil_get_local_size_int");
+}
+
+void 
+AMDILPeepholeOpt::expandRWGLocalOpt(CallInst *CI) 
+{
+  assert(isRWGLocalOpt(CI) &&
+         "This optmization only works when the call inst is get_local_size!");
+  std::vector<Constant *> consts;
+  for (uint32_t x = 0; x < 3; ++x) {
+    uint32_t val = mSTM->getGlobalManager()->getLocal(mF->getName(), x);
+    consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), val));
+  }
+  consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), 0));
+  Value *cVec = ConstantVector::get(consts);
+  CI->replaceAllUsesWith(cVec);
+  ++LocalFuncs;
+  return;
+}
+
+bool 
+AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI) 
+{
+  if (!CI) {
+    return false;
+  }
+  if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
+      && (mSTM->getDeviceName() == "cayman")) {
+    return false;
+  }
+  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
+      == "__amdil_improved_div";
+}
+
+void 
+AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI) 
+{
+  assert(convertAccurateDivide(CI)
+         && "expanding accurate divide can only happen if it is expandable!");
+  BinaryOperator *divOp =
+    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
+                           CI->getOperand(1), "fdiv32", CI);
+  CI->replaceAllUsesWith(divOp);
+}
+
+bool
+AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
+{
+  if (optLevel != CodeGenOpt::None) {
+    return false;
+  }
+
+  if (!CI) {
+    return false;
+  }
+
+  unsigned funcNameIdx = 0;
+  funcNameIdx = CI->getNumOperands() - 1;
+  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
+  if (calleeName != "__amdil_image2d_read_norm"
+   && calleeName != "__amdil_image2d_read_unnorm"
+   && calleeName != "__amdil_image3d_read_norm"
+   && calleeName != "__amdil_image3d_read_unnorm") {
+    return false;
+  }
+
+  unsigned samplerIdx = 2;
+  samplerIdx = 1;
+  Value *sampler = CI->getOperand(samplerIdx);
+  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
+  if (!lInst) {
+    return false;
+  }
+
+  if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
+    return false;
+  }
+
+  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
+  // If we are loading from what is not a global value, then we
+  // fail and return.
+  if (!gv) {
+    return false;
+  }
+
+  // If we don't have an initializer or we have an initializer and
+  // the initializer is not a 32bit integer, we fail.
+  if (!gv->hasInitializer() 
+      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
+      return false;
+  }
+
+  // Now that we have the global variable initializer, lets replace
+  // all uses of the load instruction with the samplerVal and
+  // reparse the __amdil_is_constant() function.
+  Constant *samplerVal = gv->getInitializer();
+  lInst->replaceAllUsesWith(samplerVal);
+  return true;
+}
+
+bool 
+AMDILPeepholeOpt::doInitialization(Module &M) 
+{
+  return false;
+}
+
+bool 
+AMDILPeepholeOpt::doFinalization(Module &M) 
+{
+  return false;
+}
+
+void 
+AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 
+{
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
diff --git a/src/gallium/drivers/radeon/AMDILPointerManager.cpp b/src/gallium/drivers/radeon/AMDILPointerManager.cpp

new file mode 100644 (file)

index 0000000..9cac61c
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILPointerManager.cpp
@@ -0,0 +1,2551 @@
+//===-------- AMDILPointerManager.cpp - Manage Pointers for HW-------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// Implementation for the AMDILPointerManager classes. See header file for
+// more documentation of class.
+// TODO: This fails when function calls are enabled, must always be inlined
+//===----------------------------------------------------------------------===//
+#include "AMDILPointerManager.h"
+#include "AMDILCompilerErrors.h"
+#include "AMDILDeviceInfo.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Metadata.h"
+#include "llvm/Module.h"
+#include "llvm/Support/FormattedStream.h"
+
+#include <stdio.h>
+using namespace llvm;
+char AMDILPointerManager::ID = 0;
+namespace llvm {
+  FunctionPass*
+    createAMDILPointerManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+    {
+      return tm.getSubtarget<AMDILSubtarget>()
+        .device()->getPointerManager(tm AMDIL_OPT_LEVEL_VAR);
+    }
+}
+
+AMDILPointerManager::AMDILPointerManager(
+    TargetMachine &tm
+    AMDIL_OPT_LEVEL_DECL) :
+  MachineFunctionPass(ID),
+  TM(tm)
+{
+  mDebug = DEBUGME;
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+}
+
+AMDILPointerManager::~AMDILPointerManager()
+{
+}
+
+const char*
+AMDILPointerManager::getPassName() const
+{
+  return "AMD IL Default Pointer Manager Pass";
+}
+
+void
+AMDILPointerManager::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.setPreservesAll();
+  AU.addRequiredID(MachineDominatorsID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDILEGPointerManager::AMDILEGPointerManager(
+    TargetMachine &tm
+    AMDIL_OPT_LEVEL_DECL) :
+  AMDILPointerManager(tm AMDIL_OPT_LEVEL_VAR),
+  TM(tm)
+{
+}
+
+AMDILEGPointerManager::~AMDILEGPointerManager()
+{
+}
+std::string
+findSamplerName(MachineInstr* MI,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    const TargetMachine *TM)
+{
+  std::string sampler = "unknown";
+  assert(MI->getNumOperands() == 5 && "Only an "
+      "image read instruction with 5 arguments can "
+      "have a sampler.");
+  assert(MI->getOperand(3).isReg() && 
+      "Argument 3 must be a register to call this function");
+  unsigned reg = MI->getOperand(3).getReg();
+  // If this register points to an argument, then
+  // we can return the argument name.
+  if (lookupTable[reg].second && dyn_cast<Argument>(lookupTable[reg].second)) {
+    return lookupTable[reg].second->getName();
+  }
+  // Otherwise the sampler is coming from memory somewhere.
+  // If the sampler memory location can be tracked, then
+  // we ascertain the sampler name that way.
+  // The most common case is when optimizations are disabled
+  // or mem2reg is not enabled, then the sampler when it is
+  // an argument is passed through the frame index.
+
+  // In the optimized case, the instruction that defined
+  // register from operand #3 is a private load.
+  MachineRegisterInfo &regInfo = MI->getParent()->getParent()->getRegInfo();
+  assert(!regInfo.def_empty(reg) 
+      && "We don't have any defs of this register, but we aren't an argument!");
+  MachineOperand *defOp = regInfo.getRegUseDefListHead(reg);
+  MachineInstr *defMI = defOp->getParent();
+  if (isPrivateInst(TM->getInstrInfo(), defMI) && isLoadInst(TM->getInstrInfo(), defMI)) {
+    if (defMI->getOperand(1).isFI()) {
+      RegValPair &fiRVP = FIToPtrMap[reg];
+      if (fiRVP.second && dyn_cast<Argument>(fiRVP.second)) {
+        return fiRVP.second->getName();
+      } else {
+        // FIXME: Fix the case where the value stored is not a kernel argument.
+        assert(!"Found a private load of a sampler where the value isn't an argument!");
+      }
+    } else {
+      // FIXME: Fix the case where someone dynamically loads a sampler value
+      // from private memory. This is problematic because we need to know the
+      // sampler value at compile time and if it is dynamically loaded, we won't
+      // know what sampler value to use.
+      assert(!"Found a private load of a sampler that isn't from a frame index!");
+    }
+  } else {
+    // FIXME: Handle the case where the def is neither a private instruction
+    // and not a load instruction. This shouldn't occur, but putting an assertion
+    // just to make sure that it doesn't.
+    assert(!"Found a case which we don't handle.");
+  }
+  return sampler;
+}
+
+const char*
+AMDILEGPointerManager::getPassName() const
+{
+  return "AMD IL EG Pointer Manager Pass";
+}
+
+// Helper function to determine if the current pointer is from the
+// local, region or private address spaces.
+  static bool
+isLRPInst(MachineInstr *MI,
+    const AMDILTargetMachine *ATM)
+{
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  if (!MI) {
+    return false;
+  }
+  if ((isRegionInst(ATM->getInstrInfo(), MI) 
+        && STM->device()->usesHardware(AMDILDeviceInfo::RegionMem))
+      || (isLocalInst(ATM->getInstrInfo(), MI)
+        && STM->device()->usesHardware(AMDILDeviceInfo::LocalMem))
+      || (isPrivateInst(ATM->getInstrInfo(), MI)
+        && STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem))) {
+    return true;
+  }
+  return false;
+}
+
+/// Helper function to determine if the I/O instruction uses
+/// global device memory or not.
+static bool
+usesGlobal(
+    const AMDILTargetMachine *ATM,
+    MachineInstr *MI) {
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  switch(MI->getOpcode()) {
+    ExpandCaseToAllTypes(AMDIL::GLOBALSTORE);
+    ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE);
+    ExpandCaseToAllTypes(AMDIL::GLOBALLOAD);
+    ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD);
+    return true;
+    ExpandCaseToAllTypes(AMDIL::REGIONLOAD);
+    ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::REGIONSTORE);
+    ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE);
+    return !STM->device()->usesHardware(AMDILDeviceInfo::RegionMem);
+    ExpandCaseToAllTypes(AMDIL::LOCALLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::LOCALSTORE);
+    ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE);
+    return !STM->device()->usesHardware(AMDILDeviceInfo::LocalMem);
+    ExpandCaseToAllTypes(AMDIL::CPOOLLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD);
+    return !STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem);
+    ExpandCaseToAllTypes(AMDIL::PRIVATELOAD);
+    ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD);
+    ExpandCaseToAllTypes(AMDIL::PRIVATESTORE);
+    ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE);
+    return !STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem);
+    default:
+    return false;
+  }
+  return false;
+}
+
+// Helper function that allocates the default resource ID for the
+// respective I/O types.
+static void
+allocateDefaultID(
+    const AMDILTargetMachine *ATM,
+    AMDILAS::InstrResEnc &curRes,
+    MachineInstr *MI,
+    bool mDebug)
+{
+  AMDILMachineFunctionInfo *mMFI =
+    MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>();
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  if (mDebug) {
+    dbgs() << "Assigning instruction to default ID. Inst:";
+    MI->dump();
+  }
+  // If we use global memory, lets set the Operand to
+  // the ARENA_UAV_ID.
+  if (usesGlobal(ATM, MI)) {
+    curRes.bits.ResourceID =
+      STM->device()->getResourceID(AMDILDevice::GLOBAL_ID);
+    if (isAtomicInst(ATM->getInstrInfo(), MI)) {
+      MI->getOperand(MI->getNumOperands()-1)
+        .setImm(curRes.bits.ResourceID);
+    }
+    AMDILKernelManager *KM = STM->getKernelManager();
+    if (curRes.bits.ResourceID == 8 
+        && !STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) {
+        KM->setUAVID(NULL, curRes.bits.ResourceID);
+        mMFI->uav_insert(curRes.bits.ResourceID);
+    }
+  } else if (isPrivateInst(ATM->getInstrInfo(), MI)) {
+    curRes.bits.ResourceID =
+      STM->device()->getResourceID(AMDILDevice::SCRATCH_ID);
+  } else if (isLocalInst(ATM->getInstrInfo(), MI) || isLocalAtomic(ATM->getInstrInfo(), MI)) {
+    curRes.bits.ResourceID =
+      STM->device()->getResourceID(AMDILDevice::LDS_ID);
+    AMDILMachineFunctionInfo *mMFI = 
+    MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>();
+    mMFI->setUsesLocal();
+    if (isAtomicInst(ATM->getInstrInfo(), MI)) {
+      assert(curRes.bits.ResourceID && "Atomic resource ID "
+          "cannot be zero!");
+      MI->getOperand(MI->getNumOperands()-1)
+        .setImm(curRes.bits.ResourceID);
+    }
+  } else if (isRegionInst(ATM->getInstrInfo(), MI) || isRegionAtomic(ATM->getInstrInfo(), MI)) {
+    curRes.bits.ResourceID =
+      STM->device()->getResourceID(AMDILDevice::GDS_ID);
+    AMDILMachineFunctionInfo *mMFI = 
+    MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>();
+    mMFI->setUsesRegion();
+    if (isAtomicInst(ATM->getInstrInfo(), MI)) {
+      assert(curRes.bits.ResourceID && "Atomic resource ID "
+          "cannot be zero!");
+      (MI)->getOperand((MI)->getNumOperands()-1)
+        .setImm(curRes.bits.ResourceID);
+    }
+  } else if (isConstantInst(ATM->getInstrInfo(), MI)) {
+    // If we are unknown constant instruction and the base pointer is known.
+    // Set the resource ID accordingly, otherwise use the default constant ID.
+    // FIXME: this should not require the base pointer to know what constant
+    // it is from.
+    AMDILGlobalManager *GM = STM->getGlobalManager();
+    MachineFunction *MF = MI->getParent()->getParent();
+    if (GM->isKernel(MF->getFunction()->getName())) {
+      const kernel &krnl = GM->getKernel(MF->getFunction()->getName());
+      const Value *V = getBasePointerValue(MI);
+      if (V && !dyn_cast<AllocaInst>(V)) {
+        curRes.bits.ResourceID = GM->getConstPtrCB(krnl, V->getName());
+        curRes.bits.HardwareInst = 1;
+      } else if (V && dyn_cast<AllocaInst>(V)) {
+          // FIXME: Need a better way to fix this. Requires a rewrite of how
+          // we lower global addresses to various address spaces.
+          // So for now, lets assume that there is only a single 
+          // constant buffer that can be accessed from a load instruction
+          // that is derived from an alloca instruction.
+          curRes.bits.ResourceID = 2;
+          curRes.bits.HardwareInst = 1;
+      } else {
+        if (isStoreInst(ATM->getInstrInfo(), MI)) {
+          if (mDebug) {
+            dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+            MI->dump();
+          }
+          curRes.bits.ByteStore = 1;
+        }
+        curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::CONSTANT_ID);
+      }
+    } else {
+      if (isStoreInst(ATM->getInstrInfo(), MI)) {
+        if (mDebug) {
+          dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+          MI->dump();
+        }
+        curRes.bits.ByteStore = 1;
+      }
+      curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::GLOBAL_ID);
+      AMDILKernelManager *KM = STM->getKernelManager();
+      KM->setUAVID(NULL, curRes.bits.ResourceID);
+      mMFI->uav_insert(curRes.bits.ResourceID);
+    }
+  } else if (isAppendInst(ATM->getInstrInfo(), MI)) {
+    unsigned opcode = MI->getOpcode();
+    if (opcode == AMDIL::APPEND_ALLOC
+        || opcode == AMDIL::APPEND_ALLOC_NORET) {
+      curRes.bits.ResourceID = 1;
+    } else {
+      curRes.bits.ResourceID = 2;
+    }
+  }
+  setAsmPrinterFlags(MI, curRes);
+}
+
+// Function that parses the arguments and updates the lookupTable with the
+// pointer -> register mapping. This function also checks for cacheable
+// pointers and updates the CacheableSet with the arguments that
+// can be cached based on the readonlypointer annotation. The final
+// purpose of this function is to update the imageSet and counterSet
+// with all pointers that are either images or atomic counters.
+uint32_t
+parseArguments(MachineFunction &MF,
+    RVPVec &lookupTable,
+    const AMDILTargetMachine *ATM,
+    CacheableSet &cacheablePtrs,
+    ImageSet &imageSet,
+    AppendSet &counterSet,
+    bool mDebug)
+{
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  uint32_t writeOnlyImages = 0;
+  uint32_t readOnlyImages = 0;
+  std::string cachedKernelName = "llvm.readonlypointer.annotations.";
+  cachedKernelName.append(MF.getFunction()->getName());
+  GlobalVariable *GV = MF.getFunction()->getParent()
+    ->getGlobalVariable(cachedKernelName);
+  unsigned cbNum = 0;
+  unsigned regNum = AMDIL::R1;
+  AMDILMachineFunctionInfo *mMFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  for (Function::const_arg_iterator I = MF.getFunction()->arg_begin(),
+      E = MF.getFunction()->arg_end(); I != E; ++I) {
+    const Argument *curArg = I;
+    if (mDebug) {
+      dbgs() << "Argument: ";
+      curArg->dump();
+    }
+    Type *curType = curArg->getType();
+    // We are either a scalar or vector type that
+    // is passed by value that is not a opaque/struct
+    // type. We just need to increment regNum
+    // the correct number of times to match the number
+    // of registers that it takes up.
+    if (curType->isFPOrFPVectorTy() ||
+        curType->isIntOrIntVectorTy()) {
+      // We are scalar, so increment once and
+      // move on
+      if (!curType->isVectorTy()) {
+        lookupTable[regNum] = std::make_pair<unsigned, const Value*>(~0U, curArg);
+        ++regNum;
+        ++cbNum;
+        continue;
+      }
+      VectorType *VT = dyn_cast<VectorType>(curType);
+      // We are a vector type. If we are 64bit type, then
+      // we increment length / 2 times, otherwise we
+      // increment length / 4 times. The only corner case
+      // is with vec3 where the vector gets scalarized and
+      // therefor we need a loop count of 3.
+      size_t loopCount = VT->getNumElements();
+      if (loopCount != 3) {
+        if (VT->getScalarSizeInBits() == 64) {
+          loopCount = loopCount >> 1;
+        } else {
+          loopCount = (loopCount + 2) >> 2;
+        }
+        cbNum += loopCount;
+      } else {
+        cbNum++;
+      }
+      while (loopCount--) {
+        lookupTable[regNum] = std::make_pair<unsigned, const Value*>(~0U, curArg);
+        ++regNum;
+      }
+    } else if (curType->isPointerTy()) {
+      Type *CT = dyn_cast<PointerType>(curType)->getElementType();
+      const StructType *ST = dyn_cast<StructType>(CT);
+      if (ST && ST->isOpaque()) {
+        StringRef name = ST->getName();
+        bool i1d_type  = name == "struct._image1d_t";
+        bool i1da_type = name == "struct._image1d_array_t";
+        bool i1db_type = name == "struct._image1d_buffer_t";
+        bool i2d_type  = name == "struct._image2d_t";
+        bool i2da_type = name == "struct._image2d_array_t";
+        bool i3d_type  = name == "struct._image3d_t";
+        bool c32_type  = name == "struct._counter32_t";
+        bool c64_type  = name == "struct._counter64_t";
+        if (i2d_type || i3d_type || i2da_type ||
+            i1d_type || i1db_type || i1da_type) {
+          imageSet.insert(I);
+          uint32_t imageNum = readOnlyImages + writeOnlyImages;
+          if (STM->getGlobalManager()
+              ->isReadOnlyImage(MF.getFunction()->getName(), imageNum)) {
+            if (mDebug) {
+              dbgs() << "Pointer: '" << curArg->getName()
+                << "' is a read only image # " << readOnlyImages << "!\n";
+            }
+            // We store the cbNum along with the image number so that we can
+            // correctly encode the 'info' intrinsics.
+            lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+              ((cbNum << 16 | readOnlyImages++), curArg);
+          } else if (STM->getGlobalManager()
+              ->isWriteOnlyImage(MF.getFunction()->getName(), imageNum)) {
+            if (mDebug) {
+              dbgs() << "Pointer: '" << curArg->getName()
+                << "' is a write only image # " << writeOnlyImages << "!\n";
+            }
+            // We store the cbNum along with the image number so that we can
+            // correctly encode the 'info' intrinsics.
+            lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+              ((cbNum << 16 | writeOnlyImages++), curArg);
+          } else {
+            assert(!"Read/Write images are not supported!");
+          }
+          ++regNum;
+          cbNum += 2;
+          continue;
+        } else if (c32_type || c64_type) {
+          if (mDebug) {
+            dbgs() << "Pointer: '" << curArg->getName()
+              << "' is a " << (c32_type ? "32" : "64")
+              << " bit atomic counter type!\n";
+          }
+          counterSet.push_back(I);
+        }
+      }
+
+      if (STM->device()->isSupported(AMDILDeviceInfo::CachedMem) 
+          && GV && GV->hasInitializer()) {
+        const ConstantArray *nameArray 
+          = dyn_cast_or_null<ConstantArray>(GV->getInitializer());
+        if (nameArray) {
+          for (unsigned x = 0, y = nameArray->getNumOperands(); x < y; ++x) {
+            const GlobalVariable *gV= dyn_cast_or_null<GlobalVariable>(
+                nameArray->getOperand(x)->getOperand(0));
+            const ConstantDataArray *argName =
+              dyn_cast_or_null<ConstantDataArray>(gV->getInitializer());
+            if (!argName) {
+              continue;
+            }
+            std::string argStr = argName->getAsString();
+            std::string curStr = curArg->getName();
+            if (!strcmp(argStr.data(), curStr.data())) {
+              if (mDebug) {
+                dbgs() << "Pointer: '" << curArg->getName() 
+                  << "' is cacheable!\n";
+              }
+              cacheablePtrs.insert(curArg);
+            }
+          }
+        }
+      }
+      uint32_t as = dyn_cast<PointerType>(curType)->getAddressSpace();
+      // Handle the case where the kernel argument is a pointer
+      if (mDebug) {
+        dbgs() << "Pointer: " << curArg->getName() << " is assigned ";
+        if (as == AMDILAS::GLOBAL_ADDRESS) {
+          dbgs() << "uav " << STM->device()
+            ->getResourceID(AMDILDevice::GLOBAL_ID);
+        } else if (as == AMDILAS::PRIVATE_ADDRESS) {
+          dbgs() << "scratch " << STM->device()
+            ->getResourceID(AMDILDevice::SCRATCH_ID);
+        } else if (as == AMDILAS::LOCAL_ADDRESS) {
+          dbgs() << "lds " << STM->device()
+            ->getResourceID(AMDILDevice::LDS_ID);
+        } else if (as == AMDILAS::CONSTANT_ADDRESS) {
+          dbgs() << "cb " << STM->device()
+            ->getResourceID(AMDILDevice::CONSTANT_ID);
+        } else if (as == AMDILAS::REGION_ADDRESS) {
+          dbgs() << "gds " << STM->device()
+            ->getResourceID(AMDILDevice::GDS_ID);
+        } else {
+          assert(!"Found an address space that we don't support!");
+        }
+        dbgs() << " @ register " << regNum << ". Inst: ";
+        curArg->dump();
+      }
+      switch (as) {
+        default:
+          lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+            (STM->device()->getResourceID(AMDILDevice::GLOBAL_ID), curArg);
+          break;
+        case AMDILAS::LOCAL_ADDRESS:
+          lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+            (STM->device()->getResourceID(AMDILDevice::LDS_ID), curArg);
+          mMFI->setHasLocalArg();
+          break;
+        case AMDILAS::REGION_ADDRESS:
+          lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+            (STM->device()->getResourceID(AMDILDevice::GDS_ID), curArg);
+          mMFI->setHasRegionArg();
+          break;
+        case AMDILAS::CONSTANT_ADDRESS:
+          lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+            (STM->device()->getResourceID(AMDILDevice::CONSTANT_ID), curArg);
+          break;
+        case AMDILAS::PRIVATE_ADDRESS:
+          lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+            (STM->device()->getResourceID(AMDILDevice::SCRATCH_ID), curArg);
+          break;
+      }
+      // In this case we need to increment it once.
+      ++regNum;
+      ++cbNum;
+    } else {
+      // Is anything missing that is legal in CL?
+      assert(0 && "Current type is not supported!");
+      lookupTable[regNum] = std::make_pair<unsigned, const Value*>
+        (STM->device()->getResourceID(AMDILDevice::GLOBAL_ID), curArg);
+      ++regNum;
+      ++cbNum;
+    }
+  }
+  return writeOnlyImages;
+}
+// The call stack is interesting in that even in SSA form, it assigns
+// registers to the same value's over and over again. So we need to
+// ignore the values that are assigned and just deal with the input
+// and return registers.
+static void
+parseCall(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    RVPVec &lookupTable,
+    MachineBasicBlock::iterator &mBegin,
+    MachineBasicBlock::iterator mEnd,
+    bool mDebug)
+{
+  SmallVector<unsigned, 8> inputRegs;
+  AMDILAS::InstrResEnc curRes;
+  if (mDebug) {
+    dbgs() << "Parsing Call Stack Start.\n";
+  }
+  MachineBasicBlock::iterator callInst = mBegin;
+  MachineInstr *CallMI = callInst;
+  getAsmPrinterFlags(CallMI, curRes);
+  MachineInstr *MI = --mBegin;
+  unsigned reg = AMDIL::R1;
+  // First we need to check the input registers.
+  do {
+    // We stop if we hit the beginning of the call stack
+    // adjustment.
+    if (MI->getOpcode() == AMDIL::ADJCALLSTACKDOWN
+        || MI->getOpcode() == AMDIL::ADJCALLSTACKUP
+        || MI->getNumOperands() != 2
+        || !MI->getOperand(0).isReg()) {
+      break;
+    }
+    reg = MI->getOperand(0).getReg();
+    if (MI->getOperand(1).isReg()) {
+      unsigned reg1 = MI->getOperand(1).getReg();
+      inputRegs.push_back(reg1);
+      if (lookupTable[reg1].second) {
+        curRes.bits.PointerPath = 1;
+      }
+    }
+    lookupTable.erase(reg);
+    if ((signed)reg < 0 
+    || mBegin == CallMI->getParent()->begin()) {
+      break;
+    }
+    MI = --mBegin;
+  } while (1);
+  mBegin = callInst;
+  MI = ++mBegin;
+  // If the next registers operand 1 is not a register or that register
+  // is not R1, then we don't have any return values.
+  if (MI->getNumOperands() == 2 
+      && MI->getOperand(1).isReg() 
+      && MI->getOperand(1).getReg() == AMDIL::R1) {
+    // Next we check the output register.
+    reg = MI->getOperand(0).getReg();
+    // Now we link the inputs to the output.
+    for (unsigned x = 0; x < inputRegs.size(); ++x) {
+      if (lookupTable[inputRegs[x]].second) {
+        curRes.bits.PointerPath = 1;
+        lookupTable[reg] = lookupTable[inputRegs[x]];
+        InstToPtrMap[CallMI].insert(
+            lookupTable[reg].second);
+        break;
+      }
+    }
+    lookupTable.erase(MI->getOperand(1).getReg());
+  }
+  setAsmPrinterFlags(CallMI, curRes);
+  if (mDebug) {
+    dbgs() << "Parsing Call Stack End.\n";
+  }
+  return;
+}
+
+// Detect if the current instruction conflicts with another instruction
+// and add the instruction to the correct location accordingly.
+static void
+detectConflictInst(
+    MachineInstr *MI,
+    AMDILAS::InstrResEnc &curRes,
+    RVPVec &lookupTable,
+    InstPMap &InstToPtrMap,
+    bool isLoadStore,
+    unsigned reg,
+    unsigned dstReg,
+    bool mDebug)
+{
+  // If the instruction does not have a point path flag
+  // associated with it, then we know that no other pointer
+  // hits this instruciton.
+  if (!curRes.bits.PointerPath) {
+    if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) {
+      curRes.bits.PointerPath = 1;
+    }
+    // We don't want to transfer to the register number
+    // between load/store because the load dest can be completely
+    // different pointer path and the store doesn't have a real
+    // destination register.
+    if (!isLoadStore) {
+      if (mDebug) {
+        if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) {
+          dbgs() << "Pointer: " << lookupTable[reg].second->getName();
+          assert(dyn_cast<PointerType>(lookupTable[reg].second->getType())
+              && "Must be a pointer type for an instruction!");
+          switch (dyn_cast<PointerType>(
+                lookupTable[reg].second->getType())->getAddressSpace())
+          {
+            case AMDILAS::GLOBAL_ADDRESS:  dbgs() << " UAV: "; break;
+            case AMDILAS::LOCAL_ADDRESS: dbgs() << " LDS: "; break;
+            case AMDILAS::REGION_ADDRESS: dbgs() << " GDS: "; break;
+            case AMDILAS::PRIVATE_ADDRESS: dbgs() << " SCRATCH: "; break;
+            case AMDILAS::CONSTANT_ADDRESS: dbgs() << " CB: "; break;
+
+          }
+          dbgs() << lookupTable[reg].first << " Reg: " << reg
+            << " assigned to reg " << dstReg << ". Inst: ";
+          MI->dump();
+        }
+      }
+      // We don't want to do any copies if the register is not virtual
+      // as it is the result of a CALL. ParseCallInst handles the
+      // case where the input and output need to be linked up 
+      // if it occurs. The easiest way to check for virtual
+      // is to check the top bit.
+      lookupTable[dstReg] = lookupTable[reg];
+    }
+  } else {
+    if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) {
+      // Otherwise we have a conflict between two pointers somehow.
+      curRes.bits.ConflictPtr = 1;
+      if (mDebug) {
+        dbgs() << "Pointer: " << lookupTable[reg].second->getName();
+        assert(dyn_cast<PointerType>(lookupTable[reg].second->getType())
+            && "Must be a pointer type for a conflict instruction!");
+        switch (dyn_cast<PointerType>(
+              lookupTable[reg].second->getType())->getAddressSpace())
+        {
+          case AMDILAS::GLOBAL_ADDRESS:  dbgs() << " UAV: "; break;
+          case AMDILAS::LOCAL_ADDRESS: dbgs() << " LDS: "; break;
+          case AMDILAS::REGION_ADDRESS: dbgs() << " GDS: "; break;
+          case AMDILAS::PRIVATE_ADDRESS: dbgs() << " SCRATCH: "; break;
+          case AMDILAS::CONSTANT_ADDRESS: dbgs() << " CB: "; break;
+
+        }
+        dbgs() << lookupTable[reg].first << " Reg: " << reg;
+        if (InstToPtrMap[MI].size() > 1) {
+          dbgs() << " conflicts with:\n ";
+          for (PtrSet::iterator psib = InstToPtrMap[MI].begin(),
+              psie = InstToPtrMap[MI].end(); psib != psie; ++psib) {
+            dbgs() << "\t\tPointer: " << (*psib)->getName() << " ";
+            assert(dyn_cast<PointerType>((*psib)->getType())
+                && "Must be a pointer type for a conflict instruction!");
+            (*psib)->dump();
+          }
+        } else {
+          dbgs() << ".";
+        }
+        dbgs() << " Inst: ";
+        MI->dump();
+      }
+    }
+    // Add the conflicting values to the pointer set for the instruction
+    InstToPtrMap[MI].insert(lookupTable[reg].second);
+    // We don't want to add the destination register if
+    // we are a load or store.
+    if (!isLoadStore) {
+      InstToPtrMap[MI].insert(lookupTable[dstReg].second);
+    }
+  }
+  setAsmPrinterFlags(MI, curRes);
+}
+
+// In this case we want to handle a load instruction.
+static void
+parseLoadInst(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    CPoolSet &cpool,
+    BlockCacheableInfo &bci,
+    MachineInstr *MI,
+    bool mDebug)
+{
+  assert(isLoadInst(ATM->getInstrInfo(), MI) && "Only a load instruction can be parsed by "
+      "the parseLoadInst function.");
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  unsigned dstReg = MI->getOperand(0).getReg();
+  unsigned idx = 0;
+  const Value *basePtr = NULL;
+  if (MI->getOperand(1).isReg()) {
+    idx = MI->getOperand(1).getReg();
+    basePtr = lookupTable[idx].second;
+    // If we don't know what value the register
+    // is assigned to, then we need to special case
+    // this instruction.
+  } else if (MI->getOperand(1).isFI()) {
+    idx = MI->getOperand(1).getIndex();
+    lookupTable[dstReg] = FIToPtrMap[idx];
+  } else if (MI->getOperand(1).isCPI()) {
+    cpool.insert(MI);
+  } 
+  // If we are a hardware local, then we don't need to track as there
+  // is only one resource ID that we need to know about, so we
+  // map it using allocateDefaultID, which maps it to the default.
+  // This is also the case for REGION_ADDRESS and PRIVATE_ADDRESS.
+  if (isLRPInst(MI, ATM) || !basePtr) {
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+    return;
+  }
+  // We have a load instruction so we map this instruction
+  // to the pointer and insert it into the set of known
+  // load instructions.
+  InstToPtrMap[MI].insert(basePtr);
+  PtrToInstMap[basePtr].push_back(MI);
+
+  if (isGlobalInst(ATM->getInstrInfo(), MI)) {
+    // Add to the cacheable set for the block. If there was a store earlier
+    // in the block, this call won't actually add it to the cacheable set.
+    bci.addPossiblyCacheableInst(ATM, MI);
+  }
+
+  if (mDebug) {
+    dbgs() << "Assigning instruction to pointer ";
+    dbgs() << basePtr->getName() << ". Inst: ";
+    MI->dump();
+  }
+  detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true,
+      idx, dstReg, mDebug);
+}
+
+// In this case we want to handle a store instruction.
+static void
+parseStoreInst(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    CPoolSet &cpool,
+    BlockCacheableInfo &bci,
+    MachineInstr *MI,
+    ByteSet &bytePtrs,
+    ConflictSet &conflictPtrs,
+    bool mDebug)
+{
+  assert(isStoreInst(ATM->getInstrInfo(), MI) && "Only a store instruction can be parsed by "
+      "the parseStoreInst function.");
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  unsigned dstReg = MI->getOperand(0).getReg();
+
+  // If the data part of the store instruction is known to
+  // be a pointer, then we need to mark this pointer as being
+  // a byte pointer. This is the conservative case that needs
+  // to be handled correctly.
+  if (lookupTable[dstReg].second && lookupTable[dstReg].first != ~0U) {
+    curRes.bits.ConflictPtr = 1;
+    if (mDebug) {
+      dbgs() << "Found a case where the pointer is being stored!\n";
+      MI->dump();
+      dbgs() << "Pointer is ";
+      lookupTable[dstReg].second->print(dbgs());
+      dbgs() << "\n";
+    }
+    //PtrToInstMap[lookupTable[dstReg].second].push_back(MI);
+    if (lookupTable[dstReg].second->getType()->isPointerTy()) {
+      conflictPtrs.insert(lookupTable[dstReg].second);
+    }
+  }
+
+  // Before we go through the special cases, for the cacheable information
+  // all we care is if the store if global or not.
+  if (!isLRPInst(MI, ATM)) {
+    bci.setReachesExit();
+  }
+
+  // If the address is not a register address,
+  // then we need to lower it as an unknown id.
+  if (!MI->getOperand(1).isReg()) {
+    if (MI->getOperand(1).isCPI()) {
+      if (mDebug) {
+        dbgs() << "Found an instruction with a CPI index #"
+          << MI->getOperand(1).getIndex() << "!\n";
+      }
+      cpool.insert(MI);
+    } else if (MI->getOperand(1).isFI()) {
+      if (mDebug) {
+        dbgs() << "Found an instruction with a frame index #"
+          << MI->getOperand(1).getIndex() << "!\n";
+      }
+      // If we are a frame index and we are storing a pointer there, lets
+      // go ahead and assign the pointer to the location within the frame
+      // index map so that we can get the value out later.
+      FIToPtrMap[MI->getOperand(1).getIndex()] = lookupTable[dstReg];
+    }
+
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+    return;
+  }
+  unsigned reg = MI->getOperand(1).getReg();
+  // If we don't know what value the register
+  // is assigned to, then we need to special case
+  // this instruction.
+  if (!lookupTable[reg].second) {
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+    return;
+  }
+  // const Value *basePtr = lookupTable[reg].second;
+  // If we are a hardware local, then we don't need to track as there
+  // is only one resource ID that we need to know about, so we
+  // map it using allocateDefaultID, which maps it to the default.
+  // This is also the case for REGION_ADDRESS and PRIVATE_ADDRESS.
+  if (isLRPInst(MI, ATM)) {
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+    return;
+  }
+
+  // We have a store instruction so we map this instruction
+  // to the pointer and insert it into the set of known
+  // store instructions.
+  InstToPtrMap[MI].insert(lookupTable[reg].second);
+  PtrToInstMap[lookupTable[reg].second].push_back(MI);
+  uint16_t RegClass = MI->getDesc().OpInfo[0].RegClass;
+  switch (RegClass) {
+    default:
+      break;
+    case AMDIL::GPRI8RegClassID:
+    case AMDIL::GPRV2I8RegClassID:
+    case AMDIL::GPRI16RegClassID:
+      if (usesGlobal(ATM, MI)) {
+        if (mDebug) {
+          dbgs() << "Annotating instruction as Byte Store. Inst: ";
+          MI->dump();
+        }
+        curRes.bits.ByteStore = 1;
+        setAsmPrinterFlags(MI, curRes);
+        const PointerType *PT = dyn_cast<PointerType>(
+            lookupTable[reg].second->getType());
+        if (PT) {
+          bytePtrs.insert(lookupTable[reg].second);
+        }
+      }
+      break;
+  };
+  // If we are a truncating store, then we need to determine the
+  // size of the pointer that we are truncating to, and if we
+  // are less than 32 bits, we need to mark the pointer as a
+  // byte store pointer.
+  switch (MI->getOpcode()) {
+    case AMDIL::GLOBALTRUNCSTORE_i16i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i16i8:
+    case AMDIL::GLOBALTRUNCSTORE_i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i32i8:
+    case AMDIL::GLOBALTRUNCSTORE_i64i8:
+    case AMDIL::GLOBALTRUNCSTORE_v2i64i8:
+    case AMDIL::GLOBALTRUNCSTORE_i32i16:
+    case AMDIL::GLOBALTRUNCSTORE_i64i16:
+    case AMDIL::GLOBALSTORE_i8:
+    case AMDIL::GLOBALSTORE_i16:
+      curRes.bits.ByteStore = 1;
+      setAsmPrinterFlags(MI, curRes);
+      bytePtrs.insert(lookupTable[reg].second);
+      break;
+    default:
+      break;
+  }
+
+  if (mDebug) {
+    dbgs() << "Assigning instruction to pointer ";
+    dbgs() << lookupTable[reg].second->getName() << ". Inst: ";
+    MI->dump();
+  }
+  detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true,
+      reg, dstReg, mDebug);
+}
+
+// In this case we want to handle an atomic instruction.
+static void
+parseAtomicInst(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    RVPVec &lookupTable,
+    BlockCacheableInfo &bci,
+    MachineInstr *MI,
+    ByteSet &bytePtrs,
+    bool mDebug)
+{
+  assert(isAtomicInst(ATM->getInstrInfo(), MI) && "Only an atomic instruction can be parsed by "
+      "the parseAtomicInst function.");
+  AMDILAS::InstrResEnc curRes;
+  unsigned dstReg = MI->getOperand(0).getReg();
+  unsigned reg = 0;
+  getAsmPrinterFlags(MI, curRes);
+  unsigned numOps = MI->getNumOperands();
+  bool found = false;
+  while (--numOps) {
+    MachineOperand &Op = MI->getOperand(numOps);
+    if (!Op.isReg()) {
+      continue;
+    }
+    reg = Op.getReg();
+    // If the register is not known to be owned by a pointer
+    // then we can ignore it
+    if (!lookupTable[reg].second) {
+      continue;
+    }
+    // if the pointer is known to be local, region or private, then we
+    // can ignore it.  Although there are no private atomics, we still
+    // do this check so we don't have to write a new function to check
+    // for only local and region.
+    if (isLRPInst(MI, ATM)) {
+      continue;
+    }
+    found = true;
+    InstToPtrMap[MI].insert(lookupTable[reg].second);
+    PtrToInstMap[lookupTable[reg].second].push_back(MI);
+
+       // We now know we have an atomic operation on global memory.
+       // This is a store so must update the cacheable information.
+    bci.setReachesExit();
+
+       // Only do if have SC with arena atomic bug fix (EPR 326883).
+       // TODO: enable once SC with EPR 326883 has been promoted to CAL.
+    if (ATM->getSubtargetImpl()->calVersion() >= CAL_VERSION_SC_150) {
+      // Force pointers that are used by atomics to be in the arena.
+      // If they were allowed to be accessed as RAW they would cause
+      // all access to use the slow complete path.
+      if (mDebug) {
+        dbgs() << __LINE__ << ": Setting byte store bit on atomic instruction: ";
+        MI->dump();
+      }
+         curRes.bits.ByteStore = 1;
+      bytePtrs.insert(lookupTable[reg].second);
+       }
+
+    if (mDebug) {
+      dbgs() << "Assigning instruction to pointer ";
+      dbgs() << lookupTable[reg].second->getName() << ". Inst: ";
+      MI->dump();
+    }
+    detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true,
+        reg, dstReg, mDebug);
+  }
+  if (!found) {
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+  }
+}
+// In this case we want to handle a counter instruction.
+static void
+parseAppendInst(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    RVPVec &lookupTable,
+    MachineInstr *MI,
+    bool mDebug)
+{
+  assert(isAppendInst(ATM->getInstrInfo(), MI) && "Only an atomic counter instruction can be "
+      "parsed by the parseAppendInst function.");
+  AMDILAS::InstrResEnc curRes;
+  unsigned dstReg = MI->getOperand(0).getReg();
+  unsigned reg = MI->getOperand(1).getReg();
+  getAsmPrinterFlags(MI, curRes);
+  // If the register is not known to be owned by a pointer
+  // then we set it to the default
+  if (!lookupTable[reg].second) {
+    allocateDefaultID(ATM, curRes, MI, mDebug);
+    return;
+  }
+  InstToPtrMap[MI].insert(lookupTable[reg].second);
+  PtrToInstMap[lookupTable[reg].second].push_back(MI);
+  if (mDebug) {
+    dbgs() << "Assigning instruction to pointer ";
+    dbgs() << lookupTable[reg].second->getName() << ". Inst: ";
+    MI->dump();
+  }
+  detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true,
+      reg, dstReg, mDebug);
+}
+// In this case we want to handle an Image instruction.
+static void
+parseImageInst(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    MachineInstr *MI,
+    bool mDebug)
+{
+  assert(isImageInst(ATM->getInstrInfo(), MI) && "Only an image instruction can be "
+      "parsed by the parseImageInst function.");
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  // AMDILKernelManager *km = 
+  //   (AMDILKernelManager *)ATM->getSubtargetImpl()->getKernelManager();
+  AMDILMachineFunctionInfo *mMFI = MI->getParent()->getParent()
+    ->getInfo<AMDILMachineFunctionInfo>();
+  if (MI->getOpcode() == AMDIL::IMAGE2D_WRITE
+      || MI->getOpcode() == AMDIL::IMAGE3D_WRITE) {
+    unsigned dstReg = MI->getOperand(0).getReg();
+    curRes.bits.ResourceID = lookupTable[dstReg].first & 0xFFFF;
+    curRes.bits.isImage = 1;
+    InstToPtrMap[MI].insert(lookupTable[dstReg].second);
+    PtrToInstMap[lookupTable[dstReg].second].push_back(MI);
+    if (mDebug) {
+      dbgs() << "Assigning instruction to pointer ";
+      dbgs() << lookupTable[dstReg].second->getName() << ". Inst: ";
+      MI->dump();
+    }
+  } else {
+    // unsigned dstReg = MI->getOperand(0).getReg();
+    unsigned reg = MI->getOperand(1).getReg();
+
+    // If the register is not known to be owned by a pointer
+    // then we set it to the default
+    if (!lookupTable[reg].second) {
+      assert(!"This should not happen for images!");
+      allocateDefaultID(ATM, curRes, MI, mDebug);
+      return;
+    }
+    InstToPtrMap[MI].insert(lookupTable[reg].second);
+    PtrToInstMap[lookupTable[reg].second].push_back(MI);
+    if (mDebug) {
+      dbgs() << "Assigning instruction to pointer ";
+      dbgs() << lookupTable[reg].second->getName() << ". Inst: ";
+      MI->dump();
+    }    
+    switch (MI->getOpcode()) {
+      case AMDIL::IMAGE2D_READ:
+      case AMDIL::IMAGE2D_READ_UNNORM:
+      case AMDIL::IMAGE3D_READ:
+      case AMDIL::IMAGE3D_READ_UNNORM:
+        curRes.bits.ResourceID = lookupTable[reg].first & 0xFFFF;
+        if (MI->getOperand(3).isReg()) {
+          // Our sampler is not a literal value.
+          char buffer[256];
+          memset(buffer, 0, sizeof(buffer));
+          std::string sampler_name = "";
+          unsigned reg = MI->getOperand(3).getReg();
+          if (lookupTable[reg].second) {
+            sampler_name = lookupTable[reg].second->getName();
+          }
+          if (sampler_name.empty()) {
+            sampler_name = findSamplerName(MI, lookupTable, FIToPtrMap, ATM);
+          }
+          uint32_t val = mMFI->addSampler(sampler_name, ~0U);
+          if (mDebug) {
+            dbgs() << "Mapping kernel sampler " << sampler_name
+              << " to sampler number " << val << " for Inst:\n";
+            MI->dump();
+          }
+          MI->getOperand(3).ChangeToImmediate(val);
+        } else {
+          // Our sampler is known at runtime as a literal, lets make sure
+          // that the metadata for it is known.
+          char buffer[256];
+          memset(buffer, 0, sizeof(buffer));
+          sprintf(buffer,"_%d", (int32_t)MI->getOperand(3).getImm());
+          std::string sampler_name = std::string("unknown") + std::string(buffer);
+          uint32_t val = mMFI->addSampler(sampler_name, MI->getOperand(3).getImm());
+          if (mDebug) {
+            dbgs() << "Mapping internal sampler " << sampler_name 
+              << " to sampler number " << val << " for Inst:\n";
+            MI->dump();
+          }
+          MI->getOperand(3).setImm(val);
+        }
+        break;
+      case AMDIL::IMAGE2D_INFO0:
+      case AMDIL::IMAGE3D_INFO0:
+        curRes.bits.ResourceID = lookupTable[reg].first >> 16;
+        break;
+      case AMDIL::IMAGE2D_INFO1:
+      case AMDIL::IMAGE2DA_INFO1:
+        curRes.bits.ResourceID = (lookupTable[reg].first >> 16) + 1;
+        break;
+    };
+    curRes.bits.isImage = 1;
+  }
+  setAsmPrinterFlags(MI, curRes);
+}
+// This case handles the rest of the instructions
+static void
+parseInstruction(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    RVPVec &lookupTable,
+    CPoolSet &cpool,
+    MachineInstr *MI,
+    bool mDebug)
+{
+  assert(!isAtomicInst(ATM->getInstrInfo(), MI) && !isStoreInst(ATM->getInstrInfo(), MI) && !isLoadInst(ATM->getInstrInfo(), MI) &&
+      !isAppendInst(ATM->getInstrInfo(), MI) && !isImageInst(ATM->getInstrInfo(), MI) &&
+      "Atomic/Load/Store/Append/Image insts should not be handled here!");
+  unsigned numOps = MI->getNumOperands();
+  // If we don't have any operands, we can skip this instruction
+  if (!numOps) {
+    return;
+  }
+  // if the dst operand is not a register, then we can skip
+  // this instruction. That is because we are probably a branch
+  // or jump instruction.
+  if (!MI->getOperand(0).isReg()) {
+    return;
+  }
+  // If we are a LOADCONST_i32, we might be a sampler, so we need
+  // to propogate the LOADCONST to IMAGE[2|3]D_READ instructions.
+  if (MI->getOpcode() == AMDIL::LOADCONST_i32) {
+    uint32_t val = MI->getOperand(1).getImm();
+    MachineOperand* oldPtr = &MI->getOperand(0);
+    MachineOperand* moPtr = oldPtr->getNextOperandForReg();
+    while (moPtr) {
+      oldPtr = moPtr;
+      moPtr = oldPtr->getNextOperandForReg();
+      switch (oldPtr->getParent()->getOpcode()) {
+        default:
+          break;
+        case AMDIL::IMAGE2D_READ:
+        case AMDIL::IMAGE2D_READ_UNNORM:
+        case AMDIL::IMAGE3D_READ:
+        case AMDIL::IMAGE3D_READ_UNNORM:
+          if (mDebug) {
+            dbgs() << "Found a constant sampler for image read inst: ";
+            oldPtr->getParent()->print(dbgs());
+          }
+          oldPtr->ChangeToImmediate(val);
+          break;
+      }
+    }
+  }
+  AMDILAS::InstrResEnc curRes;
+  getAsmPrinterFlags(MI, curRes);
+  unsigned dstReg = MI->getOperand(0).getReg();
+  unsigned reg = 0;
+  while (--numOps) {
+    MachineOperand &Op = MI->getOperand(numOps);
+    // if the operand is not a register, then we can ignore it
+    if (!Op.isReg()) {
+      if (Op.isCPI()) {
+        cpool.insert(MI);
+      }
+      continue;
+    }
+    reg = Op.getReg();
+    // If the register is not known to be owned by a pointer
+    // then we can ignore it
+    if (!lookupTable[reg].second) {
+      continue;
+    }
+    detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, false,
+        reg, dstReg, mDebug);
+
+  }
+}
+
+// This function parses the basic block and based on the instruction type,
+// calls the function to finish parsing the instruction.
+static void
+parseBasicBlock(
+    const AMDILTargetMachine *ATM,
+    MachineBasicBlock *MB,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    ByteSet &bytePtrs,
+    ConflictSet &conflictPtrs, 
+    CPoolSet &cpool,
+    BlockCacheableInfo &bci,
+    bool mDebug)
+{
+  for (MachineBasicBlock::iterator mbb = MB->begin(), mbe = MB->end();
+      mbb != mbe; ++mbb) {
+    MachineInstr *MI = mbb;
+    if (MI->getOpcode() == AMDIL::CALL) {
+      parseCall(ATM, InstToPtrMap, PtrToInstMap, lookupTable,
+          mbb, mbe, mDebug);
+    } 
+    else if (isLoadInst(ATM->getInstrInfo(), MI)) {
+      parseLoadInst(ATM, InstToPtrMap, PtrToInstMap,
+          FIToPtrMap, lookupTable, cpool, bci, MI, mDebug);
+    } else if (isStoreInst(ATM->getInstrInfo(), MI)) {
+      parseStoreInst(ATM, InstToPtrMap, PtrToInstMap,
+          FIToPtrMap, lookupTable, cpool, bci, MI, bytePtrs, conflictPtrs, mDebug);
+    } else if (isAtomicInst(ATM->getInstrInfo(), MI)) {
+      parseAtomicInst(ATM, InstToPtrMap, PtrToInstMap,
+          lookupTable, bci, MI, bytePtrs, mDebug);
+    } else if (isAppendInst(ATM->getInstrInfo(), MI)) {
+      parseAppendInst(ATM, InstToPtrMap, PtrToInstMap,
+          lookupTable, MI, mDebug);
+    } else if (isImageInst(ATM->getInstrInfo(), MI)) {
+      parseImageInst(ATM, InstToPtrMap, PtrToInstMap,
+          FIToPtrMap, lookupTable, MI, mDebug);
+    } else {
+      parseInstruction(ATM, InstToPtrMap, PtrToInstMap,
+          lookupTable, cpool, MI, mDebug);
+    }
+  }
+}
+
+// Follows the Reverse Post Order Traversal of the basic blocks to
+// determine which order to parse basic blocks in.
+void
+parseFunction(
+    const AMDILPointerManager *PM,
+    const AMDILTargetMachine *ATM,
+    MachineFunction &MF,
+    InstPMap &InstToPtrMap,
+    PtrIMap &PtrToInstMap,
+    FIPMap &FIToPtrMap,
+    RVPVec &lookupTable,
+    ByteSet &bytePtrs,
+    ConflictSet &conflictPtrs,
+    CPoolSet &cpool,
+    MBBCacheableMap &mbbCacheable,
+    bool mDebug)
+{
+  if (mDebug) {
+    MachineDominatorTree *dominatorTree = &PM
+      ->getAnalysis<MachineDominatorTree>();
+    dominatorTree->dump();
+  }
+
+  std::list<MachineBasicBlock*> prop_worklist;
+
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
+      curBlock = RPOT.begin(), endBlock = RPOT.end();
+      curBlock != endBlock; ++curBlock) {
+    MachineBasicBlock *MB = (*curBlock);
+    BlockCacheableInfo &bci = mbbCacheable[MB];
+    for (MachineBasicBlock::pred_iterator mbbit = MB->pred_begin(),
+        mbbitend = MB->pred_end();
+        mbbit != mbbitend;
+        mbbit++) {
+      MBBCacheableMap::const_iterator mbbcmit = mbbCacheable.find(*mbbit);
+      if (mbbcmit != mbbCacheable.end() &&
+          mbbcmit->second.storeReachesExit()) {
+        bci.setReachesTop();
+        break;
+      }
+    }
+
+    if (mDebug) {
+      dbgs() << "[BlockOrdering] Parsing CurrentBlock: "
+        << MB->getNumber() << "\n";
+    }
+    parseBasicBlock(ATM, MB, InstToPtrMap, PtrToInstMap,
+        FIToPtrMap, lookupTable, bytePtrs, conflictPtrs, cpool, bci, mDebug);
+
+    if (bci.storeReachesExit())
+      prop_worklist.push_back(MB);
+
+    if (mDebug) {
+      dbgs() << "BCI info: Top: " << bci.storeReachesTop() << " Exit: " 
+        << bci.storeReachesExit() << "\n Instructions:\n";
+      for (CacheableInstrSet::const_iterator cibit = bci.cacheableBegin(),
+          cibitend = bci.cacheableEnd();
+          cibit != cibitend;
+          cibit++)
+      {
+        (*cibit)->dump();
+      }
+    }
+  }
+
+  // This loop pushes any "storeReachesExit" flags into successor
+  // blocks until the flags have been fully propagated. This will
+  // ensure that blocks that have reachable stores due to loops
+  // are labeled appropriately.
+  while (!prop_worklist.empty()) {
+    MachineBasicBlock *wlb = prop_worklist.front();
+    prop_worklist.pop_front();
+    for (MachineBasicBlock::succ_iterator mbbit = wlb->succ_begin(),
+        mbbitend = wlb->succ_end();
+        mbbit != mbbitend;
+        mbbit++)
+    {
+      BlockCacheableInfo &blockCache = mbbCacheable[*mbbit];
+      if (!blockCache.storeReachesTop()) {
+        blockCache.setReachesTop();
+        prop_worklist.push_back(*mbbit);
+      }
+      if (mDebug) {
+        dbgs() << "BCI Prop info: " << (*mbbit)->getNumber() << " Top: " 
+          << blockCache.storeReachesTop() << " Exit: " 
+          << blockCache.storeReachesExit()
+          << "\n";
+      }
+    }
+  }
+}
+
+// Helper function that dumps to dbgs() information about
+// a pointer set.
+  void
+dumpPointers(AppendSet &Ptrs, const char *str)
+{
+  if (Ptrs.empty()) {
+    return;
+  }
+  dbgs() << "[Dump]" << str << " found: " << "\n";
+  for (AppendSet::iterator sb = Ptrs.begin();
+      sb != Ptrs.end(); ++sb) {
+    (*sb)->dump();
+  }
+  dbgs() << "\n";
+}
+// Helper function that dumps to dbgs() information about
+// a pointer set.
+  void
+dumpPointers(PtrSet &Ptrs, const char *str)
+{
+  if (Ptrs.empty()) {
+    return;
+  }
+  dbgs() << "[Dump]" << str << " found: " << "\n";
+  for (PtrSet::iterator sb = Ptrs.begin();
+      sb != Ptrs.end(); ++sb) {
+    (*sb)->dump();
+  }
+  dbgs() << "\n";
+}
+// Function that detects all the conflicting pointers and adds
+// the pointers that are detected to the conflict set, otherwise
+// they are added to the raw or byte set based on their usage.
+void
+detectConflictingPointers(
+    const AMDILTargetMachine *ATM,
+    InstPMap &InstToPtrMap,
+    ByteSet &bytePtrs,
+    RawSet &rawPtrs,
+    ConflictSet &conflictPtrs,
+    bool mDebug)
+{
+  if (InstToPtrMap.empty()) {
+    return;
+  }
+  PtrSet aliasedPtrs;
+  const AMDILSubtarget *STM = ATM->getSubtargetImpl();
+  for (InstPMap::iterator
+      mapIter = InstToPtrMap.begin(), iterEnd = InstToPtrMap.end();
+      mapIter != iterEnd; ++mapIter) {
+    if (mDebug) {
+      dbgs() << "Instruction: ";
+      (mapIter)->first->dump();
+    }
+    MachineInstr* MI = mapIter->first;
+    AMDILAS::InstrResEnc curRes;
+    getAsmPrinterFlags(MI, curRes);
+    if (curRes.bits.isImage) {
+      continue;
+    }
+    bool byte = false;
+    // We might have a case where more than 1 pointers is going to the same
+    // I/O instruction
+    if (mDebug) {
+      dbgs() << "Base Pointer[s]:\n";
+    }
+    for (PtrSet::iterator cfIter = mapIter->second.begin(),
+        cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) {
+      if (mDebug) {
+        (*cfIter)->dump();
+      }
+      if (bytePtrs.count(*cfIter)) {
+        if (mDebug) {
+          dbgs() << "Byte pointer found!\n";
+        }
+        byte = true;
+        break;
+      }
+    }
+    if (byte) {
+      for (PtrSet::iterator cfIter = mapIter->second.begin(),
+          cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) {
+        const Value *ptr = (*cfIter);
+        if (isLRPInst(mapIter->first, ATM)) {
+          // We don't need to deal with pointers to local/region/private
+          // memory regions
+          continue;
+        }
+        if (mDebug) {
+          dbgs() << "Adding pointer " << (ptr)->getName()
+            << " to byte set!\n";
+        }
+        const PointerType *PT = dyn_cast<PointerType>(ptr->getType());
+        if (PT) {
+          bytePtrs.insert(ptr);
+        }
+      }
+    } else {
+      for (PtrSet::iterator cfIter = mapIter->second.begin(),
+          cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) {
+        const Value *ptr = (*cfIter);
+        // bool aliased = false;
+        if (isLRPInst(mapIter->first, ATM)) {
+          // We don't need to deal with pointers to local/region/private
+          // memory regions
+          continue;
+        }
+        const Argument *arg = dyn_cast_or_null<Argument>(*cfIter);
+        if (!arg) {
+          continue;
+        }
+        if (!STM->device()->isSupported(AMDILDeviceInfo::NoAlias) 
+            && !arg->hasNoAliasAttr()) {
+          if (mDebug) {
+            dbgs() << "Possible aliased pointer found!\n";
+          }
+          aliasedPtrs.insert(ptr);
+        }
+        if (mapIter->second.size() > 1) {
+          if (mDebug) {
+            dbgs() << "Adding pointer " << ptr->getName()
+              << " to conflict set!\n";
+          }
+          const PointerType *PT = dyn_cast<PointerType>(ptr->getType());
+          if (PT) {
+            conflictPtrs.insert(ptr);
+          }
+        }
+        if (mDebug) {
+          dbgs() << "Adding pointer " << ptr->getName()
+            << " to raw set!\n";
+        }
+        const PointerType *PT = dyn_cast<PointerType>(ptr->getType());
+        if (PT) {
+          rawPtrs.insert(ptr);
+        }
+      }
+    }
+    if (mDebug) {
+      dbgs() << "\n";
+    }
+  }
+  // If we have any aliased pointers and byte pointers exist,
+  // then make sure that all of the aliased pointers are 
+  // part of the byte pointer set.
+  if (!bytePtrs.empty()) {
+    for (PtrSet::iterator aIter = aliasedPtrs.begin(),
+        aEnd = aliasedPtrs.end(); aIter != aEnd; ++aIter) {
+      if (mDebug) {
+        dbgs() << "Moving " << (*aIter)->getName() 
+          << " from raw to byte.\n";
+      }
+      bytePtrs.insert(*aIter);
+      rawPtrs.erase(*aIter);
+    }
+  }
+}
+// Function that detects aliased constant pool operations.
+void
+detectAliasedCPoolOps(
+    TargetMachine &TM,
+    CPoolSet &cpool,
+    bool mDebug
+    )
+{
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  if (mDebug && !cpool.empty()) {
+    dbgs() << "Instructions w/ CPool Ops: \n";
+  }
+  // The algorithm for detecting aliased cpool is as follows.
+  // For each instruction that has a cpool argument
+  // follow def-use chain
+  //   if instruction is a load and load is a private load,
+  //      switch to constant pool load
+  for (CPoolSet::iterator cpb = cpool.begin(), cpe = cpool.end();
+      cpb != cpe; ++cpb) {
+    if (mDebug) {
+      (*cpb)->dump();
+    }
+    std::queue<MachineInstr*> queue;
+    std::set<MachineInstr*> visited;
+    queue.push(*cpb);
+    MachineInstr *cur;
+    while (!queue.empty()) {
+      cur = queue.front();
+      queue.pop();
+      if (visited.count(cur)) {
+        continue;
+      }
+      if (isLoadInst(TM.getInstrInfo(), cur) && isPrivateInst(TM.getInstrInfo(), cur)) { 
+        // If we are a private load and the register is
+        // used in the address register, we need to
+        // switch from private to constant pool load.
+        if (mDebug) {
+          dbgs() << "Found an instruction that is a private load "
+            << "but should be a constant pool load.\n";
+          cur->print(dbgs());
+          dbgs() << "\n";
+        }
+        AMDILAS::InstrResEnc curRes;
+        getAsmPrinterFlags(cur, curRes);
+        curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::GLOBAL_ID);
+        curRes.bits.ConflictPtr = 1;
+        setAsmPrinterFlags(cur, curRes);
+        cur->setDesc(TM.getInstrInfo()->get(
+              (cur->getOpcode() - AMDIL::PRIVATEAEXTLOAD_f32) 
+              + AMDIL::CPOOLAEXTLOAD_f32));
+      } else {
+        if (cur->getOperand(0).isReg()) {
+          MachineOperand* ptr = cur->getOperand(0).getNextOperandForReg();
+          while (ptr && !ptr->isDef() && ptr->isReg()) {
+            queue.push(ptr->getParent());
+            ptr = ptr->getNextOperandForReg();
+          } 
+        }
+      }
+      visited.insert(cur);
+    }
+  }
+}
+// Function that detects fully cacheable pointers. Fully cacheable pointers
+// are pointers that have no writes to them and -fno-alias is specified.
+void
+detectFullyCacheablePointers(
+    const AMDILTargetMachine *ATM,
+    PtrIMap &PtrToInstMap,
+    RawSet &rawPtrs,
+    CacheableSet &cacheablePtrs,
+    ConflictSet &conflictPtrs,
+    bool mDebug
+    )
+{
+  if (PtrToInstMap.empty()) {
+    return;
+  }
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  // 4XXX hardware doesn't support cached uav opcodes and we assume
+  // no aliasing for this to work. Also in debug mode we don't do
+  // any caching.
+  if (STM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX
+      || !STM->device()->isSupported(AMDILDeviceInfo::CachedMem)) {
+    return;
+  }
+  if (STM->device()->isSupported(AMDILDeviceInfo::NoAlias)) {
+    for (PtrIMap::iterator mapIter = PtrToInstMap.begin(), 
+        iterEnd = PtrToInstMap.end(); mapIter != iterEnd; ++mapIter) {
+      if (mDebug) {
+        dbgs() << "Instruction: ";
+        mapIter->first->dump();
+      }
+      // Skip the pointer if we have already detected it.
+      if (cacheablePtrs.count(mapIter->first)) {
+        continue;
+      }
+      bool cacheable = true;
+      for (std::vector<MachineInstr*>::iterator 
+          miBegin = mapIter->second.begin(),
+          miEnd = mapIter->second.end(); miBegin != miEnd; ++miBegin) {
+        if (isStoreInst(ATM->getInstrInfo(), *miBegin) ||
+            isImageInst(ATM->getInstrInfo(), *miBegin) ||
+            isAtomicInst(ATM->getInstrInfo(), *miBegin)) {
+          cacheable = false;
+          break;
+        }
+      }
+      // we aren't cacheable, so lets move on to the next instruction
+      if (!cacheable) {
+        continue;
+      }
+      // If we are in the conflict set, lets move to the next instruction
+      // FIXME: we need to check to see if the pointers that conflict with
+      // the current pointer are also cacheable. If they are, then add them
+      // to the cacheable list and not fail.
+      if (conflictPtrs.count(mapIter->first)) {
+        continue;
+      }
+      // Otherwise if we have no stores and no conflicting pointers, we can
+      // be added to the cacheable set.
+      if (mDebug) {
+        dbgs() << "Adding pointer " << mapIter->first->getName();
+        dbgs() << " to cached set!\n";
+      }
+      const PointerType *PT = dyn_cast<PointerType>(mapIter->first->getType());
+      if (PT) {
+        cacheablePtrs.insert(mapIter->first);
+      }
+    }
+  }
+}
+
+// Are any of the pointers in PtrSet also in the BytePtrs or the CachePtrs?
+static bool
+ptrSetIntersectsByteOrCache(
+    PtrSet &cacheSet,
+    ByteSet &bytePtrs,
+    CacheableSet &cacheablePtrs
+    )
+{
+  for (PtrSet::const_iterator psit = cacheSet.begin(),
+      psitend = cacheSet.end();
+      psit != psitend;
+      psit++) {
+    if (bytePtrs.find(*psit) != bytePtrs.end() ||
+        cacheablePtrs.find(*psit) != cacheablePtrs.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Function that detects which instructions are cacheable even if
+// all instructions of the pointer are not cacheable. The resulting
+// set of instructions will not contain Ptrs that are in the cacheable
+// ptr set (under the assumption they will get marked cacheable already)
+// or pointers in the byte set, since they are not cacheable.
+void
+detectCacheableInstrs(
+    MBBCacheableMap &bbCacheable,
+    InstPMap &InstToPtrMap,
+    CacheableSet &cacheablePtrs,
+    ByteSet &bytePtrs,
+    CacheableInstrSet &cacheableSet,
+    bool mDebug
+    )
+
+{
+  for (MBBCacheableMap::const_iterator mbbcit = bbCacheable.begin(),
+      mbbcitend = bbCacheable.end();
+      mbbcit != mbbcitend;
+      mbbcit++) {
+    for (CacheableInstrSet::const_iterator bciit 
+        = mbbcit->second.cacheableBegin(),
+        bciitend
+        = mbbcit->second.cacheableEnd();
+        bciit != bciitend;
+        bciit++) {
+      if (!ptrSetIntersectsByteOrCache(InstToPtrMap[*bciit],
+            bytePtrs, 
+            cacheablePtrs)) {
+        cacheableSet.insert(*bciit);
+      }
+    }
+  }
+}
+// This function annotates the cacheable pointers with the
+// CacheableRead bit. The cacheable read bit is set
+// when the number of write images is not equal to the max
+// or if the default RAW_UAV_ID is equal to 11. The first
+// condition means that there is a raw uav between 0 and 7
+// that is available for cacheable reads and the second
+// condition means that UAV 11 is available for cacheable
+// reads.
+void
+annotateCacheablePtrs(
+    TargetMachine &TM,
+    PtrIMap &PtrToInstMap,
+    CacheableSet &cacheablePtrs,
+    ByteSet &bytePtrs,
+    uint32_t numWriteImages,
+    bool mDebug)
+{
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager();
+  PtrSet::iterator siBegin, siEnd;
+  std::vector<MachineInstr*>::iterator miBegin, miEnd;
+  AMDILMachineFunctionInfo *mMFI = NULL;
+  // First we can check the cacheable pointers
+  for (siBegin = cacheablePtrs.begin(), siEnd = cacheablePtrs.end();
+      siBegin != siEnd; ++siBegin) {
+    assert(!bytePtrs.count(*siBegin) && "Found a cacheable pointer "
+        "that also exists as a byte pointer!");
+    for (miBegin = PtrToInstMap[*siBegin].begin(),
+        miEnd = PtrToInstMap[*siBegin].end();
+        miBegin != miEnd; ++miBegin) {
+      if (mDebug) {
+        dbgs() << "Annotating pointer as cacheable. Inst: ";
+        (*miBegin)->dump();
+      }
+      AMDILAS::InstrResEnc curRes;
+      getAsmPrinterFlags(*miBegin, curRes);
+      assert(!curRes.bits.ByteStore && "No cacheable pointers should have the "
+          "byte Store flag set!");
+      // If UAV11 is enabled, then we can enable cached reads.
+      if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) {
+        curRes.bits.CacheableRead = 1;
+        curRes.bits.ResourceID = 11;
+        setAsmPrinterFlags(*miBegin, curRes);
+        if (!mMFI) {
+          mMFI = (*miBegin)->getParent()->getParent()
+            ->getInfo<AMDILMachineFunctionInfo>();
+        }
+        mMFI->uav_insert(curRes.bits.ResourceID);
+      }
+    }
+  }
+}
+
+// A byte pointer is a pointer that along the pointer path has a
+// byte store assigned to it.
+void
+annotateBytePtrs(
+    TargetMachine &TM,
+    PtrIMap &PtrToInstMap,
+    ByteSet &bytePtrs,
+    RawSet &rawPtrs,
+    bool mDebug
+    )
+{
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  AMDILKernelManager *KM = STM->getKernelManager();
+  PtrSet::iterator siBegin, siEnd;
+  std::vector<MachineInstr*>::iterator miBegin, miEnd;
+  uint32_t arenaID = STM->device()
+          ->getResourceID(AMDILDevice::ARENA_UAV_ID);
+  if (STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) {
+    arenaID = ARENA_SEGMENT_RESERVED_UAVS + 1;
+  }
+  AMDILMachineFunctionInfo *mMFI = NULL;
+  for (siBegin = bytePtrs.begin(), siEnd = bytePtrs.end();
+      siBegin != siEnd; ++siBegin) {
+          const Value* val = (*siBegin);
+    const PointerType *PT = dyn_cast<PointerType>(val->getType());
+    if (!PT) {
+        continue;
+    }
+    const Argument *curArg = dyn_cast<Argument>(val);
+    assert(!rawPtrs.count(*siBegin) && "Found a byte pointer "
+        "that also exists as a raw pointer!");
+    bool arenaInc = false;
+    for (miBegin = PtrToInstMap[*siBegin].begin(),
+        miEnd = PtrToInstMap[*siBegin].end();
+        miBegin != miEnd; ++miBegin) {
+      if (mDebug) {
+        dbgs() << "Annotating pointer as arena. Inst: ";
+        (*miBegin)->dump();
+      }
+      AMDILAS::InstrResEnc curRes;
+      getAsmPrinterFlags(*miBegin, curRes);
+
+      if (STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)
+          && PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) {
+        // If hardware constant mem is enabled, then we need to
+        // get the constant pointer CB number and use that to specify
+        // the resource ID.
+        AMDILGlobalManager *GM = STM->getGlobalManager();
+        const StringRef funcName = (*miBegin)->getParent()->getParent()
+          ->getFunction()->getName();
+        if (GM->isKernel(funcName)) {
+          const kernel &krnl = GM->getKernel(funcName);
+          curRes.bits.ResourceID = GM->getConstPtrCB(krnl,
+              (*siBegin)->getName());
+          curRes.bits.HardwareInst = 1;
+        } else {
+          curRes.bits.ResourceID = STM->device()
+            ->getResourceID(AMDILDevice::CONSTANT_ID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::LocalMem)
+          && PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) {
+        // If hardware local mem is enabled, get the local mem ID from
+        // the device to use as the ResourceID
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::LDS_ID);
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin)) {
+          assert(curRes.bits.ResourceID && "Atomic resource ID "
+              "cannot be non-zero!");
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::RegionMem)
+          && PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) {
+        // If hardware region mem is enabled, get the gds mem ID from
+        // the device to use as the ResourceID
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::GDS_ID);
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin)) {
+          assert(curRes.bits.ResourceID && "Atomic resource ID "
+              "cannot be non-zero!");
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)
+          && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::SCRATCH_ID);
+      } else { 
+        if (mDebug) {
+          dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+          (*miBegin)->print(dbgs());
+        }
+        curRes.bits.ByteStore = 1;
+        curRes.bits.ResourceID = (curArg && curArg->hasNoAliasAttr()) ? arenaID
+          : STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID);
+        if (STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) {
+          arenaInc = true;
+        }
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin) &&
+            STM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+          // If we are an arena instruction, we need to switch the atomic opcode
+          // from the global version to the arena version.
+          MachineInstr *MI = *miBegin;
+          MI->setDesc(
+              TM.getInstrInfo()->get(
+                (MI->getOpcode() - AMDIL::ATOM_G_ADD) + AMDIL::ATOM_A_ADD));
+        }
+        if (mDebug) {
+          dbgs() << "Annotating pointer as arena. Inst: ";
+          (*miBegin)->dump();
+        }
+      }
+      setAsmPrinterFlags(*miBegin, curRes);
+      KM->setUAVID(*siBegin, curRes.bits.ResourceID);
+      if (!mMFI) {
+        mMFI = (*miBegin)->getParent()->getParent()
+          ->getInfo<AMDILMachineFunctionInfo>();
+      }
+      mMFI->uav_insert(curRes.bits.ResourceID);
+    }
+    if (arenaInc) {
+      ++arenaID;
+    }
+  }
+}
+// An append pointer is a opaque object that has append instructions
+// in its path.
+void
+annotateAppendPtrs(
+    TargetMachine &TM,
+    PtrIMap &PtrToInstMap,
+    AppendSet &appendPtrs,
+    bool mDebug)
+{
+  unsigned currentCounter = 0;
+  // const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager();
+  MachineFunction *MF = NULL;
+  for (AppendSet::iterator asBegin = appendPtrs.begin(),
+      asEnd = appendPtrs.end(); asBegin != asEnd; ++asBegin)
+  {
+    bool usesWrite = false;
+    bool usesRead = false;
+    const Value* curVal = *asBegin;
+    if (mDebug) {
+      dbgs() << "Counter: " << curVal->getName() 
+        << " assigned the counter " << currentCounter << "\n";
+    }
+    for (std::vector<MachineInstr*>::iterator 
+        miBegin = PtrToInstMap[curVal].begin(),
+        miEnd = PtrToInstMap[curVal].end(); miBegin != miEnd; ++miBegin) {
+      MachineInstr *MI = *miBegin;
+      if (!MF) {
+        MF = MI->getParent()->getParent();
+      }
+      unsigned opcode = MI->getOpcode();
+      switch (opcode) {
+        default:
+          if (mDebug) {
+            dbgs() << "Skipping instruction: ";
+            MI->dump();
+          }
+          break;
+        case AMDIL::APPEND_ALLOC:
+        case AMDIL::APPEND_ALLOC_NORET:
+          usesWrite = true;
+          MI->getOperand(1).ChangeToImmediate(currentCounter);
+          if (mDebug) {
+            dbgs() << "Assing to counter " << currentCounter << " Inst: ";
+            MI->dump();
+          }
+          break;
+        case AMDIL::APPEND_CONSUME:
+        case AMDIL::APPEND_CONSUME_NORET:
+          usesRead = true;
+          MI->getOperand(1).ChangeToImmediate(currentCounter);
+          if (mDebug) {
+            dbgs() << "Assing to counter " << currentCounter << " Inst: ";
+            MI->dump();
+          }
+          break;
+      };
+    }
+    if (usesWrite && usesRead && MF) {
+      MF->getInfo<AMDILMachineFunctionInfo>()->addErrorMsg(
+          amd::CompilerErrorMessage[INCORRECT_COUNTER_USAGE]);
+    }
+    ++currentCounter;
+  }
+}
+// A raw pointer is any pointer that does not have byte store in its path.
+static void
+annotateRawPtrs(
+    TargetMachine &TM,
+    PtrIMap &PtrToInstMap,
+    RawSet &rawPtrs,
+    ByteSet &bytePtrs,
+    uint32_t numWriteImages,
+    bool mDebug
+    )
+{
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  AMDILKernelManager *KM = STM->getKernelManager();
+  PtrSet::iterator siBegin, siEnd;
+  std::vector<MachineInstr*>::iterator miBegin, miEnd;
+  AMDILMachineFunctionInfo *mMFI = NULL;
+
+  // Now all of the raw pointers will go to the raw uav.
+  for (siBegin = rawPtrs.begin(), siEnd = rawPtrs.end();
+      siBegin != siEnd; ++siBegin) {
+    const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType());
+    if (!PT) {
+      continue;
+    }
+    assert(!bytePtrs.count(*siBegin) && "Found a raw pointer "
+        " that also exists as a byte pointers!");
+    for (miBegin = PtrToInstMap[*siBegin].begin(),
+        miEnd = PtrToInstMap[*siBegin].end();
+        miBegin != miEnd; ++miBegin) {
+      if (mDebug) {
+        dbgs() << "Annotating pointer as raw. Inst: ";
+        (*miBegin)->dump();
+      }
+      AMDILAS::InstrResEnc curRes;
+      getAsmPrinterFlags(*miBegin, curRes);
+      if (!curRes.bits.ConflictPtr) {
+        assert(!curRes.bits.ByteStore 
+            && "Found a instruction that is marked as "
+            "raw but has a byte store bit set!");
+      } else if (curRes.bits.ConflictPtr) {
+        if (curRes.bits.ByteStore) {
+          curRes.bits.ByteStore = 0;
+        }
+      }
+      if (STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)
+          && PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) {
+        // If hardware constant mem is enabled, then we need to
+        // get the constant pointer CB number and use that to specify
+        // the resource ID.
+        AMDILGlobalManager *GM = STM->getGlobalManager();
+        const StringRef funcName = (*miBegin)->getParent()->getParent()
+          ->getFunction()->getName();
+        if (GM->isKernel(funcName)) {
+          const kernel &krnl = GM->getKernel(funcName);
+          curRes.bits.ResourceID = GM->getConstPtrCB(krnl,
+              (*siBegin)->getName());
+          curRes.bits.HardwareInst = 1;
+        } else {
+          curRes.bits.ResourceID = STM->device()
+            ->getResourceID(AMDILDevice::CONSTANT_ID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::LocalMem)
+          && PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) {
+        // If hardware local mem is enabled, get the local mem ID from
+        // the device to use as the ResourceID
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::LDS_ID);
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin)) {
+          assert(curRes.bits.ResourceID && "Atomic resource ID "
+              "cannot be non-zero!");
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::RegionMem)
+          && PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) {
+        // If hardware region mem is enabled, get the gds mem ID from
+        // the device to use as the ResourceID
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::GDS_ID);
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin)) {
+          assert(curRes.bits.ResourceID && "Atomic resource ID "
+              "cannot be non-zero!");
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+        }
+      } else if (STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)
+          && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+        curRes.bits.ResourceID = STM->device()
+          ->getResourceID(AMDILDevice::SCRATCH_ID);
+      } else if (!STM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+        // If multi uav is enabled, then the resource ID is either the
+        // number of write images that are available or the device
+        // raw uav id if it is 11.
+        if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) >
+            STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+          curRes.bits.ResourceID = STM->device()
+            ->getResourceID(AMDILDevice::RAW_UAV_ID);
+        } else if (numWriteImages != OPENCL_MAX_WRITE_IMAGES) {
+          if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID)
+              < numWriteImages) {
+            curRes.bits.ResourceID = numWriteImages;
+          } else {
+            curRes.bits.ResourceID = STM->device()
+              ->getResourceID(AMDILDevice::RAW_UAV_ID);
+          }
+        } else {
+          if (mDebug) {
+            dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+            (*miBegin)->print(dbgs());
+          }
+          curRes.bits.ByteStore = 1;
+          curRes.bits.ResourceID = STM->device()
+            ->getResourceID(AMDILDevice::ARENA_UAV_ID);
+        }
+        if (isAtomicInst(TM.getInstrInfo(), *miBegin)) {
+          (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+            .setImm(curRes.bits.ResourceID);
+          if (curRes.bits.ResourceID
+              == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+            assert(0 && "Found an atomic instruction that has "
+                "an arena uav id!");
+          }
+        }
+        KM->setUAVID(*siBegin, curRes.bits.ResourceID);
+        if (!mMFI) {
+          mMFI = (*miBegin)->getParent()->getParent()
+            ->getInfo<AMDILMachineFunctionInfo>();
+        }
+        mMFI->uav_insert(curRes.bits.ResourceID);
+      }
+      setAsmPrinterFlags(*miBegin, curRes);
+    }
+  }
+
+}
+
+void
+annotateCacheableInstrs(
+    TargetMachine &TM,
+    CacheableInstrSet &cacheableSet,
+    bool mDebug)
+{
+  const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager();
+
+  CacheableInstrSet::iterator miBegin, miEnd;
+
+  for (miBegin = cacheableSet.begin(),
+      miEnd = cacheableSet.end();
+      miBegin != miEnd; ++miBegin) {
+    if (mDebug) {
+      dbgs() << "Annotating instr as cacheable. Inst: ";
+      (*miBegin)->dump();
+    }
+    AMDILAS::InstrResEnc curRes;
+    getAsmPrinterFlags(*miBegin, curRes);
+    // If UAV11 is enabled, then we can enable cached reads.
+    if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) {
+      curRes.bits.CacheableRead = 1;
+      curRes.bits.ResourceID = 11;
+      setAsmPrinterFlags(*miBegin, curRes);
+    }
+  }
+}
+
+// Annotate the instructions along various pointer paths. The paths that
+// are handled are the raw, byte and cacheable pointer paths.
+static void
+annotatePtrPath(
+    TargetMachine &TM,
+    PtrIMap &PtrToInstMap,
+    RawSet &rawPtrs,
+    ByteSet &bytePtrs,
+    CacheableSet &cacheablePtrs,
+    uint32_t numWriteImages,
+    bool mDebug
+    )
+{
+  if (PtrToInstMap.empty()) {
+    return;
+  }
+  // First we can check the cacheable pointers
+  annotateCacheablePtrs(TM, PtrToInstMap, cacheablePtrs,
+      bytePtrs, numWriteImages, mDebug);
+
+  // Next we annotate the byte pointers
+  annotateBytePtrs(TM, PtrToInstMap, bytePtrs, rawPtrs, mDebug);
+
+  // Next we annotate the raw pointers
+  annotateRawPtrs(TM, PtrToInstMap, rawPtrs, bytePtrs,
+      numWriteImages, mDebug);
+}
+// Allocate MultiUAV pointer ID's for the raw/conflict pointers.
+static void
+allocateMultiUAVPointers(
+    MachineFunction &MF,
+    const AMDILTargetMachine *ATM,
+    PtrIMap &PtrToInstMap,
+    RawSet &rawPtrs,
+    ConflictSet &conflictPtrs,
+    CacheableSet &cacheablePtrs,
+    uint32_t numWriteImages,
+    bool mDebug)
+{
+  if (PtrToInstMap.empty()) {
+    return;
+  }
+  AMDILMachineFunctionInfo *mMFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  uint32_t curUAV = numWriteImages;
+  bool increment = true;
+  const AMDILSubtarget *STM
+    = ATM->getSubtargetImpl();
+  // If the RAW_UAV_ID is a value that is larger than the max number of write
+  // images, then we use that UAV ID.
+  if (numWriteImages >= OPENCL_MAX_WRITE_IMAGES) {
+    curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+    increment = false;
+  }
+  AMDILKernelManager *KM = STM->getKernelManager();
+  PtrSet::iterator siBegin, siEnd;
+  std::vector<MachineInstr*>::iterator miBegin, miEnd;
+  // First lets handle the raw pointers.
+  for (siBegin = rawPtrs.begin(), siEnd = rawPtrs.end();
+      siBegin != siEnd; ++siBegin) {
+    assert((*siBegin)->getType()->isPointerTy() && "We must be a pointer type "
+        "to be processed at this point!");
+    const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType());
+    if (conflictPtrs.count(*siBegin) || !PT) {
+      continue;
+    }
+    // We only want to process global address space pointers
+    if (PT->getAddressSpace() != AMDILAS::GLOBAL_ADDRESS) {
+      if ((PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS
+            && STM->device()->usesSoftware(AMDILDeviceInfo::LocalMem))
+          || (PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS
+            && STM->device()->usesSoftware(AMDILDeviceInfo::ConstantMem))
+          || (PT->getAddressSpace() == AMDILAS::REGION_ADDRESS
+            && STM->device()->usesSoftware(AMDILDeviceInfo::RegionMem))) {
+        // If we are using software emulated hardware features, then
+        // we need to specify that they use the raw uav and not
+        // zero-copy uav. The easiest way to do this is to assume they
+        // conflict with another pointer. Any pointer that conflicts
+        // with another pointer is assigned to the raw uav or the
+        // arena uav if no raw uav exists.
+        const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType());
+        if (PT) {
+          conflictPtrs.insert(*siBegin);
+        }
+      }
+      if (PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+        if (STM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem)) {
+          const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType());
+          if (PT) {
+            conflictPtrs.insert(*siBegin);
+          }
+        } else {
+          if (mDebug) {
+            dbgs() << "Scratch Pointer '" << (*siBegin)->getName()
+              << "' being assigned uav "<<
+              STM->device()->getResourceID(AMDILDevice::SCRATCH_ID) << "\n";
+          }
+          for (miBegin = PtrToInstMap[*siBegin].begin(),
+              miEnd = PtrToInstMap[*siBegin].end();
+              miBegin != miEnd; ++miBegin) {
+            AMDILAS::InstrResEnc curRes;
+            getAsmPrinterFlags(*miBegin, curRes);
+            curRes.bits.ResourceID = STM->device()
+              ->getResourceID(AMDILDevice::SCRATCH_ID);
+            if (mDebug) {
+              dbgs() << "Updated instruction to bitmask ";
+              dbgs().write_hex(curRes.u16all);
+              dbgs() << " with ResID " << curRes.bits.ResourceID;
+              dbgs() << ". Inst: ";
+              (*miBegin)->dump();
+            }
+            setAsmPrinterFlags((*miBegin), curRes);
+            KM->setUAVID(*siBegin, curRes.bits.ResourceID);
+            mMFI->uav_insert(curRes.bits.ResourceID);
+          }
+        }
+      }
+      continue;
+    }
+    // If more than just UAV 11 is cacheable, then we can remove
+    // this check.
+    if (cacheablePtrs.count(*siBegin)) {
+      if (mDebug) {
+        dbgs() << "Raw Pointer '" << (*siBegin)->getName()
+          << "' is cacheable, not allocating a multi-uav for it!\n";
+      }
+      continue;
+    }
+    if (mDebug) {
+      dbgs() << "Raw Pointer '" << (*siBegin)->getName()
+        << "' being assigned uav " << curUAV << "\n";
+    }
+    if (PtrToInstMap[*siBegin].empty()) {
+      KM->setUAVID(*siBegin, curUAV);
+      mMFI->uav_insert(curUAV);
+    }
+    // For all instructions here, we are going to set the new UAV to the curUAV
+    // number and not the value that it currently is set to.
+    for (miBegin = PtrToInstMap[*siBegin].begin(),
+        miEnd = PtrToInstMap[*siBegin].end();
+        miBegin != miEnd; ++miBegin) {
+      AMDILAS::InstrResEnc curRes;
+      getAsmPrinterFlags(*miBegin, curRes);
+      curRes.bits.ResourceID = curUAV;
+      if (isAtomicInst(ATM->getInstrInfo(), *miBegin)) {
+        (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+          .setImm(curRes.bits.ResourceID);
+        if (curRes.bits.ResourceID
+            == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+          assert(0 && "Found an atomic instruction that has "
+              "an arena uav id!");
+        }
+      }
+      if (curUAV == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+        if (mDebug) {
+          dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+          (*miBegin)->print(dbgs());
+        }
+        curRes.bits.ByteStore = 1;
+        curRes.bits.CacheableRead = 0;
+      }
+      if (mDebug) {
+        dbgs() << "Updated instruction to bitmask ";
+        dbgs().write_hex(curRes.u16all);
+        dbgs() << " with ResID " << curRes.bits.ResourceID;
+        dbgs() << ". Inst: ";
+        (*miBegin)->dump();
+      }
+      setAsmPrinterFlags(*miBegin, curRes);
+      KM->setUAVID(*siBegin, curRes.bits.ResourceID);
+      mMFI->uav_insert(curRes.bits.ResourceID);
+    }
+    // If we make it here, we can increment the uav counter if we are less
+    // than the max write image count. Otherwise we set it to the default
+    // UAV and leave it.
+    if (increment && curUAV < (OPENCL_MAX_WRITE_IMAGES - 1)) {
+      ++curUAV;
+    } else {
+      curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+      increment = false;
+    }
+  }
+  if (numWriteImages == 8) {
+    curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID);
+  }
+  // Now lets handle the conflict pointers
+  for (siBegin = conflictPtrs.begin(), siEnd = conflictPtrs.end();
+      siBegin != siEnd; ++siBegin) {
+    assert((*siBegin)->getType()->isPointerTy() && "We must be a pointer type "
+        "to be processed at this point!");
+    const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType());
+    // We only want to process global address space pointers
+    if (!PT || PT->getAddressSpace() != AMDILAS::GLOBAL_ADDRESS) {
+      continue;
+    }
+    if (mDebug) {
+      dbgs() << "Conflict Pointer '" << (*siBegin)->getName()
+        << "' being assigned uav " << curUAV << "\n";
+    }
+    if (PtrToInstMap[*siBegin].empty()) {
+      KM->setUAVID(*siBegin, curUAV);
+      mMFI->uav_insert(curUAV);
+    }
+    for (miBegin = PtrToInstMap[*siBegin].begin(),
+        miEnd = PtrToInstMap[*siBegin].end();
+        miBegin != miEnd; ++miBegin) {
+      AMDILAS::InstrResEnc curRes;
+      getAsmPrinterFlags(*miBegin, curRes);
+      curRes.bits.ResourceID = curUAV;
+      if (isAtomicInst(ATM->getInstrInfo(), *miBegin)) {
+        (*miBegin)->getOperand((*miBegin)->getNumOperands()-1)
+          .setImm(curRes.bits.ResourceID);
+        if (curRes.bits.ResourceID
+            == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+          assert(0 && "Found an atomic instruction that has "
+              "an arena uav id!");
+        }
+      }
+      if (curUAV == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) {
+        if (mDebug) {
+          dbgs() << __LINE__ << ": Setting byte store bit on instruction: ";
+          (*miBegin)->print(dbgs());
+        }
+        curRes.bits.ByteStore = 1;
+      }
+      if (mDebug) {
+        dbgs() << "Updated instruction to bitmask ";
+        dbgs().write_hex(curRes.u16all);
+        dbgs() << " with ResID " << curRes.bits.ResourceID;
+        dbgs() << ". Inst: ";
+        (*miBegin)->dump();
+      }
+      setAsmPrinterFlags(*miBegin, curRes);
+      KM->setUAVID(*siBegin, curRes.bits.ResourceID);
+      mMFI->uav_insert(curRes.bits.ResourceID);
+    }
+  }
+}
+// The first thing we should do is to allocate the default
+// ID for each load/store/atomic instruction so that 
+// it is correctly allocated. Everything else after this
+// is just an optimization to more efficiently allocate
+// resource ID's.
+void
+allocateDefaultIDs(
+    const AMDILTargetMachine *ATM,
+    MachineFunction &MF,
+    bool mDebug)
+{
+  for (MachineFunction::iterator mfBegin = MF.begin(),
+      mfEnd = MF.end(); mfBegin != mfEnd; ++mfBegin) {
+    MachineBasicBlock *MB = mfBegin;
+    for (MachineBasicBlock::iterator mbb = MB->begin(), mbe = MB->end();
+        mbb != mbe; ++mbb) {
+      MachineInstr *MI = mbb;
+      if (isLoadInst(ATM->getInstrInfo(), MI) 
+          || isStoreInst(ATM->getInstrInfo(), MI)
+          || isAtomicInst(ATM->getInstrInfo(), MI)) {
+        AMDILAS::InstrResEnc curRes;
+        getAsmPrinterFlags(MI, curRes);
+        allocateDefaultID(ATM, curRes, MI, mDebug);
+      } 
+    }
+  }
+}
+
+  bool
+AMDILEGPointerManager::runOnMachineFunction(MachineFunction &MF)
+{
+  bool changed = false;
+  const AMDILTargetMachine *ATM
+    = reinterpret_cast<const AMDILTargetMachine*>(&TM);
+  AMDILMachineFunctionInfo *mMFI = 
+    MF.getInfo<AMDILMachineFunctionInfo>();
+  if (mDebug) {
+    dbgs() << getPassName() << "\n";
+    dbgs() << MF.getFunction()->getName() << "\n";
+    MF.dump();
+  }
+  // Start out by allocating the default ID's to all instructions in the
+  // function.
+  allocateDefaultIDs(ATM, MF, mDebug);
+
+  // A set of all pointers are tracked in this map and
+  // if multiple pointers are detected, they go to the same
+  // set.
+  PtrIMap PtrToInstMap;
+
+  // All of the instructions that are loads, stores or pointer
+  // conflicts are tracked in the map with a set of all values
+  // that reference the instruction stored.
+  InstPMap InstToPtrMap;
+
+  // In order to track across stack entries, we need a map between a 
+  // frame index and a pointer. That way when we load from a frame
+  // index, we know what pointer was stored to the frame index.
+  FIPMap FIToPtrMap;
+
+  // Set of all the pointers that are byte pointers. Byte pointers
+  // are required to have their instructions go to the arena.
+  ByteSet bytePtrs;
+
+  // Set of all the pointers that are cacheable. All of the cache pointers
+  // are required to go to a raw uav and cannot go to arena.
+  CacheableSet cacheablePtrs;
+
+  // Set of all the pointers that go into a raw buffer. A pointer can
+  // exist in either rawPtrs or bytePtrs but not both.
+  RawSet rawPtrs;
+
+  // Set of all the pointers that end up having a conflicting instruction
+  // somewhere in the pointer path.
+  ConflictSet conflictPtrs;
+
+  // Set of all pointers that are images
+  ImageSet images;
+
+  // Set of all pointers that are counters
+  AppendSet counters;
+
+  // Set of all pointers that load from a constant pool
+  CPoolSet cpool;
+
+  // Mapping from BB to infomation about the cacheability of the 
+  // global load instructions in it.
+  MBBCacheableMap bbCacheable;
+
+  // A set of load instructions that are cacheable 
+  // even if all the load instructions of the ptr are not. 
+  CacheableInstrSet cacheableSet;
+
+  // The lookup table holds all of the registers that
+  // are used as we assign pointers values to them.
+  // If two pointers collide on the lookup table, then
+  // we assign them to the same UAV. If one of the
+  // pointers is byte addressable, then we assign
+  // them to arena, otherwise we assign them to raw.
+  RVPVec lookupTable;
+
+  // First we need to go through all of the arguments and assign the
+  // live in registers to the lookup table and the pointer mapping.
+  uint32_t numWriteImages = parseArguments(MF, lookupTable, ATM, 
+      cacheablePtrs, images, counters, mDebug);
+
+  // Lets do some error checking on the results of the parsing.
+  if (counters.size() > OPENCL_MAX_NUM_ATOMIC_COUNTERS) {
+    mMFI->addErrorMsg(
+        amd::CompilerErrorMessage[INSUFFICIENT_COUNTER_RESOURCES]);
+  }
+  if (numWriteImages > OPENCL_MAX_WRITE_IMAGES
+      || (images.size() - numWriteImages > OPENCL_MAX_READ_IMAGES)) {
+    mMFI->addErrorMsg(
+        amd::CompilerErrorMessage[INSUFFICIENT_IMAGE_RESOURCES]);
+  }
+
+  // Now lets parse all of the instructions and update our
+  // lookup tables.
+  parseFunction(this, ATM, MF, InstToPtrMap, PtrToInstMap,
+      FIToPtrMap, lookupTable, bytePtrs, conflictPtrs, cpool, 
+      bbCacheable, mDebug);
+
+  // We need to go over our pointer map and find all the conflicting
+  // pointers that have byte stores and put them in the bytePtr map.
+  // All conflicting pointers that don't have byte stores go into
+  // the rawPtr map.
+  detectConflictingPointers(ATM, InstToPtrMap, bytePtrs, rawPtrs,
+      conflictPtrs, mDebug);
+
+  // The next step is to detect whether the pointer should be added to
+  // the fully cacheable set or not. A pointer is marked as cacheable if
+  // no store instruction exists.
+  detectFullyCacheablePointers(ATM, PtrToInstMap, rawPtrs,
+      cacheablePtrs, conflictPtrs, mDebug);
+
+  // Disable partially cacheable for now when multiUAV is on.
+  // SC versions before SC139 have a bug that generates incorrect
+  // addressing for some cached accesses.
+  if (!ATM->getSubtargetImpl()
+      ->device()->isSupported(AMDILDeviceInfo::MultiUAV) &&
+      ATM->getSubtargetImpl()->calVersion() >= CAL_VERSION_SC_139) {
+    // Now we take the set of loads that have no reachable stores and
+    // create a list of additional instructions (those that aren't already
+    // in a cacheablePtr set) that are safe to mark as cacheable. 
+    detectCacheableInstrs(bbCacheable, InstToPtrMap, cacheablePtrs,
+        bytePtrs, cacheableSet, mDebug);
+
+    // Annotate the additional instructions computed above as cacheable.
+    // Note that this should not touch any instructions annotated in
+    // annotatePtrPath.
+    annotateCacheableInstrs(TM, cacheableSet, mDebug);
+  }
+
+  // Now that we have detected everything we need to detect, lets go through an
+  // annotate the instructions along the pointer path for each of the
+  // various pointer types.
+  annotatePtrPath(TM, PtrToInstMap, rawPtrs, bytePtrs,
+      cacheablePtrs, numWriteImages, mDebug);
+
+  // Annotate the atomic counter path if any exists.
+  annotateAppendPtrs(TM, PtrToInstMap, counters, mDebug);
+
+  // If we support MultiUAV, then we need to determine how
+  // many write images exist so that way we know how many UAV are
+  // left to allocate to buffers.
+  if (ATM->getSubtargetImpl()
+      ->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
+    // We now have (OPENCL_MAX_WRITE_IMAGES - numPtrs) buffers open for
+    // multi-uav allocation.
+    allocateMultiUAVPointers(MF, ATM, PtrToInstMap, rawPtrs,
+        conflictPtrs, cacheablePtrs, numWriteImages, mDebug);
+  }
+
+  // The last step is to detect if we have any alias constant pool operations.
+  // This is not likely, but does happen on occasion with double precision 
+  // operations.
+  detectAliasedCPoolOps(TM, cpool, mDebug);
+  if (mDebug) {
+    dumpPointers(bytePtrs, "Byte Store Ptrs");
+    dumpPointers(rawPtrs, "Raw Ptrs");
+    dumpPointers(cacheablePtrs, "Cache Load Ptrs");
+    dumpPointers(counters, "Atomic Counters");
+    dumpPointers(images, "Images");
+  }
+  return changed;
+}
+
+// The default pointer manager just assigns the default ID's to
+// each load/store instruction and does nothing else. This is
+// the pointer manager for the 7XX series of cards.
+  bool
+AMDILPointerManager::runOnMachineFunction(MachineFunction &MF)
+{
+  bool changed = false;
+  const AMDILTargetMachine *ATM
+    = reinterpret_cast<const AMDILTargetMachine*>(&TM);
+  if (mDebug) {
+    dbgs() << getPassName() << "\n";
+    dbgs() << MF.getFunction()->getName() << "\n";
+    MF.dump();
+  }
+  // On the 7XX we don't have to do any special processing, so we 
+  // can just allocate the default ID and be done with it.
+  allocateDefaultIDs(ATM, MF, mDebug);
+  return changed;
+}
diff --git a/src/gallium/drivers/radeon/AMDILPointerManager.h b/src/gallium/drivers/radeon/AMDILPointerManager.h

new file mode 100644 (file)

index 0000000..2c471fb
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILPointerManager.h
@@ -0,0 +1,209 @@
+//===-------- AMDILPointerManager.h - Manage Pointers for HW ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// The AMDIL Pointer Manager is a class that does all the checking for
+// different pointer characteristics. Pointers have attributes that need
+// to be attached to them in order to correctly codegen them efficiently.
+// This class will analyze the pointers of a function and then traverse the uses
+// of the pointers and determine if a pointer can be cached, should belong in
+// the arena, and what UAV it should belong to. There are seperate classes for
+// each unique generation of devices. This pass only works in SSA form.
+//===----------------------------------------------------------------------===//
+#ifndef _AMDIL_POINTER_MANAGER_H_
+#define _AMDIL_POINTER_MANAGER_H_
+#undef DEBUG_TYPE
+#undef DEBUGME
+#define DEBUG_TYPE "PointerManager"
+#if !defined(NDEBUG)
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME (false)
+#endif
+#include "AMDIL.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <list>
+#include <map>
+#include <queue>
+#include <set>
+
+namespace llvm {
+  class Value;
+  class MachineBasicBlock;
+  // Typedefing the multiple different set types to that it is
+  // easier to read what each set is supposed to handle. This
+  // also allows it easier to track which set goes to which
+  // argument in a function call.
+  typedef std::set<const Value*> PtrSet;
+
+  // A Byte set is the set of all base pointers that must
+  // be allocated to the arena path.
+  typedef PtrSet ByteSet;
+
+  // A Raw set is the set of all base pointers that can be
+  // allocated to the raw path.
+  typedef PtrSet RawSet;
+
+  // A cacheable set is the set of all base pointers that
+  // are deamed cacheable based on annotations or
+  // compiler options.
+  typedef PtrSet CacheableSet;
+
+  // A conflict set is a set of all base pointers whose 
+  // use/def chains conflict with another base pointer.
+  typedef PtrSet ConflictSet;
+
+  // An image set is a set of all read/write only image pointers.
+  typedef PtrSet ImageSet;
+
+  // An append set is a set of atomic counter base pointers
+  typedef std::vector<const Value*> AppendSet;
+
+  // A ConstantSet is a set of constant pool instructions
+  typedef std::set<MachineInstr*> CPoolSet;
+
+  // A CacheableInstSet set is a set of instructions that are cachable
+  // even if the pointer is not generally cacheable.
+  typedef std::set<MachineInstr*> CacheableInstrSet;
+
+  // A pair that maps a virtual register to the equivalent base
+  // pointer value that it was derived from.
+  typedef std::pair<unsigned, const Value*> RegValPair;
+
+  // A map that maps between the base pointe rvalue and an array
+  // of instructions that are part of the pointer chain. A pointer
+  // chain is a recursive def/use chain of all instructions that don't
+  // store data to memory unless the pointer is the data being stored.
+  typedef std::map<const Value*, std::vector<MachineInstr*> > PtrIMap;
+
+  // A map that holds a set of all base pointers that are used in a machine
+  // instruction. This helps to detect when conflict pointers are found
+  // such as when pointer subtraction occurs.
+  typedef std::map<MachineInstr*, PtrSet> InstPMap;
+
+  // A map that holds the frame index to RegValPair so that writes of 
+  // pointers to the stack can be tracked.
+  typedef std::map<unsigned, RegValPair > FIPMap;
+
+  // A small vector impl that holds all of the register to base pointer 
+  // mappings for a given function.
+  typedef std::map<unsigned, RegValPair> RVPVec;
+
+
+
+  // The default pointer manager. This handles pointer 
+  // resource allocation for default ID's only. 
+  // There is no special processing.
+  class AMDILPointerManager : public MachineFunctionPass
+  {
+    public:
+      AMDILPointerManager(
+          TargetMachine &tm
+          AMDIL_OPT_LEVEL_DECL);
+      virtual ~AMDILPointerManager();
+      virtual const char*
+        getPassName() const;
+      virtual bool
+        runOnMachineFunction(MachineFunction &F);
+      virtual void
+        getAnalysisUsage(AnalysisUsage &AU) const;
+      static char ID;
+    protected:
+      bool mDebug;
+    private:
+      TargetMachine &TM;
+  }; // class AMDILPointerManager
+
+  // The pointer manager for Evergreen and Northern Island
+  // devices. This pointer manager allocates and trackes
+  // cached memory, arena resources, raw resources and
+  // whether multi-uav is utilized or not.
+  class AMDILEGPointerManager : public AMDILPointerManager
+  {
+    public:
+      AMDILEGPointerManager(
+          TargetMachine &tm
+          AMDIL_OPT_LEVEL_DECL);
+      virtual ~AMDILEGPointerManager();
+      virtual const char*
+        getPassName() const;
+      virtual bool
+        runOnMachineFunction(MachineFunction &F);
+    private:
+      TargetMachine &TM;
+  }; // class AMDILEGPointerManager
+
+  // Information related to the cacheability of instructions in a basic block.
+  // This is used during the parse phase of the pointer algorithm to track
+  // the reachability of stores within a basic block.
+  class BlockCacheableInfo {
+    public:
+      BlockCacheableInfo() :
+        mStoreReachesTop(false),
+        mStoreReachesExit(false),
+        mCacheableSet()
+    {};
+
+      bool storeReachesTop() const  { return mStoreReachesTop; }
+      bool storeReachesExit() const { return mStoreReachesExit; }
+      CacheableInstrSet::const_iterator 
+        cacheableBegin() const { return mCacheableSet.begin(); }
+      CacheableInstrSet::const_iterator 
+        cacheableEnd()   const { return mCacheableSet.end(); }
+
+      // mark the block as having a global store that reaches it. This
+      // will also set the store reaches exit flag, and clear the list
+      // of loads (since they are now reachable by a store.)
+      bool setReachesTop() {
+        bool changedExit = !mStoreReachesExit;
+
+        if (!mStoreReachesTop)
+          mCacheableSet.clear();
+
+        mStoreReachesTop = true;
+        mStoreReachesExit = true;
+        return changedExit;
+      }
+
+      // Mark the block as having a store that reaches the exit of the 
+      // block.
+      void setReachesExit() {
+        mStoreReachesExit = true;
+      }
+
+      // If the top or the exit of the block are not marked as reachable
+      // by a store, add the load to the list of cacheable loads.
+      void addPossiblyCacheableInst(const TargetMachine * tm, MachineInstr *load) {
+        // By definition, if store reaches top, then store reaches exit.
+        // So, we only test for exit here.
+        // If we have a volatile load we cannot cache it.
+        if (mStoreReachesExit || isVolatileInst(tm->getInstrInfo(), load)) {
+          return;
+        }
+
+        mCacheableSet.insert(load);
+      }
+
+    private:
+      bool mStoreReachesTop; // Does a global store reach the top of this block?
+      bool mStoreReachesExit;// Does a global store reach the exit of this block?
+      CacheableInstrSet mCacheableSet; // The set of loads in the block not 
+      // reachable by a global store.
+  };
+  // Map from MachineBasicBlock to it's cacheable load info.
+  typedef std::map<MachineBasicBlock*, BlockCacheableInfo> MBBCacheableMap;
+} // end llvm namespace
+#endif // _AMDIL_POINTER_MANAGER_H_
diff --git a/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp b/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp

new file mode 100644 (file)

index 0000000..95614f4
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp
@@ -0,0 +1,293 @@
+//===-- AMDILPrintfConvert.cpp - Printf Conversion pass --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "PrintfConvert"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILAlgorithms.tpp"
+#include "AMDILKernelManager.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILModuleInfo.h"
+#include "AMDILTargetMachine.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+
+#include <cstdio>
+
+using namespace llvm;
+namespace
+{
+    class LLVM_LIBRARY_VISIBILITY AMDILPrintfConvert : public FunctionPass
+    {
+        public:
+            TargetMachine &TM;
+            static char ID;
+            AMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
+            ~AMDILPrintfConvert();
+            const char* getPassName() const;
+            bool runOnFunction(Function &F);
+            bool doInitialization(Module &M);
+            bool doFinalization(Module &M);
+            void getAnalysisUsage(AnalysisUsage &AU) const;
+
+        private:
+            bool expandPrintf(BasicBlock::iterator *bbb);
+            AMDILMachineFunctionInfo *mMFI;
+            AMDILKernelManager *mKM;
+            bool mChanged;
+            SmallVector<int64_t, DEFAULT_VEC_SLOTS> bVecMap;
+    };
+    char AMDILPrintfConvert::ID = 0;
+} // anonymouse namespace
+
+namespace llvm
+{
+    FunctionPass*
+        createAMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+        {
+            return new AMDILPrintfConvert(tm AMDIL_OPT_LEVEL_VAR);
+        }
+} // llvm namespace
+AMDILPrintfConvert::AMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
+    : FunctionPass(ID), TM(tm)
+{
+}
+AMDILPrintfConvert::~AMDILPrintfConvert()
+{
+}
+    bool
+AMDILPrintfConvert::expandPrintf(BasicBlock::iterator *bbb)
+{
+    Instruction *inst = (*bbb);
+    CallInst *CI = dyn_cast<CallInst>(inst);
+    if (!CI) {
+        return false;
+    }
+    int num_ops = CI->getNumOperands();
+    if (!num_ops) {
+        return false;
+    }
+    if (CI->getOperand(num_ops - 1)->getName() != "printf") {
+        return false;
+    }
+
+    Function *mF = inst->getParent()->getParent();
+    uint64_t bytes = 0;
+    mChanged = true;
+    if (num_ops == 1) {
+        ++(*bbb);
+        Constant *newConst = ConstantInt::getSigned(CI->getType(), bytes);
+        CI->replaceAllUsesWith(newConst);
+        CI->eraseFromParent();
+        return mChanged;
+    }
+    // Deal with the string here
+    Value *op = CI->getOperand(0);
+    ConstantExpr *GEPinst = dyn_cast<ConstantExpr>(op);
+    if (GEPinst) {
+        GlobalVariable *GVar
+            = dyn_cast<GlobalVariable>(GEPinst->getOperand(0));
+        std::string str = "unknown";
+        if (GVar && GVar->hasInitializer()) {
+          ConstantDataArray *CA
+              = dyn_cast<ConstantDataArray>(GVar->getInitializer());
+          str = (CA->isString() ? CA->getAsString() : "unknown");
+        }
+        uint64_t id = (uint64_t)mMFI->addPrintfString(str, 
+            getAnalysis<MachineFunctionAnalysis>().getMF()
+            .getMMI().getObjFileInfo<AMDILModuleInfo>().get_printf_offset());
+        std::string name = "___dumpStringID";
+        Function *nF = NULL;
+        std::vector<Type*> types;
+        types.push_back(Type::getInt32Ty(mF->getContext()));
+        nF = mF->getParent()->getFunction(name);
+        if (!nF) {
+            nF = Function::Create(
+                    FunctionType::get(
+                        Type::getVoidTy(mF->getContext()), types, false),
+                    GlobalValue::ExternalLinkage,
+                    name, mF->getParent());
+        }
+        Constant *C = ConstantInt::get(
+                Type::getInt32Ty(mF->getContext()), id, false);
+        CallInst *nCI = CallInst::Create(nF, C);
+        nCI->insertBefore(CI);
+        bytes = strlen(str.data());
+        for (uint32_t x = 1, y = num_ops - 1; x < y; ++x) {
+            op = CI->getOperand(x);
+            Type *oType = op->getType();
+            uint32_t eleCount = getNumElements(oType);
+            uint32_t eleSize = (uint32_t)GET_SCALAR_SIZE(oType);
+            if (!eleSize) {
+              // Default size is 32bits.
+              eleSize = 32;
+            }
+            if (!eleCount) {
+              // Default num elements is 1.
+              eleCount = 1;
+            }
+            uint32_t totalSize = eleCount * eleSize;
+            mMFI->addPrintfOperand(str, (x - 1),
+                    (uint32_t)totalSize);
+        }
+    }
+    for (uint32_t x = 1, y = num_ops - 1; x < y; ++x) {
+        op = CI->getOperand(x);
+        Type *oType = op->getType();
+        if (oType->isFPOrFPVectorTy()
+                && (oType->getTypeID() != Type::VectorTyID)) {
+            Type *iType = NULL;
+            if (oType->isFloatTy()) {
+                iType = dyn_cast<Type>(
+                        Type::getInt32Ty(oType->getContext()));
+            } else {
+                iType = dyn_cast<Type>(
+                        Type::getInt64Ty(oType->getContext()));
+            }
+            op = new BitCastInst(op, iType, "printfBitCast", CI);
+        } else if (oType->getTypeID() == Type::VectorTyID) {
+            Type *iType = NULL;
+            uint32_t eleCount = getNumElements(oType);
+            uint32_t eleSize = (uint32_t)GET_SCALAR_SIZE(oType);
+            uint32_t totalSize = eleCount * eleSize;
+            switch (eleSize) {
+                default:
+                    eleCount = totalSize / 64;
+                    iType = dyn_cast<Type>(
+                            Type::getInt64Ty(oType->getContext()));
+                    break;
+                case 8:
+                    if (eleCount >= 8) {
+                        eleCount = totalSize / 64;
+                        iType = dyn_cast<Type>(
+                                Type::getInt64Ty(oType->getContext()));
+                    } else if (eleCount >= 4) {
+                        eleCount = 1;
+                        iType = dyn_cast<Type>(
+                                Type::getInt32Ty(oType->getContext()));
+                    } else {
+                        eleCount = 1;
+                        iType = dyn_cast<Type>(
+                                Type::getInt16Ty(oType->getContext()));
+                    }
+                    break;
+                case 16:
+                    if (eleCount >= 4) {
+                        eleCount = totalSize / 64;
+                        iType = dyn_cast<Type>(
+                                Type::getInt64Ty(oType->getContext()));
+                    } else {
+                        eleCount = 1;
+                        iType = dyn_cast<Type>(
+                                Type::getInt32Ty(oType->getContext()));
+                    }
+                    break;
+            }
+            if (eleCount > 1) {
+                iType = dyn_cast<Type>(
+                        VectorType::get(iType, eleCount));
+            }
+            op = new BitCastInst(op, iType, "printfBitCast", CI);
+        }
+        char buffer[256];
+        uint32_t size = (uint32_t)GET_SCALAR_SIZE(oType);
+        if (size) {
+            sprintf(buffer, "___dumpBytes_v%db%u",
+                    1,
+                    (uint32_t)getNumElements(oType) * (uint32_t)size);
+        } else {
+            const PointerType *PT = dyn_cast<PointerType>(oType);
+            if (PT->getAddressSpace() == 0 &&
+                    GET_SCALAR_SIZE(PT->getContainedType(0)) == 8
+                    && getNumElements(PT->getContainedType(0)) == 1) {
+                op = new BitCastInst(op,
+                        Type::getInt8PtrTy(oType->getContext(),
+                            AMDILAS::CONSTANT_ADDRESS),
+                        "printfPtrCast", CI);
+
+                sprintf(buffer, "___dumpBytes_v%dbs", 1);
+            } else {
+                op = new PtrToIntInst(op,
+                        Type::getInt32Ty(oType->getContext()),
+                        "printfPtrCast", CI);
+                sprintf(buffer, "___dumpBytes_v1b32");
+            }
+        }
+        std::vector<Type*> types;
+        types.push_back(op->getType());
+        std::string name = buffer;
+        Function *nF = NULL;
+        nF = mF->getParent()->getFunction(name);
+        if (!nF) {
+            nF = Function::Create(
+                    FunctionType::get(
+                        Type::getVoidTy(mF->getContext()), types, false),
+                    GlobalValue::ExternalLinkage,
+                    name, mF->getParent());
+        }
+        CallInst *nCI = CallInst::Create(nF, op);
+        nCI->insertBefore(CI);
+        bytes += (size - 4);
+    }
+    ++(*bbb);
+    Constant *newConst = ConstantInt::getSigned(CI->getType(), bytes);
+    CI->replaceAllUsesWith(newConst);
+    CI->eraseFromParent();
+    return mChanged;
+}
+    bool
+AMDILPrintfConvert::runOnFunction(Function &MF)
+{
+    mChanged = false;
+    mKM = TM.getSubtarget<AMDILSubtarget>().getKernelManager();
+    mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
+          .getInfo<AMDILMachineFunctionInfo>();
+    bVecMap.clear();
+    safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+            std::bind1st(
+                std::mem_fun(
+                    &AMDILPrintfConvert::expandPrintf), this));
+    return mChanged;
+}
+
+const char*
+AMDILPrintfConvert::getPassName() const
+{
+    return "AMDIL Printf Conversion Pass";
+}
+bool
+AMDILPrintfConvert::doInitialization(Module &M)
+{
+    return false;
+}
+
+bool
+AMDILPrintfConvert::doFinalization(Module &M)
+{
+    return false;
+}
+
+void
+AMDILPrintfConvert::getAnalysisUsage(AnalysisUsage &AU) const
+{
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
diff --git a/src/gallium/drivers/radeon/AMDILProfiles.td b/src/gallium/drivers/radeon/AMDILProfiles.td

new file mode 100644 (file)

index 0000000..60435a8
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILProfiles.td
@@ -0,0 +1,174 @@
+//===- AMDILProfiles.td - AMD IL Profiles ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+// These are used for custom selection dag type profiles
+
+//===----------------------------------------------------------------------===//
+// Custom Selection DAG Type Profiles
+//===----------------------------------------------------------------------===//
+// SDTCisDP - The specified operand has double type
+// Tablegen needs to be hacked to get this constraint to work
+//class SDTCisDP<int OpNum> : SDTypeConstraint<OpNum>;
+
+//===----------------------------------------------------------------------===//
+// Generic Profile Types
+//===----------------------------------------------------------------------===//
+
+def SDTIL_GenUnaryOp : SDTypeProfile<1, 1, [
+    SDTCisSameAs<0, 1>
+    ]>;
+def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
+    ]>;
+def SDTIL_GenCMovLog : SDTypeProfile<1, 3, [
+    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisInt<1>
+    ]>;
+def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
+    SDTCisEltOfVec<1, 0>
+    ]>;
+
+def SDTIL_GenVecExtract : SDTypeProfile<1, 2, [
+    SDTCisEltOfVec<0, 1>, SDTCisVT<2, i32>
+    ]>;
+
+def SDTIL_GenVecInsert : SDTypeProfile<1, 4, [
+    SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>,
+    SDTCisVT<3, i32>, SDTCisVT<4, i32>
+    ]>;
+
+def SDTIL_GenVecShuffle : SDTypeProfile <1, 2, [
+    SDTCisSameAs<0, 1>, SDTCisVT<2, i32>
+    ]>;
+
+def SDTIL_GenVecConcat : SDTypeProfile <1, 2, [
+    SDTCisSameAs<1, 2>
+    ]>;
+//===----------------------------------------------------------------------===//
+// Conversion Profile Types
+//===----------------------------------------------------------------------===//
+def SDTIL_DPToFPOp : SDTypeProfile<1, 1, [
+    SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>
+    ]>; // d2f
+
+def SDTIL_AnyToInt : SDTypeProfile<1, 1, [
+    SDTCisInt<0>
+    ]>;
+def SDTIL_IntToAny : SDTypeProfile<1, 1, [
+    SDTCisInt<1>
+    ]>;
+def SDTIL_GenBitConv : SDTypeProfile<1, 1, []>;
+//===----------------------------------------------------------------------===//
+// Scalar Profile Types
+//===----------------------------------------------------------------------===//
+
+// Add instruction pattern to handle offsets of memory operationns
+def SDTIL_AddAddrri: SDTypeProfile<1, 2, [
+    SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2>
+    ]>;
+def SDTIL_AddAddrir : SDTypeProfile<1, 2, [
+    SDTCisInt<0>, SDTCisPtrTy<2>, SDTCisSameAs<0, 1>
+    ]>;
+
+def SDTIL_LCreate : SDTypeProfile<1, 2, [
+    SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_LCreate2 : SDTypeProfile<1, 2, [
+    SDTCisVT<0, v2i64>, SDTCisVT<1, v2i32>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_LComp : SDTypeProfile<1, 1, [
+    SDTCisVT<0, i32>, SDTCisVT<1, i64>
+    ]>;
+def SDTIL_LComp2 : SDTypeProfile<1, 1, [
+    SDTCisVT<0, v2i32>, SDTCisVT<1, v2i64>
+    ]>;
+def SDTIL_DCreate : SDTypeProfile<1, 2, [
+    SDTCisVT<0, f64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_DComp : SDTypeProfile<1, 1, [
+    SDTCisVT<0, i32>, SDTCisVT<1, f64>
+    ]>;
+def SDTIL_DCreate2 : SDTypeProfile<1, 2, [
+    SDTCisVT<0, v2f64>, SDTCisVT<1, v2i32>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_DComp2 : SDTypeProfile<1, 1, [
+    SDTCisVT<0, v2i32>, SDTCisVT<1, v2f64>
+    ]>;
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Profile for Normal Call
+def SDTIL_Call : SDTypeProfile<0, 1, [
+    SDTCisVT<0, i32>
+    ]>;
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+// Comparison instruction
+def SDTIL_Cmp  : SDTypeProfile<1, 3, [
+    SDTCisSameAs<0, 2>, SDTCisSameAs<2,3>, SDTCisVT<1, i32>
+    ]>;
+
+
+//===----------------------------------------------------------------------===//
+// Call Sequence Profiles
+//===----------------------------------------------------------------------===//
+def SDTIL_CallSeqStart  : SDCallSeqStart< [
+    SDTCisVT<0, i32>
+    ]>;
+def SDTIL_CallSeqEnd    : SDCallSeqEnd< [
+    SDTCisVT<0, i32>, SDTCisVT<1, i32>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Image Operation Profiles
+//===----------------------------------------------------------------------===//
+def SDTIL_ImageRead  : SDTypeProfile<1, 3, 
+    [SDTCisVT<0, v4i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, v4f32>]>;
+def SDTIL_ImageWrite : SDTypeProfile<0, 3,
+    [SDTCisPtrTy<0>, SDTCisVT<1, v2i32>, SDTCisVT<2, v4i32>]>;
+def SDTIL_ImageWrite3D : SDTypeProfile<0, 3,
+    [SDTCisPtrTy<0>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>]>;
+def SDTIL_ImageInfo  : SDTypeProfile<1, 1,
+    [SDTCisVT<0, v4i32>, SDTCisPtrTy<1>]>;
+//===----------------------------------------------------------------------===//
+// Atomic Operation Profiles
+//===----------------------------------------------------------------------===//
+def SDTIL_UniAtomNoRet : SDTypeProfile<0, 2, [
+    SDTCisPtrTy<0>, SDTCisVT<1, i32>
+    ]>;
+def SDTIL_BinAtomNoRet : SDTypeProfile<0, 3, [
+    SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>
+    ]>;
+def SDTIL_TriAtomNoRet : SDTypeProfile<0, 4, [
+    SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>
+    ]>;
+def SDTIL_UniAtom : SDTypeProfile<1, 2, [
+    SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>
+    ]>;
+def SDTIL_BinAtom : SDTypeProfile<1, 3, [
+    SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, i32>
+    ]>;
+def SDTIL_TriAtom : SDTypeProfile<1, 4, [
+    SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>,
+    SDTCisVT<3, i32>, SDTCisVT<4, i32>
+    ]>;
+
+def SDTIL_BinAtomFloat : SDTypeProfile<1, 3, [
+    SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, f32>, SDTCisVT<3, f32>
+    ]>;
+def SDTIL_BinAtomNoRetFloat : SDTypeProfile<0, 3, [
+    SDTCisPtrTy<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>
+    ]>;
+
+def SDTIL_Append : SDTypeProfile<1, 1, [
+    SDTCisVT<0, i32>, SDTCisPtrTy<1>
+    ]>;
diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp

new file mode 100644 (file)

index 0000000..5588233
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
@@ -0,0 +1,200 @@
+//===- AMDILRegisterInfo.cpp - AMDIL Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AMDIL implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILRegisterInfo.h"
+#include "AMDIL.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+AMDILRegisterInfo::AMDILRegisterInfo(AMDILTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDILGenRegisterInfo(0), // RA???
+  TM(tm), TII(tii)
+{
+  baseOffset = 0;
+  nextFuncOffset = 0;
+}
+
+const uint16_t*
+AMDILRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
+{
+  static const uint16_t CalleeSavedRegs[] = { 0 };
+  // TODO: Does IL need to actually have any callee saved regs?
+  // I don't think we do since we can just use sequential registers
+  // Maybe this would be easier if every function call was inlined first
+  // and then there would be no callee issues to deal with
+  //TODO(getCalleeSavedRegs);
+  return CalleeSavedRegs;
+}
+
+BitVector
+AMDILRegisterInfo::getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  // We reserve the first getNumRegs() registers as they are the ones passed
+  // in live-in/live-out
+  // and therefor cannot be killed by the scheduler. This works around a bug
+  // discovered
+  // that was causing the linearscan register allocator to kill registers
+  // inside of the
+  // function that were also passed as LiveIn registers.
+  for (unsigned int x = 0, y = 256; x < y; ++x) {
+    Reserved.set(x);
+  }
+  return Reserved;
+}
+
+BitVector
+AMDILRegisterInfo::getAllocatableSet(const MachineFunction &MF,
+    const TargetRegisterClass *RC = NULL) const
+{
+  BitVector Allocatable(getNumRegs());
+  Allocatable.clear();
+  return Allocatable;
+}
+
+const TargetRegisterClass* const*
+AMDILRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
+{
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  // TODO: Keep in sync with getCalleeSavedRegs
+  //TODO(getCalleeSavedRegClasses);
+  return CalleeSavedRegClasses;
+}
+void
+AMDILRegisterInfo::eliminateCallFramePseudoInstr(
+    MachineFunction &MF,
+    MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const
+{
+  MBB.erase(I);
+}
+
+// For each frame index we find, we store the offset in the stack which is
+// being pushed back into the global buffer. The offset into the stack where
+// the value is stored is copied into a new register and the frame index is
+// then replaced with that register.
+void 
+AMDILRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+    int SPAdj,
+    RegScavenger *RS) const
+{
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned int y = MI.getNumOperands();
+  for (unsigned int x = 0; x < y; ++x) {
+    if (!MI.getOperand(x).isFI()) {
+      continue;
+    }
+    bool def = isStoreInst(TM.getInstrInfo(), &MI);
+    int FrameIndex = MI.getOperand(x).getIndex();
+    int64_t Offset = MFI->getObjectOffset(FrameIndex);
+    //int64_t Size = MF.getFrameInfo()->getObjectSize(FrameIndex);
+    // An optimization is to only use the offsets if the size
+    // is larger than 4, which means we are storing an array
+    // instead of just a pointer. If we are size 4 then we can
+    // just do register copies since we don't need to worry about
+    // indexing dynamically
+    MachineInstr *nMI = MF.CreateMachineInstr(
+        TII.get(AMDIL::LOADCONST_i32), MI.getDebugLoc());
+    nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, true));
+    nMI->addOperand(
+        MachineOperand::CreateImm(Offset));
+    MI.getParent()->insert(II, nMI);
+    nMI = MF.CreateMachineInstr(
+        TII.get(AMDIL::ADD_i32), MI.getDebugLoc());
+    nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, true));
+    nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, false));
+    nMI->addOperand(MachineOperand::CreateReg(AMDIL::FP, false));
+    
+    MI.getParent()->insert(II, nMI);
+    if (MI.getOperand(x).isReg() == false)  {
+      MI.getOperand(x).ChangeToRegister(
+          nMI->getOperand(0).getReg(), def);
+    } else {
+      MI.getOperand(x).setReg(
+          nMI->getOperand(0).getReg());
+    }
+  }
+}
+
+void
+AMDILRegisterInfo::processFunctionBeforeFrameFinalized(
+    MachineFunction &MF) const
+{
+  //TODO(processFunctionBeforeFrameFinalized);
+  // Here we keep track of the amount of stack that the current function
+  // uses so
+  // that we can set the offset to the end of the stack and any other
+  // function call
+  // will not overwrite any stack variables.
+  // baseOffset = nextFuncOffset;
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  for (uint32_t x = 0, y = MFI->getNumObjects(); x < y; ++x) {
+    int64_t size = MFI->getObjectSize(x);
+    if (!(size % 4) && size > 1) {
+      nextFuncOffset += size;
+    } else {
+      nextFuncOffset += 16;
+    }
+  }
+}
+unsigned int
+AMDILRegisterInfo::getRARegister() const
+{
+  return AMDIL::RA;
+}
+
+unsigned int
+AMDILRegisterInfo::getFrameRegister(const MachineFunction &MF) const
+{
+  return AMDIL::FP;
+}
+
+unsigned int
+AMDILRegisterInfo::getEHExceptionRegister() const
+{
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned int
+AMDILRegisterInfo::getEHHandlerRegister() const
+{
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int64_t
+AMDILRegisterInfo::getStackSize() const
+{
+  return nextFuncOffset - baseOffset;
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDILGenRegisterInfo.inc"
+
diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.h b/src/gallium/drivers/radeon/AMDILRegisterInfo.h

new file mode 100644 (file)

index 0000000..5207cd8
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.h
@@ -0,0 +1,91 @@
+//===- AMDILRegisterInfo.h - AMDIL Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file contains the AMDIL implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDILREGISTERINFO_H_
+#define AMDILREGISTERINFO_H_
+
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#include "AMDILGenRegisterInfo.inc"
+// See header file for explanation
+
+namespace llvm
+{
+
+  class AMDILTargetMachine;
+  class TargetInstrInfo;
+  class Type;
+
+  /// DWARFFlavour - Flavour of dwarf regnumbers
+  ///
+  namespace DWARFFlavour {
+    enum {
+      AMDIL_Generic = 0
+    };
+  }
+
+  struct AMDILRegisterInfo : public AMDILGenRegisterInfo
+  {
+    AMDILTargetMachine &TM;
+    const TargetInstrInfo &TII;
+
+    AMDILRegisterInfo(AMDILTargetMachine &tm, const TargetInstrInfo &tii);
+    /// Code Generation virtual methods...
+    const uint16_t * getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+    const TargetRegisterClass* const*
+      getCalleeSavedRegClasses(
+          const MachineFunction *MF = 0) const;
+
+    BitVector
+      getReservedRegs(const MachineFunction &MF) const;
+    BitVector
+      getAllocatableSet(const MachineFunction &MF,
+          const TargetRegisterClass *RC) const;
+
+    void
+      eliminateCallFramePseudoInstr(
+          MachineFunction &MF,
+          MachineBasicBlock &MBB,
+          MachineBasicBlock::iterator I) const;
+    void
+      eliminateFrameIndex(MachineBasicBlock::iterator II,
+          int SPAdj, RegScavenger *RS = NULL) const;
+
+    void
+      processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+    // Debug information queries.
+    unsigned int
+      getRARegister() const;
+
+    unsigned int
+      getFrameRegister(const MachineFunction &MF) const;
+
+    // Exception handling queries.
+    unsigned int
+      getEHExceptionRegister() const;
+    unsigned int
+      getEHHandlerRegister() const;
+
+    int64_t
+      getStackSize() const;
+    private:
+    mutable int64_t baseOffset;
+    mutable int64_t nextFuncOffset;
+  };
+
+} // end namespace llvm
+
+#endif // AMDILREGISTERINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.td b/src/gallium/drivers/radeon/AMDILRegisterInfo.td

new file mode 100644 (file)

index 0000000..17f4b3b
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.td
@@ -0,0 +1,964 @@
+//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//  Declarations that describe the AMDIL register file
+//
+//===----------------------------------------------------------------------===//
+
+class AMDILReg<bits<16> num, string n> : Register<n> {
+  field bits<16> Value;
+  let Value = num;
+  let Namespace = "AMDIL";
+}
+
+// We will start with 8 registers for each class before expanding to more
+// Since the swizzle is added based on the register class, we can leave it
+// off here and just specify different registers for different register classes
+def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
+def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
+def R21 : AMDILReg<21, "r21">, DwarfRegNum<[21]>;
+def R22 : AMDILReg<22, "r22">, DwarfRegNum<[22]>;
+def R23 : AMDILReg<23, "r23">, DwarfRegNum<[23]>;
+def R24 : AMDILReg<24, "r24">, DwarfRegNum<[24]>;
+def R25 : AMDILReg<25, "r25">, DwarfRegNum<[25]>;
+def R26 : AMDILReg<26, "r26">, DwarfRegNum<[26]>;
+def R27 : AMDILReg<27, "r27">, DwarfRegNum<[27]>;
+def R28 : AMDILReg<28, "r28">, DwarfRegNum<[28]>;
+def R29 : AMDILReg<29, "r29">, DwarfRegNum<[29]>;
+def R30 : AMDILReg<30, "r30">, DwarfRegNum<[30]>;
+def R31 : AMDILReg<31, "r31">, DwarfRegNum<[31]>;
+def R32 : AMDILReg<32, "r32">, DwarfRegNum<[32]>;
+def R33 : AMDILReg<33, "r33">, DwarfRegNum<[33]>;
+def R34 : AMDILReg<34, "r34">, DwarfRegNum<[34]>;
+def R35 : AMDILReg<35, "r35">, DwarfRegNum<[35]>;
+def R36 : AMDILReg<36, "r36">, DwarfRegNum<[36]>;
+def R37 : AMDILReg<37, "r37">, DwarfRegNum<[37]>;
+def R38 : AMDILReg<38, "r38">, DwarfRegNum<[38]>;
+def R39 : AMDILReg<39, "r39">, DwarfRegNum<[39]>;
+def R40 : AMDILReg<40, "r40">, DwarfRegNum<[40]>;
+def R41 : AMDILReg<41, "r41">, DwarfRegNum<[41]>;
+def R42 : AMDILReg<42, "r42">, DwarfRegNum<[42]>;
+def R43 : AMDILReg<43, "r43">, DwarfRegNum<[43]>;
+def R44 : AMDILReg<44, "r44">, DwarfRegNum<[44]>;
+def R45 : AMDILReg<45, "r45">, DwarfRegNum<[45]>;
+def R46 : AMDILReg<46, "r46">, DwarfRegNum<[46]>;
+def R47 : AMDILReg<47, "r47">, DwarfRegNum<[47]>;
+def R48 : AMDILReg<48, "r48">, DwarfRegNum<[48]>;
+def R49 : AMDILReg<49, "r49">, DwarfRegNum<[49]>;
+def R50 : AMDILReg<50, "r50">, DwarfRegNum<[50]>;
+def R51 : AMDILReg<51, "r51">, DwarfRegNum<[51]>;
+def R52 : AMDILReg<52, "r52">, DwarfRegNum<[52]>;
+def R53 : AMDILReg<53, "r53">, DwarfRegNum<[53]>;
+def R54 : AMDILReg<54, "r54">, DwarfRegNum<[54]>;
+def R55 : AMDILReg<55, "r55">, DwarfRegNum<[55]>;
+def R56 : AMDILReg<56, "r56">, DwarfRegNum<[56]>;
+def R57 : AMDILReg<57, "r57">, DwarfRegNum<[57]>;
+def R58 : AMDILReg<58, "r58">, DwarfRegNum<[58]>;
+def R59 : AMDILReg<59, "r59">, DwarfRegNum<[59]>;
+def R60 : AMDILReg<60, "r60">, DwarfRegNum<[60]>;
+def R61 : AMDILReg<61, "r61">, DwarfRegNum<[61]>;
+def R62 : AMDILReg<62, "r62">, DwarfRegNum<[62]>;
+def R63 : AMDILReg<63, "r63">, DwarfRegNum<[63]>;
+def R64 : AMDILReg<64, "r64">, DwarfRegNum<[64]>;
+def R65 : AMDILReg<65, "r65">, DwarfRegNum<[65]>;
+def R66 : AMDILReg<66, "r66">, DwarfRegNum<[66]>;
+def R67 : AMDILReg<67, "r67">, DwarfRegNum<[67]>;
+def R68 : AMDILReg<68, "r68">, DwarfRegNum<[68]>;
+def R69 : AMDILReg<69, "r69">, DwarfRegNum<[69]>;
+def R70 : AMDILReg<70, "r70">, DwarfRegNum<[70]>;
+def R71 : AMDILReg<71, "r71">, DwarfRegNum<[71]>;
+def R72 : AMDILReg<72, "r72">, DwarfRegNum<[72]>;
+def R73 : AMDILReg<73, "r73">, DwarfRegNum<[73]>;
+def R74 : AMDILReg<74, "r74">, DwarfRegNum<[74]>;
+def R75 : AMDILReg<75, "r75">, DwarfRegNum<[75]>;
+def R76 : AMDILReg<76, "r76">, DwarfRegNum<[76]>;
+def R77 : AMDILReg<77, "r77">, DwarfRegNum<[77]>;
+def R78 : AMDILReg<78, "r78">, DwarfRegNum<[78]>;
+def R79 : AMDILReg<79, "r79">, DwarfRegNum<[79]>;
+def R80 : AMDILReg<80, "r80">, DwarfRegNum<[80]>;
+def R81 : AMDILReg<81, "r81">, DwarfRegNum<[81]>;
+def R82 : AMDILReg<82, "r82">, DwarfRegNum<[82]>;
+def R83 : AMDILReg<83, "r83">, DwarfRegNum<[83]>;
+def R84 : AMDILReg<84, "r84">, DwarfRegNum<[84]>;
+def R85 : AMDILReg<85, "r85">, DwarfRegNum<[85]>;
+def R86 : AMDILReg<86, "r86">, DwarfRegNum<[86]>;
+def R87 : AMDILReg<87, "r87">, DwarfRegNum<[87]>;
+def R88 : AMDILReg<88, "r88">, DwarfRegNum<[88]>;
+def R89 : AMDILReg<89, "r89">, DwarfRegNum<[89]>;
+def R90 : AMDILReg<90, "r90">, DwarfRegNum<[90]>;
+def R91 : AMDILReg<91, "r91">, DwarfRegNum<[91]>;
+def R92 : AMDILReg<92, "r92">, DwarfRegNum<[92]>;
+def R93 : AMDILReg<93, "r93">, DwarfRegNum<[93]>;
+def R94 : AMDILReg<94, "r94">, DwarfRegNum<[94]>;
+def R95 : AMDILReg<95, "r95">, DwarfRegNum<[95]>;
+def R96 : AMDILReg<96, "r96">, DwarfRegNum<[96]>;
+def R97 : AMDILReg<97, "r97">, DwarfRegNum<[97]>;
+def R98 : AMDILReg<98, "r98">, DwarfRegNum<[98]>;
+def R99 : AMDILReg<99, "r99">, DwarfRegNum<[99]>;
+def R100 : AMDILReg<100, "r100">, DwarfRegNum<[100]>;
+def R101 : AMDILReg<101, "r101">, DwarfRegNum<[101]>;
+def R102 : AMDILReg<102, "r102">, DwarfRegNum<[102]>;
+def R103 : AMDILReg<103, "r103">, DwarfRegNum<[103]>;
+def R104 : AMDILReg<104, "r104">, DwarfRegNum<[104]>;
+def R105 : AMDILReg<105, "r105">, DwarfRegNum<[105]>;
+def R106 : AMDILReg<106, "r106">, DwarfRegNum<[106]>;
+def R107 : AMDILReg<107, "r107">, DwarfRegNum<[107]>;
+def R108 : AMDILReg<108, "r108">, DwarfRegNum<[108]>;
+def R109 : AMDILReg<109, "r109">, DwarfRegNum<[109]>;
+def R110 : AMDILReg<110, "r110">, DwarfRegNum<[110]>;
+def R111 : AMDILReg<111, "r111">, DwarfRegNum<[111]>;
+def R112 : AMDILReg<112, "r112">, DwarfRegNum<[112]>;
+def R113 : AMDILReg<113, "r113">, DwarfRegNum<[113]>;
+def R114 : AMDILReg<114, "r114">, DwarfRegNum<[114]>;
+def R115 : AMDILReg<115, "r115">, DwarfRegNum<[115]>;
+def R116 : AMDILReg<116, "r116">, DwarfRegNum<[116]>;
+def R117 : AMDILReg<117, "r117">, DwarfRegNum<[117]>;
+def R118 : AMDILReg<118, "r118">, DwarfRegNum<[118]>;
+def R119 : AMDILReg<119, "r119">, DwarfRegNum<[119]>;
+def R120 : AMDILReg<120, "r120">, DwarfRegNum<[120]>;
+def R121 : AMDILReg<121, "r121">, DwarfRegNum<[121]>;
+def R122 : AMDILReg<122, "r122">, DwarfRegNum<[122]>;
+def R123 : AMDILReg<123, "r123">, DwarfRegNum<[123]>;
+def R124 : AMDILReg<124, "r124">, DwarfRegNum<[124]>;
+def R125 : AMDILReg<125, "r125">, DwarfRegNum<[125]>;
+def R126 : AMDILReg<126, "r126">, DwarfRegNum<[126]>;
+def R127 : AMDILReg<127, "r127">, DwarfRegNum<[127]>;
+def R128 : AMDILReg<128, "r128">, DwarfRegNum<[128]>;
+def R129 : AMDILReg<129, "r129">, DwarfRegNum<[129]>;
+def R130 : AMDILReg<130, "r130">, DwarfRegNum<[130]>;
+def R131 : AMDILReg<131, "r131">, DwarfRegNum<[131]>;
+def R132 : AMDILReg<132, "r132">, DwarfRegNum<[132]>;
+def R133 : AMDILReg<133, "r133">, DwarfRegNum<[133]>;
+def R134 : AMDILReg<134, "r134">, DwarfRegNum<[134]>;
+def R135 : AMDILReg<135, "r135">, DwarfRegNum<[135]>;
+def R136 : AMDILReg<136, "r136">, DwarfRegNum<[136]>;
+def R137 : AMDILReg<137, "r137">, DwarfRegNum<[137]>;
+def R138 : AMDILReg<138, "r138">, DwarfRegNum<[138]>;
+def R139 : AMDILReg<139, "r139">, DwarfRegNum<[139]>;
+def R140 : AMDILReg<140, "r140">, DwarfRegNum<[140]>;
+def R141 : AMDILReg<141, "r141">, DwarfRegNum<[141]>;
+def R142 : AMDILReg<142, "r142">, DwarfRegNum<[142]>;
+def R143 : AMDILReg<143, "r143">, DwarfRegNum<[143]>;
+def R144 : AMDILReg<144, "r144">, DwarfRegNum<[144]>;
+def R145 : AMDILReg<145, "r145">, DwarfRegNum<[145]>;
+def R146 : AMDILReg<146, "r146">, DwarfRegNum<[146]>;
+def R147 : AMDILReg<147, "r147">, DwarfRegNum<[147]>;
+def R148 : AMDILReg<148, "r148">, DwarfRegNum<[148]>;
+def R149 : AMDILReg<149, "r149">, DwarfRegNum<[149]>;
+def R150 : AMDILReg<150, "r150">, DwarfRegNum<[150]>;
+def R151 : AMDILReg<151, "r151">, DwarfRegNum<[151]>;
+def R152 : AMDILReg<152, "r152">, DwarfRegNum<[152]>;
+def R153 : AMDILReg<153, "r153">, DwarfRegNum<[153]>;
+def R154 : AMDILReg<154, "r154">, DwarfRegNum<[154]>;
+def R155 : AMDILReg<155, "r155">, DwarfRegNum<[155]>;
+def R156 : AMDILReg<156, "r156">, DwarfRegNum<[156]>;
+def R157 : AMDILReg<157, "r157">, DwarfRegNum<[157]>;
+def R158 : AMDILReg<158, "r158">, DwarfRegNum<[158]>;
+def R159 : AMDILReg<159, "r159">, DwarfRegNum<[159]>;
+def R160 : AMDILReg<160, "r160">, DwarfRegNum<[160]>;
+def R161 : AMDILReg<161, "r161">, DwarfRegNum<[161]>;
+def R162 : AMDILReg<162, "r162">, DwarfRegNum<[162]>;
+def R163 : AMDILReg<163, "r163">, DwarfRegNum<[163]>;
+def R164 : AMDILReg<164, "r164">, DwarfRegNum<[164]>;
+def R165 : AMDILReg<165, "r165">, DwarfRegNum<[165]>;
+def R166 : AMDILReg<166, "r166">, DwarfRegNum<[166]>;
+def R167 : AMDILReg<167, "r167">, DwarfRegNum<[167]>;
+def R168 : AMDILReg<168, "r168">, DwarfRegNum<[168]>;
+def R169 : AMDILReg<169, "r169">, DwarfRegNum<[169]>;
+def R170 : AMDILReg<170, "r170">, DwarfRegNum<[170]>;
+def R171 : AMDILReg<171, "r171">, DwarfRegNum<[171]>;
+def R172 : AMDILReg<172, "r172">, DwarfRegNum<[172]>;
+def R173 : AMDILReg<173, "r173">, DwarfRegNum<[173]>;
+def R174 : AMDILReg<174, "r174">, DwarfRegNum<[174]>;
+def R175 : AMDILReg<175, "r175">, DwarfRegNum<[175]>;
+def R176 : AMDILReg<176, "r176">, DwarfRegNum<[176]>;
+def R177 : AMDILReg<177, "r177">, DwarfRegNum<[177]>;
+def R178 : AMDILReg<178, "r178">, DwarfRegNum<[178]>;
+def R179 : AMDILReg<179, "r179">, DwarfRegNum<[179]>;
+def R180 : AMDILReg<180, "r180">, DwarfRegNum<[180]>;
+def R181 : AMDILReg<181, "r181">, DwarfRegNum<[181]>;
+def R182 : AMDILReg<182, "r182">, DwarfRegNum<[182]>;
+def R183 : AMDILReg<183, "r183">, DwarfRegNum<[183]>;
+def R184 : AMDILReg<184, "r184">, DwarfRegNum<[184]>;
+def R185 : AMDILReg<185, "r185">, DwarfRegNum<[185]>;
+def R186 : AMDILReg<186, "r186">, DwarfRegNum<[186]>;
+def R187 : AMDILReg<187, "r187">, DwarfRegNum<[187]>;
+def R188 : AMDILReg<188, "r188">, DwarfRegNum<[188]>;
+def R189 : AMDILReg<189, "r189">, DwarfRegNum<[189]>;
+def R190 : AMDILReg<190, "r190">, DwarfRegNum<[190]>;
+def R191 : AMDILReg<191, "r191">, DwarfRegNum<[191]>;
+def R192 : AMDILReg<192, "r192">, DwarfRegNum<[192]>;
+def R193 : AMDILReg<193, "r193">, DwarfRegNum<[193]>;
+def R194 : AMDILReg<194, "r194">, DwarfRegNum<[194]>;
+def R195 : AMDILReg<195, "r195">, DwarfRegNum<[195]>;
+def R196 : AMDILReg<196, "r196">, DwarfRegNum<[196]>;
+def R197 : AMDILReg<197, "r197">, DwarfRegNum<[197]>;
+def R198 : AMDILReg<198, "r198">, DwarfRegNum<[198]>;
+def R199 : AMDILReg<199, "r199">, DwarfRegNum<[199]>;
+def R200 : AMDILReg<200, "r200">, DwarfRegNum<[200]>;
+def R201 : AMDILReg<201, "r201">, DwarfRegNum<[201]>;
+def R202 : AMDILReg<202, "r202">, DwarfRegNum<[202]>;
+def R203 : AMDILReg<203, "r203">, DwarfRegNum<[203]>;
+def R204 : AMDILReg<204, "r204">, DwarfRegNum<[204]>;
+def R205 : AMDILReg<205, "r205">, DwarfRegNum<[205]>;
+def R206 : AMDILReg<206, "r206">, DwarfRegNum<[206]>;
+def R207 : AMDILReg<207, "r207">, DwarfRegNum<[207]>;
+def R208 : AMDILReg<208, "r208">, DwarfRegNum<[208]>;
+def R209 : AMDILReg<209, "r209">, DwarfRegNum<[209]>;
+def R210 : AMDILReg<210, "r210">, DwarfRegNum<[210]>;
+def R211 : AMDILReg<211, "r211">, DwarfRegNum<[211]>;
+def R212 : AMDILReg<212, "r212">, DwarfRegNum<[212]>;
+def R213 : AMDILReg<213, "r213">, DwarfRegNum<[213]>;
+def R214 : AMDILReg<214, "r214">, DwarfRegNum<[214]>;
+def R215 : AMDILReg<215, "r215">, DwarfRegNum<[215]>;
+def R216 : AMDILReg<216, "r216">, DwarfRegNum<[216]>;
+def R217 : AMDILReg<217, "r217">, DwarfRegNum<[217]>;
+def R218 : AMDILReg<218, "r218">, DwarfRegNum<[218]>;
+def R219 : AMDILReg<219, "r219">, DwarfRegNum<[219]>;
+def R220 : AMDILReg<220, "r220">, DwarfRegNum<[220]>;
+def R221 : AMDILReg<221, "r221">, DwarfRegNum<[221]>;
+def R222 : AMDILReg<222, "r222">, DwarfRegNum<[222]>;
+def R223 : AMDILReg<223, "r223">, DwarfRegNum<[223]>;
+def R224 : AMDILReg<224, "r224">, DwarfRegNum<[224]>;
+def R225 : AMDILReg<225, "r225">, DwarfRegNum<[225]>;
+def R226 : AMDILReg<226, "r226">, DwarfRegNum<[226]>;
+def R227 : AMDILReg<227, "r227">, DwarfRegNum<[227]>;
+def R228 : AMDILReg<228, "r228">, DwarfRegNum<[228]>;
+def R229 : AMDILReg<229, "r229">, DwarfRegNum<[229]>;
+def R230 : AMDILReg<230, "r230">, DwarfRegNum<[230]>;
+def R231 : AMDILReg<231, "r231">, DwarfRegNum<[231]>;
+def R232 : AMDILReg<232, "r232">, DwarfRegNum<[232]>;
+def R233 : AMDILReg<233, "r233">, DwarfRegNum<[233]>;
+def R234 : AMDILReg<234, "r234">, DwarfRegNum<[234]>;
+def R235 : AMDILReg<235, "r235">, DwarfRegNum<[235]>;
+def R236 : AMDILReg<236, "r236">, DwarfRegNum<[236]>;
+def R237 : AMDILReg<237, "r237">, DwarfRegNum<[237]>;
+def R238 : AMDILReg<238, "r238">, DwarfRegNum<[238]>;
+def R239 : AMDILReg<239, "r239">, DwarfRegNum<[239]>;
+def R240 : AMDILReg<240, "r240">, DwarfRegNum<[240]>;
+def R241 : AMDILReg<241, "r241">, DwarfRegNum<[241]>;
+def R242 : AMDILReg<242, "r242">, DwarfRegNum<[242]>;
+def R243 : AMDILReg<243, "r243">, DwarfRegNum<[243]>;
+def R244 : AMDILReg<244, "r244">, DwarfRegNum<[244]>;
+def R245 : AMDILReg<245, "r245">, DwarfRegNum<[245]>;
+def R246 : AMDILReg<246, "r246">, DwarfRegNum<[246]>;
+def R247 : AMDILReg<247, "r247">, DwarfRegNum<[247]>;
+def R248 : AMDILReg<248, "r248">, DwarfRegNum<[248]>;
+def R249 : AMDILReg<249, "r249">, DwarfRegNum<[249]>;
+def R250 : AMDILReg<250, "r250">, DwarfRegNum<[250]>;
+def R251 : AMDILReg<251, "r251">, DwarfRegNum<[251]>;
+def R252 : AMDILReg<252, "r252">, DwarfRegNum<[252]>;
+def R253 : AMDILReg<253, "r253">, DwarfRegNum<[253]>;
+def R254 : AMDILReg<254, "r254">, DwarfRegNum<[254]>;
+def R255 : AMDILReg<255, "r255">, DwarfRegNum<[255]>;
+def R256 : AMDILReg<256, "r256">, DwarfRegNum<[256]>;
+def R257 : AMDILReg<257, "r257">, DwarfRegNum<[257]>;
+def R258 : AMDILReg<258, "r258">, DwarfRegNum<[258]>;
+def R259 : AMDILReg<259, "r259">, DwarfRegNum<[259]>;
+def R260 : AMDILReg<260, "r260">, DwarfRegNum<[260]>;
+def R261 : AMDILReg<261, "r261">, DwarfRegNum<[261]>;
+def R262 : AMDILReg<262, "r262">, DwarfRegNum<[262]>;
+def R263 : AMDILReg<263, "r263">, DwarfRegNum<[263]>;
+def R264 : AMDILReg<264, "r264">, DwarfRegNum<[264]>;
+def R265 : AMDILReg<265, "r265">, DwarfRegNum<[265]>;
+def R266 : AMDILReg<266, "r266">, DwarfRegNum<[266]>;
+def R267 : AMDILReg<267, "r267">, DwarfRegNum<[267]>;
+def R268 : AMDILReg<268, "r268">, DwarfRegNum<[268]>;
+def R269 : AMDILReg<269, "r269">, DwarfRegNum<[269]>;
+def R270 : AMDILReg<270, "r270">, DwarfRegNum<[270]>;
+def R271 : AMDILReg<271, "r271">, DwarfRegNum<[271]>;
+def R272 : AMDILReg<272, "r272">, DwarfRegNum<[272]>;
+def R273 : AMDILReg<273, "r273">, DwarfRegNum<[273]>;
+def R274 : AMDILReg<274, "r274">, DwarfRegNum<[274]>;
+def R275 : AMDILReg<275, "r275">, DwarfRegNum<[275]>;
+def R276 : AMDILReg<276, "r276">, DwarfRegNum<[276]>;
+def R277 : AMDILReg<277, "r277">, DwarfRegNum<[277]>;
+def R278 : AMDILReg<278, "r278">, DwarfRegNum<[278]>;
+def R279 : AMDILReg<279, "r279">, DwarfRegNum<[279]>;
+def R280 : AMDILReg<280, "r280">, DwarfRegNum<[280]>;
+def R281 : AMDILReg<281, "r281">, DwarfRegNum<[281]>;
+def R282 : AMDILReg<282, "r282">, DwarfRegNum<[282]>;
+def R283 : AMDILReg<283, "r283">, DwarfRegNum<[283]>;
+def R284 : AMDILReg<284, "r284">, DwarfRegNum<[284]>;
+def R285 : AMDILReg<285, "r285">, DwarfRegNum<[285]>;
+def R286 : AMDILReg<286, "r286">, DwarfRegNum<[286]>;
+def R287 : AMDILReg<287, "r287">, DwarfRegNum<[287]>;
+def R288 : AMDILReg<288, "r288">, DwarfRegNum<[288]>;
+def R289 : AMDILReg<289, "r289">, DwarfRegNum<[289]>;
+def R290 : AMDILReg<290, "r290">, DwarfRegNum<[290]>;
+def R291 : AMDILReg<291, "r291">, DwarfRegNum<[291]>;
+def R292 : AMDILReg<292, "r292">, DwarfRegNum<[292]>;
+def R293 : AMDILReg<293, "r293">, DwarfRegNum<[293]>;
+def R294 : AMDILReg<294, "r294">, DwarfRegNum<[294]>;
+def R295 : AMDILReg<295, "r295">, DwarfRegNum<[295]>;
+def R296 : AMDILReg<296, "r296">, DwarfRegNum<[296]>;
+def R297 : AMDILReg<297, "r297">, DwarfRegNum<[297]>;
+def R298 : AMDILReg<298, "r298">, DwarfRegNum<[298]>;
+def R299 : AMDILReg<299, "r299">, DwarfRegNum<[299]>;
+def R300 : AMDILReg<300, "r300">, DwarfRegNum<[300]>;
+def R301 : AMDILReg<301, "r301">, DwarfRegNum<[301]>;
+def R302 : AMDILReg<302, "r302">, DwarfRegNum<[302]>;
+def R303 : AMDILReg<303, "r303">, DwarfRegNum<[303]>;
+def R304 : AMDILReg<304, "r304">, DwarfRegNum<[304]>;
+def R305 : AMDILReg<305, "r305">, DwarfRegNum<[305]>;
+def R306 : AMDILReg<306, "r306">, DwarfRegNum<[306]>;
+def R307 : AMDILReg<307, "r307">, DwarfRegNum<[307]>;
+def R308 : AMDILReg<308, "r308">, DwarfRegNum<[308]>;
+def R309 : AMDILReg<309, "r309">, DwarfRegNum<[309]>;
+def R310 : AMDILReg<310, "r310">, DwarfRegNum<[310]>;
+def R311 : AMDILReg<311, "r311">, DwarfRegNum<[311]>;
+def R312 : AMDILReg<312, "r312">, DwarfRegNum<[312]>;
+def R313 : AMDILReg<313, "r313">, DwarfRegNum<[313]>;
+def R314 : AMDILReg<314, "r314">, DwarfRegNum<[314]>;
+def R315 : AMDILReg<315, "r315">, DwarfRegNum<[315]>;
+def R316 : AMDILReg<316, "r316">, DwarfRegNum<[316]>;
+def R317 : AMDILReg<317, "r317">, DwarfRegNum<[317]>;
+def R318 : AMDILReg<318, "r318">, DwarfRegNum<[318]>;
+def R319 : AMDILReg<319, "r319">, DwarfRegNum<[319]>;
+def R320 : AMDILReg<320, "r320">, DwarfRegNum<[320]>;
+def R321 : AMDILReg<321, "r321">, DwarfRegNum<[321]>;
+def R322 : AMDILReg<322, "r322">, DwarfRegNum<[322]>;
+def R323 : AMDILReg<323, "r323">, DwarfRegNum<[323]>;
+def R324 : AMDILReg<324, "r324">, DwarfRegNum<[324]>;
+def R325 : AMDILReg<325, "r325">, DwarfRegNum<[325]>;
+def R326 : AMDILReg<326, "r326">, DwarfRegNum<[326]>;
+def R327 : AMDILReg<327, "r327">, DwarfRegNum<[327]>;
+def R328 : AMDILReg<328, "r328">, DwarfRegNum<[328]>;
+def R329 : AMDILReg<329, "r329">, DwarfRegNum<[329]>;
+def R330 : AMDILReg<330, "r330">, DwarfRegNum<[330]>;
+def R331 : AMDILReg<331, "r331">, DwarfRegNum<[331]>;
+def R332 : AMDILReg<332, "r332">, DwarfRegNum<[332]>;
+def R333 : AMDILReg<333, "r333">, DwarfRegNum<[333]>;
+def R334 : AMDILReg<334, "r334">, DwarfRegNum<[334]>;
+def R335 : AMDILReg<335, "r335">, DwarfRegNum<[335]>;
+def R336 : AMDILReg<336, "r336">, DwarfRegNum<[336]>;
+def R337 : AMDILReg<337, "r337">, DwarfRegNum<[337]>;
+def R338 : AMDILReg<338, "r338">, DwarfRegNum<[338]>;
+def R339 : AMDILReg<339, "r339">, DwarfRegNum<[339]>;
+def R340 : AMDILReg<340, "r340">, DwarfRegNum<[340]>;
+def R341 : AMDILReg<341, "r341">, DwarfRegNum<[341]>;
+def R342 : AMDILReg<342, "r342">, DwarfRegNum<[342]>;
+def R343 : AMDILReg<343, "r343">, DwarfRegNum<[343]>;
+def R344 : AMDILReg<344, "r344">, DwarfRegNum<[344]>;
+def R345 : AMDILReg<345, "r345">, DwarfRegNum<[345]>;
+def R346 : AMDILReg<346, "r346">, DwarfRegNum<[346]>;
+def R347 : AMDILReg<347, "r347">, DwarfRegNum<[347]>;
+def R348 : AMDILReg<348, "r348">, DwarfRegNum<[348]>;
+def R349 : AMDILReg<349, "r349">, DwarfRegNum<[349]>;
+def R350 : AMDILReg<350, "r350">, DwarfRegNum<[350]>;
+def R351 : AMDILReg<351, "r351">, DwarfRegNum<[351]>;
+def R352 : AMDILReg<352, "r352">, DwarfRegNum<[352]>;
+def R353 : AMDILReg<353, "r353">, DwarfRegNum<[353]>;
+def R354 : AMDILReg<354, "r354">, DwarfRegNum<[354]>;
+def R355 : AMDILReg<355, "r355">, DwarfRegNum<[355]>;
+def R356 : AMDILReg<356, "r356">, DwarfRegNum<[356]>;
+def R357 : AMDILReg<357, "r357">, DwarfRegNum<[357]>;
+def R358 : AMDILReg<358, "r358">, DwarfRegNum<[358]>;
+def R359 : AMDILReg<359, "r359">, DwarfRegNum<[359]>;
+def R360 : AMDILReg<360, "r360">, DwarfRegNum<[360]>;
+def R361 : AMDILReg<361, "r361">, DwarfRegNum<[361]>;
+def R362 : AMDILReg<362, "r362">, DwarfRegNum<[362]>;
+def R363 : AMDILReg<363, "r363">, DwarfRegNum<[363]>;
+def R364 : AMDILReg<364, "r364">, DwarfRegNum<[364]>;
+def R365 : AMDILReg<365, "r365">, DwarfRegNum<[365]>;
+def R366 : AMDILReg<366, "r366">, DwarfRegNum<[366]>;
+def R367 : AMDILReg<367, "r367">, DwarfRegNum<[367]>;
+def R368 : AMDILReg<368, "r368">, DwarfRegNum<[368]>;
+def R369 : AMDILReg<369, "r369">, DwarfRegNum<[369]>;
+def R370 : AMDILReg<370, "r370">, DwarfRegNum<[370]>;
+def R371 : AMDILReg<371, "r371">, DwarfRegNum<[371]>;
+def R372 : AMDILReg<372, "r372">, DwarfRegNum<[372]>;
+def R373 : AMDILReg<373, "r373">, DwarfRegNum<[373]>;
+def R374 : AMDILReg<374, "r374">, DwarfRegNum<[374]>;
+def R375 : AMDILReg<375, "r375">, DwarfRegNum<[375]>;
+def R376 : AMDILReg<376, "r376">, DwarfRegNum<[376]>;
+def R377 : AMDILReg<377, "r377">, DwarfRegNum<[377]>;
+def R378 : AMDILReg<378, "r378">, DwarfRegNum<[378]>;
+def R379 : AMDILReg<379, "r379">, DwarfRegNum<[379]>;
+def R380 : AMDILReg<380, "r380">, DwarfRegNum<[380]>;
+def R381 : AMDILReg<381, "r381">, DwarfRegNum<[381]>;
+def R382 : AMDILReg<382, "r382">, DwarfRegNum<[382]>;
+def R383 : AMDILReg<383, "r383">, DwarfRegNum<[383]>;
+def R384 : AMDILReg<384, "r384">, DwarfRegNum<[384]>;
+def R385 : AMDILReg<385, "r385">, DwarfRegNum<[385]>;
+def R386 : AMDILReg<386, "r386">, DwarfRegNum<[386]>;
+def R387 : AMDILReg<387, "r387">, DwarfRegNum<[387]>;
+def R388 : AMDILReg<388, "r388">, DwarfRegNum<[388]>;
+def R389 : AMDILReg<389, "r389">, DwarfRegNum<[389]>;
+def R390 : AMDILReg<390, "r390">, DwarfRegNum<[390]>;
+def R391 : AMDILReg<391, "r391">, DwarfRegNum<[391]>;
+def R392 : AMDILReg<392, "r392">, DwarfRegNum<[392]>;
+def R393 : AMDILReg<393, "r393">, DwarfRegNum<[393]>;
+def R394 : AMDILReg<394, "r394">, DwarfRegNum<[394]>;
+def R395 : AMDILReg<395, "r395">, DwarfRegNum<[395]>;
+def R396 : AMDILReg<396, "r396">, DwarfRegNum<[396]>;
+def R397 : AMDILReg<397, "r397">, DwarfRegNum<[397]>;
+def R398 : AMDILReg<398, "r398">, DwarfRegNum<[398]>;
+def R399 : AMDILReg<399, "r399">, DwarfRegNum<[399]>;
+def R400 : AMDILReg<400, "r400">, DwarfRegNum<[400]>;
+def R401 : AMDILReg<401, "r401">, DwarfRegNum<[401]>;
+def R402 : AMDILReg<402, "r402">, DwarfRegNum<[402]>;
+def R403 : AMDILReg<403, "r403">, DwarfRegNum<[403]>;
+def R404 : AMDILReg<404, "r404">, DwarfRegNum<[404]>;
+def R405 : AMDILReg<405, "r405">, DwarfRegNum<[405]>;
+def R406 : AMDILReg<406, "r406">, DwarfRegNum<[406]>;
+def R407 : AMDILReg<407, "r407">, DwarfRegNum<[407]>;
+def R408 : AMDILReg<408, "r408">, DwarfRegNum<[408]>;
+def R409 : AMDILReg<409, "r409">, DwarfRegNum<[409]>;
+def R410 : AMDILReg<410, "r410">, DwarfRegNum<[410]>;
+def R411 : AMDILReg<411, "r411">, DwarfRegNum<[411]>;
+def R412 : AMDILReg<412, "r412">, DwarfRegNum<[412]>;
+def R413 : AMDILReg<413, "r413">, DwarfRegNum<[413]>;
+def R414 : AMDILReg<414, "r414">, DwarfRegNum<[414]>;
+def R415 : AMDILReg<415, "r415">, DwarfRegNum<[415]>;
+def R416 : AMDILReg<416, "r416">, DwarfRegNum<[416]>;
+def R417 : AMDILReg<417, "r417">, DwarfRegNum<[417]>;
+def R418 : AMDILReg<418, "r418">, DwarfRegNum<[418]>;
+def R419 : AMDILReg<419, "r419">, DwarfRegNum<[419]>;
+def R420 : AMDILReg<420, "r420">, DwarfRegNum<[420]>;
+def R421 : AMDILReg<421, "r421">, DwarfRegNum<[421]>;
+def R422 : AMDILReg<422, "r422">, DwarfRegNum<[422]>;
+def R423 : AMDILReg<423, "r423">, DwarfRegNum<[423]>;
+def R424 : AMDILReg<424, "r424">, DwarfRegNum<[424]>;
+def R425 : AMDILReg<425, "r425">, DwarfRegNum<[425]>;
+def R426 : AMDILReg<426, "r426">, DwarfRegNum<[426]>;
+def R427 : AMDILReg<427, "r427">, DwarfRegNum<[427]>;
+def R428 : AMDILReg<428, "r428">, DwarfRegNum<[428]>;
+def R429 : AMDILReg<429, "r429">, DwarfRegNum<[429]>;
+def R430 : AMDILReg<430, "r430">, DwarfRegNum<[430]>;
+def R431 : AMDILReg<431, "r431">, DwarfRegNum<[431]>;
+def R432 : AMDILReg<432, "r432">, DwarfRegNum<[432]>;
+def R433 : AMDILReg<433, "r433">, DwarfRegNum<[433]>;
+def R434 : AMDILReg<434, "r434">, DwarfRegNum<[434]>;
+def R435 : AMDILReg<435, "r435">, DwarfRegNum<[435]>;
+def R436 : AMDILReg<436, "r436">, DwarfRegNum<[436]>;
+def R437 : AMDILReg<437, "r437">, DwarfRegNum<[437]>;
+def R438 : AMDILReg<438, "r438">, DwarfRegNum<[438]>;
+def R439 : AMDILReg<439, "r439">, DwarfRegNum<[439]>;
+def R440 : AMDILReg<440, "r440">, DwarfRegNum<[440]>;
+def R441 : AMDILReg<441, "r441">, DwarfRegNum<[441]>;
+def R442 : AMDILReg<442, "r442">, DwarfRegNum<[442]>;
+def R443 : AMDILReg<443, "r443">, DwarfRegNum<[443]>;
+def R444 : AMDILReg<444, "r444">, DwarfRegNum<[444]>;
+def R445 : AMDILReg<445, "r445">, DwarfRegNum<[445]>;
+def R446 : AMDILReg<446, "r446">, DwarfRegNum<[446]>;
+def R447 : AMDILReg<447, "r447">, DwarfRegNum<[447]>;
+def R448 : AMDILReg<448, "r448">, DwarfRegNum<[448]>;
+def R449 : AMDILReg<449, "r449">, DwarfRegNum<[449]>;
+def R450 : AMDILReg<450, "r450">, DwarfRegNum<[450]>;
+def R451 : AMDILReg<451, "r451">, DwarfRegNum<[451]>;
+def R452 : AMDILReg<452, "r452">, DwarfRegNum<[452]>;
+def R453 : AMDILReg<453, "r453">, DwarfRegNum<[453]>;
+def R454 : AMDILReg<454, "r454">, DwarfRegNum<[454]>;
+def R455 : AMDILReg<455, "r455">, DwarfRegNum<[455]>;
+def R456 : AMDILReg<456, "r456">, DwarfRegNum<[456]>;
+def R457 : AMDILReg<457, "r457">, DwarfRegNum<[457]>;
+def R458 : AMDILReg<458, "r458">, DwarfRegNum<[458]>;
+def R459 : AMDILReg<459, "r459">, DwarfRegNum<[459]>;
+def R460 : AMDILReg<460, "r460">, DwarfRegNum<[460]>;
+def R461 : AMDILReg<461, "r461">, DwarfRegNum<[461]>;
+def R462 : AMDILReg<462, "r462">, DwarfRegNum<[462]>;
+def R463 : AMDILReg<463, "r463">, DwarfRegNum<[463]>;
+def R464 : AMDILReg<464, "r464">, DwarfRegNum<[464]>;
+def R465 : AMDILReg<465, "r465">, DwarfRegNum<[465]>;
+def R466 : AMDILReg<466, "r466">, DwarfRegNum<[466]>;
+def R467 : AMDILReg<467, "r467">, DwarfRegNum<[467]>;
+def R468 : AMDILReg<468, "r468">, DwarfRegNum<[468]>;
+def R469 : AMDILReg<469, "r469">, DwarfRegNum<[469]>;
+def R470 : AMDILReg<470, "r470">, DwarfRegNum<[470]>;
+def R471 : AMDILReg<471, "r471">, DwarfRegNum<[471]>;
+def R472 : AMDILReg<472, "r472">, DwarfRegNum<[472]>;
+def R473 : AMDILReg<473, "r473">, DwarfRegNum<[473]>;
+def R474 : AMDILReg<474, "r474">, DwarfRegNum<[474]>;
+def R475 : AMDILReg<475, "r475">, DwarfRegNum<[475]>;
+def R476 : AMDILReg<476, "r476">, DwarfRegNum<[476]>;
+def R477 : AMDILReg<477, "r477">, DwarfRegNum<[477]>;
+def R478 : AMDILReg<478, "r478">, DwarfRegNum<[478]>;
+def R479 : AMDILReg<479, "r479">, DwarfRegNum<[479]>;
+def R480 : AMDILReg<480, "r480">, DwarfRegNum<[480]>;
+def R481 : AMDILReg<481, "r481">, DwarfRegNum<[481]>;
+def R482 : AMDILReg<482, "r482">, DwarfRegNum<[482]>;
+def R483 : AMDILReg<483, "r483">, DwarfRegNum<[483]>;
+def R484 : AMDILReg<484, "r484">, DwarfRegNum<[484]>;
+def R485 : AMDILReg<485, "r485">, DwarfRegNum<[485]>;
+def R486 : AMDILReg<486, "r486">, DwarfRegNum<[486]>;
+def R487 : AMDILReg<487, "r487">, DwarfRegNum<[487]>;
+def R488 : AMDILReg<488, "r488">, DwarfRegNum<[488]>;
+def R489 : AMDILReg<489, "r489">, DwarfRegNum<[489]>;
+def R490 : AMDILReg<490, "r490">, DwarfRegNum<[490]>;
+def R491 : AMDILReg<491, "r491">, DwarfRegNum<[491]>;
+def R492 : AMDILReg<492, "r492">, DwarfRegNum<[492]>;
+def R493 : AMDILReg<493, "r493">, DwarfRegNum<[493]>;
+def R494 : AMDILReg<494, "r494">, DwarfRegNum<[494]>;
+def R495 : AMDILReg<495, "r495">, DwarfRegNum<[495]>;
+def R496 : AMDILReg<496, "r496">, DwarfRegNum<[496]>;
+def R497 : AMDILReg<497, "r497">, DwarfRegNum<[497]>;
+def R498 : AMDILReg<498, "r498">, DwarfRegNum<[498]>;
+def R499 : AMDILReg<499, "r499">, DwarfRegNum<[499]>;
+def R500 : AMDILReg<500, "r500">, DwarfRegNum<[500]>;
+def R501 : AMDILReg<501, "r501">, DwarfRegNum<[501]>;
+def R502 : AMDILReg<502, "r502">, DwarfRegNum<[502]>;
+def R503 : AMDILReg<503, "r503">, DwarfRegNum<[503]>;
+def R504 : AMDILReg<504, "r504">, DwarfRegNum<[504]>;
+def R505 : AMDILReg<505, "r505">, DwarfRegNum<[505]>;
+def R506 : AMDILReg<506, "r506">, DwarfRegNum<[506]>;
+def R507 : AMDILReg<507, "r507">, DwarfRegNum<[507]>;
+def R508 : AMDILReg<508, "r508">, DwarfRegNum<[508]>;
+def R509 : AMDILReg<509, "r509">, DwarfRegNum<[509]>;
+def R510 : AMDILReg<510, "r510">, DwarfRegNum<[510]>;
+def R511 : AMDILReg<511, "r511">, DwarfRegNum<[511]>;
+def R512 : AMDILReg<512, "r512">, DwarfRegNum<[512]>;
+def R513 : AMDILReg<513, "r513">, DwarfRegNum<[513]>;
+def R514 : AMDILReg<514, "r514">, DwarfRegNum<[514]>;
+def R515 : AMDILReg<515, "r515">, DwarfRegNum<[515]>;
+def R516 : AMDILReg<516, "r516">, DwarfRegNum<[516]>;
+def R517 : AMDILReg<517, "r517">, DwarfRegNum<[517]>;
+def R518 : AMDILReg<518, "r518">, DwarfRegNum<[518]>;
+def R519 : AMDILReg<519, "r519">, DwarfRegNum<[519]>;
+def R520 : AMDILReg<520, "r520">, DwarfRegNum<[520]>;
+def R521 : AMDILReg<521, "r521">, DwarfRegNum<[521]>;
+def R522 : AMDILReg<522, "r522">, DwarfRegNum<[522]>;
+def R523 : AMDILReg<523, "r523">, DwarfRegNum<[523]>;
+def R524 : AMDILReg<524, "r524">, DwarfRegNum<[524]>;
+def R525 : AMDILReg<525, "r525">, DwarfRegNum<[525]>;
+def R526 : AMDILReg<526, "r526">, DwarfRegNum<[526]>;
+def R527 : AMDILReg<527, "r527">, DwarfRegNum<[527]>;
+def R528 : AMDILReg<528, "r528">, DwarfRegNum<[528]>;
+def R529 : AMDILReg<529, "r529">, DwarfRegNum<[529]>;
+def R530 : AMDILReg<530, "r530">, DwarfRegNum<[530]>;
+def R531 : AMDILReg<531, "r531">, DwarfRegNum<[531]>;
+def R532 : AMDILReg<532, "r532">, DwarfRegNum<[532]>;
+def R533 : AMDILReg<533, "r533">, DwarfRegNum<[533]>;
+def R534 : AMDILReg<534, "r534">, DwarfRegNum<[534]>;
+def R535 : AMDILReg<535, "r535">, DwarfRegNum<[535]>;
+def R536 : AMDILReg<536, "r536">, DwarfRegNum<[536]>;
+def R537 : AMDILReg<537, "r537">, DwarfRegNum<[537]>;
+def R538 : AMDILReg<538, "r538">, DwarfRegNum<[538]>;
+def R539 : AMDILReg<539, "r539">, DwarfRegNum<[539]>;
+def R540 : AMDILReg<540, "r540">, DwarfRegNum<[540]>;
+def R541 : AMDILReg<541, "r541">, DwarfRegNum<[541]>;
+def R542 : AMDILReg<542, "r542">, DwarfRegNum<[542]>;
+def R543 : AMDILReg<543, "r543">, DwarfRegNum<[543]>;
+def R544 : AMDILReg<544, "r544">, DwarfRegNum<[544]>;
+def R545 : AMDILReg<545, "r545">, DwarfRegNum<[545]>;
+def R546 : AMDILReg<546, "r546">, DwarfRegNum<[546]>;
+def R547 : AMDILReg<547, "r547">, DwarfRegNum<[547]>;
+def R548 : AMDILReg<548, "r548">, DwarfRegNum<[548]>;
+def R549 : AMDILReg<549, "r549">, DwarfRegNum<[549]>;
+def R550 : AMDILReg<550, "r550">, DwarfRegNum<[550]>;
+def R551 : AMDILReg<551, "r551">, DwarfRegNum<[551]>;
+def R552 : AMDILReg<552, "r552">, DwarfRegNum<[552]>;
+def R553 : AMDILReg<553, "r553">, DwarfRegNum<[553]>;
+def R554 : AMDILReg<554, "r554">, DwarfRegNum<[554]>;
+def R555 : AMDILReg<555, "r555">, DwarfRegNum<[555]>;
+def R556 : AMDILReg<556, "r556">, DwarfRegNum<[556]>;
+def R557 : AMDILReg<557, "r557">, DwarfRegNum<[557]>;
+def R558 : AMDILReg<558, "r558">, DwarfRegNum<[558]>;
+def R559 : AMDILReg<559, "r559">, DwarfRegNum<[559]>;
+def R560 : AMDILReg<560, "r560">, DwarfRegNum<[560]>;
+def R561 : AMDILReg<561, "r561">, DwarfRegNum<[561]>;
+def R562 : AMDILReg<562, "r562">, DwarfRegNum<[562]>;
+def R563 : AMDILReg<563, "r563">, DwarfRegNum<[563]>;
+def R564 : AMDILReg<564, "r564">, DwarfRegNum<[564]>;
+def R565 : AMDILReg<565, "r565">, DwarfRegNum<[565]>;
+def R566 : AMDILReg<566, "r566">, DwarfRegNum<[566]>;
+def R567 : AMDILReg<567, "r567">, DwarfRegNum<[567]>;
+def R568 : AMDILReg<568, "r568">, DwarfRegNum<[568]>;
+def R569 : AMDILReg<569, "r569">, DwarfRegNum<[569]>;
+def R570 : AMDILReg<570, "r570">, DwarfRegNum<[570]>;
+def R571 : AMDILReg<571, "r571">, DwarfRegNum<[571]>;
+def R572 : AMDILReg<572, "r572">, DwarfRegNum<[572]>;
+def R573 : AMDILReg<573, "r573">, DwarfRegNum<[573]>;
+def R574 : AMDILReg<574, "r574">, DwarfRegNum<[574]>;
+def R575 : AMDILReg<575, "r575">, DwarfRegNum<[575]>;
+def R576 : AMDILReg<576, "r576">, DwarfRegNum<[576]>;
+def R577 : AMDILReg<577, "r577">, DwarfRegNum<[577]>;
+def R578 : AMDILReg<578, "r578">, DwarfRegNum<[578]>;
+def R579 : AMDILReg<579, "r579">, DwarfRegNum<[579]>;
+def R580 : AMDILReg<580, "r580">, DwarfRegNum<[580]>;
+def R581 : AMDILReg<581, "r581">, DwarfRegNum<[581]>;
+def R582 : AMDILReg<582, "r582">, DwarfRegNum<[582]>;
+def R583 : AMDILReg<583, "r583">, DwarfRegNum<[583]>;
+def R584 : AMDILReg<584, "r584">, DwarfRegNum<[584]>;
+def R585 : AMDILReg<585, "r585">, DwarfRegNum<[585]>;
+def R586 : AMDILReg<586, "r586">, DwarfRegNum<[586]>;
+def R587 : AMDILReg<587, "r587">, DwarfRegNum<[587]>;
+def R588 : AMDILReg<588, "r588">, DwarfRegNum<[588]>;
+def R589 : AMDILReg<589, "r589">, DwarfRegNum<[589]>;
+def R590 : AMDILReg<590, "r590">, DwarfRegNum<[590]>;
+def R591 : AMDILReg<591, "r591">, DwarfRegNum<[591]>;
+def R592 : AMDILReg<592, "r592">, DwarfRegNum<[592]>;
+def R593 : AMDILReg<593, "r593">, DwarfRegNum<[593]>;
+def R594 : AMDILReg<594, "r594">, DwarfRegNum<[594]>;
+def R595 : AMDILReg<595, "r595">, DwarfRegNum<[595]>;
+def R596 : AMDILReg<596, "r596">, DwarfRegNum<[596]>;
+def R597 : AMDILReg<597, "r597">, DwarfRegNum<[597]>;
+def R598 : AMDILReg<598, "r598">, DwarfRegNum<[598]>;
+def R599 : AMDILReg<599, "r599">, DwarfRegNum<[599]>;
+def R600 : AMDILReg<600, "r600">, DwarfRegNum<[600]>;
+def R601 : AMDILReg<601, "r601">, DwarfRegNum<[601]>;
+def R602 : AMDILReg<602, "r602">, DwarfRegNum<[602]>;
+def R603 : AMDILReg<603, "r603">, DwarfRegNum<[603]>;
+def R604 : AMDILReg<604, "r604">, DwarfRegNum<[604]>;
+def R605 : AMDILReg<605, "r605">, DwarfRegNum<[605]>;
+def R606 : AMDILReg<606, "r606">, DwarfRegNum<[606]>;
+def R607 : AMDILReg<607, "r607">, DwarfRegNum<[607]>;
+def R608 : AMDILReg<608, "r608">, DwarfRegNum<[608]>;
+def R609 : AMDILReg<609, "r609">, DwarfRegNum<[609]>;
+def R610 : AMDILReg<610, "r610">, DwarfRegNum<[610]>;
+def R611 : AMDILReg<611, "r611">, DwarfRegNum<[611]>;
+def R612 : AMDILReg<612, "r612">, DwarfRegNum<[612]>;
+def R613 : AMDILReg<613, "r613">, DwarfRegNum<[613]>;
+def R614 : AMDILReg<614, "r614">, DwarfRegNum<[614]>;
+def R615 : AMDILReg<615, "r615">, DwarfRegNum<[615]>;
+def R616 : AMDILReg<616, "r616">, DwarfRegNum<[616]>;
+def R617 : AMDILReg<617, "r617">, DwarfRegNum<[617]>;
+def R618 : AMDILReg<618, "r618">, DwarfRegNum<[618]>;
+def R619 : AMDILReg<619, "r619">, DwarfRegNum<[619]>;
+def R620 : AMDILReg<620, "r620">, DwarfRegNum<[620]>;
+def R621 : AMDILReg<621, "r621">, DwarfRegNum<[621]>;
+def R622 : AMDILReg<622, "r622">, DwarfRegNum<[622]>;
+def R623 : AMDILReg<623, "r623">, DwarfRegNum<[623]>;
+def R624 : AMDILReg<624, "r624">, DwarfRegNum<[624]>;
+def R625 : AMDILReg<625, "r625">, DwarfRegNum<[625]>;
+def R626 : AMDILReg<626, "r626">, DwarfRegNum<[626]>;
+def R627 : AMDILReg<627, "r627">, DwarfRegNum<[627]>;
+def R628 : AMDILReg<628, "r628">, DwarfRegNum<[628]>;
+def R629 : AMDILReg<629, "r629">, DwarfRegNum<[629]>;
+def R630 : AMDILReg<630, "r630">, DwarfRegNum<[630]>;
+def R631 : AMDILReg<631, "r631">, DwarfRegNum<[631]>;
+def R632 : AMDILReg<632, "r632">, DwarfRegNum<[632]>;
+def R633 : AMDILReg<633, "r633">, DwarfRegNum<[633]>;
+def R634 : AMDILReg<634, "r634">, DwarfRegNum<[634]>;
+def R635 : AMDILReg<635, "r635">, DwarfRegNum<[635]>;
+def R636 : AMDILReg<636, "r636">, DwarfRegNum<[636]>;
+def R637 : AMDILReg<637, "r637">, DwarfRegNum<[637]>;
+def R638 : AMDILReg<638, "r638">, DwarfRegNum<[638]>;
+def R639 : AMDILReg<639, "r639">, DwarfRegNum<[639]>;
+def R640 : AMDILReg<640, "r640">, DwarfRegNum<[640]>;
+def R641 : AMDILReg<641, "r641">, DwarfRegNum<[641]>;
+def R642 : AMDILReg<642, "r642">, DwarfRegNum<[642]>;
+def R643 : AMDILReg<643, "r643">, DwarfRegNum<[643]>;
+def R644 : AMDILReg<644, "r644">, DwarfRegNum<[644]>;
+def R645 : AMDILReg<645, "r645">, DwarfRegNum<[645]>;
+def R646 : AMDILReg<646, "r646">, DwarfRegNum<[646]>;
+def R647 : AMDILReg<647, "r647">, DwarfRegNum<[647]>;
+def R648 : AMDILReg<648, "r648">, DwarfRegNum<[648]>;
+def R649 : AMDILReg<649, "r649">, DwarfRegNum<[649]>;
+def R650 : AMDILReg<650, "r650">, DwarfRegNum<[650]>;
+def R651 : AMDILReg<651, "r651">, DwarfRegNum<[651]>;
+def R652 : AMDILReg<652, "r652">, DwarfRegNum<[652]>;
+def R653 : AMDILReg<653, "r653">, DwarfRegNum<[653]>;
+def R654 : AMDILReg<654, "r654">, DwarfRegNum<[654]>;
+def R655 : AMDILReg<655, "r655">, DwarfRegNum<[655]>;
+def R656 : AMDILReg<656, "r656">, DwarfRegNum<[656]>;
+def R657 : AMDILReg<657, "r657">, DwarfRegNum<[657]>;
+def R658 : AMDILReg<658, "r658">, DwarfRegNum<[658]>;
+def R659 : AMDILReg<659, "r659">, DwarfRegNum<[659]>;
+def R660 : AMDILReg<660, "r660">, DwarfRegNum<[660]>;
+def R661 : AMDILReg<661, "r661">, DwarfRegNum<[661]>;
+def R662 : AMDILReg<662, "r662">, DwarfRegNum<[662]>;
+def R663 : AMDILReg<663, "r663">, DwarfRegNum<[663]>;
+def R664 : AMDILReg<664, "r664">, DwarfRegNum<[664]>;
+def R665 : AMDILReg<665, "r665">, DwarfRegNum<[665]>;
+def R666 : AMDILReg<666, "r666">, DwarfRegNum<[666]>;
+def R667 : AMDILReg<667, "r667">, DwarfRegNum<[667]>;
+def R668 : AMDILReg<668, "r668">, DwarfRegNum<[668]>;
+def R669 : AMDILReg<669, "r669">, DwarfRegNum<[669]>;
+def R670 : AMDILReg<670, "r670">, DwarfRegNum<[670]>;
+def R671 : AMDILReg<671, "r671">, DwarfRegNum<[671]>;
+def R672 : AMDILReg<672, "r672">, DwarfRegNum<[672]>;
+def R673 : AMDILReg<673, "r673">, DwarfRegNum<[673]>;
+def R674 : AMDILReg<674, "r674">, DwarfRegNum<[674]>;
+def R675 : AMDILReg<675, "r675">, DwarfRegNum<[675]>;
+def R676 : AMDILReg<676, "r676">, DwarfRegNum<[676]>;
+def R677 : AMDILReg<677, "r677">, DwarfRegNum<[677]>;
+def R678 : AMDILReg<678, "r678">, DwarfRegNum<[678]>;
+def R679 : AMDILReg<679, "r679">, DwarfRegNum<[679]>;
+def R680 : AMDILReg<680, "r680">, DwarfRegNum<[680]>;
+def R681 : AMDILReg<681, "r681">, DwarfRegNum<[681]>;
+def R682 : AMDILReg<682, "r682">, DwarfRegNum<[682]>;
+def R683 : AMDILReg<683, "r683">, DwarfRegNum<[683]>;
+def R684 : AMDILReg<684, "r684">, DwarfRegNum<[684]>;
+def R685 : AMDILReg<685, "r685">, DwarfRegNum<[685]>;
+def R686 : AMDILReg<686, "r686">, DwarfRegNum<[686]>;
+def R687 : AMDILReg<687, "r687">, DwarfRegNum<[687]>;
+def R688 : AMDILReg<688, "r688">, DwarfRegNum<[688]>;
+def R689 : AMDILReg<689, "r689">, DwarfRegNum<[689]>;
+def R690 : AMDILReg<690, "r690">, DwarfRegNum<[690]>;
+def R691 : AMDILReg<691, "r691">, DwarfRegNum<[691]>;
+def R692 : AMDILReg<692, "r692">, DwarfRegNum<[692]>;
+def R693 : AMDILReg<693, "r693">, DwarfRegNum<[693]>;
+def R694 : AMDILReg<694, "r694">, DwarfRegNum<[694]>;
+def R695 : AMDILReg<695, "r695">, DwarfRegNum<[695]>;
+def R696 : AMDILReg<696, "r696">, DwarfRegNum<[696]>;
+def R697 : AMDILReg<697, "r697">, DwarfRegNum<[697]>;
+def R698 : AMDILReg<698, "r698">, DwarfRegNum<[698]>;
+def R699 : AMDILReg<699, "r699">, DwarfRegNum<[699]>;
+def R700 : AMDILReg<700, "r700">, DwarfRegNum<[700]>;
+def R701 : AMDILReg<701, "r701">, DwarfRegNum<[701]>;
+def R702 : AMDILReg<702, "r702">, DwarfRegNum<[702]>;
+def R703 : AMDILReg<703, "r703">, DwarfRegNum<[703]>;
+def R704 : AMDILReg<704, "r704">, DwarfRegNum<[704]>;
+def R705 : AMDILReg<705, "r705">, DwarfRegNum<[705]>;
+def R706 : AMDILReg<706, "r706">, DwarfRegNum<[706]>;
+def R707 : AMDILReg<707, "r707">, DwarfRegNum<[707]>;
+def R708 : AMDILReg<708, "r708">, DwarfRegNum<[708]>;
+def R709 : AMDILReg<709, "r709">, DwarfRegNum<[709]>;
+def R710 : AMDILReg<710, "r710">, DwarfRegNum<[710]>;
+def R711 : AMDILReg<711, "r711">, DwarfRegNum<[711]>;
+def R712 : AMDILReg<712, "r712">, DwarfRegNum<[712]>;
+def R713 : AMDILReg<713, "r713">, DwarfRegNum<[713]>;
+def R714 : AMDILReg<714, "r714">, DwarfRegNum<[714]>;
+def R715 : AMDILReg<715, "r715">, DwarfRegNum<[715]>;
+def R716 : AMDILReg<716, "r716">, DwarfRegNum<[716]>;
+def R717 : AMDILReg<717, "r717">, DwarfRegNum<[717]>;
+def R718 : AMDILReg<718, "r718">, DwarfRegNum<[718]>;
+def R719 : AMDILReg<719, "r719">, DwarfRegNum<[719]>;
+def R720 : AMDILReg<720, "r720">, DwarfRegNum<[720]>;
+def R721 : AMDILReg<721, "r721">, DwarfRegNum<[721]>;
+def R722 : AMDILReg<722, "r722">, DwarfRegNum<[722]>;
+def R723 : AMDILReg<723, "r723">, DwarfRegNum<[723]>;
+def R724 : AMDILReg<724, "r724">, DwarfRegNum<[724]>;
+def R725 : AMDILReg<725, "r725">, DwarfRegNum<[725]>;
+def R726 : AMDILReg<726, "r726">, DwarfRegNum<[726]>;
+def R727 : AMDILReg<727, "r727">, DwarfRegNum<[727]>;
+def R728 : AMDILReg<728, "r728">, DwarfRegNum<[728]>;
+def R729 : AMDILReg<729, "r729">, DwarfRegNum<[729]>;
+def R730 : AMDILReg<730, "r730">, DwarfRegNum<[730]>;
+def R731 : AMDILReg<731, "r731">, DwarfRegNum<[731]>;
+def R732 : AMDILReg<732, "r732">, DwarfRegNum<[732]>;
+def R733 : AMDILReg<733, "r733">, DwarfRegNum<[733]>;
+def R734 : AMDILReg<734, "r734">, DwarfRegNum<[734]>;
+def R735 : AMDILReg<735, "r735">, DwarfRegNum<[735]>;
+def R736 : AMDILReg<736, "r736">, DwarfRegNum<[736]>;
+def R737 : AMDILReg<737, "r737">, DwarfRegNum<[737]>;
+def R738 : AMDILReg<738, "r738">, DwarfRegNum<[738]>;
+def R739 : AMDILReg<739, "r739">, DwarfRegNum<[739]>;
+def R740 : AMDILReg<740, "r740">, DwarfRegNum<[740]>;
+def R741 : AMDILReg<741, "r741">, DwarfRegNum<[741]>;
+def R742 : AMDILReg<742, "r742">, DwarfRegNum<[742]>;
+def R743 : AMDILReg<743, "r743">, DwarfRegNum<[743]>;
+def R744 : AMDILReg<744, "r744">, DwarfRegNum<[744]>;
+def R745 : AMDILReg<745, "r745">, DwarfRegNum<[745]>;
+def R746 : AMDILReg<746, "r746">, DwarfRegNum<[746]>;
+def R747 : AMDILReg<747, "r747">, DwarfRegNum<[747]>;
+def R748 : AMDILReg<748, "r748">, DwarfRegNum<[748]>;
+def R749 : AMDILReg<749, "r749">, DwarfRegNum<[749]>;
+def R750 : AMDILReg<750, "r750">, DwarfRegNum<[750]>;
+def R751 : AMDILReg<751, "r751">, DwarfRegNum<[751]>;
+def R752 : AMDILReg<752, "r752">, DwarfRegNum<[752]>;
+def R753 : AMDILReg<753, "r753">, DwarfRegNum<[753]>;
+def R754 : AMDILReg<754, "r754">, DwarfRegNum<[754]>;
+def R755 : AMDILReg<755, "r755">, DwarfRegNum<[755]>;
+def R756 : AMDILReg<756, "r756">, DwarfRegNum<[756]>;
+def R757 : AMDILReg<757, "r757">, DwarfRegNum<[757]>;
+def R758 : AMDILReg<758, "r758">, DwarfRegNum<[758]>;
+def R759 : AMDILReg<759, "r759">, DwarfRegNum<[759]>;
+def R760 : AMDILReg<760, "r760">, DwarfRegNum<[760]>;
+def R761 : AMDILReg<761, "r761">, DwarfRegNum<[761]>;
+def R762 : AMDILReg<762, "r762">, DwarfRegNum<[762]>;
+def R763 : AMDILReg<763, "r763">, DwarfRegNum<[763]>;
+def R764 : AMDILReg<764, "r764">, DwarfRegNum<[764]>;
+def R765 : AMDILReg<765, "r765">, DwarfRegNum<[765]>;
+def R766 : AMDILReg<766, "r766">, DwarfRegNum<[766]>;
+def R767 : AMDILReg<767, "r767">, DwarfRegNum<[767]>;
+
+// All registers between 1000 and 1024 are reserved and cannot be used
+// unless commented in this section
+// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
+// r1020 is used to hold the frame index for local arrays
+// r1019 is used to hold the dynamic stack allocation pointer
+// r1018 is used as a temporary register for handwritten code
+// r1017 is used as a temporary register for handwritten code
+// r1016 is used as a temporary register for load/store code
+// r1015 is used as a temporary register for data segment offset
+// r1014 is used as a temporary register for store code
+// r1013 is used as the section data pointer register
+// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
+// r1009 is used as the frame pointer register
+// r999 is used as the mem register.
+// r998 is used as the return address register.
+//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
+//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
+//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
+//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
+//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
+//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
+def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
+def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
+def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
+def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
+def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
+def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
+def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
+def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
+def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
+def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
+def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
+def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
+def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
+def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
+def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
+def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
+def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
+def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
+def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
+def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
+def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
+def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
+def GPRI8 : RegisterClass<"AMDIL", [i8], 8,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2I8 : RegisterClass<"AMDIL", [v2i8], 16,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV4I8 : RegisterClass<"AMDIL", [v4i8], 32,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRI16 : RegisterClass<"AMDIL", [i16], 16,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2I16 : RegisterClass<"AMDIL", [v2i16], 32,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV4I16 : RegisterClass<"AMDIL", [v4i16], 64,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRI32 : RegisterClass<"AMDIL", [i32], 32,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRF32 : RegisterClass<"AMDIL", [f32], 32,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+// For 64 bit integer emulation, the lower 32 bits are in x
+// and the upper 32 bits are in y
+def GPRI64 : RegisterClass<"AMDIL", [i64], 64,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRF64 : RegisterClass<"AMDIL", [f64], 64,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV4F32 : RegisterClass<"AMDIL", [v4f32], 128,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV4I32 : RegisterClass<"AMDIL", [v4i32], 128,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2I32 : RegisterClass<"AMDIL", [v2i32], 64,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2F32 : RegisterClass<"AMDIL", [v2f32], 64,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2I64 : RegisterClass<"AMDIL", [v2i64], 128,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRV2F64 : RegisterClass<"AMDIL", [v2f64], 128,
+  (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)>
+{
+        let AltOrders = [(add (sequence "R%u", 1, 767))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.cpp b/src/gallium/drivers/radeon/AMDILSIDevice.cpp

new file mode 100644 (file)

index 0000000..ce56098
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.cpp
@@ -0,0 +1,49 @@
+//===-- AMDILSIDevice.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//\r
+#include "AMDILSIDevice.h"\r
+#include "AMDILEvergreenDevice.h"\r
+#include "AMDILNIDevice.h"\r
+#include "AMDILSubtarget.h"\r
+
+using namespace llvm;\r
+\r
+AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)\r
+  : AMDILEvergreenDevice(ST)\r
+{\r
+}\r
+AMDILSIDevice::~AMDILSIDevice()\r
+{\r
+}\r
+\r
+size_t\r
+AMDILSIDevice::getMaxLDSSize() const\r
+{\r
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {\r
+    return MAX_LDS_SIZE_900;\r
+  } else {\r
+    return 0;\r
+  }\r
+}\r
+\r
+uint32_t\r
+AMDILSIDevice::getGeneration() const\r
+{\r
+  return AMDILDeviceInfo::HD7XXX;\r
+}\r
+\r
+std::string\r
+AMDILSIDevice::getDataLayout() const\r
+{\r
+    return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"\r
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"\r
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"\r
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"\r
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"\r
+      "-n8:16:32:64");\r
+}\r
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.h b/src/gallium/drivers/radeon/AMDILSIDevice.h

new file mode 100644 (file)

index 0000000..69f35a0
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.h
@@ -0,0 +1,45 @@
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//\r
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//\r
+//\r
+// Interface for the subtarget data classes.\r
+//\r
+//===---------------------------------------------------------------------===//\r
+// This file will define the interface that each generation needs to\r
+// implement in order to correctly answer queries on the capabilities of the\r
+// specific hardware.\r
+//===---------------------------------------------------------------------===//\r
+#ifndef _AMDILSIDEVICE_H_\r
+#define _AMDILSIDEVICE_H_\r
+#include "AMDILEvergreenDevice.h"\r
+#include "AMDILSubtarget.h"\r
+
+namespace llvm {\r
+  class AMDILSubtarget;\r
+//===---------------------------------------------------------------------===//\r
+// SI generation of devices and their respective sub classes\r
+//===---------------------------------------------------------------------===//\r
+\r
+// The AMDILSIDevice is the base class for all Northern Island series of\r
+// cards. It is very similiar to the AMDILEvergreenDevice, with the major\r
+// exception being differences in wavefront size and hardware capabilities.  The\r
+// SI devices are all 64 wide wavefronts and also add support for signed 24 bit\r
+// integer operations\r
+\r
+  class AMDILSIDevice : public AMDILEvergreenDevice {\r
+    public:\r
+      AMDILSIDevice(AMDILSubtarget*);\r
+      virtual ~AMDILSIDevice();\r
+      virtual size_t getMaxLDSSize() const;\r
+      virtual uint32_t getGeneration() const;\r
+      virtual std::string getDataLayout() const;\r
+    protected:\r
+  }; // AMDILSIDevice\r
+\r
+} // namespace llvm\r
+#endif // _AMDILSIDEVICE_H_\r
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.cpp b/src/gallium/drivers/radeon/AMDILSubtarget.cpp

new file mode 100644 (file)

index 0000000..898833d
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.cpp
@@ -0,0 +1,179 @@
+//===- AMDILSubtarget.cpp - AMDIL Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file implements the AMD IL specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILSubtarget.h"
+#include "AMDIL.h"
+#include "AMDILDevices.h"
+#include "AMDILGlobalManager.h"
+#include "AMDILKernelManager.h"
+#include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/SubtargetFeature.h"
+
+using namespace llvm;
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
+#include "AMDILGenSubtargetInfo.inc"
+
+AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS )
+{
+  memset(CapsOverride, 0, sizeof(*CapsOverride)
+      * AMDILDeviceInfo::MaxNumberCapabilities);
+  // Default card
+  std::string GPU = "rv770";
+  GPU = CPU;
+  mIs64bit = false;
+  mVersion = 0;
+  SmallVector<StringRef, DEFAULT_VEC_SLOTS> Features;
+  SplitString(FS, Features, ",");
+  mDefaultSize[0] = 64;
+  mDefaultSize[1] = 1;
+  mDefaultSize[2] = 1;
+  std::string newFeatures = "";
+#if defined(_DEBUG) || defined(DEBUG)
+  bool useTest = false;
+#endif
+  for (size_t x = 0; x < Features.size(); ++x) {
+    if (Features[x].startswith("+mwgs")) {
+      SmallVector<StringRef, DEFAULT_VEC_SLOTS> sizes;
+      SplitString(Features[x], sizes, "-");
+      size_t mDim = ::atoi(sizes[1].data());
+      if (mDim > 3) {
+        mDim = 3;
+      }
+      for (size_t y = 0; y < mDim; ++y) {
+        mDefaultSize[y] = ::atoi(sizes[y+2].data());
+      }
+#if defined(_DEBUG) || defined(DEBUG)
+    } else if (!Features[x].compare("test")) {
+      useTest = true;
+#endif
+    } else if (Features[x].startswith("+cal")) {
+      SmallVector<StringRef, DEFAULT_VEC_SLOTS> version;
+      SplitString(Features[x], version, "=");
+      mVersion = ::atoi(version[1].data());
+    } else {
+      GPU = CPU;
+      if (x > 0) newFeatures += ',';
+      newFeatures += Features[x];
+    }
+  }
+  // If we don't have a version then set it to
+  // -1 which enables everything. This is for
+  // offline devices.
+  if (!mVersion) {
+    mVersion = (uint32_t)-1;
+  }
+  for (int x = 0; x < 3; ++x) {
+    if (!mDefaultSize[x]) {
+      mDefaultSize[x] = 1;
+    }
+  }
+#if defined(_DEBUG) || defined(DEBUG)
+  if (useTest) {
+    GPU = "kauai";
+  }
+#endif
+  ParseSubtargetFeatures(GPU, newFeatures);
+#if defined(_DEBUG) || defined(DEBUG)
+  if (useTest) {
+    GPU = "test";
+  }
+#endif
+  mDevName = GPU;
+  mDevice = getDeviceFromName(mDevName, this, mIs64bit);
+}
+AMDILSubtarget::~AMDILSubtarget()
+{
+  delete mDevice;
+}
+bool
+AMDILSubtarget::isOverride(AMDILDeviceInfo::Caps caps) const
+{
+  assert(caps < AMDILDeviceInfo::MaxNumberCapabilities &&
+      "Caps index is out of bounds!");
+  return CapsOverride[caps];
+}
+bool
+AMDILSubtarget::is64bit() const 
+{
+  return mIs64bit;
+}
+bool
+AMDILSubtarget::isTargetELF() const
+{
+  return false;
+}
+size_t
+AMDILSubtarget::getDefaultSize(uint32_t dim) const
+{
+  if (dim > 3) {
+    return 1;
+  } else {
+    return mDefaultSize[dim];
+  }
+}
+uint32_t
+AMDILSubtarget::calVersion() const
+{
+  return mVersion;
+}
+
+AMDILGlobalManager*
+AMDILSubtarget::getGlobalManager() const
+{
+  return mGM;
+}
+void
+AMDILSubtarget::setGlobalManager(AMDILGlobalManager *gm) const
+{
+  mGM = gm;
+}
+
+AMDILKernelManager*
+AMDILSubtarget::getKernelManager() const
+{
+  return mKM;
+}
+void
+AMDILSubtarget::setKernelManager(AMDILKernelManager *km) const
+{
+  mKM = km;
+}
+std::string
+AMDILSubtarget::getDataLayout() const
+{
+    if (!mDevice) {
+        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
+    }
+    return mDevice->getDataLayout();
+}
+
+std::string
+AMDILSubtarget::getDeviceName() const
+{
+  return mDevName;
+}
+const AMDILDevice *
+AMDILSubtarget::device() const
+{
+  return mDevice;
+}
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.h b/src/gallium/drivers/radeon/AMDILSubtarget.h

new file mode 100644 (file)

index 0000000..a4b0e34
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.h
@@ -0,0 +1,75 @@
+//=====-- AMDILSubtarget.h - Define Subtarget for the AMDIL ----*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file declares the AMDIL specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _AMDILSUBTARGET_H_
+#define _AMDILSUBTARGET_H_
+
+#include "AMDILDevice.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#include <cstdlib>
+#include <string>
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDILGenSubtargetInfo.inc"
+
+#define MAX_CB_SIZE (1 << 16)
+namespace llvm {
+  class Module;
+  class AMDILKernelManager;
+  class AMDILGlobalManager;
+  class AMDILDevice;
+  class AMDILSubtarget : public AMDILGenSubtargetInfo {
+    private:
+      bool CapsOverride[AMDILDeviceInfo::MaxNumberCapabilities];
+      mutable AMDILGlobalManager *mGM;
+      mutable AMDILKernelManager *mKM;
+      const AMDILDevice *mDevice;
+      size_t mDefaultSize[3];
+      size_t mMinimumSize[3];
+      std::string mDevName;
+      uint32_t mVersion;
+      bool mIs64bit;
+      bool mIs32on64bit;
+    public:
+      AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS);
+      virtual ~AMDILSubtarget();
+      bool isOverride(AMDILDeviceInfo::Caps) const;
+      bool is64bit() const;
+
+      // Helper functions to simplify if statements
+      bool isTargetELF() const;
+      AMDILGlobalManager* getGlobalManager() const;
+      void setGlobalManager(AMDILGlobalManager *gm) const;
+      AMDILKernelManager* getKernelManager() const;
+      void setKernelManager(AMDILKernelManager *gm) const;
+      const AMDILDevice* device() const;
+      std::string getDataLayout() const;
+      std::string getDeviceName() const;
+      virtual size_t getDefaultSize(uint32_t dim) const;
+      // Return the version of CAL that the backend should target.
+      uint32_t calVersion() const;
+      // ParseSubtargetFeatures - Parses features string setting specified
+      // subtarget options.  Definition of function is
+      //auto generated by tblgen.
+      void
+        ParseSubtargetFeatures(
+            llvm::StringRef CPU,
+            llvm::StringRef FS);
+
+  };
+
+} // end namespace llvm
+
+#endif // AMDILSUBTARGET_H_
diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp

new file mode 100644 (file)

index 0000000..6146dde
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
@@ -0,0 +1,195 @@
+//===-- AMDILTargetMachine.cpp - Define TargetMachine for AMDIL -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILTargetMachine.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDILDevices.h"
+#include "AMDILFrameLowering.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeAMDILTarget() {
+  // Register the target
+  RegisterTargetMachine<AMDILTargetMachine> X(TheAMDILTarget);
+  RegisterTargetMachine<AMDGPUTargetMachine> Y(TheAMDGPUTarget);
+}
+
+/// AMDILTargetMachine ctor -
+///
+AMDILTargetMachine::AMDILTargetMachine(const Target &T,
+    StringRef TT, StringRef CPU, StringRef FS,
+    TargetOptions Options,
+    Reloc::Model RM, CodeModel::Model CM,
+    CodeGenOpt::Level OL
+)
+:
+  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+  Subtarget(TT, CPU, FS),
+  DataLayout(Subtarget.getDataLayout()),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+      Subtarget.device()->getStackAlignment(), 0),
+  InstrInfo(*this), //JITInfo(*this),
+  TLInfo(*this), 
+  IntrinsicInfo(this),
+  ELFWriterInfo(false, true)
+{
+  setAsmVerbosityDefault(true);
+  setMCUseLoc(false);
+}
+
+AMDILTargetLowering*
+AMDILTargetMachine::getTargetLowering() const
+{
+  return const_cast<AMDILTargetLowering*>(&TLInfo);
+}
+
+const AMDILInstrInfo*
+AMDILTargetMachine::getInstrInfo() const
+{
+  return &InstrInfo;
+}
+const AMDILFrameLowering*
+AMDILTargetMachine::getFrameLowering() const
+{
+  return &FrameLowering;
+}
+
+const AMDILSubtarget*
+AMDILTargetMachine::getSubtargetImpl() const
+{
+  return &Subtarget;
+}
+
+const AMDILRegisterInfo*
+AMDILTargetMachine::getRegisterInfo() const
+{
+  return &InstrInfo.getRegisterInfo();
+}
+
+const TargetData*
+AMDILTargetMachine::getTargetData() const
+{
+  return &DataLayout;
+}
+
+const AMDILELFWriterInfo*
+AMDILTargetMachine::getELFWriterInfo() const
+{
+  return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+}
+
+const AMDILIntrinsicInfo*
+AMDILTargetMachine::getIntrinsicInfo() const
+{
+  return &IntrinsicInfo;
+}
+
+  void
+AMDILTargetMachine::dump(llvm::raw_ostream &O)
+{
+  if (!mDebugMode) {
+    return;
+  }
+  O << ";AMDIL Target Machine State Dump: \n";
+}
+
+  void
+AMDILTargetMachine::setDebug(bool debugMode)
+{
+  mDebugMode = debugMode;
+}
+
+bool
+AMDILTargetMachine::getDebug() const
+{
+  return mDebugMode;
+}
+
+namespace {
+class AMDILPassConfig : public TargetPassConfig {
+
+public:
+  AMDILPassConfig(AMDILTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDILTargetMachine &getAMDILTargetMachine() const {
+    return getTM<AMDILTargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreEmitPass();
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDILTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDILPassConfig(this, PM);
+}
+
+bool AMDILPassConfig::addPreISel()
+{
+  return false;
+}
+
+bool AMDILPassConfig::addInstSelector()
+{
+  PM.add(createAMDILBarrierDetect(*TM));
+  PM.add(createAMDILPrintfConvert(*TM));
+  PM.add(createAMDILInlinePass(*TM));
+  PM.add(createAMDILPeepholeOpt(*TM));
+  PM.add(createAMDILISelDag(getAMDILTargetMachine()));
+  return false;
+}
+
+bool AMDILPassConfig::addPreRegAlloc()
+{
+  // If debugging, reduce code motion. Use less aggressive pre-RA scheduler
+  if (TM->getOptLevel() == CodeGenOpt::None) {
+    llvm::RegisterScheduler::setDefault(&llvm::createSourceListDAGScheduler);
+  }
+
+  PM.add(createAMDILMachinePeephole(*TM));
+  PM.add(createAMDILPointerManager(*TM));
+  return false;
+}
+
+bool AMDILPassConfig::addPostRegAlloc() {
+  return false;  // -print-machineinstr should print after this.
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool AMDILPassConfig::addPreEmitPass()
+{
+  PM.add(createAMDILCFGPreparationPass(*TM));
+  PM.add(createAMDILCFGStructurizerPass(*TM));
+  PM.add(createAMDILLiteralManager(*TM));
+  PM.add(createAMDILIOExpansion(*TM));
+  return true;
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.h b/src/gallium/drivers/radeon/AMDILTargetMachine.h

new file mode 100644 (file)

index 0000000..1c90e1c
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILTargetMachine.h
@@ -0,0 +1,75 @@
+//===-- AMDILTargetMachine.h - Define TargetMachine for AMDIL ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file declares the AMDIL specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDILTARGETMACHINE_H_
+#define AMDILTARGETMACHINE_H_
+
+#include "AMDILELFWriterInfo.h"
+#include "AMDILFrameLowering.h"
+#include "AMDILISelLowering.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDILSubtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm
+{
+    class raw_ostream;
+
+    class AMDILTargetMachine : public LLVMTargetMachine
+    {
+        private:
+        AMDILSubtarget Subtarget;
+        const TargetData DataLayout;       // Calculates type size & alignment
+        AMDILFrameLowering FrameLowering;
+        AMDILInstrInfo InstrInfo;
+        AMDILTargetLowering TLInfo;
+        AMDILIntrinsicInfo IntrinsicInfo;
+        AMDILELFWriterInfo ELFWriterInfo;
+        bool mDebugMode;
+        CodeGenOpt::Level mOptLevel;
+
+        protected:
+
+        public:
+        AMDILTargetMachine(const Target &T,
+             StringRef TT, StringRef CPU, StringRef FS,
+             TargetOptions Options,
+             Reloc::Model RM, CodeModel::Model CM,
+             CodeGenOpt::Level OL);
+
+        // Get Target/Subtarget specific information
+        virtual AMDILTargetLowering* getTargetLowering() const;
+        virtual const AMDILInstrInfo* getInstrInfo() const;
+        virtual const AMDILFrameLowering* getFrameLowering() const;
+        virtual const AMDILSubtarget* getSubtargetImpl() const;
+        virtual const AMDILRegisterInfo* getRegisterInfo() const;
+        virtual const TargetData* getTargetData() const;
+        virtual const AMDILIntrinsicInfo *getIntrinsicInfo() const;
+        virtual const AMDILELFWriterInfo *getELFWriterInfo() const;
+
+        // Pass Pipeline Configuration
+        virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+        void dump(llvm::raw_ostream &O);
+        void setDebug(bool debugMode);
+        bool getDebug() const;
+        CodeGenOpt::Level getOptLevel() const { return mOptLevel; }
+
+
+    }; // AMDILTargetMachine
+
+} // end namespace llvm
+
+#endif // AMDILTARGETMACHINE_H_
diff --git a/src/gallium/drivers/radeon/AMDILTokenDesc.td b/src/gallium/drivers/radeon/AMDILTokenDesc.td

new file mode 100644 (file)

index 0000000..b81f593
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILTokenDesc.td
@@ -0,0 +1,120 @@
+//===-- AMDILTokenDesc.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+
+include "AMDILEnumeratedTypes.td"
+
+// Each token is 32 bits as specified in section 2.1 of the IL spec
+class ILToken <bits<32> n> {
+    field bits<32> _bits = n;
+}
+
+// Section 2.2.1 - IL Language Token
+class ILLang<bits<8> client_type> : ILToken<0> {
+    let _bits{0-7} = client_type;
+}
+
+// Section 2.2.2 - IL Version Token
+class ILVersion<bits<8> minor_version, bits<8> major_version, ILShader shader_type>  : ILToken<0> {
+    let _bits{0-7} = minor_version;
+    let _bits{8-15} = major_version;
+    let _bits{16-23} = shader_type.Value;
+}
+
+// Section 2.2.3 - IL Opcode Token
+class ILOpcode<ILOpCode opcode, bits<14> control, bit sec_mod_pre, bit pri_mod_pre> : ILToken<0> {
+    let _bits{0-15} = opcode.Value;
+    let _bits{16-29} = control;
+    let _bits{30} = sec_mod_pre;
+    let _bits{31} = pri_mod_pre;
+}
+
+// Section 2.2.4 - IL Destination Token
+class ILDst<AMDILReg register_num, ILRegType register_type, bit mod_pre, bits<2> relative_address, bit dimension, bit immediate_pre, bit extended> : ILToken<0> {
+    let _bits{0-15} = register_num.Value;
+    let _bits{16-21} = register_type.Value;
+    let _bits{22} = mod_pre;
+    let _bits{23-24} = relative_address;
+    let _bits{25} = dimension;
+    let _bits{26} = immediate_pre;
+    let _bits{31} = extended;
+}
+
+// Section 2.2.5 - IL Destination Modifier Token
+class ILDstMod<ILModDstComp x, ILModDstComp y, ILModDstComp z, ILModDstComp w, bit clamp, ILShiftScale shift_scale> : ILToken<0> {
+    let _bits{0-1} = x.Value;
+    let _bits{2-3} = y.Value;
+    let _bits{4-5} = z.Value;
+    let _bits{6-7} = w.Value;
+    let _bits{8} = clamp;
+    //let _bits{9-12} = shift_scale;
+}
+
+// Section 2.2.6 - IL Source Token
+class ILSrc<AMDILReg register_num, ILRegType register_type, bit mod_pre, bits<2> relative_address, bit dimension, bit immediate_pre, bit extended> : ILToken<0> {
+    let _bits{0-15} = register_num.Value;
+    let _bits{16-21} = register_type.Value;
+    let _bits{22} = mod_pre;
+    let _bits{23-24} = relative_address;
+    let _bits{25} = dimension;
+    let _bits{26} = immediate_pre;
+    let _bits{31} = extended;
+}
+
+// Section 2.2.7 - IL Source Modifier Token
+class ILSrcMod<ILComponentSelect swizzle_x, bit negate_x, ILComponentSelect swizzle_y, bit negate_y,
+               ILComponentSelect swizzle_z, bit negate_z, ILComponentSelect swizzle_w, bit negate_w,
+               bit invert, bit bias, bit x2, bit sign, bit abs, ILDivComp divComp,
+               bits<8> clamp> : ILToken<0> {
+    let _bits{0-2} = swizzle_x.Value;
+    let _bits{3} = negate_x;
+    let _bits{4-6} = swizzle_y.Value;
+    let _bits{7} = negate_y;
+    let _bits{8-10} = swizzle_z.Value;
+    let _bits{11} = negate_z;
+    let _bits{12-14} = swizzle_w.Value;
+    let _bits{15} = negate_w;
+    let _bits{16} = invert;
+    let _bits{17} = bias;
+    let _bits{18} = x2;
+    let _bits{19} = sign;
+    let _bits{20} = abs;
+    let _bits{21-23} = divComp.Value;
+    let _bits{24-31} = clamp;
+}
+
+// Section 2.2.8 - IL Relative Address Token
+class ILRelAddr<AMDILReg address_register, bit loop_relative, ILAddressing component> : ILToken<0> {
+    let _bits{0-15} = address_register.Value;
+    let _bits{16} = loop_relative;
+    let _bits{17-19} = component.Value;
+}
+
+// IL Literal Token
+class ILLiteral<bits<32> val> : ILToken<0> {
+    let _bits = val;
+}
+
+// All tokens required for a destination register
+class ILDstReg<ILDst Reg, ILDstMod Mod, ILRelAddr Rel, ILSrc Reg_Rel, ILSrcMod Reg_Rel_Mod> {
+    ILDst       reg = Reg;
+    ILDstMod    mod = Mod;
+    ILRelAddr   rel = Rel;
+    ILSrc       reg_rel = Reg_Rel;
+    ILSrcMod    reg_rel_mod = Reg_Rel_Mod;
+}
+
+// All tokens required for a source register
+class ILSrcReg<ILSrc Reg, ILSrcMod Mod, ILRelAddr Rel, ILSrc Reg_Rel, ILSrcMod Reg_Rel_Mod> {
+    ILSrc       reg = Reg;
+    ILSrcMod    mod = Mod;
+    ILRelAddr   rel = Rel;
+    ILSrc       reg_rel = Reg_Rel;
+    ILSrcMod    reg_rel_mod = Reg_Rel_Mod;
+}
+
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp

new file mode 100644 (file)

index 0000000..f2ef4eb
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp
@@ -0,0 +1,683 @@
+//===-- AMDILUtilityFunctions.cpp - AMDIL Utility Functions       ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file provides the implementations of functions that are declared in the
+// AMDILUtilityFUnctions.h file.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILUtilityFunctions.h"
+#include "AMDILISelLowering.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Type.h"
+
+#include <cstdio>
+#include <list>
+#include <queue>
+
+#define GET_OPCODE_NAME(TII, MI) \
+  TII->getName(MI->getOpcode())
+
+
+using namespace llvm;
+int64_t GET_SCALAR_SIZE(llvm::Type *A) {
+  return A->getScalarSizeInBits();
+}
+
+const TargetRegisterClass * getRegClassFromID(unsigned int ID) {
+  switch (ID) {
+  default:
+    assert(0 && "Passed in ID does not match any register classes.");
+    return NULL;
+  case AMDIL::GPRI8RegClassID:
+    return &AMDIL::GPRI8RegClass;
+  case AMDIL::GPRI16RegClassID:
+    return &AMDIL::GPRI16RegClass;
+  case AMDIL::GPRI32RegClassID:
+    return &AMDIL::GPRI32RegClass;
+  case AMDIL::GPRF32RegClassID:
+    return &AMDIL::GPRF32RegClass;
+  case AMDIL::GPRI64RegClassID:
+    return &AMDIL::GPRI64RegClass;
+  case AMDIL::GPRF64RegClassID:
+    return &AMDIL::GPRF64RegClass;
+  case AMDIL::GPRV4F32RegClassID:
+    return &AMDIL::GPRV4F32RegClass;
+  case AMDIL::GPRV4I8RegClassID:
+    return &AMDIL::GPRV4I8RegClass;
+  case AMDIL::GPRV4I16RegClassID:
+    return &AMDIL::GPRV4I16RegClass;
+  case AMDIL::GPRV4I32RegClassID:
+    return &AMDIL::GPRV4I32RegClass;
+  case AMDIL::GPRV2F32RegClassID:
+    return &AMDIL::GPRV2F32RegClass;
+  case AMDIL::GPRV2I8RegClassID:
+    return &AMDIL::GPRV2I8RegClass;
+  case AMDIL::GPRV2I16RegClassID:
+    return &AMDIL::GPRV2I16RegClass;
+  case AMDIL::GPRV2I32RegClassID:
+    return &AMDIL::GPRV2I32RegClass;
+  case AMDIL::GPRV2F64RegClassID:
+    return &AMDIL::GPRV2F64RegClass;
+  case AMDIL::GPRV2I64RegClassID:
+    return &AMDIL::GPRV2I64RegClass;
+  };
+}
+
+unsigned int getMoveInstFromID(unsigned int ID) {
+  switch (ID) {
+  default:
+    assert(0 && "Passed in ID does not match any move instructions.");
+  case AMDIL::GPRI8RegClassID:
+    return AMDIL::MOVE_i8;
+  case AMDIL::GPRI16RegClassID:
+    return AMDIL::MOVE_i16;
+  case AMDIL::GPRI32RegClassID:
+    return AMDIL::MOVE_i32;
+  case AMDIL::GPRF32RegClassID:
+    return AMDIL::MOVE_f32;
+  case AMDIL::GPRI64RegClassID:
+    return AMDIL::MOVE_i64;
+  case AMDIL::GPRF64RegClassID:
+    return AMDIL::MOVE_f64;
+  case AMDIL::GPRV4F32RegClassID:
+    return AMDIL::MOVE_v4f32;
+  case AMDIL::GPRV4I8RegClassID:
+    return AMDIL::MOVE_v4i8;
+  case AMDIL::GPRV4I16RegClassID:
+    return AMDIL::MOVE_v4i16;
+  case AMDIL::GPRV4I32RegClassID:
+    return AMDIL::MOVE_v4i32;
+  case AMDIL::GPRV2F32RegClassID:
+    return AMDIL::MOVE_v2f32;
+  case AMDIL::GPRV2I8RegClassID:
+    return AMDIL::MOVE_v2i8;
+  case AMDIL::GPRV2I16RegClassID:
+    return AMDIL::MOVE_v2i16;
+  case AMDIL::GPRV2I32RegClassID:
+    return AMDIL::MOVE_v2i32;
+  case AMDIL::GPRV2F64RegClassID:
+    return AMDIL::MOVE_v2f64;
+  case AMDIL::GPRV2I64RegClassID:
+    return AMDIL::MOVE_v2i64;
+  };
+  return -1;
+}
+
+unsigned int getPHIMoveInstFromID(unsigned int ID) {
+  switch (ID) {
+  default:
+    assert(0 && "Passed in ID does not match any move instructions.");
+  case AMDIL::GPRI8RegClassID:
+    return AMDIL::PHIMOVE_i8;
+  case AMDIL::GPRI16RegClassID:
+    return AMDIL::PHIMOVE_i16;
+  case AMDIL::GPRI32RegClassID:
+    return AMDIL::PHIMOVE_i32;
+  case AMDIL::GPRF32RegClassID:
+    return AMDIL::PHIMOVE_f32;
+  case AMDIL::GPRI64RegClassID:
+    return AMDIL::PHIMOVE_i64;
+  case AMDIL::GPRF64RegClassID:
+    return AMDIL::PHIMOVE_f64;
+  case AMDIL::GPRV4F32RegClassID:
+    return AMDIL::PHIMOVE_v4f32;
+  case AMDIL::GPRV4I8RegClassID:
+    return AMDIL::PHIMOVE_v4i8;
+  case AMDIL::GPRV4I16RegClassID:
+    return AMDIL::PHIMOVE_v4i16;
+  case AMDIL::GPRV4I32RegClassID:
+    return AMDIL::PHIMOVE_v4i32;
+  case AMDIL::GPRV2F32RegClassID:
+    return AMDIL::PHIMOVE_v2f32;
+  case AMDIL::GPRV2I8RegClassID:
+    return AMDIL::PHIMOVE_v2i8;
+  case AMDIL::GPRV2I16RegClassID:
+    return AMDIL::PHIMOVE_v2i16;
+  case AMDIL::GPRV2I32RegClassID:
+    return AMDIL::PHIMOVE_v2i32;
+  case AMDIL::GPRV2F64RegClassID:
+    return AMDIL::PHIMOVE_v2f64;
+  case AMDIL::GPRV2I64RegClassID:
+    return AMDIL::PHIMOVE_v2i64;
+  };
+  return -1;
+}
+
+const TargetRegisterClass* getRegClassFromType(unsigned int type) {
+  switch (type) {
+  default:
+    assert(0 && "Passed in type does not match any register classes.");
+  case MVT::i8:
+    return &AMDIL::GPRI8RegClass;
+  case MVT::i16:
+    return &AMDIL::GPRI16RegClass;
+  case MVT::i32:
+    return &AMDIL::GPRI32RegClass;
+  case MVT::f32:
+    return &AMDIL::GPRF32RegClass;
+  case MVT::i64:
+    return &AMDIL::GPRI64RegClass;
+  case MVT::f64:
+    return &AMDIL::GPRF64RegClass;
+  case MVT::v4f32:
+    return &AMDIL::GPRV4F32RegClass;
+  case MVT::v4i8:
+    return &AMDIL::GPRV4I8RegClass;
+  case MVT::v4i16:
+    return &AMDIL::GPRV4I16RegClass;
+  case MVT::v4i32:
+    return &AMDIL::GPRV4I32RegClass;
+  case MVT::v2f32:
+    return &AMDIL::GPRV2F32RegClass;
+  case MVT::v2i8:
+    return &AMDIL::GPRV2I8RegClass;
+  case MVT::v2i16:
+    return &AMDIL::GPRV2I16RegClass;
+  case MVT::v2i32:
+    return &AMDIL::GPRV2I32RegClass;
+  case MVT::v2f64:
+    return &AMDIL::GPRV2F64RegClass;
+  case MVT::v2i64:
+    return &AMDIL::GPRV2I64RegClass;
+  }
+}
+
+void printSDNode(const SDNode *N) {
+  printf("Opcode: %d isTargetOpcode: %d isMachineOpcode: %d\n",
+         N->getOpcode(), N->isTargetOpcode(), N->isMachineOpcode());
+  printf("Empty: %d OneUse: %d Size: %d NodeID: %d\n",
+         N->use_empty(), N->hasOneUse(), (int)N->use_size(), N->getNodeId());
+  for (unsigned int i = 0; i < N->getNumOperands(); ++i) {
+    printf("OperandNum: %d ValueCount: %d ValueType: %d\n",
+           i, N->getNumValues(), N->getValueType(0) .getSimpleVT().SimpleTy);
+    printSDValue(N->getOperand(i), 0);
+  }
+}
+
+void printSDValue(const SDValue &Op, int level) {
+  printf("\nOp: %p OpCode: %d NumOperands: %d ", (void*)&Op, Op.getOpcode(),
+         Op.getNumOperands());
+  printf("IsTarget: %d IsMachine: %d ", Op.isTargetOpcode(),
+         Op.isMachineOpcode());
+  if (Op.isMachineOpcode()) {
+    printf("MachineOpcode: %d\n", Op.getMachineOpcode());
+  } else {
+    printf("\n");
+  }
+  EVT vt = Op.getValueType();
+  printf("ValueType: %d \n", vt.getSimpleVT().SimpleTy);
+  printf("UseEmpty: %d OneUse: %d\n", Op.use_empty(), Op.hasOneUse());
+  if (level) {
+    printf("Children for %d:\n", level);
+    for (unsigned int i = 0; i < Op.getNumOperands(); ++i) {
+      printf("Child %d->%d:", level, i);
+      printSDValue(Op.getOperand(i), level - 1);
+    }
+  }
+}
+
+bool isPHIMove(unsigned int opcode) {
+  switch (opcode) {
+  default:
+    return false;
+    ExpandCaseToAllTypes(AMDIL::PHIMOVE);
+    return true;
+  }
+  return false;
+}
+
+bool isMove(unsigned int opcode) {
+  switch (opcode) {
+  default:
+    return false;
+    ExpandCaseToAllTypes(AMDIL::MOVE);
+    return true;
+  }
+  return false;
+}
+
+bool isMoveOrEquivalent(unsigned int opcode) {
+  switch (opcode) {
+  default:
+    return isMove(opcode) || isPHIMove(opcode);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASCHAR);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASSHORT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASINT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASLONG);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASDOUBLE);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASFLOAT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2CHAR);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2SHORT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2INT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2FLOAT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2LONG);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2DOUBLE);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4CHAR);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4SHORT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4INT);
+    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4FLOAT);
+    case AMDIL::INTTOANY_i8:
+    case AMDIL::INTTOANY_i16:
+    case AMDIL::INTTOANY_i32:
+    case AMDIL::INTTOANY_f32:
+    case AMDIL::DLO:
+    case AMDIL::LLO:
+    case AMDIL::LLO_v2i64:
+      return true;
+  };
+  return false;
+}
+
+bool check_type(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+size_t getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t getTypeSize(StructType * const ST, bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t getTypeSize(IntegerType * const IT, bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t getTypeSize(FunctionType * const FT, bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t getTypeSize(ArrayType * const AT, bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t getTypeSize(VectorType * const VT, bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t getTypeSize(PointerType * const PT, bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}
+
+size_t getNumElements(Type * const T) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = 1;
+    break;
+  case Type::PointerTyID:
+    size = getNumElements(dyn_cast<PointerType>(T));
+    break;
+  case Type::IntegerTyID:
+    size = getNumElements(dyn_cast<IntegerType>(T));
+    break;
+  case Type::StructTyID:
+    size = getNumElements(dyn_cast<StructType>(T));
+    break;
+  case Type::ArrayTyID:
+    size = getNumElements(dyn_cast<ArrayType>(T));
+    break;
+  case Type::FunctionTyID:
+    size = getNumElements(dyn_cast<FunctionType>(T));
+    break;
+  case Type::VectorTyID:
+    size = getNumElements(dyn_cast<VectorType>(T));
+    break;
+  };
+  return size;
+}
+
+size_t getNumElements(StructType * const ST) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end();
+       eib != eie; ++eib) {
+    curType = *eib;
+    size += getNumElements(curType);
+  }
+  return size;
+}
+
+size_t getNumElements(IntegerType * const IT) {
+  return (!IT) ? 0 : 1;
+}
+
+size_t getNumElements(FunctionType * const FT) {
+  assert(0 && "Should not be able to calculate the number of "
+         "elements of a function type");
+  return 0;
+}
+
+size_t getNumElements(ArrayType * const AT) {
+  return (!AT) ? 0
+               :  (size_t)(getNumElements(AT->getElementType()) *
+                           AT->getNumElements());
+}
+
+size_t getNumElements(VectorType * const VT) {
+  return (!VT) ? 0
+               : VT->getNumElements() * getNumElements(VT->getElementType());
+}
+
+size_t getNumElements(PointerType * const PT) {
+  size_t size = 0;
+  if (!PT) {
+    return size;
+  }
+  for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+    size += getNumElements(PT->getContainedType(x));
+  }
+  return size;
+}
+
+const llvm::Value *getBasePointerValue(const llvm::Value *V)
+{
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        // assert(0 && "Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
+const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI) {
+  const Value *moVal = NULL;
+  if (!MI->memoperands_empty()) {
+    const MachineMemOperand *memOp = (*MI->memoperands_begin());
+    moVal = memOp ? memOp->getValue() : NULL;
+    moVal = getBasePointerValue(moVal);
+  }
+  return moVal;
+}
+
+bool commaPrint(int i, llvm::raw_ostream &O) {
+  O << ":" << i;
+  return false;
+}
+
+bool isLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  if (strstr(GET_OPCODE_NAME(TII, MI), "LOADCONST")) {
+    return false;
+  }
+  return strstr(GET_OPCODE_NAME(TII, MI), "LOAD");
+}
+
+bool isSWSExtLoadInst(MachineInstr *MI)
+{
+switch (MI->getOpcode()) {
+    default:
+      break;
+      ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD);
+      return true;
+  };
+  return false;
+}
+
+bool isExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "EXTLOAD");
+}
+
+bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "SEXTLOAD");
+}
+
+bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "AEXTLOAD");
+}
+
+bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "ZEXTLOAD");
+}
+
+bool isStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "STORE");
+}
+
+bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "TRUNCSTORE");
+}
+
+bool isAtomicInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM");
+}
+
+bool isVolatileInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
+  if (!MI->memoperands_empty()) {
+    for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(),
+        moe = MI->memoperands_end(); mob != moe; ++mob) {
+      // If there is a volatile mem operand, this is a volatile instruction.
+      if ((*mob)->isVolatile()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "GLOBAL");
+}
+bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "PRIVATE");
+}
+bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "CONSTANT")
+    || strstr(GET_OPCODE_NAME(TII, MI), "CPOOL");
+}
+bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "REGION");
+}
+bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "LOCAL");
+}
+bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "IMAGE");
+}
+bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "APPEND");
+}
+bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_R");
+}
+bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_L");
+}
+bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_G")
+    || isArenaAtomic(TII, MI);
+}
+bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
+{
+  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_A");
+}
+
+const char* getSrcSwizzle(unsigned idx) {
+  const char *srcSwizzles[]  = {
+    "", ".x000", ".0x00", ".00x0", ".000x", ".y000", ".0y00", ".00y0", ".000y", 
+    ".z000", ".0z00", ".00z0", ".000z", ".w000", ".0w00", ".00w0", ".000w",
+    ".xy00", ".00xy", ".zw00", ".00zw", ".xyz0", ".0xyz", ".xyzw", ".0000",
+    ".xxxx", ".yyyy", ".zzzz", ".wwww", ".xyxy", ".zwzw", ".xzxz", ".ywyw",
+    ".x0y0", ".0x0y", ".xy_neg(y)", "_neg(yw)", "_neg(x)", ".xy_neg(xy)",
+    "_neg(xyzw)", ".0yzw", ".x0zw", ".xy0w", ".x", ".y", ".z", ".w", ".xy",
+    ".zw"
+  };
+  assert(idx < sizeof(srcSwizzles)/sizeof(srcSwizzles[0])
+      && "Idx passed in is invalid!");
+  return srcSwizzles[idx];
+}
+const char* getDstSwizzle(unsigned idx) {
+  const char *dstSwizzles[] = {
+    "", ".x___", ".xy__", ".xyz_", ".xyzw", "._y__", "._yz_", "._yzw", ".__z_",
+    ".__zw", ".___w", ".x_zw", ".xy_w", ".x_z_", ".x__w", "._y_w", 
+  };
+  assert(idx < sizeof(dstSwizzles)/sizeof(dstSwizzles[0])
+      && "Idx passed in is invalid!");
+  return dstSwizzles[idx];
+}
+/// Helper function to get the currently set flags
+void getAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
+{
+  // We need 16 bits of information, but LLVMr127097 cut the field in half.
+  // So we have to use two different fields to store all of our information.
+  uint16_t upper = MI->getFlags() << 8;
+  uint16_t lower = MI->getAsmPrinterFlags();
+  curRes.u16all = upper | lower;
+}
+/// Helper function to clear the currently set flags and add the new flags.
+void setAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
+{
+  // We need 16 bits of information, but LLVMr127097 cut the field in half.
+  // So we have to use two different fields to store all of our information.
+  MI->clearAsmPrinterFlags();
+  MI->setFlags(0);
+  uint8_t lower = curRes.u16all & 0xFF;
+  uint8_t upper = (curRes.u16all >> 8) & 0xFF;
+  MI->setFlags(upper);
+  MI->setAsmPrinterFlag((llvm::MachineInstr::CommentFlag)lower);
+}
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h

new file mode 100644 (file)

index 0000000..637c868
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
@@ -0,0 +1,362 @@
+//===-- AMDILUtilityFunctions.h - AMDIL Utility Functions Header --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file provides declarations for functions that are used across different
+// classes and provide various conversions or utility to shorten the code
+//
+//===----------------------------------------------------------------------===//
+#ifndef AMDILUTILITYFUNCTIONS_H_
+#define AMDILUTILITYFUNCTIONS_H_
+
+#include "AMDIL.h"
+#include "AMDILTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
+
+// Utility functions from ID
+//
+namespace llvm {
+class TargetRegisterClass;
+class SDValue;
+class SDNode;
+class Value;
+class Type;
+class StructType;
+class IntegerType;
+class FunctionType;
+class VectorType;
+class ArrayType;
+class PointerType;
+class OpaqueType;
+class MachineInstr;
+
+}
+enum SrcSwizzles {
+  AMDIL_SRC_SWIZZLE_DEFAULT = 0,
+  AMDIL_SRC_SWIZZLE_X000,
+  AMDIL_SRC_SWIZZLE_0X00,
+  AMDIL_SRC_SWIZZLE_00X0,
+  AMDIL_SRC_SWIZZLE_000X,
+  AMDIL_SRC_SWIZZLE_Y000,
+  AMDIL_SRC_SWIZZLE_0Y00,
+  AMDIL_SRC_SWIZZLE_00Y0,
+  AMDIL_SRC_SWIZZLE_000Y,
+  AMDIL_SRC_SWIZZLE_Z000,
+  AMDIL_SRC_SWIZZLE_0Z00,
+  AMDIL_SRC_SWIZZLE_00Z0,
+  AMDIL_SRC_SWIZZLE_000Z,
+  AMDIL_SRC_SWIZZLE_W000,
+  AMDIL_SRC_SWIZZLE_0W00,
+  AMDIL_SRC_SWIZZLE_00W0,
+  AMDIL_SRC_SWIZZLE_000W,
+  AMDIL_SRC_SWIZZLE_XY00,
+  AMDIL_SRC_SWIZZLE_00XY,
+  AMDIL_SRC_SWIZZLE_ZW00,
+  AMDIL_SRC_SWIZZLE_00ZW,
+  AMDIL_SRC_SWIZZLE_XYZ0,
+  AMDIL_SRC_SWIZZLE_0XYZ,
+  AMDIL_SRC_SWIZZLE_XYZW,
+  AMDIL_SRC_SWIZZLE_0000,
+  AMDIL_SRC_SWIZZLE_XXXX,
+  AMDIL_SRC_SWIZZLE_YYYY,
+  AMDIL_SRC_SWIZZLE_ZZZZ,
+  AMDIL_SRC_SWIZZLE_WWWW,
+  AMDIL_SRC_SWIZZLE_XYXY,
+  AMDIL_SRC_SWIZZLE_ZWZW,
+  AMDIL_SRC_SWIZZLE_XZXZ,
+  AMDIL_SRC_SWIZZLE_YWYW,
+  AMDIL_SRC_SWIZZLE_X0Y0,
+  AMDIL_SRC_SWIZZLE_0X0Y,
+  AMDIL_SRC_SWIZZLE_XY_NEGY,
+  AMDIL_SRC_SWIZZLE_NEGYW,
+  AMDIL_SRC_SWIZZLE_NEGX,
+  AMDIL_SRC_SWIZZLE_XY_NEGXY,
+  AMDIL_SRC_SWIZZLE_NEG_XYZW,
+  AMDIL_SRC_SWIZZLE_0YZW,
+  AMDIL_SRC_SWIZZLE_X0ZW,
+  AMDIL_SRC_SWIZZLE_XY0W,
+  AMDIL_SRC_SWIZZLE_X,
+  AMDIL_SRC_SWIZZLE_Y,
+  AMDIL_SRC_SWIZZLE_Z,
+  AMDIL_SRC_SWIZZLE_W,
+  AMDIL_SRC_SWIZZLE_XY,
+  AMDIL_SRC_SWIZZLE_ZW,
+  AMDIL_SRC_SWIZZLE_LAST
+};
+enum DstSwizzles {
+  AMDIL_DST_SWIZZLE_DEFAULT = 0,
+  AMDIL_DST_SWIZZLE_X___,
+  AMDIL_DST_SWIZZLE_XY__,
+  AMDIL_DST_SWIZZLE_XYZ_,
+  AMDIL_DST_SWIZZLE_XYZW,
+  AMDIL_DST_SWIZZLE__Y__,
+  AMDIL_DST_SWIZZLE__YZ_,
+  AMDIL_DST_SWIZZLE__YZW,
+  AMDIL_DST_SWIZZLE___Z_,
+  AMDIL_DST_SWIZZLE___ZW,
+  AMDIL_DST_SWIZZLE____W,
+  AMDIL_DST_SWIZZLE_X_ZW,
+  AMDIL_DST_SWIZZLE_XY_W,
+  AMDIL_DST_SWIZZLE_X_Z_,
+  AMDIL_DST_SWIZZLE_X__W,
+  AMDIL_DST_SWIZZLE__Y_W,
+  AMDIL_DST_SWIZZLE_LAST
+};
+// Function to get the correct src swizzle string from ID
+const char *getSrcSwizzle(unsigned);
+
+// Function to get the correct dst swizzle string from ID
+const char *getDstSwizzle(unsigned);
+
+const llvm::TargetRegisterClass *getRegClassFromID(unsigned int ID);
+
+unsigned int getMoveInstFromID(unsigned int ID);
+unsigned int getPHIMoveInstFromID(unsigned int ID);
+
+// Utility functions from Type.
+const llvm::TargetRegisterClass *getRegClassFromType(unsigned int type);
+unsigned int getTargetIndependentMoveFromType(unsigned int type);
+
+// Debug functions for SDNode and SDValue.
+void printSDValue(const llvm::SDValue &Op, int level);
+void printSDNode(const llvm::SDNode *N);
+
+// Functions to check if an opcode is a specific type.
+bool isMove(unsigned int opcode);
+bool isPHIMove(unsigned int opcode);
+bool isMoveOrEquivalent(unsigned int opcode);
+
+// Function to check address space
+bool check_type(const llvm::Value *ptr, unsigned int addrspace);
+
+// Group of functions that recursively calculate the size of a structure based
+// on it's sub-types.
+size_t getTypeSize(llvm::Type * const T, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::StructType * const ST, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::IntegerType * const IT, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::FunctionType * const FT, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::ArrayType * const AT, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::VectorType * const VT, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::PointerType * const PT, bool dereferencePtr = false);
+size_t
+getTypeSize(llvm::OpaqueType * const OT, bool dereferencePtr = false);
+
+// Group of functions that recursively calculate the number of elements of a
+// structure based on it's sub-types.
+size_t getNumElements(llvm::Type * const T);
+size_t getNumElements(llvm::StructType * const ST);
+size_t getNumElements(llvm::IntegerType * const IT);
+size_t getNumElements(llvm::FunctionType * const FT);
+size_t getNumElements(llvm::ArrayType * const AT);
+size_t getNumElements(llvm::VectorType * const VT);
+size_t getNumElements(llvm::PointerType * const PT);
+size_t getNumElements(llvm::OpaqueType * const OT);
+const llvm::Value *getBasePointerValue(const llvm::Value *V);
+const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI);
+
+
+int64_t GET_SCALAR_SIZE(llvm::Type* A);
+
+// Helper functions that check the opcode for status information
+bool isLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isSWSExtLoadInst(llvm::MachineInstr *MI);
+bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isAtomicInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isVolatileInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
+
+
+// Macros that are used to help with switch statements for various data types
+// However, these macro's do not return anything unlike the second set below.
+#define ExpandCaseTo32bitIntTypes(Instr)  \
+case Instr##_i8: \
+case Instr##_i16: \
+case Instr##_i32:
+
+#define ExpandCaseTo32bitIntTruncTypes(Instr)  \
+case Instr##_i16i8: \
+case Instr##_i32i8: \
+case Instr##_i32i16: 
+
+#define ExpandCaseToIntTypes(Instr) \
+    ExpandCaseTo32bitIntTypes(Instr) \
+case Instr##_i64:
+
+#define ExpandCaseToIntTruncTypes(Instr) \
+    ExpandCaseTo32bitIntTruncTypes(Instr) \
+case Instr##_i64i8:\
+case Instr##_i64i16:\
+case Instr##_i64i32:\
+
+#define ExpandCaseToFloatTypes(Instr) \
+    case Instr##_f32: \
+case Instr##_f64:
+
+#define ExpandCaseToFloatTruncTypes(Instr) \
+case Instr##_f64f32:
+
+#define ExpandCaseTo32bitScalarTypes(Instr) \
+    ExpandCaseTo32bitIntTypes(Instr) \
+case Instr##_f32:
+
+#define ExpandCaseToAllScalarTypes(Instr) \
+    ExpandCaseToFloatTypes(Instr) \
+ExpandCaseToIntTypes(Instr)
+
+#define ExpandCaseToAllScalarTruncTypes(Instr) \
+    ExpandCaseToFloatTruncTypes(Instr) \
+ExpandCaseToIntTruncTypes(Instr)
+
+// Vector versions of above macros
+#define ExpandCaseToVectorIntTypes(Instr) \
+    case Instr##_v2i8: \
+case Instr##_v4i8: \
+case Instr##_v2i16: \
+case Instr##_v4i16: \
+case Instr##_v2i32: \
+case Instr##_v4i32: \
+case Instr##_v2i64:
+
+#define ExpandCaseToVectorIntTruncTypes(Instr) \
+case Instr##_v2i16i8: \
+case Instr##_v4i16i8: \
+case Instr##_v2i32i8: \
+case Instr##_v4i32i8: \
+case Instr##_v2i32i16: \
+case Instr##_v4i32i16: \
+case Instr##_v2i64i8: \
+case Instr##_v2i64i16: \
+case Instr##_v2i64i32: 
+
+#define ExpandCaseToVectorFloatTypes(Instr) \
+    case Instr##_v2f32: \
+case Instr##_v4f32: \
+case Instr##_v2f64:
+
+#define ExpandCaseToVectorFloatTruncTypes(Instr) \
+case Instr##_v2f64f32:
+
+#define ExpandCaseToVectorByteTypes(Instr) \
+  case Instr##_v4i8:\
+case Instr##_v2i16: \
+case Instr##_v4i16:
+
+#define ExpandCaseToAllVectorTypes(Instr) \
+    ExpandCaseToVectorFloatTypes(Instr) \
+ExpandCaseToVectorIntTypes(Instr)
+
+#define ExpandCaseToAllVectorTruncTypes(Instr) \
+    ExpandCaseToVectorFloatTruncTypes(Instr) \
+ExpandCaseToVectorIntTruncTypes(Instr)
+
+#define ExpandCaseToAllTypes(Instr) \
+    ExpandCaseToAllVectorTypes(Instr) \
+ExpandCaseToAllScalarTypes(Instr)
+
+#define ExpandCaseToAllTruncTypes(Instr) \
+    ExpandCaseToAllVectorTruncTypes(Instr) \
+ExpandCaseToAllScalarTruncTypes(Instr)
+
+#define ExpandCaseToPackedTypes(Instr) \
+    case Instr##_v2i8: \
+    case Instr##_v4i8: \
+    case Instr##_v2i16: \
+    case Instr##_v4i16:
+
+#define ExpandCaseToByteShortTypes(Instr) \
+    case Instr##_i8: \
+    case Instr##_i16: \
+    ExpandCaseToPackedTypes(Instr)
+
+// Macros that expand into case statements with return values
+#define ExpandCaseTo32bitIntReturn(Instr, Return)  \
+case Instr##_i8: return Return##_i8;\
+case Instr##_i16: return Return##_i16;\
+case Instr##_i32: return Return##_i32;
+
+#define ExpandCaseToIntReturn(Instr, Return) \
+    ExpandCaseTo32bitIntReturn(Instr, Return) \
+case Instr##_i64: return Return##_i64;
+
+#define ExpandCaseToFloatReturn(Instr, Return) \
+    case Instr##_f32: return Return##_f32;\
+case Instr##_f64: return Return##_f64;
+
+#define ExpandCaseToAllScalarReturn(Instr, Return) \
+    ExpandCaseToFloatReturn(Instr, Return) \
+ExpandCaseToIntReturn(Instr, Return)
+
+// These macros expand to common groupings of RegClass ID's
+#define ExpandCaseTo1CompRegID \
+case AMDIL::GPRI8RegClassID: \
+case AMDIL::GPRI16RegClassID: \
+case AMDIL::GPRI32RegClassID: \
+case AMDIL::GPRF32RegClassID:
+
+#define ExpandCaseTo2CompRegID \
+    case AMDIL::GPRI64RegClassID: \
+case AMDIL::GPRF64RegClassID: \
+case AMDIL::GPRV2I8RegClassID: \
+case AMDIL::GPRV2I16RegClassID: \
+case AMDIL::GPRV2I32RegClassID: \
+case AMDIL::GPRV2F32RegClassID:
+
+// Macros that expand to case statements for specific bitlengths
+#define ExpandCaseTo8BitType(Instr) \
+    case Instr##_i8:
+
+#define ExpandCaseTo16BitType(Instr) \
+    case Instr##_v2i8: \
+case Instr##_i16:
+
+#define ExpandCaseTo32BitType(Instr) \
+    case Instr##_v4i8: \
+case Instr##_v2i16: \
+case Instr##_i32: \
+case Instr##_f32:
+
+#define ExpandCaseTo64BitType(Instr) \
+    case Instr##_v4i16: \
+case Instr##_v2i32: \
+case Instr##_v2f32: \
+case Instr##_i64: \
+case Instr##_f64:
+
+#define ExpandCaseTo128BitType(Instr) \
+    case Instr##_v4i32: \
+case Instr##_v4f32: \
+case Instr##_v2i64: \
+case Instr##_v2f64:
+
+bool commaPrint(int i, llvm::raw_ostream &O);
+/// Helper function to get the currently get/set flags.
+void getAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
+void setAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
+
+#endif // AMDILUTILITYFUNCTIONS_H_
diff --git a/src/gallium/drivers/radeon/AMDILVersion.td b/src/gallium/drivers/radeon/AMDILVersion.td

new file mode 100644 (file)

index 0000000..b8b0260
--- /dev/null
+++ b/src/gallium/drivers/radeon/AMDILVersion.td
@@ -0,0 +1,75 @@
+//===-- AMDILVersion.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+// Intrinsic operation support
+//===--------------------------------------------------------------------===//
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+def int_AMDIL_barrier   : GCCBuiltin<"barrier">,
+        BinaryIntNoRetInt;
+def int_AMDIL_barrier_global   : GCCBuiltin<"barrierGlobal">,
+      BinaryIntNoRetInt;
+def int_AMDIL_barrier_local   : GCCBuiltin<"barrierLocal">,
+      BinaryIntNoRetInt;
+def int_AMDIL_barrier_region   : GCCBuiltin<"barrierRegion">,
+      BinaryIntNoRetInt;
+def int_AMDIL_get_region_id : GCCBuiltin<"__amdil_get_region_id_int">,
+    Intrinsic<[llvm_v4i32_ty], [], []>;
+def int_AMDIL_get_region_local_id : GCCBuiltin<"__amdil_get_region_local_id_int">,
+    Intrinsic<[llvm_v4i32_ty], [], []>;
+def int_AMDIL_get_num_regions : GCCBuiltin<"__amdil_get_num_regions_int">,
+    Intrinsic<[llvm_v4i32_ty], [], []>;
+def int_AMDIL_get_region_size : GCCBuiltin<"__amdil_get_region_size_int">,
+    Intrinsic<[llvm_v4i32_ty], [], []>;
+}
+
+let isCall=1, isNotDuplicable=1 in {
+  let Predicates=[hasRegionAS] in {
+def BARRIER_EGNI : BinaryOpNoRet<IL_OP_BARRIER, (outs),
+      (ins GPRI32:$flag, GPRI32:$id),
+      "fence_threads_memory_lds_gds_gws",
+      [(int_AMDIL_barrier GPRI32:$flag, GPRI32:$id)]>;
+}
+let Predicates=[noRegionAS] in {
+def BARRIER_7XX : BinaryOpNoRet<IL_OP_BARRIER, (outs),
+      (ins GPRI32:$flag, GPRI32:$id),
+      "fence_threads_memory_lds",
+      [(int_AMDIL_barrier GPRI32:$flag, GPRI32:$id)]>;
+}
+
+def BARRIER_LOCAL : BinaryOpNoRet<IL_OP_BARRIER_LOCAL, (outs),
+      (ins GPRI32:$flag, GPRI32:$id),
+      "fence_threads_lds",
+      [(int_AMDIL_barrier_local GPRI32:$flag, GPRI32:$id)]>;
+
+def BARRIER_GLOBAL : BinaryOpNoRet<IL_OP_BARRIER_GLOBAL, (outs),
+      (ins GPRI32:$flag, GPRI32:$id),
+      "fence_threads_memory",
+      [(int_AMDIL_barrier_global GPRI32:$flag, GPRI32:$id)]>;
+
+def BARRIER_REGION : BinaryOpNoRet<IL_OP_BARRIER_REGION, (outs),
+    (ins GPRI32:$flag, GPRI32:$id),
+    "fence_threads_gds",
+    [(int_AMDIL_barrier_region GPRI32:$flag, GPRI32:$id)]>;
+
+def GET_REGION_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"),
+    [(set GPRV4I32:$dst, (int_AMDIL_get_region_id))]>;
+
+def GET_REGION_LOCAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"),
+    [(set GPRV4I32:$dst, (int_AMDIL_get_region_local_id))]>;
+
+def GET_REGION_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[10].xyz0"),
+    [(set GPRV4I32:$dst, (int_AMDIL_get_region_size))]>;
+
+def GET_NUM_REGIONS : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst),
+    (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[11].xyz0"),
+    [(set GPRV4I32:$dst, (int_AMDIL_get_num_regions))]>;
+
+}
diff --git a/src/gallium/drivers/radeon/LICENSE.TXT b/src/gallium/drivers/radeon/LICENSE.TXT

new file mode 100644 (file)

index 0000000..a57de2e
--- /dev/null
+++ b/src/gallium/drivers/radeon/LICENSE.TXT
@@ -0,0 +1,43 @@
+==============================================================================
+LLVM Release License
+==============================================================================
+University of Illinois/NCSA
+Open Source License
+
+Copyright (c) 2003-2012 University of Illinois at Urbana-Champaign.
+All rights reserved.
+
+Developed by:
+
+    LLVM Team
+
+    University of Illinois at Urbana-Champaign
+
+    http://llvm.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal with
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimers.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimers in the
+      documentation and/or other materials provided with the distribution.
+
+    * Neither the names of the LLVM Team, University of Illinois at
+      Urbana-Champaign, nor the names of its contributors may be used to
+      endorse or promote products derived from this Software without specific
+      prior written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+SOFTWARE.
diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp

new file mode 100644 (file)

index 0000000..5b62311
--- /dev/null
+++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp
@@ -0,0 +1,107 @@
+//===-- MCTargetDesc/AMDILMCAsmInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDILMCAsmInfo.h"
+#ifndef NULL
+#define NULL 0
+#endif
+
+using namespace llvm;
+AMDILMCAsmInfo::AMDILMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo()
+{
+  //===------------------------------------------------------------------===//
+  HasSubsectionsViaSymbols = true;
+  HasMachoZeroFillDirective = false;
+  HasMachoTBSSDirective = false;
+  HasStaticCtorDtorReferenceInStaticMode = false;
+  LinkerRequiresNonEmptyDwarfLines = true;
+  MaxInstLength = 16;
+  PCSymbol = "$";
+  SeparatorString = "\n";
+  CommentColumn = 40;
+  CommentString = ";";
+  LabelSuffix = ":";
+  GlobalPrefix = "@";
+  PrivateGlobalPrefix = ";.";
+  LinkerPrivateGlobalPrefix = "!";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+  AssemblerDialect = 0;
+  AllowQuotesInName = false;
+  AllowNameToStartWithDigit = false;
+  AllowPeriodsInName = false;
+
+  //===--- Data Emission Directives -------------------------------------===//
+  ZeroDirective = ".zero";
+  AsciiDirective = ".ascii\t";
+  AscizDirective = ".asciz\t";
+  Data8bitsDirective = ".byte\t";
+  Data16bitsDirective = ".short\t";
+  Data32bitsDirective = ".long\t";
+  Data64bitsDirective = ".quad\t";
+  GPRel32Directive = NULL;
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+  HasMicrosoftFastStdCallMangling = false;
+
+  //===--- Alignment Information ----------------------------------------===//
+  AlignDirective = ".align\t";
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  GlobalDirective = ".global";
+  ExternDirective = ".extern";
+  HasSetDirective = false;
+  HasAggressiveSymbolFolding = true;
+  LCOMMDirectiveType = LCOMM::None;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = true;
+  HasNoDeadStrip = true;
+  HasSymbolResolver = false;
+  WeakRefDirective = ".weakref\t";
+  WeakDefDirective = ".weakdef\t";
+  LinkOnceDirective = NULL;
+  HiddenVisibilityAttr = MCSA_Hidden;
+  HiddenDeclarationVisibilityAttr = MCSA_Hidden;
+  ProtectedVisibilityAttr = MCSA_Protected;
+
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::None;
+  DwarfUsesInlineInfoSection = false;
+  DwarfSectionOffsetDirective = ".offset";
+  DwarfUsesLabelOffsetForRanges = true;
+
+  //===--- CBE Asm Translation Table -----------------------------------===//
+  AsmTransCBE = NULL;
+}
+const char*
+AMDILMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const
+{
+  switch (AS) {
+    default:
+      return NULL;
+    case 0:
+      return NULL;
+  };
+  return NULL;
+}
+
+const MCSection*
+AMDILMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const
+{
+  return NULL;
+}
diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h

new file mode 100644 (file)

index 0000000..d354b03
--- /dev/null
+++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MCTargetDesc/AMDILMCAsmInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDILMCASMINFO_H_
+#define AMDILMCASMINFO_H_
+
+#include "llvm/MC/MCAsmInfo.h"
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  class AMDILMCAsmInfo : public MCAsmInfo {
+    public:
+      explicit AMDILMCAsmInfo(const Target &T, StringRef &TT);
+      const char*
+        getDataASDirective(unsigned int Size, unsigned int AS) const;
+      const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+  };
+} // namespace llvm
+#endif // AMDILMCASMINFO_H_
diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp

new file mode 100644 (file)

index 0000000..5e60b00
--- /dev/null
+++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp
@@ -0,0 +1,66 @@
+#include "AMDILMCTargetDesc.h"
+#include "AMDILMCAsmInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDILGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDILGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDILGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createAMDILMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDILMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDILMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDILMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createAMDILMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo * X = new MCSubtargetInfo();
+  InitAMDILMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createAMDILMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+extern "C" void LLVMInitializeAMDILTargetMC() {
+
+  RegisterMCAsmInfo<AMDILMCAsmInfo> X(TheAMDILTarget);
+  RegisterMCAsmInfo<AMDILMCAsmInfo> Y(TheAMDGPUTarget);
+
+  TargetRegistry::RegisterMCCodeGenInfo(TheAMDILTarget, createAMDILMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDILMCCodeGenInfo);
+
+  TargetRegistry::RegisterMCInstrInfo(TheAMDILTarget, createAMDILMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDILMCInstrInfo);
+
+  TargetRegistry::RegisterMCRegInfo(TheAMDILTarget, createAMDILMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDILMCRegisterInfo);
+
+  TargetRegistry::RegisterMCSubtargetInfo(TheAMDILTarget, createAMDILMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDILMCSubtargetInfo);
+
+}
diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h

new file mode 100644 (file)

index 0000000..370769f
--- /dev/null
+++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h
@@ -0,0 +1,36 @@
+//===-- AMDILMCTargetDesc.h - AMDIL Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides AMDIL specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef AMDILMCTARGETDESC_H
+#define AMDILMCTARGETDESC_H
+
+namespace llvm {
+class MCSubtargetInfo;
+class Target;
+
+extern Target TheAMDILTarget;
+extern Target TheAMDGPUTarget;
+
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDILGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDILGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDILGenSubtargetInfo.inc"
+
+#endif // AMDILMCTARGETDESC_H
diff --git a/src/gallium/drivers/radeon/Makefile b/src/gallium/drivers/radeon/Makefile

new file mode 100644 (file)

index 0000000..807dc78
--- /dev/null
+++ b/src/gallium/drivers/radeon/Makefile
@@ -0,0 +1,77 @@
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+include Makefile.sources
+
+LIBNAME = radeon
+
+LIBRARY_INCLUDES = -I$(TOP)/include
+
+TBLGEN = $(LLVM_BINDIR)/llvm-tblgen
+
+CXXFLAGS+= $(LLVM_CXXFLAGS)
+
+include ../../Makefile.template
+
+CXXFLAGS := $(filter-out -DDEBUG, $(CXXFLAGS))
+
+tablegen = $(TBLGEN) -I $(LLVM_INCLUDEDIR) $1 $2 -o $3
+
+gen: $(GENERATED_SOURCES)
+
+SIRegisterInfo.td: SIGenRegisterInfo.pl
+       $(PERL) $^ > $@
+
+SIRegisterGetHWRegNum.inc: SIGenRegisterInfo.pl
+       $(PERL) $^ $@ > /dev/null
+
+R600ShaderPatterns.td: AMDGPUGenShaderPatterns.pl
+       $(PERL) $^ C > $@
+       
+R600RegisterInfo.td: R600GenRegisterInfo.pl
+       $(PERL) $^ > $@
+
+AMDGPUInstrEnums.td: AMDGPUGenInstrEnums.pl
+       $(PERL) $^ td > $@
+
+AMDGPUInstrEnums.h.include: AMDGPUGenInstrEnums.pl
+       $(PERL) $^ h > $@
+
+AMDGPUInstrEnums.include: AMDGPUGenInstrEnums.pl
+       $(PERL) $^ inc > $@
+
+
+AMDILGenRegisterInfo.inc: *.td
+       $(call tablegen, -gen-register-info, AMDIL.td, $@)
+
+AMDILGenInstrInfo.inc: *.td
+       $(call tablegen, -gen-instr-info, AMDIL.td, $@)
+
+AMDILGenAsmWriter.inc: *.td
+       $(call tablegen, -gen-asm-writer, AMDIL.td, $@)
+
+AMDILGenDAGISel.inc: *.td
+       $(call tablegen, -gen-dag-isel, AMDIL.td, $@)
+
+AMDILGenCallingConv.inc: *.td
+       $(call tablegen, -gen-callingconv, AMDIL.td, $@)
+
+AMDILGenSubtargetInfo.inc: *.td
+       $(call tablegen, -gen-subtarget, AMDIL.td, $@)
+
+AMDILGenEDInfo.inc: *.td
+       $(call tablegen, -gen-enhanced-disassembly-info, AMDIL.td, $@)
+
+AMDILGenIntrinsics.inc: *.td
+       $(call tablegen, -gen-tgt-intrinsic, AMDIL.td, $@)
+
+AMDILGenCodeEmitter.inc: *.td
+       $(call tablegen, -gen-emitter, AMDIL.td, $@)
+
+LOADER_LIBS=$(shell llvm-config --libs bitreader asmparser)
+loader: loader.o libradeon.a
+       gcc -o loader -L/usr/local/lib $(LDFLAGS) loader.o libradeon.a $(LLVM_LIBS) $(LOADER_LIBS) -lpthread -ldl -lstdc++ -lm
+
+# FIXME: Remove when this driver is converted to automake.
+all: default
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources

new file mode 100644 (file)

index 0000000..96189e7
--- /dev/null
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -0,0 +1,86 @@
+
+GENERATED_SOURCES := \
+       R600ShaderPatterns.td           \
+       R600RegisterInfo.td             \
+       AMDGPUInstrEnums.td             \
+       SIRegisterInfo.td               \
+       SIRegisterGetHWRegNum.inc               \
+       AMDILGenRegisterInfo.inc        \
+       AMDILGenInstrInfo.inc           \
+       AMDILGenAsmWriter.inc           \
+       AMDILGenDAGISel.inc             \
+       AMDILGenCallingConv.inc         \
+       AMDILGenSubtargetInfo.inc               \
+       AMDILGenEDInfo.inc              \
+       AMDILGenIntrinsics.inc          \
+       AMDILGenCodeEmitter.inc \
+       AMDGPUInstrEnums.h.include      \
+       AMDGPUInstrEnums.include
+
+CPP_SOURCES := \
+       AMDIL7XXDevice.cpp              \
+       AMDIL7XXIOExpansion.cpp         \
+       AMDIL789IOExpansion.cpp         \
+       AMDILAsmBackend.cpp             \
+       AMDILBarrierDetect.cpp          \
+       AMDILCFGStructurizer.cpp        \
+       AMDILDevice.cpp                 \
+       AMDILDeviceInfo.cpp             \
+       AMDILEGIOExpansion.cpp          \
+       AMDILEvergreenDevice.cpp        \
+       AMDILELFWriterInfo.cpp          \
+       AMDILFrameLowering.cpp          \
+       AMDILGlobalManager.cpp          \
+       AMDILImageExpansion.cpp         \
+       AMDILInliner.cpp                \
+       AMDILInstrInfo.cpp              \
+       AMDILIntrinsicInfo.cpp          \
+       AMDILIOExpansion.cpp            \
+       AMDILISelDAGToDAG.cpp           \
+       AMDILISelLowering.cpp           \
+       AMDILKernelManager.cpp          \
+       AMDILLiteralManager.cpp         \
+       AMDILMachineFunctionInfo.cpp    \
+       AMDILMachinePeephole.cpp        \
+       AMDILMCCodeEmitter.cpp          \
+       AMDILModuleInfo.cpp             \
+       AMDILNIDevice.cpp               \
+       AMDILPeepholeOptimizer.cpp      \
+       AMDILPointerManager.cpp         \
+       AMDILPrintfConvert.cpp          \
+       AMDILRegisterInfo.cpp           \
+       AMDILSIDevice.cpp               \
+       AMDILSubtarget.cpp              \
+       AMDILTargetMachine.cpp          \
+       AMDILUtilityFunctions.cpp       \
+       AMDGPUTargetMachine.cpp         \
+       AMDGPUISelLowering.cpp          \
+       AMDGPUConvertToISA.cpp          \
+       AMDGPULowerShaderInstructions.cpp       \
+       AMDGPUReorderPreloadInstructions.cpp    \
+       AMDGPUInstrInfo.cpp             \
+       AMDGPURegisterInfo.cpp          \
+       AMDGPUUtil.cpp                  \
+       R600CodeEmitter.cpp             \
+       R600ISelLowering.cpp            \
+       R600InstrInfo.cpp               \
+       R600KernelParameters.cpp        \
+       R600LowerInstructions.cpp       \
+       R600LowerShaderInstructions.cpp \
+       R600RegisterInfo.cpp            \
+       SIAssignInterpRegs.cpp          \
+       SICodeEmitter.cpp               \
+       SIConvertToISA.cpp              \
+       SIInstrInfo.cpp                 \
+       SIISelLowering.cpp              \
+       SILowerShaderInstructions.cpp   \
+       SIMachineFunctionInfo.cpp       \
+       SIPropagateImmReads.cpp         \
+       SIRegisterInfo.cpp              \
+       MCTargetDesc/AMDILMCAsmInfo.cpp \
+       MCTargetDesc/AMDILMCTargetDesc.cpp      \
+       TargetInfo/AMDILTargetInfo.cpp  \
+       radeon_llvm_emit.cpp
+
+C_SOURCES := \
+       radeon_setup_tgsi_llvm.c
diff --git a/src/gallium/drivers/radeon/Processors.td b/src/gallium/drivers/radeon/Processors.td

new file mode 100644 (file)

index 0000000..6d1b411
--- /dev/null
+++ b/src/gallium/drivers/radeon/Processors.td
@@ -0,0 +1,28 @@
+//===-- Processors.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDIL processors supported.\r
+//
+//===----------------------------------------------------------------------===//
+\r
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features> \r
+: Processor<Name, itin, Features>;\r
+def : Proc<"rv710",      R600_EG_Itin, []>;\r
+def : Proc<"rv730",      R600_EG_Itin, []>;\r
+def : Proc<"rv770",      R600_EG_Itin, [FeatureFP64]>;\r
+def : Proc<"cedar",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"redwood",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"juniper",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"cypress",    R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;\r
+def : Proc<"barts",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"turks",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"caicos",     R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;\r
+def : Proc<"cayman",     R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;\r
+def : Proc<"SI", SI_Itin, []>;\r
+\r
diff --git a/src/gallium/drivers/radeon/R600CodeEmitter.cpp b/src/gallium/drivers/radeon/R600CodeEmitter.cpp

new file mode 100644 (file)

index 0000000..d5f82cf
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600CodeEmitter.cpp
@@ -0,0 +1,776 @@
+//===-- R600CodeEmitter.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDILCodeEmitter.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILUtilityFunctions.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <stdio.h>
+
+#define SRC_BYTE_COUNT 11
+#define DST_BYTE_COUNT 5
+
+using namespace llvm;
+
+namespace {
+
+  class R600CodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter {
+
+  private:
+
+  static char ID;
+  formatted_raw_ostream &_OS;
+  const TargetMachine * TM;
+  const MachineRegisterInfo * MRI;
+  AMDILMachineFunctionInfo * MFI;
+  const R600RegisterInfo * TRI;
+  bool evergreenEncoding;
+
+  bool isReduction;
+  unsigned reductionElement;
+  bool isLast;
+
+  unsigned section_start;
+
+  public:
+
+  R600CodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
+      _OS(OS), TM(NULL), evergreenEncoding(false), isReduction(false),
+      isLast(true) { }
+
+  const char *getPassName() const { return "AMDGPU Machine Code Emitter"; }
+
+  bool runOnMachineFunction(MachineFunction &MF);
+  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                     const MachineOperand &MO) const;
+
+  private:
+
+  void emitALUInstr(MachineInstr  &MI);
+  void emitSrc(const MachineOperand & MO);
+  void emitDst(const MachineOperand & MO);
+  void emitALU(MachineInstr &MI, unsigned numSrc);
+  void emitTexInstr(MachineInstr &MI);
+  void emitFCInstr(MachineInstr &MI);
+
+  unsigned int getHWInst(const MachineInstr &MI);
+
+  void emitNullBytes(unsigned int byteCount);
+
+  void emitByte(unsigned int byte);
+
+  void emitTwoBytes(uint32_t bytes);
+
+  void emit(uint32_t value);
+  void emit(uint64_t value);
+
+  unsigned getHWReg(unsigned regNo) const;
+
+  unsigned getElement(unsigned regNo);
+
+};
+
+} /* End anonymous namespace */
+
+#define WRITE_MASK_X 0x1
+#define WRITE_MASK_Y 0x2
+#define WRITE_MASK_Z 0x4
+#define WRITE_MASK_W 0x8
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum InstrTypes {
+  INSTR_ALU = 0,
+  INSTR_TEX,
+  INSTR_FC,
+  INSTR_NATIVE,
+  INSTR_VTX
+};
+
+enum FCInstr {
+  FC_IF = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK,
+  FC_BREAK_NZ_INT,
+  FC_CONTINUE,
+  FC_BREAK_Z_INT
+};
+
+enum TextureTypes {
+  TEXTURE_1D = 1,
+  TEXTURE_2D, 
+  TEXTURE_3D,
+  TEXTURE_CUBE,
+  TEXTURE_RECT,
+  TEXTURE_SHADOW1D,
+  TEXTURE_SHADOW2D,
+  TEXTURE_SHADOWRECT,     
+  TEXTURE_1D_ARRAY,       
+  TEXTURE_2D_ARRAY,
+  TEXTURE_SHADOW1D_ARRAY,
+  TEXTURE_SHADOW2D_ARRAY
+};
+
+char R600CodeEmitter::ID = 0;
+
+FunctionPass *llvm::createR600CodeEmitterPass(formatted_raw_ostream &OS) {
+  return new R600CodeEmitter(OS);
+}
+
+bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+
+  TM = &MF.getTarget();
+  MRI = &MF.getRegInfo();
+  MFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  TRI = static_cast<const R600RegisterInfo *>(TM->getRegisterInfo());
+  const AMDILSubtarget &STM = TM->getSubtarget<AMDILSubtarget>();
+  std::string gpu = STM.getDeviceName();
+  if (!gpu.compare(0,3, "rv7")) {
+    evergreenEncoding = false;
+  } else {
+    evergreenEncoding = true;
+  }
+  const AMDGPUTargetMachine *amdtm =
+    static_cast<const AMDGPUTargetMachine *>(&MF.getTarget());
+
+  if (amdtm->shouldDumpCode()) {
+    MF.dump();
+  }
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+     MachineBasicBlock &MBB = *BB;
+     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                       I != E; ++I) {
+          MachineInstr &MI = *I;
+          if (MI.getNumOperands() > 1 && MI.getOperand(0).isReg() && MI.getOperand(0).isDead()) {
+            continue;
+          }
+          if (isTexOp(MI.getOpcode())) {
+            emitTexInstr(MI);
+          } else if (isFCOp(MI.getOpcode())){
+            emitFCInstr(MI);
+          } else if (isReductionOp(MI.getOpcode())) {
+            isReduction = true;
+            isLast = false;
+            for (reductionElement = 0; reductionElement < 4; reductionElement++) {
+              isLast = (reductionElement == 3);
+              emitALUInstr(MI);
+            }
+            isReduction = false;
+          } else if (MI.getOpcode() == AMDIL::RETURN) {
+            continue;
+          } else {
+            switch(MI.getOpcode()) {
+            case AMDIL::RAT_WRITE_CACHELESS_eg:
+              {
+                /* XXX: Support for autoencoding 64-bit instructions was added
+                 * in LLVM 3.1.  Until we drop support for 3.0, we will use Magic
+                 * numbers for the high bits. */
+                  uint64_t high = 0x95c0100000000000;
+                  uint64_t inst = getBinaryCodeForInstr(MI);
+                  inst |= high;
+                /* Set End Of Program bit */
+                /* XXX: Need better check of end of program.  EOP should be
+                 * encoded in one of the operands of the MI, and it should be
+                 * set in a prior pass. */
+                MachineBasicBlock::iterator NextI = llvm::next(I);
+                MachineInstr &NextMI = *NextI;
+                if (NextMI.getOpcode() == AMDIL::RETURN) {
+                  inst |= (((uint64_t)1) << 53);
+                }
+                emitByte(INSTR_NATIVE);
+                emit(inst);
+                break;
+              }
+            case AMDIL::VTX_READ_eg:
+              {
+                emitByte(INSTR_VTX);
+                /* inst */
+                emitByte(0);
+
+                /* fetch_type */
+                emitByte(2);
+
+                /* buffer_id */
+                emitByte(MI.getOperand(2).getImm());
+
+                /* src_gpr */
+                emitByte(getHWReg(MI.getOperand(1).getReg()));
+
+                /* src_sel_x */
+                emitByte(TRI->getHWRegChan(MI.getOperand(1).getReg()));
+
+                /* mega_fetch_count */
+                emitByte(3);
+
+                /* dst_gpr */
+                emitByte(getHWReg(MI.getOperand(0).getReg()));
+
+                /* dst_sel_x */
+                emitByte(0);
+
+                /* dst_sel_y */
+                emitByte(7);
+
+                /* dst_sel_z */
+                emitByte(7);
+
+                /* dst_sel_w */
+                emitByte(7);
+
+                /* use_const_fields */
+                emitByte(1);
+
+                /* data_format */
+                emitByte(0);
+
+                /* num_format_all */
+                emitByte(0);
+
+                /* format_comp_all */
+                emitByte(0);
+
+                /* srf_mode_all */
+                emitByte(0);
+
+                /* offset */
+                emitByte(0);
+
+                /* endian */
+                emitByte(0);
+                break;
+              }
+
+            default:
+              emitALUInstr(MI);
+              break;
+          }
+        }
+    }
+  }
+  return false;
+}
+
+void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
+{
+
+  unsigned numOperands = MI.getNumOperands();
+
+   /* Some instructions are just place holder instructions that represent
+    * operations that the GPU does automatically.  They should be ignored. */
+  if (isPlaceHolderOpcode(MI.getOpcode())) {
+    return;
+  }
+
+  /* We need to handle some opcodes differently */
+  switch (MI.getOpcode()) {
+    default: break;
+
+    /* Custom swizzle instructions, ignore the last two operands */
+    case AMDIL::SET_CHAN:
+      numOperands = 2;
+      break;
+
+    case AMDIL::VEXTRACT_v4f32:
+      numOperands = 2;
+      break;
+
+    /* XXX: Temp Hack */
+    case AMDIL::STORE_OUTPUT:
+      numOperands = 2;
+      break;
+  }
+
+  /* XXX Check if instruction writes a result */
+  if (numOperands < 1) {
+    return;
+  }
+  const MachineOperand dstOp = MI.getOperand(0);
+
+  /* Emit instruction type */
+  emitByte(0);
+
+  unsigned int opIndex;
+  for (opIndex = 1; opIndex < numOperands; opIndex++) {
+    /* Literal constants are always stored as the last operand. */
+    if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
+      break;
+    }
+    emitSrc(MI.getOperand(opIndex));
+  }
+
+    /* Emit zeros for unused sources */
+  for ( ; opIndex < 4; opIndex++) {
+    emitNullBytes(SRC_BYTE_COUNT);
+  }
+
+  emitDst(dstOp);
+
+  emitALU(MI, numOperands - 1);
+}
+
+void R600CodeEmitter::emitSrc(const MachineOperand & MO)
+{
+  uint32_t value = 0;
+  /* Emit the source select (2 bytes).  For GPRs, this is the register index.
+   * For other potential instruction operands, (e.g. constant registers) the
+   * value of the source select is defined in the r600isa docs. */
+  if (MO.isReg()) {
+    unsigned reg = MO.getReg();
+    emitTwoBytes(getHWReg(reg));
+    if (reg == AMDIL::ALU_LITERAL_X) {
+      const MachineInstr * parent = MO.getParent();
+      unsigned immOpIndex = parent->getNumOperands() - 1;
+      MachineOperand immOp = parent->getOperand(immOpIndex);
+      if (immOp.isFPImm()) {
+        value = immOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue();
+      } else {
+        assert(immOp.isImm());
+        value = immOp.getImm();
+      }
+    }
+  } else {
+    /* XXX: Handle other operand types. */
+    emitTwoBytes(0);
+  }
+
+  /* Emit the source channel (1 byte) */
+  if (isReduction) {
+    emitByte(reductionElement);
+  } else if (MO.isReg()) {
+    const MachineInstr * parent = MO.getParent();
+    /* The source channel for EXTRACT is stored in operand 2. */
+    if (parent->getOpcode() == AMDIL::VEXTRACT_v4f32) {
+      emitByte(parent->getOperand(2).getImm());
+    } else {
+      emitByte(TRI->getHWRegChan(MO.getReg()));
+    }
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit isNegated (1 byte) */
+  if ((!(MO.getTargetFlags() & MO_FLAG_ABS))
+      && (MO.getTargetFlags() & MO_FLAG_NEG ||
+     (MO.isReg() &&
+      (MO.getReg() == AMDIL::NEG_ONE || MO.getReg() == AMDIL::NEG_HALF)))){
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* Emit isAbsolute (1 byte) */
+  if (MO.getTargetFlags() & MO_FLAG_ABS) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit relative addressing mode (1 byte) */
+  emitByte(0);
+
+  /* Emit kc_bank, This will be adjusted later by r600_asm */
+  emitByte(0);
+
+  /* Emit the literal value, if applicable (4 bytes).  */
+  emit(value);
+
+}
+
+void R600CodeEmitter::emitDst(const MachineOperand & MO)
+{
+  if (MO.isReg()) {
+    /* Emit the destination register index (1 byte) */
+    emitByte(getHWReg(MO.getReg()));
+
+    /* Emit the element of the destination register (1 byte)*/
+    const MachineInstr * parent = MO.getParent();
+    if (isReduction) {
+      emitByte(reductionElement);
+
+    /* The destination element for SET_CHAN is stored in the 3rd operand. */
+    } else if (parent->getOpcode() == AMDIL::SET_CHAN) {
+      emitByte(parent->getOperand(2).getImm());
+    } else if (parent->getOpcode() == AMDIL::VCREATE_v4f32) {
+      emitByte(ELEMENT_X);
+    } else {
+      emitByte(TRI->getHWRegChan(MO.getReg()));
+    }
+
+    /* Emit isClamped (1 byte) */
+    if (MO.getTargetFlags() & MO_FLAG_CLAMP) {
+      emitByte(1);
+    } else {
+      emitByte(0);
+    }
+
+    /* Emit writemask (1 byte).  */
+    if ((isReduction && reductionElement != TRI->getHWRegChan(MO.getReg()))
+         || MO.getTargetFlags() & MO_FLAG_MASK) {
+      emitByte(0);
+    } else {
+      emitByte(1);
+    }
+
+    /* XXX: Emit relative addressing mode */
+    emitByte(0);
+  } else {
+    /* XXX: Handle other operand types.  Are there any for destination regs? */
+    emitNullBytes(DST_BYTE_COUNT);
+  }
+}
+
+void R600CodeEmitter::emitALU(MachineInstr &MI, unsigned numSrc)
+{
+  /* Emit the instruction (2 bytes) */
+  emitTwoBytes(getHWInst(MI));
+
+  /* Emit isLast (for this instruction group) (1 byte) */
+  if (isLast) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+  /* Emit isOp3 (1 byte) */
+  if (numSrc == 3) {
+    emitByte(1);
+  } else {
+    emitByte(0);
+  }
+
+  /* XXX: Emit predicate (1 byte) */
+  emitByte(0);
+
+  /* XXX: Emit bank swizzle. (1 byte)  Do we need this?  It looks like
+   * r600_asm.c sets it. */
+  emitByte(0);
+
+  /* XXX: Emit bank_swizzle_force (1 byte) Not sure what this is for. */
+  emitByte(0);
+
+  /* XXX: Emit OMOD (1 byte) Not implemented. */
+  emitByte(0);
+
+  /* XXX: Emit index_mode.  I think this is for indirect addressing, so we
+   * don't need to worry about it. */
+  emitByte(0);
+}
+
+void R600CodeEmitter::emitTexInstr(MachineInstr &MI)
+{
+
+  int64_t sampler = MI.getOperand(2).getImm();
+  int64_t textureType = MI.getOperand(3).getImm();
+  unsigned opcode = MI.getOpcode();
+  unsigned srcSelect[4] = {0, 1, 2, 3};
+
+  /* Emit instruction type */
+  emitByte(1);
+
+  /* Emit instruction */
+  emitByte(getHWInst(MI));
+
+  /* XXX: Emit resource id r600_shader.c uses sampler + 1.  Why? */
+  emitByte(sampler + 1 + 1);
+
+  /* Emit source register */
+  emitByte(getHWReg(MI.getOperand(1).getReg()));
+
+  /* XXX: Emit src isRelativeAddress */
+  emitByte(0);
+
+  /* Emit destination register */
+  emitByte(getHWReg(MI.getOperand(0).getReg()));
+
+  /* XXX: Emit dst isRealtiveAddress */
+  emitByte(0);
+
+  /* XXX: Emit dst select */
+  emitByte(0); /* X */
+  emitByte(1); /* Y */
+  emitByte(2); /* Z */
+  emitByte(3); /* W */
+
+  /* XXX: Emit lod bias */
+  emitByte(0);
+
+  /* XXX: Emit coord types */
+  unsigned coordType[4] = {1, 1, 1, 1};
+
+  if (textureType == TEXTURE_RECT
+      || textureType == TEXTURE_SHADOWRECT) {
+    coordType[ELEMENT_X] = 0;
+    coordType[ELEMENT_Y] = 0;
+  }
+
+  if (textureType == TEXTURE_1D_ARRAY
+      || textureType == TEXTURE_SHADOW1D_ARRAY) {
+    if (opcode == AMDIL::TEX_SAMPLE_C_L || opcode == AMDIL::TEX_SAMPLE_C_LB) {
+      coordType[ELEMENT_Y] = 0;
+    } else {
+      coordType[ELEMENT_Z] = 0;
+      srcSelect[ELEMENT_Z] = ELEMENT_Y;
+    }
+  } else if (textureType == TEXTURE_2D_ARRAY
+             || textureType == TEXTURE_SHADOW2D_ARRAY) {
+    coordType[ELEMENT_Z] = 0;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    emitByte(coordType[i]);
+  }
+
+  /* XXX: Emit offsets */
+  emitByte(0); /* X */
+  emitByte(0); /* Y */
+  emitByte(0); /* Z */
+  /* There is no OFFSET_W */
+
+  /* Emit sampler id */
+  emitByte(sampler);
+
+  /* XXX:Emit source select */
+  if ((textureType == TEXTURE_SHADOW1D
+      || textureType == TEXTURE_SHADOW2D
+      || textureType == TEXTURE_SHADOWRECT
+      || textureType == TEXTURE_SHADOW1D_ARRAY)
+      && opcode != AMDIL::TEX_SAMPLE_C_L
+      && opcode != AMDIL::TEX_SAMPLE_C_LB) {
+    srcSelect[ELEMENT_W] = ELEMENT_Z;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    emitByte(srcSelect[i]);
+  }
+}
+
+void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
+{
+  /* Emit instruction type */
+  emitByte(INSTR_FC);
+
+  /* Emit SRC */
+  unsigned numOperands = MI.getNumOperands();
+  if (numOperands > 0) {
+    assert(numOperands == 1);
+    emitSrc(MI.getOperand(0));
+  } else {
+    emitNullBytes(SRC_BYTE_COUNT);
+  }
+
+  /* Emit FC Instruction */
+  enum FCInstr instr;
+  switch (MI.getOpcode()) {
+  case AMDIL::BREAK_LOGICALZ_f32:
+    instr = FC_BREAK;
+    break;
+  case AMDIL::BREAK_LOGICALNZ_i32:
+    instr = FC_BREAK_NZ_INT;
+    break;
+  case AMDIL::BREAK_LOGICALZ_i32:
+    instr = FC_BREAK_Z_INT;
+    break;
+  case AMDIL::CONTINUE_LOGICALNZ_f32:
+    instr = FC_CONTINUE;
+    break;
+  /* XXX: This assumes that all IFs will be if (x != 0).  If we add
+   * optimizations this might not be the case */
+  case AMDIL::IF_LOGICALNZ_f32:
+  case AMDIL::IF_LOGICALNZ_i32:
+    instr = FC_IF;
+    break;
+  case AMDIL::IF_LOGICALZ_f32:
+    abort();
+    break;
+  case AMDIL::ELSE:
+    instr = FC_ELSE;
+    break;
+  case AMDIL::ENDIF:
+    instr = FC_ENDIF;
+    break;
+  case AMDIL::ENDLOOP:
+    instr = FC_ENDLOOP;
+    break;
+  case AMDIL::WHILELOOP:
+    instr = FC_BGNLOOP;
+    break;
+  default:
+    abort();
+    break;
+  }
+  emitByte(instr);
+}
+
+#define INSTR_FLOAT2_V(inst, hw) \
+  case AMDIL:: inst##_v4f32: \
+  case AMDIL:: inst##_v2f32: return HW_INST2(hw);
+
+#define INSTR_FLOAT2_S(inst, hw) \
+  case AMDIL:: inst##_f32: return HW_INST2(hw);
+
+#define INSTR_FLOAT2(inst, hw) \
+  INSTR_FLOAT2_V(inst, hw) \
+  INSTR_FLOAT2_S(inst, hw)
+
+unsigned int R600CodeEmitter::getHWInst(const MachineInstr &MI)
+{
+
+  /* XXX: Lower these to MOV before the code emitter. */
+  switch (MI.getOpcode()) {
+    case AMDIL::STORE_OUTPUT:
+    case AMDIL::VCREATE_v4i32:
+    case AMDIL::VCREATE_v4f32:
+    case AMDIL::VEXTRACT_v4f32:
+    case AMDIL::VINSERT_v4f32:
+    case AMDIL::LOADCONST_i32:
+    case AMDIL::LOADCONST_f32:
+    case AMDIL::MOVE_v4i32:
+    case AMDIL::SET_CHAN:
+    /* Instructons to reinterpret bits as ... */
+    case AMDIL::IL_ASINT_f32:
+    case AMDIL::IL_ASINT_i32:
+    case AMDIL::IL_ASFLOAT_f32:
+    case AMDIL::IL_ASFLOAT_i32:
+      return 0x19;
+
+  default:
+    return getBinaryCodeForInstr(MI);
+  }
+}
+
+void R600CodeEmitter::emitNullBytes(unsigned int byteCount)
+{
+  for (unsigned int i = 0; i < byteCount; i++) {
+    emitByte(0);
+  }
+}
+
+void R600CodeEmitter::emitByte(unsigned int byte)
+{
+  _OS.write((uint8_t) byte & 0xff);
+}
+void R600CodeEmitter::emitTwoBytes(unsigned int bytes)
+{
+  _OS.write((uint8_t) (bytes & 0xff));
+  _OS.write((uint8_t) ((bytes >> 8) & 0xff));
+}
+
+void R600CodeEmitter::emit(uint32_t value)
+{
+  for (unsigned i = 0; i < 4; i++) {
+    _OS.write((uint8_t) ((value >> (8 * i)) & 0xff));
+  }
+}
+
+void R600CodeEmitter::emit(uint64_t value)
+{
+  for (unsigned i = 0; i < 8; i++) {
+    emitByte((value >> (8 * i)) & 0xff);
+  }
+}
+
+unsigned R600CodeEmitter::getHWReg(unsigned regNo) const
+{
+  unsigned hwReg;
+
+  hwReg = TRI->getHWRegIndex(regNo);
+  if (AMDIL::R600_CReg32RegClass.contains(regNo)) {
+    hwReg += 512;
+  }
+  return hwReg;
+}
+
+uint64_t R600CodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                            const MachineOperand &MO) const
+{
+  if (MO.isReg()) {
+    return getHWReg(MO.getReg());
+  } else {
+    return MO.getImm();
+  }
+}
+
+
+RegElement maskBitToElement(unsigned int maskBit)
+{
+  switch (maskBit) {
+    case WRITE_MASK_X: return ELEMENT_X;
+    case WRITE_MASK_Y: return ELEMENT_Y;
+    case WRITE_MASK_Z: return ELEMENT_Z;
+    case WRITE_MASK_W: return ELEMENT_W;
+    default:
+      assert("Invalid maskBit");
+      return ELEMENT_X;
+  }
+}
+
+unsigned int dstSwizzleToWriteMask(unsigned swizzle)
+{
+  switch(swizzle) {
+  default:
+  case AMDIL_DST_SWIZZLE_DEFAULT:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X___:
+    return WRITE_MASK_X;
+  case AMDIL_DST_SWIZZLE_XY__:
+    return WRITE_MASK_X | WRITE_MASK_Y;
+  case AMDIL_DST_SWIZZLE_XYZ_:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE_XYZW:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE__Y__:
+    return WRITE_MASK_Y;
+  case AMDIL_DST_SWIZZLE__YZ_:
+    return WRITE_MASK_Y | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE__YZW:
+    return WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE___Z_:
+    return WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE___ZW:
+    return WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE____W:
+    return WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X_ZW:
+    return WRITE_MASK_X | WRITE_MASK_Z | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_XY_W:
+    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE_X_Z_:
+    return WRITE_MASK_X | WRITE_MASK_Z;
+  case AMDIL_DST_SWIZZLE_X__W:
+    return WRITE_MASK_X | WRITE_MASK_W;
+  case AMDIL_DST_SWIZZLE__Y_W:
+    return WRITE_MASK_Y | WRITE_MASK_W;
+  }
+}
+
+#include "AMDILGenCodeEmitter.inc"
+
diff --git a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl

new file mode 100644 (file)

index 0000000..396a69f
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
@@ -0,0 +1,171 @@
+#===-- R600GenRegisterInfo.pl - TODO: Add brief description -------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===#
+#
+# TODO: Add full description
+#
+#===----------------------------------------------------------------------===#
+
+use strict;
+use warnings;
+
+use AMDGPUConstants;
+
+my $CREG_MAX = CONST_REG_COUNT - 1;
+my $TREG_MAX = TEMP_REG_COUNT - 1;
+
+print <<STRING;
+
+class R600Reg <string name> : Register<name> {
+  let Namespace = "AMDIL";
+}
+
+class R600Reg_128<string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDIL";
+  let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
+}
+
+STRING
+
+my $i;
+
+### REG DEFS ###
+
+my @creg_list = print_reg_defs(CONST_REG_COUNT * 4, "C");
+my @treg_list = print_reg_defs(TEMP_REG_COUNT * 4, "T");
+
+my @t128reg;
+my @treg_x;
+for (my $i = 0; $i < TEMP_REG_COUNT; $i++) {
+  my $name = "T$i\_XYZW";
+  print qq{def $name : R600Reg_128 <"T$i.XYZW", [T$i\_X, T$i\_Y, T$i\_Z, T$i\_W] >;\n};
+  $t128reg[$i] = $name;
+  $treg_x[$i] = "T$i\_X";
+}
+
+my $treg_string = join(",", @treg_list);
+my $creg_list = join(",", @creg_list);
+my $t128_string = join(",", @t128reg);
+my $treg_x_string = join(",", @treg_x);
+print <<STRING;
+
+class RegSet <dag s> {
+  dag set = s;
+}
+
+def ZERO : R600Reg<"0.0">;
+def HALF : R600Reg<"0.5">;
+def ONE : R600Reg<"1.0">;
+def ONE_INT : R600Reg<"1">;
+def NEG_HALF : R600Reg<"-0.5">;
+def NEG_ONE : R600Reg<"-1.0">;
+def PV_X : R600Reg<"pv.x">;
+def ALU_LITERAL_X : R600Reg<"literal.x">;
+
+def R600_CReg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add
+    $creg_list)>;
+
+def R600_TReg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add
+    $treg_string)>;
+
+def R600_TReg32_X : RegisterClass <"AMDIL", [f32, i32], 32, (add
+    $treg_x_string)>;
+    
+def R600_Reg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_CReg32,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
+
+def R600_Reg128 : RegisterClass<"AMDIL", [v4f32], 128, (add
+    $t128_string)>
+{
+  let SubRegClasses = [(R600_TReg32 sel_x, sel_y, sel_z, sel_w)];
+}
+
+STRING
+
+my %index_map;
+my %chan_map;
+
+for ($i = 0; $i <= $#creg_list; $i++) {
+  push(@{$index_map{get_hw_index($i)}}, $creg_list[$i]);
+  push(@{$chan_map{get_chan_str($i)}}, $creg_list[$i]);
+}
+
+for ($i = 0; $i <= $#treg_list; $i++) {
+  push(@{$index_map{get_hw_index($i)}}, $treg_list[$i]);
+  push(@{$chan_map{get_chan_str($i)}}, $treg_list[$i]);
+}
+
+for ($i = 0; $i <= $#t128reg; $i++) {
+  push(@{$index_map{$i}}, $t128reg[$i]);
+  push(@{$chan_map{'X'}}, $t128reg[$i]);
+}
+
+open(OUTFILE, ">", "R600HwRegInfo.include");
+
+print OUTFILE <<STRING;
+
+unsigned R600RegisterInfo::getHWRegIndexGen(unsigned reg) const
+{
+  switch(reg) {
+  default: assert(!"Unknown register"); return 0;
+STRING
+foreach my $key (keys(%index_map)) {
+  foreach my $reg (@{$index_map{$key}}) {
+    print OUTFILE "  case AMDIL::$reg:\n";
+  }
+  print OUTFILE "    return $key;\n\n";
+}
+
+print OUTFILE "  }\n}\n\n";
+
+print OUTFILE <<STRING;
+
+unsigned R600RegisterInfo::getHWRegChanGen(unsigned reg) const
+{
+  switch(reg) {
+  default: assert(!"Unknown register"); return 0;
+STRING
+
+foreach my $key (keys(%chan_map)) {
+  foreach my $reg (@{$chan_map{$key}}) {
+    print OUTFILE " case AMDIL::$reg:\n";
+  }
+  my $val;
+  if ($key eq 'X') {
+    $val = 0;
+  } elsif ($key eq 'Y') {
+    $val = 1;
+  } elsif ($key eq 'Z') {
+    $val = 2;
+  } elsif ($key eq 'W') {
+    $val = 3;
+  } else {
+    die("Unknown chan value; $key");
+  }
+  print OUTFILE "    return $val;\n\n";
+}
+
+print OUTFILE "  }\n}\n\n";
+
+sub print_reg_defs {
+  my ($count, $prefix) = @_;
+
+  my @reg_list;
+
+  for ($i = 0; $i < $count; $i++) {
+    my $hw_index = get_hw_index($i);
+    my $chan= get_chan_str($i);
+    my $name = "$prefix$hw_index\_$chan";
+    print qq{def $name : R600Reg <"$prefix$hw_index.$chan">;\n};
+    $reg_list[$i] = $name;
+  }
+  return @reg_list;
+}
+
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.cpp b/src/gallium/drivers/radeon/R600ISelLowering.cpp

new file mode 100644 (file)

index 0000000..104f4c5
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600ISelLowering.cpp
@@ -0,0 +1,102 @@
+//===-- R600ISelLowering.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo()))
+{
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+//  setSchedulingPreference(Sched::VLIW);
+}
+
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const
+{
+  MachineFunction * MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  switch (MI->getOpcode()) {
+  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  /* XXX: Use helper function from AMDGPULowerShaderInstructions here */
+  case AMDIL::TGID_X:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T1_X);
+    break;
+  case AMDIL::TGID_Y:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T1_Y);
+    break;
+  case AMDIL::TGID_Z:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T1_Z);
+    break;
+  case AMDIL::TIDIG_X:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T0_X);
+    break;
+  case AMDIL::TIDIG_Y:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T0_Y);
+    break;
+  case AMDIL::TIDIG_Z:
+    addLiveIn(MI, MF, MRI, TII, AMDIL::T0_Z);
+    break;
+  case AMDIL::NGROUPS_X:
+    lowerImplicitParameter(MI, *BB, MRI, 0);
+    break;
+  case AMDIL::NGROUPS_Y:
+    lowerImplicitParameter(MI, *BB, MRI, 1);
+    break;
+  case AMDIL::NGROUPS_Z:
+    lowerImplicitParameter(MI, *BB, MRI, 2);
+    break;
+  case AMDIL::GLOBAL_SIZE_X:
+    lowerImplicitParameter(MI, *BB, MRI, 3);
+    break;
+  case AMDIL::GLOBAL_SIZE_Y:
+    lowerImplicitParameter(MI, *BB, MRI, 4);
+    break;
+  case AMDIL::GLOBAL_SIZE_Z:
+    lowerImplicitParameter(MI, *BB, MRI, 5);
+    break;
+  case AMDIL::LOCAL_SIZE_X:
+    lowerImplicitParameter(MI, *BB, MRI, 6);
+    break;
+  case AMDIL::LOCAL_SIZE_Y:
+    lowerImplicitParameter(MI, *BB, MRI, 7);
+    break;
+  case AMDIL::LOCAL_SIZE_Z:
+    lowerImplicitParameter(MI, *BB, MRI, 8);
+    break;
+  }
+  MI->eraseFromParent();
+  return BB;
+}
+
+void R600TargetLowering::lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineRegisterInfo & MRI, unsigned dword_offset) const
+{
+  MachineBasicBlock::iterator I = *MI;
+  unsigned offsetReg = MRI.createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+  MRI.setRegClass(MI->getOperand(0).getReg(), &AMDIL::R600_TReg32_XRegClass);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::MOV), offsetReg)
+          .addReg(AMDIL::ALU_LITERAL_X)
+          .addImm(dword_offset * 4);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::VTX_READ_eg))
+          .addOperand(MI->getOperand(0))
+          .addReg(offsetReg)
+          .addImm(0);
+}
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.h b/src/gallium/drivers/radeon/R600ISelLowering.h

new file mode 100644 (file)

index 0000000..fd26bf5
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600ISelLowering.h
@@ -0,0 +1,40 @@
+//===-- R600ISelLowering.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600ISELLOWERING_H
+#define R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+
+class R600TargetLowering : public AMDGPUTargetLowering
+{
+public:
+  R600TargetLowering(TargetMachine &TM);
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const;
+
+private:
+  const R600InstrInfo * TII;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+
+};
+
+} // End namespace llvm;
+
+#endif // R600ISELLOWERING_H
diff --git a/src/gallium/drivers/radeon/R600InstrFormats.td b/src/gallium/drivers/radeon/R600InstrFormats.td

new file mode 100644 (file)

index 0000000..0890eb6
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600InstrFormats.td
@@ -0,0 +1,16 @@
+//===-- R600InstrFormats.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+class ALUInst <bits<10> op, dag outs, dag ins, string asm, list<dag> pattern>
+  : InstR600 <, outs, ins , asm, pattern>
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.cpp b/src/gallium/drivers/radeon/R600InstrInfo.cpp

new file mode 100644 (file)

index 0000000..bcee89c
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600InstrInfo.cpp
@@ -0,0 +1,109 @@
+//===-- R600InstrInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600RegisterInfo.h"
+
+using namespace llvm;
+
+R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this),
+    TM(tm)
+  { }
+
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const
+{
+  return RI;
+}
+
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const
+{
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
+}
+
+void
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const
+{
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg)
+      && AMDIL::GPRI32RegClass.contains(SrcReg)) {
+    SrcReg = AMDIL::T0_X;
+  }
+  BuildMI(MBB, MI, DL, get(AMDIL::MOV), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned R600InstrInfo::getISAOpcode(unsigned opcode) const
+{
+  switch (opcode) {
+    default: return AMDGPUInstrInfo::getISAOpcode(opcode);
+    case AMDIL::CUSTOM_ADD_i32:
+      return AMDIL::ADD_INT;
+    case AMDIL::CUSTOM_XOR_i32:
+      return AMDIL::XOR_INT;
+    case AMDIL::MOVE_f32:
+    case AMDIL::MOVE_i32:
+      return AMDIL::MOV;
+    case AMDIL::SHR_i32:
+      return getLSHRop();
+  }
+}
+
+unsigned R600InstrInfo::getLSHRop() const
+{
+  unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
+  if (gen < AMDILDeviceInfo::HD5XXX) {
+    return AMDIL::LSHR_r600;
+  } else {
+    return AMDIL::LSHR_eg;
+  }
+}
+
+unsigned R600InstrInfo::getMULHI_UINT() const
+{
+  unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
+
+  if (gen < AMDILDeviceInfo::HD5XXX) {
+    return AMDIL::MULHI_UINT_r600;
+  } else {
+    return AMDIL::MULHI_UINT_eg;
+  }
+}
+
+unsigned R600InstrInfo::getMULLO_UINT() const
+{
+  unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
+
+  if (gen < AMDILDeviceInfo::HD5XXX) {
+    return AMDIL::MULLO_UINT_r600;
+  } else {
+    return AMDIL::MULLO_UINT_eg;
+  }
+}
+
+unsigned R600InstrInfo::getRECIP_UINT() const
+{
+  const AMDILDevice * dev = TM.getSubtarget<AMDILSubtarget>().device();
+
+  if (dev->getGeneration() < AMDILDeviceInfo::HD5XXX) {
+    return AMDIL::RECIP_UINT_r600;
+  } else if (dev->getDeviceFlag() != OCL_DEVICE_CAYMAN) {
+    return AMDIL::RECIP_UINT_eg;
+  } else {
+    return AMDIL::RECIP_UINT_cm;
+  }
+}
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.h b/src/gallium/drivers/radeon/R600InstrInfo.h

new file mode 100644 (file)

index 0000000..aedaa9f
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600InstrInfo.h
@@ -0,0 +1,74 @@
+//===-- R600InstrInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600INSTRUCTIONINFO_H_
+#define R600INSTRUCTIONINFO_H_
+
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "R600RegisterInfo.h"
+
+#include <map>
+
+namespace llvm {
+
+  struct InstrGroup {
+    unsigned amdil;
+    unsigned r600;
+    unsigned eg;
+    unsigned cayman;
+  };
+
+  class AMDGPUTargetMachine;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class R600InstrInfo : public AMDGPUInstrInfo {
+  private:
+  const R600RegisterInfo RI;
+  AMDGPUTargetMachine &TM;
+
+  public:
+  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+
+  const R600RegisterInfo &getRegisterInfo() const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  virtual unsigned getISAOpcode(unsigned opcode) const;
+  bool isTrig(const MachineInstr &MI) const;
+
+  unsigned getLSHRop() const;
+  unsigned getMULHI_UINT() const;
+  unsigned getMULLO_UINT() const;
+  unsigned getRECIP_UINT() const;
+
+  };
+
+} // End llvm namespace
+
+namespace R600_InstFlag {
+       enum TIF {
+               TRANS_ONLY = (1 << 0),
+               TEX = (1 << 1),
+               REDUCTION = (1 << 2),
+               FC = (1 << 3),
+               TRIG = (1 << 4),
+               OP3 = (1 << 5)
+       };
+}
+
+#endif // R600INSTRINFO_H_
diff --git a/src/gallium/drivers/radeon/R600Instructions.td b/src/gallium/drivers/radeon/R600Instructions.td

new file mode 100644 (file)

index 0000000..913e27f
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600Instructions.td
@@ -0,0 +1,931 @@
+//===-- R600Instructions.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+
+class InstR600 <bits<32> inst, dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<32> Inst;
+       bit Trig = 0;
+  bit Op3 = 0;
+
+  let Inst = inst;
+  let Namespace = "AMDIL";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+}
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst <outs, ins, asm, pattern>
+{
+  field bits<64> Inst;
+
+  let Namespace = "AMDIL";
+}
+
+def MEMri : Operand<iPTRAny> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+
+class R600_ALU {
+
+  bits<7> DST_GPR = 0;
+  bits<9> SRC0_SEL = 0;
+  bits<1> SRC0_NEG = 0;
+  bits<9> SRC1_SEL = 0;
+  bits<1> SRC1_NEG = 0;
+  bits<1> CLAMP = 0;
+  
+}
+
+
+class R600_1OP <bits<32> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          (ins R600_Reg32:$src, variable_ops),
+          !strconcat(opName, " $dst, $src"),
+          pattern,
+          itin
+  >;
+
+class R600_2OP <bits<32> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          (ins R600_Reg32:$src0, R600_Reg32:$src1, variable_ops),
+          !strconcat(opName, " $dst, $src0, $src1"),
+          pattern,
+          itin
+  >;
+
+class R600_3OP <bits<32> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2, variable_ops),
+          !strconcat(opName, "$dst $src0, $src1, $src2"),
+          pattern,
+          itin>{
+
+    let Op3 = 1;
+  }
+
+class R600_REDUCTION <bits<32> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin
+
+  >;
+
+class R600_TEX <bits<32> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg128:$dst),
+          (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2),
+          !strconcat(opName, "$dst, $src0, $src1, $src2"),
+          pattern,
+          itin
+  >;
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 11 || TType == 12;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins,
+                 string asm> :
+    InstR600ISA <outs, ins, asm, []>
+{
+  bits<7>  RW_GPR;
+  bits<7>  INDEX_GPR;
+  bits<4>  RAT_ID;
+
+  bits<2>  RIM;
+  bits<2>  TYPE;
+  bits<1>  RW_REL;
+  bits<2>  ELEM_SIZE;
+
+  bits<12> ARRAY_SIZE;
+  bits<4>  COMP_MASK;
+  bits<4>  BURST_COUNT;
+  bits<1>  VPM;
+  bits<1>  EOP;
+  bits<1>  MARK;
+  bits<1>  BARRIER;
+
+  /* CF_ALLOC_EXPORT_WORD0_RAT */
+  let Inst{3-0}   = RAT_ID;
+  let Inst{9-4}   = rat_inst;
+  let Inst{10}    = 0; /* Reserved */
+  let Inst{12-11} = RIM;
+  let Inst{14-13} = TYPE;
+  let Inst{21-15} = RW_GPR;
+  let Inst{22}    = RW_REL;
+  let Inst{29-23} = INDEX_GPR;
+  let Inst{31-30} = ELEM_SIZE;
+
+  /* CF_ALLOC_EXPORT_WORD1_BUF */
+/* XXX: We can't have auto encoding of 64-bit instructions until LLVM 3.1 :( */
+/*
+  let Inst{43-32} = ARRAY_SIZE;
+  let Inst{47-44} = COMP_MASK;
+  let Inst{51-48} = BURST_COUNT;
+  let Inst{52}    = VPM;
+  let Inst{53}    = EOP;
+  let Inst{61-54} = cf_inst;
+  let Inst{62}    = MARK;
+  let Inst{63}    = BARRIER;
+*/
+}
+
+/*
+def store_global : PatFrag<(ops node:$value, node:$ptr),
+                           (store node:$value, node:$ptr),
+                           [{
+                            const Value *Src;
+                            const PointerType *Type;
+                            if ((src = cast<StoreSDNode>(N)->getSrcValue() &&
+                                 PT = dyn_cast<PointerType>(Src->getType()))) {
+                              return PT->getAddressSpace() == 1;
+                            }
+                            return false;
+                           }]>;
+
+*/
+
+def load_param : PatFrag<(ops node:$ptr),
+                         (load node:$ptr),
+                          [{
+                           return true;
+                           const Value *Src = cast<LoadSDNode>(N)->getSrcValue();
+                           if (Src) {
+                                PointerType * PT = dyn_cast<PointerType>(Src->getType());
+                                return PT && PT->getAddressSpace() == AMDILAS::PARAM_I_ADDRESS;
+                           }
+                           return false;
+                          }]>;
+
+//class EG_CF <bits<32> inst, string asm> :
+//    InstR600 <inst, (outs), (ins), asm, []>;
+
+/* XXX: We will use this when we emit the real ISA.
+  bits<24> ADDR = 0;
+  bits<3> JTS = 0;
+
+  bits<3> PC = 0;
+  bits<5> CF_CONS = 0;
+  bits<2> COND = 0;
+  bits<6> COUNT = 0;
+  bits<1> VPM = 0;
+  bits<1> EOP = 0;
+  bits<8> CF_INST = 0;
+  bits<1> WQM = 0;
+  bits<1> B = 0;
+
+  let Inst{23-0} = ADDR;
+  let Inst{26-24} = JTS;
+  let Inst{34-32} = PC;
+  let Inst{39-35} = CF_CONST;
+  let Inst{41-40} = COND;
+  let Inst{47-42} = COUNT;
+  let Inst{52} = VPM;
+  let Inst{53} = EOP;
+  let Inst{61-54} = CF_INST;
+  let Inst{62} = WQM;
+  let Inst{63} = B;
+//}
+*/
+def isR600 : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDILDeviceInfo::HD4XXX">;
+def isEG : Predicate<"Subtarget.device()"
+                            "->getGeneration() >= AMDILDeviceInfo::HD5XXX && "
+                            "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
+def isCayman : Predicate<"Subtarget.device()"
+                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
+def isEGorCayman : Predicate<"Subtarget.device()"
+                            "->getGeneration() >= AMDILDeviceInfo::HD5XXX">;
+
+def isR600toCayman : Predicate<
+                     "Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX">;
+
+
+let Predicates = [isR600toCayman] in { 
+
+/* ------------------------------------------- */
+/* Common Instructions R600, R700, Evergreen, Cayman */
+/* ------------------------------------------- */
+let Gen = AMDGPUGen.R600_CAYMAN  in {
+
+def ADD : R600_2OP <
+  0x0, "ADD",
+  [(set R600_Reg32:$dst, (fadd R600_Reg32:$src0, R600_Reg32:$src1))] > {
+  let AMDILOp = AMDILInst.ADD_f32;
+}
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP <
+  0x1, "MUL NON-IEEE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_mul R600_Reg32:$src0, R600_Reg32:$src1))]
+>;
+
+def MUL_IEEE : R600_2OP <
+  0x2, "MUL_IEEE",
+  [(set R600_Reg32:$dst, (fmul R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.MUL_IEEE_f32;
+}
+
+def MAX : R600_2OP <
+  0x3, "MAX",
+  [(set R600_Reg32:$dst, (int_AMDIL_max R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.MAX_f32;
+}
+
+def MIN : R600_2OP <
+  0x4, "MIN",
+  [(set R600_Reg32:$dst, (int_AMDIL_min R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.MIN_f32;
+}
+
+/* For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+ * so some of the instruction names don't match the asm string.
+ * XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+ */
+
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_seq R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.FEQ;
+}
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_sgt R600_Reg32:$src0, R600_Reg32:$src1))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_sge R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.FGE;
+}
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_sne R600_Reg32:$src0, R600_Reg32:$src1))]> {
+  let AMDILOp = AMDILInst.FNE;
+}
+
+def FRACT : R600_1OP <
+  0x10, "FRACT",
+  []> {
+  let AMDILOp = AMDILInst.FRAC_f32;
+}
+
+def TRUNC : R600_1OP <
+  0x11, "TRUNC",
+  [(set R600_Reg32:$dst, (int_AMDGPU_trunc R600_Reg32:$src))]
+>;
+
+def FLOOR : R600_1OP <
+  0x14, "FLOOR",
+  [(set R600_Reg32:$dst, (int_AMDGPU_floor R600_Reg32:$src))]
+>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+def KILLGT : R600_2OP <
+  0x2D, "KILLGT",
+  []
+>;
+
+def AND_INT : R600_2OP <
+  0x30, "AND_INT",
+  []> {
+  let AMDILOp = AMDILInst.AND_i32;
+}
+
+def XOR_INT : R600_2OP <
+  0x32, "XOR_INT",
+  []
+>;
+
+def ADD_INT : R600_2OP <
+  0x34, "ADD_INT $dst, $src0, $src1",
+  []>{
+  let AMDILOp = AMDILInst.ADD_i32;
+}
+
+def SUB_INT : R600_2OP <
+       0x35, "SUB_INT $dst, $src0, $src1",
+       []
+>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT $dst, $src0, $src1",
+  []>{
+  let AMDILOp = AMDILInst.IEQ;
+}
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SGT_INT $dst, $src0, $src1",
+  []
+>;
+
+def SETGE_INT : R600_2OP <
+       0x3C, "SETGE_INT $dst, $src0, $src1",
+       []>{
+  let AMDILOp = AMDILInst.IGE;
+}
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT $dst, $src0, $src1",
+  []>{
+  let AMDILOp = AMDILInst.INE;
+}
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT $dst, $src0, $src1",
+  []>{
+  let AMDILOp = AMDILInst.UGT;
+}
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT $dst, $src0, $src1",
+  []>{
+  let AMDILOp = AMDILInst.UGE;
+}
+
+def CNDE_INT : R600_3OP <
+       0x1C, "CNDE_INT $dst, $src0, $src1, $src2",
+       []
+>;
+
+/* Texture instructions */
+
+def TEX_SAMPLE : R600_TEX <
+  0x10, "TEX_SAMPLE",
+  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_SAMPLE_C : R600_TEX <
+  0x18, "TEX_SAMPLE_C",
+  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))]
+>;
+
+def TEX_SAMPLE_L : R600_TEX <
+  0x11, "TEX_SAMPLE_L",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_SAMPLE_C_L : R600_TEX <
+  0x19, "TEX_SAMPLE_C_L",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))]
+>;
+
+def TEX_SAMPLE_LB : R600_TEX <
+  0x12, "TEX_SAMPLE_LB",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_SAMPLE_C_LB : R600_TEX <
+  0x1A, "TEX_SAMPLE_C_LB",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))]
+>;
+
+def TEX_SAMPLE_G : R600_TEX <
+  0x14, "TEX_SAMPLE_G",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_SAMPLE_C_G : R600_TEX <
+  0x1C, "TEX_SAMPLE_C_G",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))]
+>;
+
+} // End Gen R600_CAYMAN
+
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+/* Helper classes for common instructions */
+
+class MUL_LIT_Common <bits<32> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<32> inst> : R600_3OP <
+  inst, "MULADD",
+  []> {
+  let AMDILOp = AMDILInst.MAD_f32;
+}
+
+class CNDE_Common <bits<32> inst> : R600_3OP <
+  inst, "CNDE",
+  []> {
+  let AMDILOp = AMDILInst.CMOVLOG_f32;
+}
+
+class CNDGT_Common <bits<32> inst> : R600_3OP <
+  inst, "CNDGT",
+  []
+>;
+  
+class CNDGE_Common <bits<32> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_cndlt R600_Reg32:$src0, R600_Reg32:$src2, R600_Reg32:$src1))]
+>;
+
+class DOT4_Common <bits<32> inst> : R600_REDUCTION <
+  inst,
+  (ins R600_Reg128:$src0, R600_Reg128:$src1),
+  "DOT4 $dst $src0, $src1",
+  [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
+>;
+
+class EXP_IEEE_Common <bits<32> inst> : R600_1OP <
+  inst, "EXP_IEEE",
+  []> {
+  let AMDILOp = AMDILInst.EXP_f32;
+}
+
+class FLT_TO_INT_Common <bits<32> inst> : R600_1OP <
+  inst, "FLT_TO_INT", []> {
+  let AMDILOp = AMDILInst.FTOI;
+}
+
+class INT_TO_FLT_Common <bits<32> inst> : R600_1OP <
+  inst, "INT_TO_FLT", []> {
+  let AMDILOp = AMDILInst.ITOF;
+}
+
+class LOG_CLAMPED_Common <bits<32> inst> : R600_1OP <
+  inst, "LOG_CLAMPED",
+  []
+>;
+
+class LOG_IEEE_Common <bits<32> inst> : R600_1OP <
+  inst, "LOG_IEEE",
+  []> {
+  let AMDILOp = AMDILInst.LOG_f32;
+}
+
+class LSHL_Common <bits<32> inst> : R600_2OP <
+  inst, "LSHL $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.SHL_i32;
+}
+
+class LSHR_Common <bits<32> inst> : R600_2OP <
+  inst, "LSHR $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.USHR_i32;
+}
+
+class MULHI_INT_Common <bits<32> inst> : R600_2OP <
+  inst, "MULHI_INT $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.SMULHI_i32;
+}
+
+class MULHI_UINT_Common <bits<32> inst> : R600_2OP <
+       inst, "MULHI $dst, $src0, $src1",
+       []
+>;
+
+class MULLO_INT_Common <bits<32> inst> : R600_2OP <
+  inst, "MULLO_INT $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.SMUL_i32;
+}
+
+class MULLO_UINT_Common <bits<32> inst> : R600_2OP <
+  inst, "MULLO_UINT $dst, $src0, $src1",
+  []
+>;
+
+class RECIP_CLAMPED_Common <bits<32> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED",
+  []
+>;
+
+class RECIP_IEEE_Common <bits<32> inst> : R600_1OP <
+  inst, "RECIP_IEEE",
+  [(set R600_Reg32:$dst, (int_AMDGPU_rcp R600_Reg32:$src))]> {
+  let AMDILOp = AMDILInst.RSQ_f32;
+}
+
+class RECIP_UINT_Common <bits<32> inst> : R600_1OP <
+  inst, "RECIP_INT $dst, $src",
+  []
+>;
+
+class RECIPSQRT_CLAMPED_Common <bits<32> inst> : R600_1OP <
+  inst, "RECIPSQRT_CLAMPED",
+  [(set R600_Reg32:$dst, (int_AMDGPU_rsq R600_Reg32:$src))]
+>;
+
+class RECIPSQRT_IEEE_Common <bits<32> inst> : R600_1OP <
+  inst, "RECIPSQRT_IEEE",
+  []
+>;
+
+class SIN_Common <bits<32> inst> : R600_1OP <
+  inst, "SIN",
+  []>{
+  let AMDILOp = AMDILInst.SIN_f32;
+  let Trig = 1;
+}
+
+class COS_Common <bits<32> inst> : R600_1OP <
+  inst, "COS",
+  []> {
+  let AMDILOp = AMDILInst.COS_f32;
+  let Trig = 1;
+}
+
+/* Helper patterns for complex intrinsics */
+/* -------------------------------------- */
+
+class DIV_Common <InstR600 recip_ieee> : Pat<
+  (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
+  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+>;
+
+class LRP_Common <InstR600 muladd> : Pat <
+  (int_AMDGPU_lrp R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  (muladd R600_Reg32:$src0, R600_Reg32:$src1, (MUL (SUB_f32 ONE, R600_Reg32:$src0), R600_Reg32:$src2))
+>;
+
+class SSG_Common <InstR600 cndgt, InstR600 cndge> : Pat <
+  (int_AMDGPU_ssg R600_Reg32:$src),
+  (cndgt R600_Reg32:$src, (f32 ONE), (cndge R600_Reg32:$src, (f32 ZERO), (f32 NEG_ONE)))
+>;
+
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
+  (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
+  (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
+>;
+
+/* ---------------------- */
+/* R600 / R700 Only Instructions */
+/* ---------------------- */
+
+let Predicates = [isR600] in {
+
+let Gen = AMDGPUGen.R600 in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  def DOT4_r600 : DOT4_Common<0x50>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x77>;
+
+} // End AMDGPUGen.R600
+
+  def DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def LRP_r600 : LRP_Common<MULADD_r600>;
+  def POW_r600 : POW_Common<LOG_IEEE_r600, EXP_IEEE_r600, MUL, GPRF32>;
+  def SSG_r600 : SSG_Common<CNDGT_r600, CNDGE_r600>;
+  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
+
+}
+
+/* ----------------- */
+/* R700+ Trig helper */
+/* ----------------- */
+
+/*
+class TRIG_HELPER_r700 <InstR600 trig_inst>: Pat <
+  (trig_inst R600_Reg32:$src),
+  (trig_inst (fmul R600_Reg32:$src, (PI))))
+>;
+*/
+
+/* ---------------------- */
+/* Evergreen Instructions */
+/* ---------------------- */
+
+
+let Predicates = [isEG] in {
+
+let Gen = AMDGPUGen.EG in {
+
+def RAT_WRITE_CACHELESS_eg :
+    EG_CF_RAT <0x57, 0x2, (outs), (ins R600_TReg32_X:$rw_gpr,
+                                   R600_TReg32_X:$index_gpr, i32imm:$rat_id), "">
+{
+/*
+  let Inst{3-0}   = RAT_ID;
+  let Inst{21-15} = RW_GPR;
+  let Inst{29-23} = INDEX_GPR;
+  /* Propery of the UAV */
+  let Inst{31-30} = ELEM_SIZE;
+*/
+  let RIM         = 0;
+  /* XXX: Have a separate instruction for non-indexed writes. */
+  let TYPE        = 1;
+  let RW_REL      = 0;
+  let ELEM_SIZE   = 0;
+
+/*
+  let ARRAY_SIZE  = 0;
+  let COMP_MASK   = 1;
+  let BURST_COUNT = 0;
+  let VPM         = 0;
+  let EOP         = 0;
+  let MARK        = 0;
+  let BARRIER     = 1;
+*/
+}
+
+def VTX_READ_eg : InstR600ISA < (outs R600_TReg32_X:$dst),
+                                (ins R600_TReg32_X:$src, i32imm:$buffer_id),
+                                "VTX_READ_eg $dst, $src", []>
+{
+/*
+  bits<7> DST_GPR;
+  bits<7> SRC_GPR;
+  bits<8> BUFFER_ID;
+*/
+  /* If any of these field below need to be calculated at compile time, and
+   * a ins operand for them and move them to the list of operands above. */
+
+  /* XXX: This instruction is manual encoded, so none of these values are used.
+   */
+/*
+  bits<5> VC_INST          = 0; //VC_INST_FETCH
+  bits<2> FETCH_TYPE       = 2;
+  bits<1> FETCH_WHOLE_QUAD = 1;
+  bits<1> SRC_REL          = 0;
+  bits<2> SRC_SEL_X        = 0;
+  bits<6> MEGA_FETCH_COUNT = 4;
+*/
+/*
+
+  bits<1> DST_REL          = 0;
+  bits<3> DST_SEL_X        = 0;
+  bits<3> DST_SEL_Y        = 7; //Masked
+  bits<3> DST_SEL_Z        = 7; //Masked
+  bits<3> DST_SEL_W        = 7; //Masked
+  bits<1> USE_CONST_FIELDS = 1; //Masked
+  bits<6> DATA_FORMAT      = 0;
+  bits<2> NUM_FORMAT_ALL   = 0;
+  bits<1> FORMAT_COMP_ALL  = 0;
+  bits<1> SRF_MODE_ALL     = 0;
+*/
+
+/*
+  let Inst{4-0}   = VC_INST;
+  let Inst{6-5}   = FETCH_TYPE;
+  let Inst{7}     = FETCH_WHOLE_QUAD;
+  let Inst{15-8}  = BUFFER_ID;
+  let Inst{22-16} = SRC_GPR;
+  let Inst{23}    = SRC_REL;
+  let Inst{25-24} = SRC_SEL_X;
+  let Inst{31-26} = MEGA_FETCH_COUNT;
+*/
+  /* DST_GPR is OK to leave uncommented, because LLVM 3.0 only prevents you
+   * from statically setting bits > 31.  This field will be set by
+   * getMachineValueOp which can set bits > 31.
+   */
+//  let Inst{32-38} = DST_GPR;
+
+  /* XXX: Uncomment for LLVM 3.1 which supports 64-bit instructions */
+
+/*
+  let Inst{39}    = DST_REL;
+  let Inst{40}    = 0; //Reserved
+  let Inst{43-41} = DST_SEL_X;
+  let Inst{46-44} = DST_SEL_Y;
+  let Inst{49-47} = DST_SEL_Z;
+  let Inst{52-50} = DST_SEL_W;
+  let Inst{53}    = USE_CONST_FIELDS;
+  let Inst{59-54} = DATA_FORMAT;
+  let Inst{61-60} = NUM_FORMAT_ALL;
+  let Inst{62}    = FORMAT_COMP_ALL;
+  let Inst{63}    = SRF_MODE_ALL;
+*/
+}
+
+
+
+} // End AMDGPUGen.EG
+/* XXX: Need to convert PTR to rat_id */
+/*
+def : Pat <(store_global (f32 R600_Reg32:$value), node:$ptr),
+           (RAT_WRITE_CACHELESS_eg (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+                                                  (f32 R600_Reg32:$value),
+                                                   sel_x),
+                                    (f32 ZERO), 0, R600_Reg32:$ptr)>;
+*/
+
+class VTX_Param_Read_Pattern <ValueType vt> : Pat <
+    (vt (load_param ADDRParam:$mem)),
+    (VTX_READ_eg (i32 R600_Reg32:$mem), 0)>;
+
+def : VTX_Param_Read_Pattern <f32>;
+def : VTX_Param_Read_Pattern <i32>;
+
+} // End isEG Predicate
+
+/* ------------------------------- */
+/* Evergreen / Cayman Instructions */
+/* ------------------------------- */
+
+let Predicates = [isEGorCayman] in {
+  
+class TRIG_eg <InstR600 trig, Intrinsic intr> : Pat<
+  (intr R600_Reg32:$src),
+  (trig (MUL (MOV (LOADCONST_i32 CONST.TWO_PI_INV)), R600_Reg32:$src))
+>;
+
+let Gen = AMDGPUGen.EG_CAYMAN in {
+
+  def MULADD_eg : MULADD_Common<0x14>;
+  def LSHR_eg : LSHR_Common<0x16>;
+  def LSHL_eg : LSHL_Common<0x17>;
+  def CNDE_eg : CNDE_Common<0x19>;
+  def CNDGT_eg : CNDGT_Common<0x1A>;
+  def CNDGE_eg : CNDGE_Common<0x1B>;
+  def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50>;
+  def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+  def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+  def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+  def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+  def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+  def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+  def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+  def SIN_eg : SIN_Common<0x8D>;
+  def COS_eg : COS_Common<0x8E>;
+  def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+  def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+  def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+  def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+  def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+  def DOT4_eg : DOT4_Common<0xBE>;
+
+} // End AMDGPUGen.EG_CAYMAN
+
+  def DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+  def LRP_eg : LRP_Common<MULADD_eg>;
+  def POW_eg : POW_Common<LOG_IEEE_eg, EXP_IEEE_eg, MUL, GPRF32>;
+  def SSG_eg : SSG_Common<CNDGT_eg, CNDGE_eg>;
+  def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+  def : TRIG_eg <SIN_eg, int_AMDGPU_sin>;
+  def : TRIG_eg <COS_eg, int_AMDGPU_cos>;
+
+}
+
+let Predicates = [isCayman] in {
+
+let Gen = AMDGPUGen.CAYMAN in {
+
+  /* XXX: I'm not sure if this opcode is correct. */
+  def RECIP_UINT_cm : RECIP_UINT_Common<0x77>;
+
+} // End AMDGPUGen.CAYMAN
+
+} // End isCayman
+
+/* Other Instructions */
+
+let isCodeGenOnly = 1 in {
+/*
+  def SWIZZLE : AMDGPUShaderInst <
+    (outs GPRV4F32:$dst),
+    (ins GPRV4F32:$src0, i32imm:$src1),
+    "SWIZZLE $dst, $src0, $src1",
+    [(set GPRV4F32:$dst, (int_AMDGPU_swizzle GPRV4F32:$src0, imm:$src1))]
+  >;
+*/
+
+  def LAST : AMDGPUShaderInst <
+    (outs),
+    (ins),
+    "LAST",
+    []
+  >;
+
+  def GET_CHAN : AMDGPUShaderInst <
+    (outs R600_Reg32:$dst),
+    (ins R600_Reg128:$src0, i32imm:$src1),
+    "GET_CHAN $dst, $src0, $src1",
+    []
+  >;
+
+  def SET_CHAN : AMDGPUShaderInst <
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg32:$src0, i32imm:$src1),
+    "SET_CHAN $dst, $src0, $src1",
+    []
+  >;
+
+  def MULLIT : AMDGPUShaderInst <
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+    "MULLIT $dst, $src0, $src1",
+    [(set R600_Reg128:$dst, (int_AMDGPU_mullit R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
+  >;
+
+let usesCustomInserter = 1, isPseudo = 1 in {
+
+class R600PreloadInst <string asm, Intrinsic intr> : AMDGPUInst <
+  (outs R600_TReg32:$dst),
+  (ins),
+  asm,
+  [(set R600_TReg32:$dst, (intr))]
+>;
+
+def TGID_X : R600PreloadInst <"TGID_X", int_r600_read_tgid_x>;
+def TGID_Y : R600PreloadInst <"TGID_Y", int_r600_read_tgid_y>;
+def TGID_Z : R600PreloadInst <"TGID_Z", int_r600_read_tgid_z>;
+
+def TIDIG_X : R600PreloadInst <"TIDIG_X", int_r600_read_tidig_x>;
+def TIDIG_Y : R600PreloadInst <"TIDIG_Y", int_r600_read_tidig_y>;
+def TIDIG_Z : R600PreloadInst <"TIDIG_Z", int_r600_read_tidig_z>;
+
+def NGROUPS_X : R600PreloadInst <"NGROUPS_X", int_r600_read_ngroups_x>;
+def NGROUPS_Y : R600PreloadInst <"NGROUPS_Y", int_r600_read_ngroups_y>;
+def NGROUPS_Z : R600PreloadInst <"NGROUPS_Z", int_r600_read_ngroups_z>;
+
+def GLOBAL_SIZE_X : R600PreloadInst <"GLOBAL_SIZE_X",
+                                     int_r600_read_global_size_x>;
+def GLOBAL_SIZE_Y : R600PreloadInst <"GLOBAL_SIZE_Y",
+                                     int_r600_read_global_size_y>;
+def GLOBAL_SIZE_Z : R600PreloadInst <"GLOBAL_SIZE_Z",
+                                     int_r600_read_global_size_z>;
+
+def LOCAL_SIZE_X : R600PreloadInst <"LOCAL_SIZE_X",
+                                    int_r600_read_local_size_x>;
+def LOCAL_SIZE_Y : R600PreloadInst <"LOCAL_SIZE_Y",
+                                    int_r600_read_local_size_y>;
+def LOCAL_SIZE_Z : R600PreloadInst <"LOCAL_SIZE_Z",
+                                    int_r600_read_local_size_z>;
+
+} // End usesCustomInserter = 1, isPseudo = 1
+
+} // End isCodeGenOnly = 1
+
+
+
+include "R600ShaderPatterns.td"
+
+// We need this pattern to avoid having real registers in PHI nodes.
+// For some reason this pattern only works when it comes after the other
+// instruction defs.
+def : Pat <
+  (int_R600_load_input imm:$src),
+  (LOAD_INPUT imm:$src)
+>;
+
+} // End isR600toCayman Predicate
diff --git a/src/gallium/drivers/radeon/R600Intrinsics.td b/src/gallium/drivers/radeon/R600Intrinsics.td

new file mode 100644 (file)

index 0000000..8038fee
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600Intrinsics.td
@@ -0,0 +1,40 @@
+//===-- R600Intrinsics.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
+
+let TargetPrefix = "r600", isTarget = 1 in {
+
+class R600ReadPreloadRegisterIntrinsic<string name>
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
+multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
+  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
+  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
+  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
+}
+
+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_global_size">;
+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_local_size">;
+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_ngroups">;
+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tgid">;
+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tidig">;
+} // End TargetPrefix = "r600"
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.cpp b/src/gallium/drivers/radeon/R600KernelParameters.cpp

new file mode 100644 (file)

index 0000000..3fdf48a
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600KernelParameters.cpp
@@ -0,0 +1,503 @@
+//===-- R600KernelParameters.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include <llvm-c/Core.h>
+#include "R600KernelParameters.h"
+#include "R600OpenCLUtils.h"
+#include "llvm/Constants.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/TypeBuilder.h"
+// #include "llvm/CodeGen/Function.h"
+
+namespace AMDILAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, // Address space for private memory.
+  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, // Address space for constant memory.
+  LOCAL_ADDRESS    = 3, // Address space for local memory.
+  REGION_ADDRESS   = 4, // Address space for region memory.
+  ADDRESS_NONE     = 5, // Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
+  LAST_ADDRESS     = 8
+};
+}
+
+
+#include <map>
+#include <set>
+
+using namespace llvm;
+using namespace std;
+
+#define CONSTANT_CACHE_SIZE_DW 127
+
+class R600KernelParameters : public llvm::FunctionPass
+{
+  const llvm::TargetData * TD;
+  LLVMContext* Context;
+  Module *mod;
+  
+  struct param
+  {
+    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0), indirect(false), specialID(0) {}
+    
+    llvm::Value* val;
+    llvm::Value* ptr_val;
+    int offset_in_dw;
+    int size_in_dw;
+
+    bool indirect;
+    
+    string specialType;
+    int specialID;
+    
+    int end() { return offset_in_dw + size_in_dw; }
+    /* The first 9 dwords are reserved for the grid sizes. */
+    int get_rat_offset() { return 9 + offset_in_dw; }
+  };
+
+  std::vector<param> params;
+
+  int getLastSpecialID(const string& TypeName);
+  
+  int getListSize();
+  void AddParam(llvm::Argument* arg);
+  int calculateArgumentSize(llvm::Argument* arg);
+  void RunAna(llvm::Function* fun);
+  void Replace(llvm::Function* fun);
+  bool isIndirect(Value* val, set<Value*>& visited);
+  void Propagate(llvm::Function* fun);
+  void Propagate(llvm::Value* v, const llvm::Twine& name, bool indirect = false);
+  Value* ConstantRead(Function* fun, param& p);
+  Value* handleSpecial(Function* fun, param& p);
+  bool isSpecialType(Type*);
+  string getSpecialTypeName(Type*);
+public:
+  static char ID;
+  R600KernelParameters() : FunctionPass(ID) {};
+  R600KernelParameters(const llvm::TargetData* TD) : FunctionPass(ID), TD(TD) {}
+//   bool runOnFunction (llvm::Function &F);
+  bool runOnFunction (llvm::Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  const char *getPassName() const;
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+};
+
+char R600KernelParameters::ID = 0;
+
+static RegisterPass<R600KernelParameters> X("kerparam", "OpenCL Kernel Parameter conversion", false, false);
+
+int R600KernelParameters::getLastSpecialID(const string& TypeName)
+{
+  int lastID = -1;
+  
+  for (vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    if (i->specialType == TypeName)
+    {
+      lastID = i->specialID;
+    }
+  }
+
+  return lastID;
+}
+
+int R600KernelParameters::getListSize()
+{
+  if (params.size() == 0)
+  {
+    return 0;
+  }
+
+  return params.back().end();
+}
+
+bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
+{
+  if (isa<LoadInst>(val))
+  {
+    return false;
+  }
+
+  if (isa<IntegerType>(val->getType()))
+  {
+    assert(0 and "Internal error");
+    return false;
+  }
+
+  if (visited.count(val))
+  {
+    return false;
+  }
+
+  visited.insert(val);
+  
+  if (isa<GetElementPtrInst>(val))
+  {
+    GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(val);
+    GetElementPtrInst::op_iterator i = GEP->op_begin();
+
+    for (i++; i != GEP->op_end(); i++)
+    {
+      if (!isa<Constant>(*i))
+      {
+        return true;
+      }
+    }
+  }
+  
+  for (Value::use_iterator i = val->use_begin(); i != val->use_end(); i++)
+  {
+    Value* v2 = dyn_cast<Value>(*i);
+
+    if (v2)
+    {
+      if (isIndirect(v2, visited))
+      {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+void R600KernelParameters::AddParam(llvm::Argument* arg)
+{
+  param p;
+  
+  p.val = dyn_cast<Value>(arg);
+  p.offset_in_dw = getListSize();
+  p.size_in_dw = calculateArgumentSize(arg);
+
+  if (isa<PointerType>(arg->getType()) and arg->hasByValAttr())
+  {
+    set<Value*> visited;
+    p.indirect = isIndirect(p.val, visited);
+  }
+  
+  params.push_back(p);
+}
+
+int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
+{
+  Type* t = arg->getType();
+
+  if (arg->hasByValAttr() and dyn_cast<PointerType>(t))
+  {
+    t = dyn_cast<PointerType>(t)->getElementType();
+  }
+  
+  int store_size_in_dw = (TD->getTypeStoreSize(t) + 3)/4;
+
+  assert(store_size_in_dw);
+  
+  return store_size_in_dw;
+}
+
+
+void R600KernelParameters::RunAna(llvm::Function* fun)
+{
+  assert(isOpenCLKernel(fun));
+
+  for (Function::arg_iterator i = fun->arg_begin(); i != fun->arg_end(); i++)
+  {
+    AddParam(i);
+  }
+
+}
+
+void R600KernelParameters::Replace(llvm::Function* fun)
+{
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    Value *new_val;
+
+    if (isSpecialType(i->val->getType()))
+    {
+      new_val = handleSpecial(fun, *i);
+    }
+    else
+    {
+      new_val = ConstantRead(fun, *i);
+    }
+    if (new_val)
+    {
+      i->val->replaceAllUsesWith(new_val);
+    }   
+  }
+}
+
+void R600KernelParameters::Propagate(llvm::Function* fun)
+{
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
+  {
+    if (i->ptr_val)
+    {
+      Propagate(i->ptr_val, i->val->getName(), i->indirect);
+   }
+  }
+}
+
+void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
+{
+  LoadInst* load = dyn_cast<LoadInst>(v);
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v);
+  
+  unsigned addrspace; 
+
+  if (indirect)
+  {
+    addrspace = AMDILAS::PARAM_I_ADDRESS;
+  }
+  else
+  {
+    addrspace = AMDILAS::PARAM_D_ADDRESS;
+  }
+
+  if (GEP and GEP->getType()->getAddressSpace() != addrspace)
+  {
+    Value* op = GEP->getPointerOperand();
+
+    if (dyn_cast<PointerType>(op->getType())->getAddressSpace() != addrspace)
+    {
+      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(op->getType())->getElementType(), addrspace), name, dyn_cast<Instruction>(v));
+    }
+
+    vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
+    
+    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name, dyn_cast<Instruction>(v));
+    GEP2->setIsInBounds(GEP->isInBounds());
+    v = dyn_cast<Value>(GEP2);
+    GEP->replaceAllUsesWith(GEP2);
+    GEP->eraseFromParent();
+    load = NULL;
+  }
+  
+  if (load)
+  {
+    if (load->getPointerAddressSpace() != addrspace) ///normally at this point we have the right address space
+    {
+      Value *orig_ptr = load->getPointerOperand();
+      PointerType *orig_ptr_type = dyn_cast<PointerType>(orig_ptr->getType());
+      
+      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(), addrspace);
+
+      Value* new_ptr = orig_ptr;
+      
+      if (orig_ptr->getType() != new_ptr_type)
+      {
+        new_ptr = new BitCastInst(orig_ptr, new_ptr_type, "prop_cast", load);
+      }
+      
+      Value* new_load = new LoadInst(new_ptr, name, load);
+      load->replaceAllUsesWith(new_load);
+      load->eraseFromParent();
+    }
+    
+    return;
+  }
+
+  vector<User*> users(v->use_begin(), v->use_end());
+  
+  for (int i = 0; i < int(users.size()); i++)
+  {
+    Value* v2 = dyn_cast<Value>(users[i]);
+    
+    if (v2)
+    {
+      Propagate(v2, name, indirect);
+    }
+  }
+}
+
+Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
+{
+  assert(fun->front().begin() != fun->front().end());
+  
+  Instruction *first_inst = fun->front().begin();
+  IRBuilder <> builder (first_inst);
+/* First 3 dwords are reserved for the dimmension info */
+
+  if (!p.val->hasNUsesOrMore(1))
+  {
+    return NULL;
+  }
+  unsigned addrspace;
+
+  if (p.indirect)
+  {
+    addrspace = AMDILAS::PARAM_I_ADDRESS;
+  }
+  else
+  {
+    addrspace = AMDILAS::PARAM_D_ADDRESS;
+  }
+  
+  Argument *arg = dyn_cast<Argument>(p.val);
+  Type * argType = p.val->getType();
+  PointerType * argPtrType = dyn_cast<PointerType>(p.val->getType());
+  
+  if (argPtrType and arg->hasByValAttr())
+  {
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(Type::getInt32Ty(*Context), addrspace));
+    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr, ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName(), first_inst);
+    param_ptr = new BitCastInst(param_ptr, PointerType::get(argPtrType->getElementType(), addrspace), arg->getName(), first_inst);
+    p.ptr_val = param_ptr;
+    return param_ptr;
+  }
+  else
+  {
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(argType, addrspace));
+    
+    Value* param_ptr = builder.CreateGEP(param_addr_space_ptr,
+             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName());
+    
+    Value* param_value = builder.CreateLoad(param_ptr, arg->getName());
+    
+    return param_value;
+  }
+}
+
+Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
+{
+  string name = getSpecialTypeName(p.val->getType());
+  int ID;
+
+  assert(!name.empty());
+  
+  if (name == "image2d_t" or name == "image3d_t")
+  {
+    int lastID = max(getLastSpecialID("image2d_t"), getLastSpecialID("image3d_t"));
+    
+    if (lastID == -1)
+    {
+      ID = 2; ///ID0 and ID1 are used internally by the driver
+    }
+    else
+    {
+      ID = lastID + 1;
+    }
+  }
+  else if (name == "sampler_t")
+  {
+    int lastID = getLastSpecialID("sampler_t");
+
+    if (lastID == -1)
+    {
+      ID = 0;
+    }
+    else
+    {
+      ID = lastID + 1;
+    }    
+  }
+  else
+  {
+    ///TODO: give some error message
+    return NULL;
+  }
+    
+  p.specialType = name;
+  p.specialID = ID;
+
+  Instruction *first_inst = fun->front().begin();
+
+  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context), p.specialID), p.val->getType(), "resourceID", first_inst);
+}
+
+
+bool R600KernelParameters::isSpecialType(Type* t)
+{
+  return !getSpecialTypeName(t).empty();
+}
+
+string R600KernelParameters::getSpecialTypeName(Type* t)
+{
+  PointerType *pt = dyn_cast<PointerType>(t);
+  StructType *st = NULL;
+
+  if (pt)
+  {
+    st = dyn_cast<StructType>(pt->getElementType());
+  }
+
+  if (st)
+  {
+    string prefix = "struct.opencl_builtin_type_";
+    
+    string name = st->getName().str();
+
+    if (name.substr(0, prefix.length()) == prefix)
+    {
+      return name.substr(prefix.length(), name.length());
+    }
+  }
+
+  return "";
+}
+
+
+bool R600KernelParameters::runOnFunction (Function &F)
+{
+  if (!isOpenCLKernel(&F))
+  {
+    return false;
+  }
+
+//  F.dump();
+  
+  RunAna(&F);
+  Replace(&F);
+  Propagate(&F);
+  
+   mod->dump();
+  return false;
+}
+
+void R600KernelParameters::getAnalysisUsage(AnalysisUsage &AU) const
+{
+//   AU.addRequired<FunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+const char *R600KernelParameters::getPassName() const
+{
+  return "OpenCL Kernel parameter conversion to memory";
+}
+
+bool R600KernelParameters::doInitialization(Module &M)
+{
+  Context = &M.getContext();
+  mod = &M;
+  
+  return false;
+}
+
+bool R600KernelParameters::doFinalization(Module &M)
+{
+  return false;
+}
+
+llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD)
+{
+  FunctionPass *p = new R600KernelParameters(TD);
+  
+  return p;
+}
+
+
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.h b/src/gallium/drivers/radeon/R600KernelParameters.h

new file mode 100644 (file)

index 0000000..904a469
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600KernelParameters.h
@@ -0,0 +1,28 @@
+//===-- R600KernelParameters.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KERNELPARAMETERS_H
+#define KERNELPARAMETERS_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Value.h"
+
+#include <vector>
+
+llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD);
+
+
+#endif
diff --git a/src/gallium/drivers/radeon/R600LowerInstructions.cpp b/src/gallium/drivers/radeon/R600LowerInstructions.cpp

new file mode 100644 (file)

index 0000000..b9f9c7c
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
@@ -0,0 +1,546 @@
+//===-- R600LowerInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUUtil.h"
+#include "AMDIL.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "AMDILRegisterInfo.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include <stdio.h>
+
+using namespace llvm;
+
+namespace {
+  class R600LowerInstructionsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+    AMDILMachineFunctionInfo * MFI;
+    const R600InstrInfo * TII;
+    MachineRegisterInfo * MRI;
+
+    void lowerFLT(MachineInstr &MI);
+
+    void calcAddress(const MachineOperand &ptrOp,
+                     const MachineOperand &indexOp,
+                     unsigned indexReg,
+                     MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I) const;
+
+    void divMod(MachineInstr &MI,
+                  MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  bool div = true) const;
+
+  public:
+    R600LowerInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm),
+      TII(static_cast<const R600InstrInfo*>(tm.getInstrInfo())),
+      MRI(NULL)
+      { }
+
+    const char *getPassName() const { return "R600 Lower Instructions"; }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char R600LowerInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createR600LowerInstructionsPass(TargetMachine &tm) {
+  return new R600LowerInstructionsPass(tm);
+}
+
+bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+  MFI = MF.getInfo<AMDILMachineFunctionInfo>();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+
+      MachineInstr &MI = *I;
+      switch(MI.getOpcode()) {
+      case AMDIL::FLT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FGE))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::ABS_i32:
+        {
+          unsigned setgt = MRI->createVirtualRegister(
+                           &AMDIL::R600_TReg32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT),
+                  setgt)
+                  .addOperand(MI.getOperand(1))
+                  .addReg(AMDIL::ZERO);
+
+          unsigned add_int = MRI->createVirtualRegister(
+                             &AMDIL::R600_TReg32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT),
+                  add_int)
+                  .addReg(setgt)
+                  .addOperand(MI.getOperand(1));
+
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::XOR_INT))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(setgt)
+                  .addReg(add_int);
+
+          break;
+        }
+
+      /* XXX: We could propagate the ABS flag to all of the uses of Operand0 and
+       * remove the ABS instruction.*/
+      case AMDIL::FABS_f32:
+      case AMDIL::ABS_f32:
+        MI.getOperand(1).addTargetFlag(MO_FLAG_ABS);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::MOVE_f32))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::BINARY_OR_f32:
+        {
+        unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp0)
+                .addOperand(MI.getOperand(1));
+        unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp1)
+                .addOperand(MI.getOperand(2));
+        unsigned tmp2 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::BINARY_OR_i32), tmp2)
+                .addReg(tmp0)
+                .addReg(tmp1);
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::ITOF), MI.getOperand(0).getReg())
+                .addReg(tmp2);
+        break;
+        }
+      case AMDIL::CMOVLOG_f32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(MI.getOpcode()))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1))
+                .addOperand(MI.getOperand(3))
+                .addOperand(MI.getOperand(2));
+        break;
+
+      case AMDIL::CMOVLOG_i32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(1))
+                .addOperand(MI.getOperand(3))
+                .addOperand(MI.getOperand(2));
+        break;
+
+      case AMDIL::CLAMP_f32:
+        {
+          MachineOperand lowOp = MI.getOperand(2);
+          MachineOperand highOp = MI.getOperand(3);
+        if (lowOp.isReg() && highOp.isReg()
+            && lowOp.getReg() == AMDIL::ZERO && highOp.getReg() == AMDIL::ONE) {
+          MI.getOperand(0).addTargetFlag(MO_FLAG_CLAMP);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV))
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+        } else {
+          /* XXX: Handle other cases */
+          abort();
+        }
+        break;
+        }
+
+      case AMDIL::UDIV_i32:
+        divMod(MI, MBB, I);
+        break;
+
+      /* XXX: Figure out the semantics of DIV_INF_f32 and make sure this is OK */
+/*      case AMDIL::DIV_INF_f32:
+        {
+          unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRF32RegClass);
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TM.getInstrInfo()->get(AMDIL::RECIP_CLAMPED), tmp0)
+                  .addOperand(MI.getOperand(2));
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TM.getInstrInfo()->get(AMDIL::MUL_IEEE_f32))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(tmp0)
+                  .addOperand(MI.getOperand(1));
+          break;
+        }
+*/        /* XXX: This is an optimization */
+
+      case AMDIL::GLOBALLOAD_f32:
+      case AMDIL::GLOBALLOAD_i32:
+        {
+          MachineOperand &ptrOperand = MI.getOperand(1);
+          MachineOperand &indexOperand = MI.getOperand(2);
+          unsigned indexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+
+          /* Calculate the address with in the VTX buffer */
+          calcAddress(ptrOperand, indexOperand, indexReg, MBB, I);
+
+          /* Make sure the VTX_READ_eg writes to the X chan */
+          MRI->setRegClass(MI.getOperand(0).getReg(),
+                          &AMDIL::R600_TReg32_XRegClass);
+
+          /* Add the VTX_READ_eg instruction */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::VTX_READ_eg))
+                  .addOperand(MI.getOperand(0))
+                  .addReg(indexReg)
+                  .addImm(1);
+          break;
+        }
+
+      case AMDIL::GLOBALSTORE_i32:
+      case AMDIL::GLOBALSTORE_f32:
+        {
+          MachineOperand &ptrOperand = MI.getOperand(1);
+          MachineOperand &indexOperand = MI.getOperand(2);
+          unsigned rwReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+          unsigned byteIndexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+          unsigned shiftReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+          unsigned indexReg =
+                   MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass);
+
+          /* Move the store value to the correct register class */
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY), rwReg)
+                  .addOperand(MI.getOperand(0));
+
+          /* Calculate the address in the RAT */
+          calcAddress(ptrOperand, indexOperand, byteIndexReg, MBB, I);
+
+
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV), shiftReg)
+                  .addReg(AMDIL::ALU_LITERAL_X)
+                  .addImm(2);
+
+          /* XXX: Check GPU family */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::LSHR_eg), indexReg)
+                 .addReg(byteIndexReg)
+                 .addReg(shiftReg);
+
+          /* XXX: Check GPU Family */
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(AMDIL::RAT_WRITE_CACHELESS_eg))
+                  .addReg(rwReg)
+                  .addReg(indexReg)
+                  .addImm(0);
+          break;
+        }
+      case AMDIL::ILT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGT_INT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+      case AMDIL::LOADCONST_f32:
+      case AMDIL::LOADCONST_i32:
+        {
+          bool canInline = false;
+          unsigned inlineReg;
+          MachineOperand & dstOp = MI.getOperand(0);
+          MachineOperand & immOp = MI.getOperand(1);
+          if (immOp.isFPImm()) {
+            const ConstantFP * cfp = immOp.getFPImm();
+            if (cfp->isZero()) {
+              canInline = true;
+              inlineReg = AMDIL::ZERO;
+            } else if (cfp->isExactlyValue(1.0f)) {
+              canInline = true;
+              inlineReg = AMDIL::ONE;
+            } else if (cfp->isExactlyValue(0.5f)) {
+              canInline = true;
+              inlineReg = AMDIL::HALF;
+            }
+          }
+
+          if (canInline) {
+            MachineOperand * use = dstOp.getNextOperandForReg();
+            /* The lowering operation for CLAMP needs to have the immediates
+             * as operands, so we must propagate them. */
+            while (use) {
+              MachineOperand * next = use->getNextOperandForReg();
+              if (use->getParent()->getOpcode() == AMDIL::CLAMP_f32) {
+                use->setReg(inlineReg);
+              }
+              use = next;
+            }
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY))
+                    .addOperand(dstOp)
+                    .addReg(inlineReg);
+          } else {
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV))
+                    .addOperand(dstOp)
+                    .addReg(AMDIL::ALU_LITERAL_X)
+                    .addOperand(immOp);
+          }
+          break;
+        }
+
+      case AMDIL::MASK_WRITE:
+      {
+        unsigned maskedRegister = MI.getOperand(0).getReg();
+        assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+        MachineInstr * defInstr = MRI->getVRegDef(maskedRegister);
+        MachineOperand * def = defInstr->findRegisterDefOperand(maskedRegister);
+        def->addTargetFlag(MO_FLAG_MASK);
+        break;
+      }
+
+      case AMDIL::VEXTRACT_v4f32:
+        MI.getOperand(2).setImm(MI.getOperand(2).getImm() - 1);
+        continue;
+
+      case AMDIL::NEGATE_i32:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT))
+                .addOperand(MI.getOperand(0))
+                .addReg(AMDIL::ZERO)
+                .addOperand(MI.getOperand(1));
+        break;
+
+      case AMDIL::NEG_f32:
+        {
+            MI.getOperand(1).addTargetFlag(MO_FLAG_NEG);
+            BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(TII->getISAOpcode(AMDIL::MOV)))
+            .addOperand(MI.getOperand(0))
+            .addOperand(MI.getOperand(1));
+          break;
+        }
+
+      case AMDIL::SUB_f32:
+        {
+          MI.getOperand(2).addTargetFlag(MO_FLAG_NEG);
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TII->get(TII->getISAOpcode(AMDIL::ADD_f32)))
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1))
+                  .addOperand(MI.getOperand(2));
+          break;
+        }
+
+      case AMDIL::VINSERT_v4f32:
+        {
+
+          int64_t swz = MI.getOperand(4).getImm();
+          int64_t chan;
+          switch (swz) {
+          case (1 << 0):
+            chan = 0;
+            break;
+          case (1 << 8):
+            chan = 1;
+            break;
+          case (1 << 16):
+            chan = 2;
+            break;
+          case (1 << 24):
+            chan = 3;
+            break;
+          default:
+            chan = 0;
+            fprintf(stderr, "swizzle: %ld\n", swz);
+            abort();
+            break;
+          }
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                          TM.getInstrInfo()->get(AMDIL::SET_CHAN))
+                  .addOperand(MI.getOperand(1))
+                  .addOperand(MI.getOperand(2))
+                  .addImm(chan);
+
+          BuildMI(MBB, I, MBB.findDebugLoc(I),
+                                      TM.getInstrInfo()->get(AMDIL::COPY))
+                  .addOperand(MI.getOperand(0))
+                  .addOperand(MI.getOperand(1));
+          break;
+        }
+
+      default:
+        continue;
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
+
+void R600LowerInstructionsPass::calcAddress(const MachineOperand &ptrOp,
+                                            const MachineOperand &indexOp,
+                                            unsigned indexReg,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator I) const
+{
+  /* Optimize the case where the indexOperand is 0 */
+  if (indexOp.isImm() && indexOp.getImm() == 0) {
+    assert(ptrOp.isReg());
+    BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(AMDIL::COPY), indexReg)
+            .addOperand(ptrOp);
+  } else {
+    BuildMI(MBB, I, MBB.findDebugLoc(I),
+                    TII->get(AMDIL::ADD_INT), indexReg)
+            .addOperand(indexOp)
+            .addOperand(ptrOp);
+  }
+}
+
+/* Mostly copied from tgsi_divmod() in r600_shader.c */
+void R600LowerInstructionsPass::divMod(MachineInstr &MI,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       bool div) const
+{
+  unsigned dst = MI.getOperand(0).getReg();
+  MachineOperand &numerator = MI.getOperand(1);
+  MachineOperand &denominator = MI.getOperand(2);
+  /* rcp = RECIP(denominator) = 2^32 / denominator + e
+   * e is rounding error */
+  unsigned rcp = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getRECIP_UINT()), rcp)
+          .addOperand(denominator);
+
+  /* rcp_lo = lo(rcp * denominator) */
+  unsigned rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()), rcp_lo)
+          .addReg(rcp)
+          .addOperand(denominator);
+
+  /* rcp_hi = HI (rcp * denominator) */
+  unsigned rcp_hi = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), rcp_hi)
+          .addReg(rcp)
+          .addOperand(denominator);
+
+  unsigned neg_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), neg_rcp_lo)
+          .addReg(AMDIL::ZERO)
+          .addReg(rcp_lo);
+
+  unsigned abs_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), abs_rcp_lo)
+          .addReg(rcp_hi)
+          .addReg(neg_rcp_lo)
+          .addReg(rcp_lo);
+
+  unsigned e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), e)
+          .addReg(abs_rcp_lo)
+          .addReg(rcp);
+
+  unsigned rcp_plus_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), rcp_plus_e)
+          .addReg(rcp)
+          .addReg(e);
+
+  unsigned rcp_sub_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), rcp_sub_e)
+          .addReg(rcp)
+          .addReg(e);
+
+  /* tmp0 = rcp_hi == 0 ? rcp_plus_e : rcp_sub_e */
+  unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), tmp0)
+          .addReg(rcp_hi)
+          .addReg(rcp_plus_e)
+          .addReg(rcp_sub_e);
+
+  unsigned q = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), q)
+          .addReg(tmp0)
+          .addOperand(numerator);
+
+  /* num_sub_r = q * denominator */
+  unsigned num_sub_r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()),
+          num_sub_r)
+          .addReg(q)
+          .addOperand(denominator);
+
+  unsigned r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), r)
+          .addOperand(numerator)
+          .addReg(num_sub_r);
+
+  unsigned r_ge_den = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_den)
+          .addReg(r)
+          .addOperand(denominator);
+
+  unsigned r_ge_zero = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_zero)
+          .addOperand(numerator)
+          .addReg(num_sub_r);
+
+  unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::AND_INT), tmp1)
+          .addReg(r_ge_den)
+          .addReg(r_ge_zero);
+
+  unsigned val0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  unsigned val1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  unsigned result = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass);
+  if (div) {
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val0)
+            .addReg(q)
+            .addReg(AMDIL::ONE_INT);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val1)
+            .addReg(q)
+            .addReg(AMDIL::ONE_INT);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
+            .addReg(tmp1)
+            .addReg(q)
+            .addReg(val0);
+  } else {
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val0)
+            .addReg(r)
+            .addOperand(denominator);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val1)
+            .addReg(r)
+            .addOperand(denominator);
+
+    BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result)
+            .addReg(tmp1)
+            .addReg(r)
+            .addReg(val0);
+  }
+
+  /* XXX: Do we need to set to MAX_INT if denominator is 0? */
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), dst)
+          .addReg(r_ge_zero)
+          .addReg(val1)
+          .addReg(result);
+}
diff --git a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp

new file mode 100644 (file)

index 0000000..394ee70
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp
@@ -0,0 +1,143 @@
+//===-- R600LowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPULowerShaderInstructions.h"
+#include "AMDIL.h"
+#include "AMDILInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class R600LowerShaderInstructionsPass : public MachineFunctionPass,
+        public AMDGPULowerShaderInstructionsPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+    void lowerEXPORT_REG_FAKE(MachineInstr &MI, MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator I);
+    void lowerLOAD_INPUT(MachineInstr & MI);
+    bool lowerSTORE_OUTPUT(MachineInstr & MI, MachineBasicBlock &MBB,
+        MachineBasicBlock::iterator I);
+
+  public:
+    R600LowerShaderInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+      bool runOnMachineFunction(MachineFunction &MF);
+
+      const char *getPassName() const { return "R600 Lower Shader Instructions"; }
+    };
+} /* End anonymous namespace */
+
+char R600LowerShaderInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createR600LowerShaderInstructionsPass(TargetMachine &tm) {
+    return new R600LowerShaderInstructionsPass(tm);
+}
+
+#define INSTR_CASE_FLOAT_V(inst) \
+  case AMDIL:: inst##_v4f32: \
+
+#define INSTR_CASE_FLOAT_S(inst) \
+  case AMDIL:: inst##_f32:
+
+#define INSTR_CASE_FLOAT(inst) \
+  INSTR_CASE_FLOAT_V(inst) \
+  INSTR_CASE_FLOAT_S(inst)
+bool R600LowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
+      MachineInstr &MI = *I;
+      bool deleteInstr = false;
+      switch (MI.getOpcode()) {
+
+      default: break;
+
+      case AMDIL::RESERVE_REG:
+      case AMDIL::EXPORT_REG:
+        deleteInstr = true;
+        break;
+
+      case AMDIL::LOAD_INPUT:
+        lowerLOAD_INPUT(MI);
+        deleteInstr = true;
+        break;
+
+      case AMDIL::STORE_OUTPUT:
+        deleteInstr = lowerSTORE_OUTPUT(MI, MBB, I);
+        break;
+
+      }
+
+      ++I;
+
+      if (deleteInstr) {
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  return false;
+}
+
+/* The goal of this function is to replace the virutal destination register of
+ * a LOAD_INPUT instruction with the correct physical register that will.
+ *
+ * XXX: I don't think this is the right way things assign physical registers,
+ * but I'm not sure of another way to do this.
+ */
+void R600LowerShaderInstructionsPass::lowerLOAD_INPUT(MachineInstr &MI)
+{
+  MachineOperand &dst = MI.getOperand(0);
+  MachineOperand &arg = MI.getOperand(1);
+  int64_t inputIndex = arg.getImm();
+  const TargetRegisterClass * inputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
+  unsigned newRegister = inputClass->getRegister(inputIndex);
+  unsigned dstReg = dst.getReg();
+
+  preloadRegister(MI.getParent()->getParent(), TM.getInstrInfo(), newRegister,
+                  dstReg);
+}
+
+bool R600LowerShaderInstructionsPass::lowerSTORE_OUTPUT(MachineInstr &MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
+{
+  MachineOperand &valueOp = MI.getOperand(1);
+  MachineOperand &indexOp = MI.getOperand(2);
+  unsigned valueReg = valueOp.getReg();
+  int64_t outputIndex = indexOp.getImm();
+  const TargetRegisterClass * outputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
+  unsigned newRegister = outputClass->getRegister(outputIndex);
+
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::COPY),
+                  newRegister)
+                  .addReg(valueReg);
+
+  if (!MRI->isLiveOut(newRegister))
+    MRI->addLiveOut(newRegister);
+
+  return true;
+
+}
diff --git a/src/gallium/drivers/radeon/R600OpenCLUtils.h b/src/gallium/drivers/radeon/R600OpenCLUtils.h

new file mode 100644 (file)

index 0000000..91e41d6
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600OpenCLUtils.h
@@ -0,0 +1,49 @@
+//===-- OpenCLUtils.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+#ifndef OPENCLUTILS_H
+#define OPENCLUTILS_H
+
+#include "llvm/Function.h"
+
+#include <llvm/Module.h>
+
+static bool isOpenCLKernel(const llvm::Function* fun)
+{
+  llvm::Module *mod = const_cast<llvm::Function*>(fun)->getParent();
+  llvm::NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
+
+  if (!md or !md->getNumOperands())
+  {
+    return false;
+  }
+
+  for (int i = 0; i < int(md->getNumOperands()); i++)
+  {
+    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
+    {
+      continue;
+    }
+    
+    assert(md->getOperand(i)->getNumOperands() == 1);
+
+    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
+#endif
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.cpp b/src/gallium/drivers/radeon/R600RegisterInfo.cpp

new file mode 100644 (file)

index 0000000..96507b1
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.cpp
@@ -0,0 +1,102 @@
+//===-- R600RegisterInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AMDIL::ZERO);
+  Reserved.set(AMDIL::HALF);
+  Reserved.set(AMDIL::ONE);
+  Reserved.set(AMDIL::ONE_INT);
+  Reserved.set(AMDIL::NEG_HALF);
+  Reserved.set(AMDIL::NEG_ONE);
+  Reserved.set(AMDIL::PV_X);
+  Reserved.set(AMDIL::ALU_LITERAL_X);
+
+  for (TargetRegisterClass::iterator I = AMDIL::R600_CReg32RegClass.begin(),
+                        E = AMDIL::R600_CReg32RegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  for (MachineFunction::const_iterator BB = MF.begin(),
+                                 BB_E = MF.end(); BB != BB_E; ++BB) {
+    const MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+                                                                  I != E; ++I) {
+      const MachineInstr &MI = *I;
+      if (MI.getOpcode() == AMDIL::RESERVE_REG) {
+        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
+          Reserved.set(MI.getOperand(0).getReg());
+        }
+      }
+    }
+  }
+  return Reserved;
+}
+
+const TargetRegisterClass *
+R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const
+{
+  switch (rc->getID()) {
+  case AMDIL::GPRV4F32RegClassID:
+  case AMDIL::GPRV4I32RegClassID:
+    return &AMDIL::R600_Reg128RegClass;
+  case AMDIL::GPRF32RegClassID:
+  case AMDIL::GPRI32RegClassID:
+    return &AMDIL::R600_Reg32RegClass;
+  default: return rc;
+  }
+}
+
+unsigned R600RegisterInfo::getHWRegIndex(unsigned reg) const
+{
+  switch(reg) {
+  case AMDIL::ZERO: return 248;
+  case AMDIL::ONE:
+  case AMDIL::NEG_ONE: return 249;
+  case AMDIL::ONE_INT: return 250;
+  case AMDIL::HALF:
+  case AMDIL::NEG_HALF: return 252;
+  case AMDIL::ALU_LITERAL_X: return 253;
+  default: return getHWRegIndexGen(reg);
+  }
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const
+{
+  switch(reg) {
+  case AMDIL::ZERO:
+  case AMDIL::ONE:
+  case AMDIL::ONE_INT:
+  case AMDIL::NEG_ONE:
+  case AMDIL::HALF:
+  case AMDIL::NEG_HALF:
+  case AMDIL::ALU_LITERAL_X:
+    return 0;
+  default: return getHWRegChanGen(reg);
+  }
+}
+
+#include "R600HwRegInfo.include"
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.h b/src/gallium/drivers/radeon/R600RegisterInfo.h

new file mode 100644 (file)

index 0000000..95a44f9
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.h
@@ -0,0 +1,44 @@
+//===-- R600RegisterInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600REGISTERINFO_H_
+#define R600REGISTERINFO_H_
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDILRegisterInfo.h"
+
+namespace llvm {
+
+  class R600TargetMachine;
+  class TargetInstrInfo;
+
+  struct R600RegisterInfo : public AMDGPURegisterInfo
+  {
+    AMDGPUTargetMachine &TM;
+    const TargetInstrInfo &TII;
+
+    R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+    virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass * rc) const;
+    unsigned getHWRegIndex(unsigned reg) const;
+    unsigned getHWRegChan(unsigned reg) const;
+private:
+    unsigned getHWRegChanGen(unsigned reg) const;
+    unsigned getHWRegIndexGen(unsigned reg) const;
+  };
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/src/gallium/drivers/radeon/R600Schedule.td b/src/gallium/drivers/radeon/R600Schedule.td

new file mode 100644 (file)

index 0000000..c6b1ca6
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600Schedule.td
@@ -0,0 +1,34 @@
+//===-- R600Schedule.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+
+def R600_EG_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>
+  ]
+>;
diff --git a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp

new file mode 100644 (file)

index 0000000..b0bdf70
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
@@ -0,0 +1,110 @@
+//===-- SIAssignInterpRegs.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDIL.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class SIAssignInterpRegsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SIAssignInterpRegsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "SI Assign intrpolation registers"; }
+  };
+} // End anonymous namespace
+
+char SIAssignInterpRegsPass::ID = 0;
+
+#define INTERP_VALUES 16
+
+struct interp_info {
+  bool enabled;
+  unsigned regs[3];
+  unsigned reg_count;
+};
+
+
+FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
+  return new SIAssignInterpRegsPass(tm);
+}
+
+bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF)
+{
+
+  struct interp_info InterpUse[INTERP_VALUES] = {
+    {false, {AMDIL::PERSP_SAMPLE_I, AMDIL::PERSP_SAMPLE_J}, 2},
+    {false, {AMDIL::PERSP_CENTER_I, AMDIL::PERSP_CENTER_J}, 2},
+    {false, {AMDIL::PERSP_CENTROID_I, AMDIL::PERSP_CENTROID_J}, 2},
+    {false, {AMDIL::PERSP_I_W, AMDIL::PERSP_J_W, AMDIL::PERSP_1_W}, 3},
+    {false, {AMDIL::LINEAR_SAMPLE_I, AMDIL::LINEAR_SAMPLE_J}, 2},
+    {false, {AMDIL::LINEAR_CENTER_I, AMDIL::LINEAR_CENTER_J}, 2},
+    {false, {AMDIL::LINEAR_CENTROID_I, AMDIL::LINEAR_CENTROID_J}, 2},
+    {false, {AMDIL::LINE_STIPPLE_TEX_COORD}, 1},
+    {false, {AMDIL::POS_X_FLOAT}, 1},
+    {false, {AMDIL::POS_Y_FLOAT}, 1},
+    {false, {AMDIL::POS_Z_FLOAT}, 1},
+    {false, {AMDIL::POS_W_FLOAT}, 1},
+    {false, {AMDIL::FRONT_FACE}, 1},
+    {false, {AMDIL::ANCILLARY}, 1},
+    {false, {AMDIL::SAMPLE_COVERAGE}, 1},
+    {false, {AMDIL::POS_FIXED_PT}, 1}
+  };
+
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  /* First pass, mark the interpolation values that are used. */
+  for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) {
+    for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count;
+                                                               reg_idx++) {
+      InterpUse[interp_idx].enabled =
+                            !MRI.use_empty(InterpUse[interp_idx].regs[reg_idx]);
+    }
+  }
+
+  unsigned used_vgprs = 0;
+
+  /* Second pass, replace with VGPRs. */
+  for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) {
+    if (!InterpUse[interp_idx].enabled) {
+      continue;
+    }
+    MFI->spi_ps_input_addr |= (1 << interp_idx);
+
+    for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count;
+                                                  reg_idx++, used_vgprs++) {
+      unsigned new_reg = AMDIL::VReg_32RegisterClass->getRegister(used_vgprs);
+      unsigned virt_reg = MRI.createVirtualRegister(AMDIL::VReg_32RegisterClass);
+      MRI.replaceRegWith(InterpUse[interp_idx].regs[reg_idx], virt_reg);
+      AMDGPU::utilAddLiveIn(&MF, MRI, TM.getInstrInfo(), new_reg, virt_reg);
+    }
+  }
+
+  return false;
+}
diff --git a/src/gallium/drivers/radeon/SICodeEmitter.cpp b/src/gallium/drivers/radeon/SICodeEmitter.cpp

new file mode 100644 (file)

index 0000000..0553f0e
--- /dev/null
+++ b/src/gallium/drivers/radeon/SICodeEmitter.cpp
@@ -0,0 +1,274 @@
+//===-- SICodeEmitter.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDILCodeEmitter.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <stdio.h>
+
+#define LITERAL_REG 255
+#define VGPR_BIT(src_idx) (1 << (8 * (src_idx)))
+using namespace llvm;
+
+namespace {
+
+  class SICodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter {
+
+  private:
+    static char ID;
+    formatted_raw_ostream &_OS;
+    const TargetMachine *TM;
+    void emitState(MachineFunction & MF);
+    void emitInstr(MachineInstr &MI);
+
+    void outputBytes(uint64_t value, unsigned bytes);
+    unsigned GPRAlign(const MachineInstr &MI, unsigned OpNo, unsigned shift)
+                                                                      const;
+
+  public:
+    SICodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
+        _OS(OS), TM(NULL) { }
+    const char *getPassName() const { return "SI Code Emitter"; }
+    bool runOnMachineFunction(MachineFunction &MF);
+    virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                       const MachineOperand &MO) const;
+    virtual unsigned GPR4AlignEncode(const MachineInstr  &MI, unsigned OpNo)
+                                                                      const;
+    virtual unsigned GPR2AlignEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                      const;
+    virtual uint64_t i32LiteralEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                      const;
+    virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                   uint64_t Value) const;
+  };
+}
+
+char SICodeEmitter::ID = 0;
+
+FunctionPass *llvm::createSICodeEmitterPass(formatted_raw_ostream &OS) {
+  return new SICodeEmitter(OS);
+}
+
+void SICodeEmitter::emitState(MachineFunction & MF)
+{
+  unsigned maxSGPR = 0;
+  unsigned maxVGPR = 0;
+  bool VCCUsed = false;
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        MachineOperand & MO = MI.getOperand(op_idx);
+        unsigned maxUsed;
+        unsigned width = 0;
+        bool isSGPR = false;
+        unsigned reg;
+        unsigned hwReg;
+        if (!MO.isReg()) {
+          continue;
+        }
+        reg = MO.getReg();
+        if (reg == AMDIL::VCC) {
+          VCCUsed = true;
+          continue;
+        }
+        if (AMDIL::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDIL::VReg_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDIL::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDIL::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDIL::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDIL::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDIL::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else {
+          assert("!Unknown register class");
+        }
+        hwReg = RI->getHWRegNum(reg);
+        maxUsed = ((hwReg + 1) * width) - 1;
+        if (isSGPR) {
+          maxSGPR = maxUsed > maxSGPR ? maxUsed : maxSGPR;
+        } else {
+          maxVGPR = maxUsed > maxVGPR ? maxUsed : maxVGPR;
+        }
+      }
+    }
+  }
+  if (VCCUsed) {
+    maxSGPR += 2;
+  }
+  outputBytes(maxSGPR + 1, 4);
+  outputBytes(maxVGPR + 1, 4);
+  outputBytes(MFI->spi_ps_input_addr, 4);
+}
+
+bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
+{
+  MF.dump();
+  TM = &MF.getTarget();
+  const AMDGPUInstrInfo * TII =
+                        static_cast<const AMDGPUInstrInfo*>(TM->getInstrInfo());
+
+  emitState(MF);
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (!TII->isRegPreload(MI) && MI.getOpcode() != AMDIL::KILL
+          && MI.getOpcode() != AMDIL::RETURN) {
+        emitInstr(MI);
+      }
+    }
+  }
+  return false;
+}
+
+void SICodeEmitter::emitInstr(MachineInstr &MI)
+{
+  const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo());
+
+  uint64_t hwInst = getBinaryCodeForInstr(MI);
+
+  if ((hwInst & 0xffffffff) == 0xffffffff) {
+    fprintf(stderr, "Unsupported Instruction: \n");
+    MI.dump();
+    abort();
+  }
+
+//  hwInst |= SII->getBinaryCode(MI);
+
+  unsigned bytes = SII->getEncodingBytes(MI);
+  outputBytes(hwInst, bytes);
+}
+
+uint64_t SICodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                          const MachineOperand &MO) const
+{
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+
+  switch(MO.getType()) {
+  case MachineOperand::MO_Register:
+    return RI->getBinaryCode(MO.getReg());
+
+  case MachineOperand::MO_Immediate:
+    return MO.getImm();
+
+  case MachineOperand::MO_FPImmediate:
+    /* XXX: Not all instructions can use inline literals */
+    /* XXX: We should make sure this is a 32-bit constant */
+    return LITERAL_REG | (MO.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue() << 32);
+  default:
+    llvm_unreachable("Encoding of this operand type is not supported yet.");
+    break;
+  }
+}
+
+unsigned SICodeEmitter::GPRAlign(const MachineInstr &MI, unsigned OpNo,
+    unsigned shift) const
+{
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM->getRegisterInfo());
+  unsigned regCode = RI->getHWRegNum(MI.getOperand(OpNo).getReg());
+  return regCode >> shift;
+}
+
+unsigned SICodeEmitter::GPR4AlignEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return GPRAlign(MI, OpNo, 2);
+}
+
+unsigned SICodeEmitter::GPR2AlignEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return GPRAlign(MI, OpNo, 1);
+}
+
+uint64_t SICodeEmitter::i32LiteralEncode(const MachineInstr &MI,
+    unsigned OpNo) const
+{
+  return LITERAL_REG | (MI.getOperand(OpNo).getImm() << 32);
+}
+
+/* Set the "VGPR" bit for VOP args that can take either a VGPR or a SGPR.
+ * XXX: It would be nice if we could handle this without a PostEncode function.
+ */
+uint64_t SICodeEmitter::VOPPostEncode(const MachineInstr &MI,
+    uint64_t Value) const
+{
+  const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo());
+  unsigned encodingType = SII->getEncodingType(MI);
+  unsigned numSrcOps;
+  unsigned vgprBitOffset;
+
+  if (encodingType == SIInstrEncodingType::VOP3) {
+    numSrcOps = 3;
+    vgprBitOffset = 32;
+  } else {
+    numSrcOps = 1;
+    vgprBitOffset = 0;
+  }
+
+  /* Add one to skip over the destination reg operand. */
+  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
+    if (!MI.getOperand(opIdx).isReg()) {
+      continue;
+    }
+    unsigned reg = MI.getOperand(opIdx).getReg();
+    if (AMDIL::VReg_32RegClass.contains(reg)
+        || AMDIL::VReg_64RegClass.contains(reg)) {
+      Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
+    }
+  }
+  return Value;
+}
+
+
+void SICodeEmitter::outputBytes(uint64_t value, unsigned bytes)
+{
+  for (unsigned i = 0; i < bytes; i++) {
+    _OS.write((uint8_t) ((value >> (8 * i)) & 0xff));
+  }
+}
diff --git a/src/gallium/drivers/radeon/SIConvertToISA.cpp b/src/gallium/drivers/radeon/SIConvertToISA.cpp

new file mode 100644 (file)

index 0000000..44e6539
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIConvertToISA.cpp
@@ -0,0 +1,89 @@
+//===-- SIConvertToISA.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class SIConvertToISAPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+    void convertVCREATE_v4f32(MachineInstr &MI, MachineBasicBlock::iterator I,
+                              MachineBasicBlock &MBB, MachineFunction &MF);
+
+  public:
+    SIConvertToISAPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  };
+} /* End anonymous namespace */
+
+char SIConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createSIConvertToISAPass(TargetMachine &tm) {
+  return new SIConvertToISAPass(tm);
+}
+
+bool SIConvertToISAPass::runOnMachineFunction(MachineFunction &MF)
+{
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDIL::VCREATE_v4f32: convertVCREATE_v4f32(MI, I, MBB, MF);
+
+      }
+      MI.removeFromParent();
+    }
+  }
+  return false;
+}
+
+void SIConvertToISAPass::convertVCREATE_v4f32(MachineInstr &MI,
+    MachineBasicBlock::iterator I, MachineBasicBlock &MBB, MachineFunction &MF)
+{
+  MachineInstrBuilder implicitDef;
+  MachineInstrBuilder insertSubreg;
+  MachineRegisterInfo & MRI = MF.getRegInfo();
+  unsigned tmp = MRI.createVirtualRegister(&AMDIL::VReg_128RegClass);
+
+  implicitDef = BuildMI(MF, MBB.findDebugLoc(I),
+                        TM.getInstrInfo()->get(AMDIL::IMPLICIT_DEF), tmp);
+
+  MRI.setRegClass(MI.getOperand(1).getReg(), &AMDIL::VReg_32RegClass);
+  insertSubreg = BuildMI(MF, MBB.findDebugLoc(I),
+                        TM.getInstrInfo()->get(AMDIL::INSERT_SUBREG))
+                        .addOperand(MI.getOperand(0))
+                        .addReg(tmp)
+                        .addOperand(MI.getOperand(1))
+                        .addImm(AMDIL::sel_x);
+
+  MBB.insert(I, implicitDef);
+  MBB.insert(I, insertSubreg);
+}
diff --git a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl

new file mode 100644 (file)

index 0000000..644daa1
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
@@ -0,0 +1,278 @@
+#===-- SIGenRegisterInfo.pl - TODO: Add brief description -------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===----------------------------------------------------------------------===#
+#
+# TODO: Add full description
+#
+#===----------------------------------------------------------------------===#
+
+
+use strict;
+use warnings;
+
+my $SGPR_COUNT = 104;
+my $VGPR_COUNT = 256;
+
+my $SGPR_MAX_IDX = $SGPR_COUNT - 1;
+my $VGPR_MAX_IDX = $VGPR_COUNT - 1;
+
+my $INDEX_FILE = defined($ARGV[0]) ? $ARGV[0] : '';
+
+print <<STRING;
+
+let Namespace = "AMDIL" in {
+  def low : SubRegIndex;
+  def high : SubRegIndex;
+
+  def sub0 : SubRegIndex;
+  def sub1 : SubRegIndex;
+  def sub2 : SubRegIndex;
+  def sub3 : SubRegIndex;
+  def sub4 : SubRegIndex;
+  def sub5 : SubRegIndex;
+  def sub6 : SubRegIndex;
+  def sub7 : SubRegIndex;
+}
+
+class SIReg <string n> : Register<n> {
+  let Namespace = "AMDIL";
+}
+
+class SI_64 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDIL";
+  let SubRegIndices = [low, high];
+}
+
+class SI_128 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDIL";
+  let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
+}
+
+class SI_256 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDIL";
+  let SubRegIndices = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
+}
+
+class SGPR_32 <bits<8> num, string name> : SIReg<name> {
+  field bits<8> Num;
+
+  let Num = num;
+}
+
+
+class VGPR_32 <bits<9> num, string name, Register gprf32_alias> : SIReg<name> {
+  field bits<9> Num;
+
+  let Num = num;
+  let Aliases = [gprf32_alias];
+}
+
+class SGPR_64 <bits<8> num, string name, list<Register> subregs> :
+    SI_64 <name, subregs>;
+
+class VGPR_64 <bits<9> num, string name, list<Register> subregs> :
+    SI_64 <name, subregs>;
+
+class SGPR_128 <bits<8> num, string name, list<Register> subregs> :
+    SI_128 <name, subregs>;
+
+class VGPR_128 <bits<9> num, string name, list<Register> subregs> :
+    SI_128 <name, subregs>;
+
+class SGPR_256 <bits<8> num, string name, list<Register> subregs> :
+    SI_256 <name, subregs>;
+
+def VCC : SIReg<"VCC">;
+def SCC : SIReg<"SCC">;
+def SREG_LIT_0 : SIReg <"S LIT 0">;
+
+def M0 : SIReg <"M0">;
+
+//Interpolation registers
+
+def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
+def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
+def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
+def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
+def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
+def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
+def PERSP_I_W : SIReg <"PERSP_I_W">;
+def PERSP_J_W : SIReg <"PERSP_J_W">;
+def PERSP_1_W : SIReg <"PERSP_1_W">;
+def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
+def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
+def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
+def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
+def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
+def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
+def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
+def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
+def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
+def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
+def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
+def FRONT_FACE : SIReg <"FRONT_FACE">;
+def ANCILLARY : SIReg <"ANCILLARY">;
+def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
+def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
+
+STRING
+
+#32 bit register
+
+my @SGPR;
+for (my $i = 0; $i < $SGPR_COUNT; $i++) {
+  print "def SGPR$i : SGPR_32 <$i, \"SGPR$i\">;\n";
+  $SGPR[$i] = "SGPR$i";
+}
+
+my @VGPR;
+my @GPRF32;
+for (my $i = 0; $i < $VGPR_COUNT; $i++) {
+  my $gprf32_num = $i + 1;
+  my $gprf32_name = "R$gprf32_num";
+  print "def VGPR$i : VGPR_32 <$i, \"VGPR$i\", $gprf32_name>;\n";
+  $VGPR[$i] = "VGPR$i";
+  $GPRF32[$i] = $gprf32_name;
+}
+
+print <<STRING;
+
+def SReg_32 : RegisterClass<"AMDIL", [f32, i32], 32,
+    (add (sequence "SGPR%u", 0, $SGPR_MAX_IDX),  SREG_LIT_0, M0)
+>;
+
+def VReg_32 : RegisterClass<"AMDIL", [f32, i32], 32,
+    (add (sequence "VGPR%u", 0, $VGPR_MAX_IDX),
+    PERSP_SAMPLE_I, PERSP_SAMPLE_J,
+    PERSP_CENTER_I, PERSP_CENTER_J,
+    PERSP_CENTROID_I, PERSP_CENTROID_J,
+    PERSP_I_W, PERSP_J_W, PERSP_1_W,
+    LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
+    LINEAR_CENTER_I, LINEAR_CENTER_J,
+    LINEAR_CENTROID_I, LINEAR_CENTROID_J,
+    LINE_STIPPLE_TEX_COORD,
+    POS_X_FLOAT,
+    POS_Y_FLOAT,
+    POS_Z_FLOAT,
+    POS_W_FLOAT,
+    FRONT_FACE,
+    ANCILLARY,
+    SAMPLE_COVERAGE,
+    POS_FIXED_PT
+    )
+>;
+
+def AllReg_32 : RegisterClass<"AMDIL", [f32, i32], 32,
+    (add VReg_32,
+         SReg_32,
+         (sequence "R%u", 1, $VGPR_COUNT))
+>;
+
+def CCReg : RegisterClass<"AMDIL", [f32], 32, (add VCC, SCC)>;
+
+STRING
+
+my @subregs_64 = ('low', 'high');
+my @subregs_128 = ('sel_x', 'sel_y', 'sel_z', 'sel_w');
+my @subregs_256 = ('sub0', 'sub1', 'sub2', 'sub3', 'sub4', 'sub5', 'sub6', 'sub7');
+
+my @SGPR64 = print_sgpr_class(64, \@subregs_64, ('i64', 'iPTRAny'));
+my @SGPR128 = print_sgpr_class(128, \@subregs_128, ('v4f32'));
+my @SGPR256 = print_sgpr_class(256, \@subregs_256, ('v8i32'));
+
+my @VGPR64 = print_vgpr_class(64, \@subregs_64, ('i64'));
+my @VGPR128 = print_vgpr_class(128, \@subregs_128, ('v4f32'));
+
+
+my $sgpr64_list = join(',', @SGPR64);
+my $vgpr64_list = join(',', @VGPR64);
+print <<STRING;
+
+def AllReg_64 : RegisterClass<"AMDIL", [f64, i64], 64,
+    (add $sgpr64_list, $vgpr64_list)
+>;
+
+STRING
+
+if ($INDEX_FILE ne '') {
+  open(my $fh, ">", $INDEX_FILE);
+  my %hw_values;
+
+  for (my $i = 0; $i <= $#SGPR; $i++) {
+    push (@{$hw_values{$i}}, $SGPR[$i]);
+  }
+
+  for (my $i = 0; $i <= $#SGPR64; $i++) {
+    push (@{$hw_values{$i * 2}}, $SGPR64[$i])
+  }
+
+  for (my $i = 0; $i <= $#SGPR128; $i++) {
+    push (@{$hw_values{$i * 4}}, $SGPR128[$i]);
+  }
+
+  for (my $i = 0; $i <= $#SGPR256; $i++) {
+    push (@{$hw_values{$i * 8}}, $SGPR256[$i]);
+  }
+
+  for (my $i = 0; $i <= $#VGPR; $i++) {
+    push (@{$hw_values{$i}}, $VGPR[$i]);
+  }
+  for (my $i = 0; $i <= $#VGPR64; $i++) {
+    push (@{$hw_values{$i * 2}}, $VGPR64[$i]);
+  }
+
+  for (my $i = 0; $i <= $#VGPR128; $i++) {
+    push (@{$hw_values{$i * 4}}, $VGPR128[$i]);
+  }
+
+
+  print $fh "unsigned SIRegisterInfo::getHWRegNum(unsigned reg) const\n{\n  switch(reg) {\n";
+  for my $key (keys(%hw_values)) {
+    my @names = @{$hw_values{$key}};
+    for my $regname (@names) {
+      print $fh "  case AMDIL::$regname:\n"
+    }
+    print $fh "    return $key;\n";
+  }
+  print $fh "  default: return 0;\n  }\n}\n"
+}
+
+
+
+
+sub print_sgpr_class {
+  my ($reg_width, $sub_reg_ref, @types) = @_;
+  return print_reg_class('SReg', 'SGPR', $reg_width, $SGPR_COUNT, $sub_reg_ref, @types);
+}
+
+sub print_vgpr_class {
+  my ($reg_width, $sub_reg_ref, @types) = @_;
+  return print_reg_class('VReg', 'VGPR', $reg_width, $VGPR_COUNT, $sub_reg_ref, @types);
+}
+
+sub print_reg_class {
+  my ($class_prefix, $reg_prefix, $reg_width, $reg_count, $sub_reg_ref, @types) = @_;
+  my @registers;
+  my $component_count = $reg_width / 32;
+
+  for (my $i = 0; $i < $reg_count; $i += $component_count) {
+    my $reg_name = $reg_prefix . $i . '_' . $reg_width;
+    my @sub_regs;
+    for (my $idx = 0; $idx < $component_count; $idx++) {
+      my $sub_idx = $i + $idx;
+      push(@sub_regs, $reg_prefix . $sub_idx);
+    }
+    print "def $reg_name : $reg_prefix\_$reg_width <$i, \"$reg_name\", [ ", join(',', @sub_regs) , "]>;\n";
+    push (@registers, $reg_name);
+  }
+  my $reg_list = join(', ', @registers);
+
+  print "def $class_prefix\_$reg_width : RegisterClass<\"AMDIL\", [" . join (', ', @types) . "], $reg_width,\n  (add $reg_list)\n>{\n";
+  print "  let SubRegClasses = [($class_prefix\_", ($reg_width / $component_count) , ' ', join(', ', @{$sub_reg_ref}), ")];\n}\n";
+  return @registers;
+}
diff --git a/src/gallium/drivers/radeon/SIISelLowering.cpp b/src/gallium/drivers/radeon/SIISelLowering.cpp

new file mode 100644 (file)

index 0000000..1a4b47e
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIISelLowering.cpp
@@ -0,0 +1,151 @@
+//===-- SIISelLowering.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+SITargetLowering::SITargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo()))
+{
+  addRegisterClass(MVT::v4f32, &AMDIL::VReg_128RegClass);
+  addRegisterClass(MVT::f32, &AMDIL::VReg_32RegClass);
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
+}
+
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const
+{
+  const struct TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
+  MachineBasicBlock::iterator I = MI;
+
+  if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
+    AppendS_WAITCNT(MI, *BB, llvm::next(I));
+  }
+
+  switch (MI->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDIL::SI_INTERP:
+    LowerSI_INTERP(MI, *BB, I, MRI);
+    break;
+  case AMDIL::SI_INTERP_CONST:
+    LowerSI_INTERP_CONST(MI, *BB, I);
+    break;
+  case AMDIL::SI_V_CNDLT:
+    LowerSI_V_CNDLT(MI, *BB, I, MRI);
+    break;
+  case AMDIL::USE_SGPR_32:
+  case AMDIL::USE_SGPR_64:
+    lowerUSE_SGPR(MI, BB->getParent(), MRI);
+    MI->eraseFromParent();
+    break;
+  case AMDIL::VS_LOAD_BUFFER_INDEX:
+    addLiveIn(MI, BB->getParent(), MRI, TII, AMDIL::VGPR0);
+    MI->eraseFromParent();
+    break;
+  }
+  return BB;
+}
+
+void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I) const
+{
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_WAITCNT))
+          .addImm(0);
+}
+
+void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
+{
+  unsigned tmp = MRI.createVirtualRegister(&AMDIL::VReg_32RegClass);
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand iReg = MI->getOperand(1);
+  MachineOperand jReg = MI->getOperand(2);
+  MachineOperand attr_chan = MI->getOperand(3);
+  MachineOperand attr = MI->getOperand(4);
+  MachineOperand params = MI->getOperand(5);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_MOV_B32))
+          .addReg(AMDIL::M0)
+          .addOperand(params);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_P1_F32), tmp)
+          .addOperand(iReg)
+          .addOperand(attr_chan)
+          .addOperand(attr);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_P2_F32))
+          .addOperand(dst)
+          .addReg(tmp)
+          .addOperand(jReg)
+          .addOperand(attr_chan)
+          .addOperand(attr);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I) const
+{
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand attr_chan = MI->getOperand(1);
+  MachineOperand attr = MI->getOperand(2);
+  MachineOperand params = MI->getOperand(3);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_MOV_B32))
+          .addReg(AMDIL::M0)
+          .addOperand(params);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_MOV_F32))
+          .addOperand(dst)
+          .addOperand(attr_chan)
+          .addOperand(attr);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const
+{
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_CMP_LT_F32_e32))
+          .addOperand(MI->getOperand(1))
+          .addReg(AMDIL::SREG_LIT_0);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_CNDMASK_B32))
+          .addOperand(MI->getOperand(0))
+          .addOperand(MI->getOperand(2))
+          .addOperand(MI->getOperand(3));
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::lowerUSE_SGPR(MachineInstr *MI,
+    MachineFunction * MF, MachineRegisterInfo & MRI) const
+{
+  const struct TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
+  unsigned dstReg = MI->getOperand(0).getReg();
+  int64_t newIndex = MI->getOperand(1).getImm();
+  const TargetRegisterClass * dstClass = MRI.getRegClass(dstReg);
+
+  unsigned newReg = dstClass->getRegister(newIndex);
+  addLiveIn(MI, MF, MRI, TII, newReg); 
+}
+
diff --git a/src/gallium/drivers/radeon/SIISelLowering.h b/src/gallium/drivers/radeon/SIISelLowering.h

new file mode 100644 (file)

index 0000000..e7a79f8
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIISelLowering.h
@@ -0,0 +1,44 @@
+//===-- SIISelLowering.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SIISELLOWERING_H
+#define SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering : public AMDGPUTargetLowering
+{
+  const SIInstrInfo * TII;
+
+  void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I) const;
+  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I) const;
+  void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void lowerUSE_SGPR(MachineInstr *MI, MachineFunction * MF,
+                     MachineRegisterInfo & MRI) const;
+public:
+  SITargetLowering(TargetMachine &tm);
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                              MachineBasicBlock * BB) const;
+};
+
+} // End namespace llvm
+
+#endif //SIISELLOWERING_H
diff --git a/src/gallium/drivers/radeon/SIInstrFormats.td b/src/gallium/drivers/radeon/SIInstrFormats.td

new file mode 100644 (file)

index 0000000..caf9b0e
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIInstrFormats.td
@@ -0,0 +1,128 @@
+//===-- SIInstrFormats.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
+  : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, AllReg_32:$src1, AllReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
+  : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, AllReg_64:$src1, AllReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+
+
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
+  : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
+
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
+  : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
+
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
+  : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
+  : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+
+class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                   string opName, list<dag> pattern> : 
+  VOP1 <
+    op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
+  >;
+
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern,
+                    bits<16> amdil = AMDILInst.NONE> {
+
+  let AMDILOp = amdil in {
+    def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+  }
+
+  def _e64 : VOP3_32 <
+    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
+                   string opName, list<dag> pattern> :
+  VOP2 <
+    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
+  >;
+
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern,
+                    bits<16> amdil = AMDILInst.NONE> {
+
+  let AMDILOp = amdil in {
+    def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+  }
+
+  def _e64 : VOP3_32 <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
+  def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
+  : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
+
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
+  : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
+
+class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                 string opName, list<dag> pattern> :
+  VOPC <
+    op, (outs), (ins arc:$src0, vrc:$src1), opName, pattern
+  >;
+
+multiclass VOPC_32 <bits<8> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOPC_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+
+  def _e64 : VOP3_32 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
+  : SOPC <op, (outs CCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+
+class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
+  : SOPC <op, (outs CCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.cpp b/src/gallium/drivers/radeon/SIInstrInfo.cpp

new file mode 100644 (file)

index 0000000..6f92e96
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIInstrInfo.cpp
@@ -0,0 +1,173 @@
+//===-- SIInstrInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+#include <stdio.h>
+
+using namespace llvm;
+
+SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this),
+    TM(tm)
+    { }
+
+const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const
+{
+  return RI;
+}
+
+void
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const
+{
+  BuildMI(MBB, MI, DL, get(AMDIL::V_MOV_B32_e32), DestReg)
+   .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
+unsigned SIInstrInfo::getEncodingType(const MachineInstr &MI) const
+{
+  return get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
+}
+
+unsigned SIInstrInfo::getEncodingBytes(const MachineInstr &MI) const
+{
+
+  /* Instructions with literal constants are expanded to 64-bits, and
+   * the constant is stored in bits [63:32] */
+  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+    if (MI.getOperand(i).getType() == MachineOperand::MO_FPImmediate) {
+      return 8;
+    }
+  }
+
+  /* This instruction always has a literal */
+  if (MI.getOpcode() == AMDIL::S_MOV_IMM_I32) {
+    return 8;
+  }
+
+  unsigned encoding_type = getEncodingType(MI);
+  switch (encoding_type) {
+    case SIInstrEncodingType::EXP:
+    case SIInstrEncodingType::LDS:
+    case SIInstrEncodingType::MUBUF:
+    case SIInstrEncodingType::MTBUF:
+    case SIInstrEncodingType::MIMG:
+    case SIInstrEncodingType::VOP3:
+      return 8;
+    default:
+      return 4;
+  }
+}
+
+MachineInstr * SIInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const
+{
+
+  switch (MI.getOpcode()) {
+    default: break;
+    case AMDIL::ABS_f32: return convertABS_f32(MI, MF, DL);
+    case AMDIL::CLAMP_f32: return convertCLAMP_f32(MI, MF, DL);
+  }
+
+  MachineInstr * newMI = AMDGPUInstrInfo::convertToISA(MI, MF, DL);
+  const MCInstrDesc &newDesc = get(newMI->getOpcode());
+
+  /* If this instruction was converted to a VOP3, we need to add the extra
+   * operands for abs, clamp, omod, and negate. */
+  if (getEncodingType(*newMI) == SIInstrEncodingType::VOP3
+      && newMI->getNumOperands() < newDesc.getNumOperands()) {
+    MachineInstrBuilder builder(newMI);
+    for (unsigned op_idx = newMI->getNumOperands();
+                  op_idx < newDesc.getNumOperands(); op_idx++) {
+      builder.addImm(0);
+    }
+  }
+  return newMI;
+}
+
+unsigned SIInstrInfo::getISAOpcode(unsigned AMDILopcode) const
+{
+  switch (AMDILopcode) {
+  case AMDIL::MAD_f32: return AMDIL::V_MAD_LEGACY_F32;
+  default: return AMDGPUInstrInfo::getISAOpcode(AMDILopcode);
+  }
+}
+
+MachineInstr * SIInstrInfo::convertABS_f32(MachineInstr & absInstr,
+    MachineFunction &MF, DebugLoc DL) const
+{
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineOperand &dst = absInstr.getOperand(0);
+
+  /* Convert the desination register to the VReg_32 class */
+  if (TargetRegisterInfo::isVirtualRegister(dst.getReg())) {
+    MRI.setRegClass(dst.getReg(), AMDIL::VReg_32RegisterClass);
+  }
+
+  return BuildMI(MF, DL, get(AMDIL::V_MOV_B32_e64))
+                 .addOperand(absInstr.getOperand(0))
+                 .addOperand(absInstr.getOperand(1))
+                /* VSRC1-2 are unused, but we still need to fill all the
+                 * operand slots, so we just reuse the VSRC0 operand */
+                 .addOperand(absInstr.getOperand(1))
+                 .addOperand(absInstr.getOperand(1))
+                 .addImm(1) // ABS
+                 .addImm(0) // CLAMP
+                 .addImm(0) // OMOD
+                 .addImm(0); // NEG
+}
+
+MachineInstr * SIInstrInfo::convertCLAMP_f32(MachineInstr & clampInstr,
+    MachineFunction &MF, DebugLoc DL) const
+{
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  /* XXX: HACK assume that low == zero and high == one for now until
+   * we have a way to propogate the immediates. */
+
+/*
+  uint32_t zero = (uint32_t)APFloat(0.0f).bitcastToAPInt().getZExtValue();
+  uint32_t one = (uint32_t)APFloat(1.0f).bitcastToAPInt().getZExtValue();
+  uint32_t low = clampInstr.getOperand(2).getImm();
+  uint32_t high = clampInstr.getOperand(3).getImm();
+*/
+//  if (low == zero && high == one) {
+  
+  /* Convert the desination register to the VReg_32 class */
+  if (TargetRegisterInfo::isVirtualRegister(clampInstr.getOperand(0).getReg())) {
+    MRI.setRegClass(clampInstr.getOperand(0).getReg(),
+                    AMDIL::VReg_32RegisterClass);
+  }
+  return BuildMI(MF, DL, get(AMDIL::V_MOV_B32_e64))
+           .addOperand(clampInstr.getOperand(0))
+           .addOperand(clampInstr.getOperand(1))
+          /* VSRC1-2 are unused, but we still need to fill all the
+           * operand slots, so we just reuse the VSRC0 operand */
+           .addOperand(clampInstr.getOperand(1))
+           .addOperand(clampInstr.getOperand(1))
+           .addImm(0) // ABS
+           .addImm(1) // CLAMP
+           .addImm(0) // OMOD
+           .addImm(0); // NEG
+//  } else {
+    /* XXX: Handle other cases */
+//    abort();
+//  }
+}
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.h b/src/gallium/drivers/radeon/SIInstrInfo.h

new file mode 100644 (file)

index 0000000..bd76c3f
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIInstrInfo.h
@@ -0,0 +1,95 @@
+//===-- SIInstrInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIINSTRINFO_H
+#define SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+  class SIInstrInfo : public AMDGPUInstrInfo {
+  private:
+    const SIRegisterInfo RI;
+    AMDGPUTargetMachine &TM;
+
+    MachineInstr * convertABS_f32(MachineInstr & absInstr, MachineFunction &MF,
+                                  DebugLoc DL) const;
+
+    MachineInstr * convertCLAMP_f32(MachineInstr & clampInstr,
+                                    MachineFunction &MF, DebugLoc DL) const;
+
+  public:
+    explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+
+    const SIRegisterInfo &getRegisterInfo() const;
+
+    virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+    unsigned getEncodingType(const MachineInstr &MI) const;
+
+    unsigned getEncodingBytes(const MachineInstr &MI) const;
+
+    uint64_t getBinaryCode(const MachineInstr &MI, bool encodOpcode = false) const;
+
+    virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF,
+                                        DebugLoc DL) const;
+
+    virtual unsigned getISAOpcode(unsigned AMDILopcode) const;
+
+  };
+
+} // End namespace llvm
+
+/* These must be kept in sync with SIInstructions.td and also the
+ * InstrEncodingInfo array in SIInstrInfo.cpp.
+ *
+ * NOTE: This enum is only used to identify the encoding type within LLVM,
+ * the actual encoding type that is part of the instruction format is different
+ */
+namespace SIInstrEncodingType {
+  enum Encoding {
+    EXP = 0,
+    LDS = 1,
+    MIMG = 2,
+    MTBUF = 3,
+    MUBUF = 4,
+    SMRD = 5,
+    SOP1 = 6,
+    SOP2 = 7,
+    SOPC = 8,
+    SOPK = 9,
+    SOPP = 10,
+    VINTRP = 11,
+    VOP1 = 12,
+    VOP2 = 13,
+    VOP3 = 14,
+    VOPC = 15
+  };
+}
+
+#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
+
+namespace SIInstrFlags {
+  enum Flags {
+    /* First 4 bits are the instruction encoding */
+    NEED_WAIT = 1 << 4
+  };
+}
+
+#endif //SIINSTRINFO_H
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.td b/src/gallium/drivers/radeon/SIInstrInfo.td

new file mode 100644 (file)

index 0000000..ffa18d0
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIInstrInfo.td
@@ -0,0 +1,472 @@
+//===-- SIInstrInfo.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<4> EncodingType = 0;
+  field bits<1> NeedWait = 0;
+
+  let TSFlags{3-0} = EncodingType;
+  let TSFlags{4} = NeedWait;
+
+}
+
+class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<32> Inst;
+}
+
+class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+}
+
+class GPR4Align <RegisterClass rc> : Operand <vAny> {
+  let EncoderMethod = "GPR4AlignEncode";
+  let MIOperandInfo = (ops rc:$reg); 
+}
+
+class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
+  let EncoderMethod = "GPR2AlignEncode";
+  let MIOperandInfo = (ops rc:$reg);
+}
+
+def i32Literal : Operand <i32> {
+  let EncoderMethod = "i32LiteralEncode";
+}
+
+def EXP : Enc64<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  bits<4> EN;
+  bits<6> TGT;
+  bits<1> COMPR;
+  bits<1> DONE;
+  bits<1> VM;
+  bits<8> VSRC0;
+  bits<8> VSRC1;
+  bits<8> VSRC2;
+  bits<8> VSRC3;
+
+  let Inst{3-0} = EN;
+  let Inst{9-4} = TGT;
+  let Inst{10} = COMPR;
+  let Inst{11} = DONE;
+  let Inst{12} = VM;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = VSRC0;
+  let Inst{47-40} = VSRC1;
+  let Inst{55-48} = VSRC2;
+  let Inst{63-56} = VSRC3;
+  let EncodingType = 0; //SIInstrEncodingType::EXP
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<4> DMASK;
+  bits<1> UNORM;
+  bits<1> GLC;
+  bits<1> DA;
+  bits<1> R128;
+  bits<1> TFE;
+  bits<1> LWE;
+  bits<1> SLC;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<5> SSAMP; 
+
+  let Inst{11-8} = DMASK;
+  let Inst{12} = UNORM;
+  let Inst{13} = GLC;
+  let Inst{14} = DA;
+  let Inst{15} = R128;
+  let Inst{16} = TFE;
+  let Inst{17} = LWE;
+  let Inst{24-18} = op;
+  let Inst{25} = SLC;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{57-53} = SSAMP;
+
+  let EncodingType = 2; //SIInstrEncodingType::MIMG
+
+}
+
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<4> DFMT;
+  bits<3> NFMT;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = DFMT;
+  let Inst{25-23} = NFMT;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+  let EncodingType = 3; //SIInstrEncodingType::MTBUF
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+  let neverHasSideEffects = 1;
+}
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<1> LDS;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{16} = LDS;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+  let EncodingType = 4; //SIInstrEncodingType::MUBUF
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+  let neverHasSideEffects = 1;
+}
+
+class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<8> OFFSET;
+  bits<6> SBASE;
+  bits<1> IMM = 0; // Determined by subclasses
+  
+  let Inst{7-0} = OFFSET;
+  let Inst{8} = IMM;
+  let Inst{14-9} = SBASE;
+  let Inst{21-15} = SDST;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+  let EncodingType = 5; //SIInstrEncodingType::SMRD
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+}
+
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<8> SSRC0;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = SDST;
+  let Inst{31-23} = 0x17d; //encoding;
+  let EncodingType = 6; //SIInstrEncodingType::SOP1
+}
+
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+  
+  bits<7> SDST;
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = SDST;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+  let EncodingType = 7; // SIInstrEncodingType::SOP2  
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  Enc32<outs, ins, asm, pattern> {
+
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+  let EncodingType = 8; // SIInstrEncodingType::SOPC
+}
+
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   Enc32 <outs, ins , asm, pattern> {
+
+  bits <7> SDST;
+  bits <16> SIMM16;
+  
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = SDST;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+  let EncodingType = 9; // SIInstrEncodingType::SOPK
+}
+
+class SOPP <bits<7> op, dag ins, string asm> : Enc32 <
+  (outs),
+  ins,
+  asm,
+  [] > {
+
+  bits <16> SIMM16;
+
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+  let EncodingType = 10; // SIInstrEncodingType::SOPP
+}
+    
+
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<8> VSRC;
+  bits<2> ATTRCHAN;
+  bits<6> ATTR;
+
+  let Inst{7-0} = VSRC;
+  let Inst{9-8} = ATTRCHAN;
+  let Inst{15-10} = ATTR;
+  let Inst{17-16} = op;
+  let Inst{25-18} = VDST;
+  let Inst{31-26} = 0x32; // encoding
+  let EncodingType = 11; // SIInstrEncodingType::VINTRP
+
+  let Uses = [M0];
+}
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = VDST;
+  let Inst{31-25} = 0x3f; //encoding
+  
+  let EncodingType = 12; // SIInstrEncodingType::VOP1
+  let PostEncoderMethod = "VOPPostEncode";
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<8> VSRC1;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = VDST;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+  
+  let EncodingType = 13; // SIInstrEncodingType::VOP2
+  let PostEncoderMethod = "VOPPostEncode";
+}
+
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<3> ABS; 
+  bits<1> CLAMP;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{10-8} = ABS;
+  let Inst{11} = CLAMP;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+  
+  let EncodingType = 14; // SIInstrEncodingType::VOP3
+  let PostEncoderMethod = "VOPPostEncode";
+}
+
+class VOPC <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+ 
+  let EncodingType = 15; //SIInstrEncodingType::VOPC
+  let PostEncoderMethod = "VOPPostEncode";
+
+  let Defs = [VCC];
+}
+
+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+  op,
+  (outs VReg_128:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr,
+       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
+  asm,
+  []
+>; 
+
+class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
+       i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayLoad = 1;
+}
+
+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayLoad = 1;
+}
+
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs),
+  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
+   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayStore = 1;
+}
+
+/*XXX: We should be able to infer the imm bit based on the arg types */
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> {
+
+  def _SGPR : SMRD <
+              op,
+              (outs dstClass:$dst),
+              (ins SReg_32:$offset, GPR2Align<SReg_64,i64>:$sbase),
+              asm,
+              []
+  > {
+    let IMM = 0;
+  }
+
+  def _IMM : SMRD <
+              op,
+              (outs dstClass:$dst),
+              (ins i32imm:$offset, GPR2Align<SReg_64,i64>:$sbase),
+              asm,
+              []
+  > {
+    let IMM = 1;
+  }
+}
+
+class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
+  let EncoderMethod = "encodeOperand";
+  let MIOperandInfo = opInfo;
+}
+
+def IMM8bit : ImmLeaf <
+  i32,
+  [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
+>;
+
+def IMM12bit : ImmLeaf <
+  i16,
+  [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
+>;
+
+include "SIInstrFormats.td"
+
+def LOAD_CONST : AMDGPUShaderInst <
+  (outs GPRF32:$dst),
+  (ins i32imm:$src),
+  "LOAD_CONST $dst, $src",
+  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+>;
+
+include "SIInstructions.td"
diff --git a/src/gallium/drivers/radeon/SIInstructions.td b/src/gallium/drivers/radeon/SIInstructions.td

new file mode 100644 (file)

index 0000000..003d3d0
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIInstructions.td
@@ -0,0 +1,962 @@
+//===-- SIInstructions.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+def isSI : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDILDeviceInfo::HD7XXX">;
+
+let Predicates = [isSI] in {
+let Gen = AMDGPUGen.SI  in {
+
+def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
+def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
+def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
+def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
+def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
+////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
+////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
+////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
+////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
+//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
+//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
+def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
+def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
+def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
+////def S_ANDN2_SAVEEXEC_B64 : SOP1_ANDN2 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
+////def S_ORN2_SAVEEXEC_B64 : SOP1_ORN2 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
+def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+def S_CMPK_EQ_I32 : SOPK_32 <0x00000003, "S_CMPK_EQ_I32", []>;
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
+def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
+def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
+//def EXP : EXP_ <0x00000000, "EXP", []>;
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
+defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
+defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
+defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
+defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
+defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
+defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
+defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
+defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
+defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
+defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
+defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
+defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
+defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
+defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
+defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
+defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
+defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
+defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
+defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
+defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
+defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
+defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
+defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
+defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
+defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
+defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
+defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
+defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
+defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
+defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
+defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
+defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
+defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
+defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
+defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
+defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
+defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
+defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
+defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
+defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
+defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
+defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
+defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
+defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
+defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
+defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
+defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
+defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
+defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
+defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
+defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
+defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
+defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
+defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
+defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
+defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
+defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
+defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
+defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
+defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
+defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
+defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
+defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
+defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
+defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
+defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
+defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
+defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
+defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
+defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
+defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
+defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
+defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
+defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
+defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
+defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
+defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
+defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
+defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
+defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
+defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
+defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
+//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
+//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
+//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
+//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
+//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
+//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
+//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
+//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
+//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
+//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
+//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
+//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
+//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
+//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
+//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
+//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
+//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
+//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
+//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
+//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
+//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
+//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+
+let mayLoad = 0, neverHasSideEffects = 1 in {
+
+defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>;
+//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>;
+//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
+//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
+//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
+//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
+//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
+//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
+
+} // End mayLoad, neverHasSideEffects
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
+//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
+//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
+def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; 
+//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
+//def IMAGE_SAMPLE_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_D", 0x00000022>;
+//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
+//def IMAGE_SAMPLE_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_L", 0x00000024>;
+//def IMAGE_SAMPLE_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_B", 0x00000025>;
+//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
+//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
+//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>;
+//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
+//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
+//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
+//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>;
+//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>;
+//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
+//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
+//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
+//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
+//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
+//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
+//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
+//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
+//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
+//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
+//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
+//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
+//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
+//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
+//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
+//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
+//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
+//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
+//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
+//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
+//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
+//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
+//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
+//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
+//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
+//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
+//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
+//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
+//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
+//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
+//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
+//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
+//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
+//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
+//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
+//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
+//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
+//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
+//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
+//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
+//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
+//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
+//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
+//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
+//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
+//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
+//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
+//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
+//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
+//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
+//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+
+let neverHasSideEffects = 1 in {
+defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", [], AMDILInst.MOVE_f32>;
+}  // End neverHasSideEffects
+defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
+//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
+//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+//defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", []>;
+//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
+//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+//defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", []>;
+defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
+////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
+//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
+//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
+//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
+//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
+//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
+//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
+//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
+//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", []>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
+defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", []>;
+defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", []>;
+defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", []>;
+defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
+defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
+defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
+defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", []>;
+defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_LEGACY_F32 : VOP1_32 <
+  0x0000002d, "V_RSQ_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
+defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
+defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
+defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
+defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
+defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
+defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
+defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
+defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
+//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
+defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
+defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
+//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
+defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
+//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+
+def V_INTERP_P1_F32 : VINTRP <
+  0x00000000,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr),
+  "V_INTERP_P1_F32",
+  []
+>;
+
+def V_INTERP_P2_F32 : VINTRP <
+  0x00000001,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr),
+  "V_INTERP_P2_F32",
+  []> {
+
+  let Constraints = "$src0 = $dst";
+  let DisableEncoding = "$src0";
+
+}
+
+def V_INTERP_MOV_F32 : VINTRP <
+  0x00000002,
+  (outs VReg_32:$dst),
+  (ins i32imm:$attr_chan, i32imm:$attr),
+  "V_INTERP_MOV_F32",
+  []> {
+  let VSRC = 0;
+}
+
+//def V_INTERP_MOV_F32 : VINTRP_32 <0x00000002, "V_INTERP_MOV_F32", []>;
+//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM"> {
+  let SIMM16 = 0;
+  let isTerminator = 1;
+}
+//def S_BRANCH : SOPP_ <0x00000002, "S_BRANCH", []>;
+//def S_CBRANCH_SCC0 : SOPP_SCC0 <0x00000004, "S_CBRANCH_SCC0", []>;
+//def S_CBRANCH_SCC1 : SOPP_SCC1 <0x00000005, "S_CBRANCH_SCC1", []>;
+//def S_CBRANCH_VCCZ : SOPP_ <0x00000006, "S_CBRANCH_VCCZ", []>;
+//def S_CBRANCH_VCCNZ : SOPP_ <0x00000007, "S_CBRANCH_VCCNZ", []>;
+//def S_CBRANCH_EXECZ : SOPP_ <0x00000008, "S_CBRANCH_EXECZ", []>;
+//def S_CBRANCH_EXECNZ : SOPP_ <0x00000009, "S_CBRANCH_EXECNZ", []>;
+//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
+def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16">;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+
+/* XXX: No VOP3 version of this instruction yet */
+def V_CNDMASK_B32 : VOP2_Helper <
+  0x00000000, VReg_32, AllReg_32, "V_CNDMASK_B32", []> {
+  let VDST = 0;
+  let Uses = [VCC];
+}
+defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
+defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
+
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", [], AMDILInst.ADD_f32>;
+
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
+defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
+defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+defm V_MUL_LEGACY_F32 : VOP2_32 <
+  0x00000007, "V_MUL_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
+>;
+defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", []>;
+//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
+//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", []>;
+
+defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", [],
+                                 AMDILInst.MAX_f32>;
+defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
+defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
+defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
+defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
+defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
+defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
+//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32", []>;
+defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32", []>;
+defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
+defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
+defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
+defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
+defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
+////def V_CVT_PKRTZ_F16_F32 : VOP2_F16 <0x0000002f, "V_CVT_PKRTZ_F16_F32", []>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+let neverHasSideEffects = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
+//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
+//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
+
+} // End neverHasSideEffects
+def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
+def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
+def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
+def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
+def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
+def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
+def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
+def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
+def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
+def S_CSELECT_B32 : SOP2_32 <0x0000000a, "S_CSELECT_B32", []>;
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", []>;
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+////def S_ANDN2_B32 : SOP2_ANDN2 <0x00000014, "S_ANDN2_B32", []>;
+////def S_ANDN2_B64 : SOP2_ANDN2 <0x00000015, "S_ANDN2_B64", []>;
+////def S_ORN2_B32 : SOP2_ORN2 <0x00000016, "S_ORN2_B32", []>;
+////def S_ORN2_B64 : SOP2_ORN2 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
+def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
+def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
+def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+def V_MOV_IMM : VOP1 <
+  0x1,
+  (outs VReg_32:$dst),
+  (ins f32imm:$src0),
+  "V_MOV_IMM",
+   []
+>;
+
+def S_MOV_IMM_I32 : SOP1 <
+  0x3,
+  (outs SReg_32:$dst),
+  (ins i32Literal:$src0),
+  "S_MOV_IMM",
+  [] > {
+  let neverHasSideEffects = 1;
+}
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+def SET_M0 : InstSI <
+  (outs SReg_32:$dst),
+  (ins i32imm:$src0),
+  "SET_M0",
+  [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
+>;
+
+def CONFIG_WRITE : InstSI <
+  (outs i32imm:$reg),
+  (ins i32imm:$val),
+  "CONFIG_WRITE $reg, $val",
+  [] > {
+  field bits<32> Inst = 0;
+}
+
+let usesCustomInserter = 1 in {
+
+def SI_V_CNDLT : InstSI <
+       (outs VReg_32:$dst),
+       (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
+       "SI_V_CNDLT $dst, $src0, $src1, $src2",
+       [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
+>;
+
+def SI_INTERP : InstSI <
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
+  "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
+  []
+>;
+
+def SI_INTERP_CONST : InstSI <
+  (outs VReg_32:$dst),
+  (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
+  "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
+  [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
+                                                 imm:$attr, SReg_32:$params))]
+>;
+
+
+def USE_SGPR_32 : InstSI <
+  (outs SReg_32:$dst),
+  (ins i32imm:$src0),
+  "USE_SGPR_32",
+  [(set SReg_32:$dst, (int_SI_use_sgpr imm:$src0))]
+
+> {
+  field bits<32> Inst = 0;
+  let PreloadReg = 1;
+}
+
+def USE_SGPR_64 : InstSI <
+  (outs SReg_64:$dst),
+  (ins i32imm:$src0),
+  "USE_SGPR_64",
+  [(set SReg_64:$dst, (int_SI_use_sgpr imm:$src0))]
+
+> {
+  field bits<32> Inst = 0;
+  let PreloadReg = 1;
+}
+
+def VS_LOAD_BUFFER_INDEX : InstSI <
+  (outs VReg_32:$dst),
+  (ins),
+  "VS_LOAD_BUFFER_INDEX",
+  [(set VReg_32:$dst, (int_SI_vs_load_buffer_index))]> {
+
+  field bits<32> Inst = 0;
+  let PreloadReg = 1;
+}
+
+} // end usesCustomInserter 
+
+} // end IsCodeGenOnly, isPseudo
+
+} // end Gen = AMDGPUGen.SI
+
+/* int_SI_vs_load_input */
+def : Pat<
+  (int_SI_vs_load_input SReg_64:$tlst_sgpr, IMM8bit:$t_offset, IMM12bit:$attr_offset,
+                        VReg_32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
+                          VReg_32:$buf_idx_vgpr,
+                          (S_LOAD_DWORDX4_IMM imm:$t_offset, SReg_64:$tlst_sgpr),
+                          0, 0, (i32 SREG_LIT_0))
+>;
+
+/* int_SI_load_const */
+
+def : Pat <
+  (int_SI_load_const SReg_64:$const_ptr, IMM8bit:$offset),
+  (S_LOAD_DWORD_IMM imm:$offset, SReg_64:$const_ptr)
+>;
+
+
+/* XXX: Complete this pattern with some form of a scalar move immediate */
+/*
+def : Pat <
+  (int_SI_load_const SReg_64:$const_ptr, imm:$offset),
+  (S_LOAD_DWORD_SGPR imm:$offset, SReg_64:$const_ptr)
+>;
+*/
+
+/* int_SI_export */
+def : Pat <
+  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
+                 VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
+>;
+
+/* int_SI_sample */
+def : Pat <
+  (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_64:$rsrc, imm:$rsrc_offset,
+                 SReg_64:$sampler, imm:$sampler_offset),
+  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
+                (S_LOAD_DWORDX8_IMM imm:$rsrc_offset, SReg_64:$rsrc), /* Resource */
+                (S_LOAD_DWORDX4_IMM imm:$sampler_offset, SReg_64:$sampler)) /* Sampler */
+>;
+
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type,
+                     RegisterClass vec_class, int sub_idx, 
+                     SubRegIndex sub_reg>: Pat<
+  (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
+  (EXTRACT_SUBREG vec_class:$src, sub_reg)
+>;
+
+def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>;
+def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>;
+def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>;
+def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>;
+
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      RegisterClass elem_class, RegisterClass vec_class,
+                      int sub_idx, SubRegIndex sub_reg> : Pat <
+
+  (vec_type (vector_insert (vec_type vec_class:$vec),
+                           (elem_type elem_class:$elem), sub_idx)),
+  (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
+>;
+
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>;
+
+/*
+def : Pat<
+  (int_SI_vs_load_buffer_index),
+  (COPY_TO_REGCLASS (f32 VGPR0), VReg_32)
+>; 
+*/ 
+
+/********** ===================== **********/
+/********** Interpolation Paterns **********/
+/********** ===================== **********/
+
+def : Pat <
+  (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
+             imm:$attr, SReg_32:$params)
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+/* llvm.AMDGPU.pow */
+/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
+
+} // End isSI predicate
diff --git a/src/gallium/drivers/radeon/SIIntrinsics.td b/src/gallium/drivers/radeon/SIIntrinsics.td

new file mode 100644 (file)

index 0000000..e3014e1
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIIntrinsics.td
@@ -0,0 +1,34 @@
+//===-- SIIntrinsics.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+
+  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  /* XXX: We may need a seperate intrinsic here for loading integer values */
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+  def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], []>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], []> ;
+
+  def int_SI_sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>;
+  def int_SI_use_sgpr : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
+  class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+  def int_SI_fs_interp_linear_center : Interp;
+  def int_SI_fs_interp_constant : Interp;
+}
diff --git a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp

new file mode 100644 (file)

index 0000000..5d49d88
--- /dev/null
+++ b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp
@@ -0,0 +1,90 @@
+//===-- SILowerShaderInstructions.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPULowerShaderInstructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+  class SILowerShaderInstructionsPass : public MachineFunctionPass,
+      public AMDGPULowerShaderInstructionsPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SILowerShaderInstructionsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    const char *getPassName() const { return "SI Lower Shader Instructions"; }
+
+    void lowerRETURN(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
+    void lowerSET_M0(MachineInstr &MI, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator I);
+  };
+} /* End anonymous namespace */
+
+char SILowerShaderInstructionsPass::ID = 0;
+
+FunctionPass *llvm::createSILowerShaderInstructionsPass(TargetMachine &tm) {
+    return new SILowerShaderInstructionsPass(tm);
+}
+
+bool SILowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  MRI = &MF.getRegInfo();
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      case AMDIL::RETURN:
+        lowerRETURN(MBB, I);
+        break;
+      case AMDIL::SET_M0:
+        lowerSET_M0(MI, MBB, I);
+        break;
+      default: continue;
+      }
+      MI.removeFromParent();
+    }
+  }
+
+  return false;
+}
+
+void SILowerShaderInstructionsPass::lowerRETURN(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I)
+{
+  const struct TargetInstrInfo * TII = TM.getInstrInfo();
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_ENDPGM));
+}
+
+void SILowerShaderInstructionsPass::lowerSET_M0(MachineInstr &MI,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
+{
+  const struct TargetInstrInfo * TII = TM.getInstrInfo();
+  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_MOV_IMM_I32))
+          .addReg(AMDIL::M0)
+          .addOperand(MI.getOperand(1));
+}
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp

new file mode 100644 (file)

index 0000000..a69353a
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
@@ -0,0 +1,62 @@
+//===-- SIMachineFunctionInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIMachineFunctionInfo.h"
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+
+SIMachineFunctionInfo::SIMachineFunctionInfo()
+  : AMDILMachineFunctionInfo(),
+    spi_ps_input_addr(0)
+  { }
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(MachineFunction &MF)
+  : AMDILMachineFunctionInfo(MF),
+    spi_ps_input_addr(0)
+  { }
+
+
+namespace {
+  class SIInitMachineFunctionInfoPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SIInitMachineFunctionInfoPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+  };
+} // End anonymous namespace
+
+char SIInitMachineFunctionInfoPass::ID = 0;
+
+FunctionPass *llvm::createSIInitMachineFunctionInfoPass(TargetMachine &tm) {
+  return new SIInitMachineFunctionInfoPass(tm);
+}
+
+/* A MachineFunction's MachineFunctionInfo is initialized in the first call to
+ * getInfo().  We need to intialize it as an SIMachineFunctionInfo object
+ * before any of the AMDIL passes otherwise it will be an
+ * AMDILMachineFunctionInfo object and we won't be able to use it.
+ */
+bool SIInitMachineFunctionInfoPass::runOnMachineFunction(MachineFunction &MF)
+{
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  return false;
+}
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h

new file mode 100644 (file)

index 0000000..66feee9
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
@@ -0,0 +1,36 @@
+//===-- SIMachineFunctionInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef _SIMACHINEFUNCTIONINFO_H_
+#define _SIMACHINEFUNCTIONINFO_H_
+
+#include "AMDILMachineFunctionInfo.h"
+
+namespace llvm {
+
+class SIMachineFunctionInfo : public AMDILMachineFunctionInfo {
+
+  private:
+
+  public:
+    SIMachineFunctionInfo();
+    SIMachineFunctionInfo(MachineFunction &MF);
+    unsigned spi_ps_input_addr;
+
+};
+
+} // End namespace llvm
+
+
+#endif //_SIMACHINEFUNCTIONINFO_H_
diff --git a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp

new file mode 100644 (file)

index 0000000..e9b51b0
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
@@ -0,0 +1,71 @@
+//===-- SIPropagateImmReads.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUUtil.h"
+#include "AMDILMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class SIPropagateImmReadsPass : public MachineFunctionPass {
+
+  private:
+    static char ID;
+    TargetMachine &TM;
+
+  public:
+    SIPropagateImmReadsPass(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm) { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+  };
+} /* End anonymous namespace */
+
+char SIPropagateImmReadsPass::ID = 0;
+
+FunctionPass *llvm::createSIPropagateImmReadsPass(TargetMachine &tm) {
+  return new SIPropagateImmReadsPass(tm);
+}
+
+bool SIPropagateImmReadsPass::runOnMachineFunction(MachineFunction &MF)
+{
+  AMDILMachineFunctionInfo * MFI = MF.getInfo<AMDILMachineFunctionInfo>();
+  const SIInstrInfo * TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+         I != MBB.end(); I = Next, Next = llvm::next(I)) {
+      MachineInstr &MI = *I;
+
+      switch (MI.getOpcode()) {
+      case AMDIL::LOADCONST_f32:
+      case AMDIL::LOADCONST_i32:
+        break;
+      default:
+        continue;
+      }
+
+      /* XXX: Create and use S_MOV_IMM for SREGs */
+      BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::V_MOV_IMM))
+          .addOperand(MI.getOperand(0))
+          .addOperand(MI.getOperand(1));
+
+      MI.eraseFromParent();
+    }
+  }
+}
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.cpp b/src/gallium/drivers/radeon/SIRegisterInfo.cpp

new file mode 100644 (file)

index 0000000..da2ec36
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.cpp
@@ -0,0 +1,66 @@
+//===-- SIRegisterInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIRegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPUUtil.h"
+
+using namespace llvm;
+
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+unsigned SIRegisterInfo::getBinaryCode(unsigned reg) const
+{
+  switch (reg) {
+    case AMDIL::M0: return 124;
+    case AMDIL::SREG_LIT_0: return 128;
+    default: return getHWRegNum(reg);
+  }
+}
+
+bool SIRegisterInfo::isBaseRegClass(unsigned regClassID) const
+{
+  switch (regClassID) {
+  default: return true;
+  case AMDIL::AllReg_32RegClassID:
+  case AMDIL::AllReg_64RegClassID:
+    return false;
+  }
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const
+{
+  switch (rc->getID()) {
+  case AMDIL::GPRF32RegClassID:
+    return &AMDIL::VReg_32RegClass;
+  case AMDIL::GPRV4F32RegClassID:
+  case AMDIL::GPRV4I32RegClassID:
+    return &AMDIL::VReg_128RegClass;
+  default: return rc;
+  }
+}
+
+#include "SIRegisterGetHWRegNum.inc"
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.h b/src/gallium/drivers/radeon/SIRegisterInfo.h

new file mode 100644 (file)

index 0000000..c797e3c
--- /dev/null
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.h
@@ -0,0 +1,46 @@
+//===-- SIRegisterInfo.h - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIREGISTERINFO_H_
+#define SIREGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class TargetInstrInfo;
+
+  struct SIRegisterInfo : public AMDGPURegisterInfo
+  {
+    AMDGPUTargetMachine &TM;
+    const TargetInstrInfo &TII;
+
+    SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+    virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+    virtual unsigned getBinaryCode(unsigned reg) const;
+
+    virtual bool isBaseRegClass(unsigned regClassID) const;
+
+    virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass * rc) const;
+
+    unsigned getHWRegNum(unsigned reg) const;
+
+  };
+
+} // End namespace llvm
+
+#endif // SIREGISTERINFO_H_
diff --git a/src/gallium/drivers/radeon/SISchedule.td b/src/gallium/drivers/radeon/SISchedule.td

new file mode 100644 (file)

index 0000000..9e99268
--- /dev/null
+++ b/src/gallium/drivers/radeon/SISchedule.td
@@ -0,0 +1,15 @@
+//===-- SISchedule.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+
+def SI_Itin : ProcessorItineraries <[], [], []>;
diff --git a/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp b/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp

new file mode 100644 (file)

index 0000000..5dee0cb
--- /dev/null
+++ b/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp
@@ -0,0 +1,32 @@
+//===-- TargetInfo/AMDILTargetInfo.cpp - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: Add full description
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDIL.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// The target for the AMDIL backend
+Target llvm::TheAMDILTarget;
+
+/// The target for the AMDGPU backend
+Target llvm::TheAMDGPUTarget;
+
+/// Extern function to initialize the targets for the AMDIL backend
+extern "C" void LLVMInitializeAMDILTargetInfo() {
+  RegisterTarget<Triple::amdil, false>
+    IL(TheAMDILTarget, "amdil", "ATI graphics cards");
+
+  RegisterTarget<Triple::r600, false>
+    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+}
diff --git a/src/gallium/drivers/radeon/loader.cpp b/src/gallium/drivers/radeon/loader.cpp

new file mode 100644 (file)

index 0000000..5b46cad
--- /dev/null
+++ b/src/gallium/drivers/radeon/loader.cpp
@@ -0,0 +1,34 @@
+
+#include "radeon_llvm.h"
+
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/IRReader.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <stdio.h>
+
+#include <llvm-c/Core.h>
+
+using namespace llvm;
+
+static cl::opt<std::string>
+InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-"));
+
+
+
+int main(int argc, char ** argv)
+{
+       unsigned char * bytes;
+       unsigned byte_count;
+
+       std::auto_ptr<Module> M;
+       LLVMContext &Context = getGlobalContext();
+       SMDiagnostic Err;
+       cl::ParseCommandLineOptions(argc, argv, "llvm system compiler\n");
+       M.reset(ParseIRFile(InputFilename, Err, Context));
+
+       Module * mod = M.get();
+  
+       radeon_llvm_compile(wrap(mod), &bytes, &byte_count, "SI", 1);
+}
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h

new file mode 100644 (file)

index 0000000..14c9ecb
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2011 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+
+#ifndef LLVM_GPU_H
+#define LLVM_GPU_H
+
+#include <llvm-c/Core.h>
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_tgsi.h"
+
+#define RADEON_LLVM_MAX_INPUTS 16 * 4
+#define RADEON_LLVM_MAX_OUTPUTS 16 * 4
+#define RADEON_LLVM_MAX_BRANCH_DEPTH 16
+#define RADEON_LLVM_MAX_LOOP_DEPTH 16
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct radeon_llvm_branch {
+       LLVMBasicBlockRef endif_block;
+       LLVMBasicBlockRef if_block;
+       LLVMBasicBlockRef else_block;
+       unsigned has_else;
+};
+
+struct radeon_llvm_loop {
+       LLVMBasicBlockRef loop_block;
+       LLVMBasicBlockRef endloop_block;
+};
+
+struct radeon_llvm_context {
+
+       struct lp_build_tgsi_soa_context soa;
+
+       /*=== Front end configuration ===*/
+
+       /* Special Intrinsics */
+
+       /** Write to an output register: float store_output(float, i32) */
+       const char * store_output_intr;
+
+       /** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32)
+        * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value
+        * in 2-bits.
+        * Swizzle{0-1} = X Channel
+        * Swizzle{2-3} = Y Channel
+        * Swizzle{4-5} = Z Channel
+        * Swizzle{6-7} = W Channel
+        */
+       const char * swizzle_intr;
+
+       /* Instructions that are not described by any of the TGSI opcodes. */
+
+       /** This function is responsible for initilizing the inputs array and will be
+         * called once for each input declared in the TGSI shader.
+         */
+       void (*load_input)(struct radeon_llvm_context *,
+                       unsigned input_index,
+                       const struct tgsi_full_declaration *decl);
+
+
+       /** User data to use with the callbacks */
+       void * userdata;
+
+       /** This array contains the input values for the shader.  Typically these
+         * values will be in the form of a target intrinsic that will inform the
+         * backend how to load the actual inputs to the shader. 
+         */
+       LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
+       LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
+       unsigned output_reg_count;
+
+       unsigned reserved_reg_count;
+       /*=== Private Members ===*/
+
+       struct radeon_llvm_branch branch[RADEON_LLVM_MAX_BRANCH_DEPTH];
+       struct radeon_llvm_loop loop[RADEON_LLVM_MAX_LOOP_DEPTH];
+
+       unsigned branch_depth;
+       unsigned loop_depth;
+
+
+       LLVMValueRef main_fn;
+
+       struct gallivm_state gallivm;
+};
+
+unsigned  radeon_llvm_compile(
+       LLVMModuleRef M,
+       unsigned char ** bytes,
+       unsigned * byte_count,
+       const char * gpu_family,
+       unsigned dump);
+
+void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
+
+void radeon_llvm_dispose(struct radeon_llvm_context * ctx);
+
+inline static struct radeon_llvm_context * radeon_llvm_context(
+       struct lp_build_tgsi_context * bld_base)
+{
+       return (struct radeon_llvm_context*)bld_base;
+}
+
+unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan);
+
+void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* LLVM_GPU_H */
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp

new file mode 100644 (file)

index 0000000..1bc6a15
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2011 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+#include "radeon_llvm.h"
+
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+#include <llvm/PassManager.h>
+#include <llvm/ADT/Triple.h>
+#include <llvm/Support/FormattedStream.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/IRReader.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <llvm/Transforms/Scalar.h>
+
+#include <llvm-c/Target.h>
+
+#include <iostream>
+#include <stdlib.h>
+#include <stdio.h>
+
+using namespace llvm;
+
+#ifndef EXTERNAL_LLVM
+extern "C" {
+
+void LLVMInitializeAMDILTargetMC(void);
+void LLVMInitializeAMDILTarget(void);
+void LLVMInitializeAMDILTargetInfo(void);
+}
+#endif
+
+/**
+ * Compile an LLVM module to machine code.
+ *
+ * @param bytes This function allocates memory for the byte stream, it is the
+ * caller's responsibility to free it.
+ */
+extern "C" unsigned
+radeon_llvm_compile(LLVMModuleRef M, unsigned char ** bytes,
+                 unsigned * byte_count, const char * gpu_family,
+                 unsigned dump) {
+
+#if HAVE_LLVM > 0x0300
+   Triple AMDGPUTriple(sys::getDefaultTargetTriple());
+#else
+   Triple AMDGPUTriple(sys::getHostTriple());
+#endif
+
+
+#ifdef EXTERNAL_LLVM
+   /* XXX: Can we just initialize the AMDGPU target here? */
+   InitializeAllTargets();
+   InitializeAllTargetMCs();
+#else
+   LLVMInitializeAMDILTargetInfo();
+   LLVMInitializeAMDILTarget();
+   LLVMInitializeAMDILTargetMC();
+#endif
+   std::string err;
+   const Target * AMDGPUTarget = TargetRegistry::lookupTarget("r600", err);
+   fprintf(stderr, "%s\n", err.c_str());
+   if(!AMDGPUTarget) {
+      fprintf(stderr, "Can't find target\n");
+      return 1;
+   }
+   
+   Triple::ArchType Arch = Triple::getArchTypeForLLVMName("r600");
+   if (Arch == Triple::UnknownArch) {
+      fprintf(stderr, "Unknown Arch\n");
+   }
+   AMDGPUTriple.setArch(Arch);
+
+   Module * mod = unwrap(M);
+   std::string FS = gpu_family;
+#if HAVE_LLVM > 0x0300
+   TargetOptions TO;
+#endif
+
+   std::auto_ptr<TargetMachine> tm(AMDGPUTarget->createTargetMachine(
+                     AMDGPUTriple.getTriple(), gpu_family, "" /* Features */,
+                     TO, Reloc::Default, CodeModel::Default,
+                     CodeGenOpt::Default
+                     ));
+   TargetMachine &AMDGPUTargetMachine = *tm.get();
+   /* XXX: Use TargetMachine.Options in 3.0 */
+   if (dump) {
+      mod->dump();
+   }
+   PassManager PM;
+   PM.add(new TargetData(*AMDGPUTargetMachine.getTargetData()));
+   PM.add(createPromoteMemoryToRegisterPass());
+   AMDGPUTargetMachine.setAsmVerbosityDefault(true);
+
+   std::string CodeString;
+   raw_string_ostream oStream(CodeString);
+   formatted_raw_ostream out(oStream);
+
+   /* Optional extra paramater true / false to disable verify */
+   if (AMDGPUTargetMachine.addPassesToEmitFile(PM, out, TargetMachine::CGFT_AssemblyFile,
+#if HAVE_LLVM <= 0x300
+                                               CodeGenOpt::Default,
+#endif
+                                               true)){
+      fprintf(stderr, "AddingPasses failed.\n");
+      return 1;
+   }
+   PM.run(*mod);
+
+   out.flush();
+   std::string &data = oStream.str();
+
+   *bytes = (unsigned char*)malloc(data.length() * sizeof(unsigned char));
+   memcpy(*bytes, data.c_str(), data.length() * sizeof(unsigned char));
+   *byte_count = data.length();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c

new file mode 100644 (file)

index 0000000..62de9da
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright 2011 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors: Tom Stellard <thomas.stellard@amd.com>
+ *
+ */
+#include "radeon_llvm.h"
+
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_gather.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_math.h"
+#include "util/u_debug.h"
+
+#include <llvm-c/Transforms/Scalar.h>
+
+static struct radeon_llvm_loop * get_current_loop(struct radeon_llvm_context * ctx)
+{
+       return ctx->loop_depth > 0 ? ctx->loop + (ctx->loop_depth - 1) : NULL;
+}
+
+static struct radeon_llvm_branch * get_current_branch(
+       struct radeon_llvm_context * ctx)
+{
+       return ctx->branch_depth > 0 ?
+                       ctx->branch + (ctx->branch_depth - 1) : NULL;
+}
+
+unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan)
+{
+ return (index * 4) + chan;
+}
+
+static void radeon_llvm_fetch_args_2_reverse_soa(
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       assert(emit_data->info->num_src == 2);
+       emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                       1, emit_data->chan);
+       emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                       0, emit_data->chan);
+       emit_data->arg_count = 2;
+       emit_data->dst_type = LLVMTypeOf(emit_data->args[0]);
+}
+
+static LLVMValueRef emit_swizzle(
+       struct lp_build_tgsi_context * bld_base,
+        LLVMValueRef value,
+       unsigned swizzle_x,
+       unsigned swizzle_y,
+       unsigned swizzle_z,
+       unsigned swizzle_w)
+{
+       unsigned char swizzles[4];
+       swizzles[0] = swizzle_x;
+       swizzles[1] = swizzle_y;
+       swizzles[2] = swizzle_z;
+       swizzles[3] = swizzle_w;
+
+
+       return lp_build_swizzle_aos(&bld_base->base, value, swizzles);
+}
+
+static LLVMValueRef
+emit_array_index(
+       struct lp_build_tgsi_soa_context *bld,
+       const struct tgsi_full_src_register *reg,
+       unsigned swizzle)
+{
+       struct gallivm_state * gallivm = bld->bld_base.base.gallivm;
+
+       LLVMValueRef addr = LLVMBuildLoad(gallivm->builder,
+       bld->addr[reg->Indirect.Index][swizzle], "");
+       LLVMValueRef offset = lp_build_const_int32(gallivm, reg->Register.Index);
+       LLVMValueRef hw_index = LLVMBuildAdd(gallivm->builder, addr, offset, "");
+       LLVMValueRef soa_index = LLVMBuildMul(gallivm->builder, hw_index,
+       lp_build_const_int32(gallivm, 4), "");
+       LLVMValueRef array_index = LLVMBuildAdd(gallivm->builder, soa_index,
+       lp_build_const_int32(gallivm, swizzle), "");
+
+       return array_index;
+}
+
+static LLVMValueRef
+emit_fetch_immediate(
+       struct lp_build_tgsi_context *bld_base,
+       const struct tgsi_full_src_register *reg,
+       enum tgsi_opcode_type type,
+       unsigned swizzle)
+{
+       struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+       return bld->immediates[reg->Register.Index][swizzle];
+}
+
+static LLVMValueRef
+emit_fetch_input(
+       struct lp_build_tgsi_context *bld_base,
+       const struct tgsi_full_src_register *reg,
+       enum tgsi_opcode_type type,
+       unsigned swizzle)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       if (swizzle == ~0) {
+               LLVMValueRef values[TGSI_NUM_CHANNELS] = {};
+               unsigned chan;
+               for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+                       values[chan] = ctx->inputs[radeon_llvm_reg_index_soa(
+                                               reg->Register.Index, chan)];
+               }
+               return lp_build_gather_values(bld_base->base.gallivm, values,
+                                               TGSI_NUM_CHANNELS);
+       } else {
+               return ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+       }
+}
+
+static LLVMValueRef
+emit_fetch_temporary(
+       struct lp_build_tgsi_context *bld_base,
+       const struct tgsi_full_src_register *reg,
+       enum tgsi_opcode_type type,
+       unsigned swizzle)
+{
+       struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+       if (reg->Register.Indirect) {
+               LLVMValueRef array_index = emit_array_index(bld, reg, swizzle);
+               LLVMValueRef ptr = LLVMBuildGEP(builder, bld->temps_array, &array_index,
+                                               1, "");
+       return LLVMBuildLoad(builder, ptr, "");
+       } else {
+               LLVMValueRef temp_ptr;
+               temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
+               return LLVMBuildLoad(builder, temp_ptr, "");
+       }
+}
+
+static LLVMValueRef
+emit_fetch_output(
+       struct lp_build_tgsi_context *bld_base,
+       const struct tgsi_full_src_register *reg,
+       enum tgsi_opcode_type type,
+       unsigned swizzle)
+{
+       struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+       LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+        if (reg->Register.Indirect) {
+               LLVMValueRef array_index = emit_array_index(bld, reg, swizzle);
+               LLVMValueRef ptr = LLVMBuildGEP(builder, bld->outputs_array, &array_index,
+                                               1, "");
+               return LLVMBuildLoad(builder, ptr, "");
+       } else {
+               LLVMValueRef temp_ptr;
+               temp_ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle);
+               return LLVMBuildLoad(builder, temp_ptr, "");
+        }
+}
+
+static void emit_declaration(
+       struct lp_build_tgsi_context * bld_base,
+       const struct tgsi_full_declaration *decl)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       switch(decl->Declaration.File) {
+       case TGSI_FILE_ADDRESS:
+       {
+                unsigned idx;
+               for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
+                       unsigned chan;
+                       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+                                ctx->soa.addr[idx][chan] = lp_build_alloca(
+                                       &ctx->gallivm,
+                                       ctx->soa.bld_base.uint_bld.elem_type, "");
+                       }
+               }
+               break;
+       }
+
+       case TGSI_FILE_TEMPORARY:
+               lp_emit_declaration_soa(bld_base, decl);
+               break;
+
+       case TGSI_FILE_INPUT:
+       {
+               unsigned idx;
+               for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
+                       ctx->load_input(ctx, idx, decl);
+               }
+       }
+       break;
+
+       case TGSI_FILE_OUTPUT:
+       {
+               unsigned idx;
+               for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
+                       unsigned chan;
+                       assert(idx < RADEON_LLVM_MAX_OUTPUTS);
+                       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+                               ctx->soa.outputs[idx][chan] = lp_build_alloca(&ctx->gallivm,
+                                       ctx->soa.bld_base.base.elem_type, "");
+                       }
+               }
+
+               ctx->output_reg_count = MAX2(ctx->output_reg_count,
+                                                        decl->Range.Last + 1);
+               break;
+       }
+
+       default:
+               break;
+       }
+}
+
+static void
+emit_store(
+       struct lp_build_tgsi_context * bld_base,
+       const struct tgsi_full_instruction * inst,
+       const struct tgsi_opcode_info * info,
+       LLVMValueRef dst[4])
+{
+       struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
+       struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
+       struct lp_build_context base = bld->bld_base.base;
+       const struct tgsi_full_dst_register *reg = &inst->Dst[0];
+       LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
+       LLVMValueRef temp_ptr;
+       unsigned chan, chan_index;
+       boolean is_vec_store = FALSE;
+       if (dst[0]) {
+               LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0]));
+               is_vec_store = (k == LLVMVectorTypeKind);
+       }
+
+       if (is_vec_store) {
+               LLVMValueRef values[4] = {};
+               TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan) {
+                       LLVMValueRef index = lp_build_const_int32(gallivm, chan);
+                       values[chan]  = LLVMBuildExtractElement(gallivm->builder,
+                                                       dst[0], index, "");
+               }
+               bld_base->emit_store(bld_base, inst, info, values);
+               return;
+       }
+
+       TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+               LLVMValueRef value = dst[chan_index];
+
+               if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
+                       struct lp_build_emit_data clamp_emit_data;
+
+                       memset(&clamp_emit_data, 0, sizeof(clamp_emit_data));
+                       clamp_emit_data.arg_count = 3;
+                       clamp_emit_data.args[0] = value;
+                       clamp_emit_data.args[2] = base.one;
+
+                       switch(inst->Instruction.Saturate) {
+                       case TGSI_SAT_ZERO_ONE:
+                               clamp_emit_data.args[1] = base.zero;
+                               break;
+                       case TGSI_SAT_MINUS_PLUS_ONE:
+                               clamp_emit_data.args[1] = LLVMConstReal(
+                                               base.elem_type, -1.0f);
+                               break;
+                       default:
+                               assert(0);
+                       }
+                       value = lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP,
+                                               &clamp_emit_data);
+               }
+
+               switch(reg->Register.File) {
+               case TGSI_FILE_OUTPUT:
+                       temp_ptr = bld->outputs[reg->Register.Index][chan_index];
+                       break;
+
+               case TGSI_FILE_TEMPORARY:
+                       temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index);
+                       break;
+
+               default:
+                       return;
+               }
+               LLVMBuildStore(builder, value, temp_ptr);
+       }
+}
+
+static void bgnloop_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       LLVMBasicBlockRef loop_block;
+       LLVMBasicBlockRef endloop_block;
+       endloop_block = LLVMAppendBasicBlockInContext(gallivm->context,
+                                               ctx->main_fn, "ENDLOOP");
+       loop_block = LLVMInsertBasicBlockInContext(gallivm->context,
+                                               endloop_block, "LOOP");
+       LLVMBuildBr(gallivm->builder, loop_block);
+       LLVMPositionBuilderAtEnd(gallivm->builder, loop_block);
+       ctx->loop_depth++;
+       ctx->loop[ctx->loop_depth - 1].loop_block = loop_block;
+       ctx->loop[ctx->loop_depth - 1].endloop_block = endloop_block;
+}
+
+static void brk_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       struct radeon_llvm_loop * current_loop = get_current_loop(ctx);
+
+       LLVMBuildBr(gallivm->builder, current_loop->endloop_block);
+}
+
+static void cont_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       struct radeon_llvm_loop * current_loop = get_current_loop(ctx);
+
+       LLVMBuildBr(gallivm->builder, current_loop->loop_block);
+}
+
+static void else_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       struct radeon_llvm_branch * current_branch = get_current_branch(ctx);
+       LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder);
+
+       /* We need to add a terminator to the current block if the previous
+        * instruction was an ENDIF.Example:
+        * IF
+        *   [code]
+        *   IF
+        *     [code]
+        *   ELSE
+        *    [code]
+        *   ENDIF <--
+        * ELSE<--
+        *   [code]
+        * ENDIF
+        */
+
+       if (current_block != current_branch->if_block) {
+               LLVMBuildBr(gallivm->builder, current_branch->endif_block);
+       }
+       if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) {
+               LLVMBuildBr(gallivm->builder, current_branch->endif_block);
+       }
+       current_branch->has_else = 1;
+       LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block);
+}
+
+static void endif_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       struct radeon_llvm_branch * current_branch = get_current_branch(ctx);
+       LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder);
+
+       /* If we have consecutive ENDIF instructions, then the first ENDIF
+        * will not have a terminator, so we need to add one. */
+       if (current_block != current_branch->if_block
+                       && current_block != current_branch->else_block
+                       && !LLVMGetBasicBlockTerminator(current_block)) {
+
+                LLVMBuildBr(gallivm->builder, current_branch->endif_block);
+       }
+       if (!LLVMGetBasicBlockTerminator(current_branch->else_block)) {
+               LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block);
+               LLVMBuildBr(gallivm->builder, current_branch->endif_block);
+       }
+
+       if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) {
+               LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->if_block);
+               LLVMBuildBr(gallivm->builder, current_branch->endif_block);
+       }
+
+       LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->endif_block);
+       ctx->branch_depth--;
+}
+
+static void endloop_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       struct radeon_llvm_loop * current_loop = get_current_loop(ctx);
+
+       if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(gallivm->builder))) {
+                LLVMBuildBr(gallivm->builder, current_loop->loop_block);
+       }
+
+       LLVMPositionBuilderAtEnd(gallivm->builder, current_loop->endloop_block);
+       ctx->loop_depth--;
+}
+
+static void if_emit(
+       const struct lp_build_tgsi_action * action,
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+       LLVMValueRef cond;
+       LLVMBasicBlockRef if_block, else_block, endif_block;
+       cond = LLVMBuildFCmp(gallivm->builder, LLVMRealOEQ, emit_data->args[0],
+                                                       bld_base->base.one, "");
+
+       endif_block = LLVMAppendBasicBlockInContext(gallivm->context,
+                                               ctx->main_fn, "ENDIF");
+       if_block = LLVMInsertBasicBlockInContext(gallivm->context,
+                                               endif_block, "IF");
+       else_block = LLVMInsertBasicBlockInContext(gallivm->context,
+                                               endif_block, "ELSE");
+       LLVMBuildCondBr(gallivm->builder, cond, if_block, else_block);
+       LLVMPositionBuilderAtEnd(gallivm->builder, if_block);
+
+       ctx->branch_depth++;
+       ctx->branch[ctx->branch_depth - 1].endif_block = endif_block;
+       ctx->branch[ctx->branch_depth - 1].if_block = if_block;
+       ctx->branch[ctx->branch_depth - 1].else_block = else_block;
+       ctx->branch[ctx->branch_depth - 1].has_else = 0;
+}
+
+static void tex_fetch_args(
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       /* XXX: lp_build_swizzle_aos() was failing with wrong arg types,
+        * when we used CHAN_ALL.  We should be able to get this to work,
+        * but for now we will swizzle it ourselves
+       emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                0, CHAN_ALL);
+
+       */
+
+       LLVMValueRef coords[4];
+       unsigned chan;
+       for (chan = 0; chan < 4; chan++) {
+               coords[chan] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, chan);
+       }
+
+       emit_data->arg_count = 1;
+       emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+                                               coords, 4);
+       emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+}
+
+void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
+{
+       struct lp_type type;
+       LLVMTypeRef main_fn_type;
+       LLVMBasicBlockRef main_fn_body;
+
+       /* Initialize the gallivm object:
+        * We are only using the module, context, and builder fields of this struct.
+        * This should be enough for us to be able to pass our gallivm struct to the
+        * helper functions in the gallivm module.
+        */
+       memset(&ctx->gallivm, 0, sizeof (ctx->gallivm));
+       memset(&ctx->soa, 0, sizeof(ctx->soa));
+       ctx->gallivm.context = LLVMContextCreate();
+       ctx->gallivm.module = LLVMModuleCreateWithNameInContext("tgsi",
+                                               ctx->gallivm.context);
+       ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context);
+
+       /* Setup the module */
+       main_fn_type = LLVMFunctionType(LLVMVoidTypeInContext(ctx->gallivm.context),
+                                        NULL, 0, 0);
+       ctx->main_fn = LLVMAddFunction(ctx->gallivm.module, "main", main_fn_type);
+       main_fn_body = LLVMAppendBasicBlockInContext(ctx->gallivm.context,
+                       ctx->main_fn, "main_body");
+        LLVMPositionBuilderAtEnd(ctx->gallivm.builder, main_fn_body);
+
+       ctx->store_output_intr = "llvm.AMDGPU.store.output.";
+       ctx->swizzle_intr = "llvm.AMDGPU.swizzle";
+       struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
+
+       /* XXX: We need to revisit this.I think the correct way to do this is
+        * to use length = 4 here and use the elem_bld for everything. */
+       type.floating = TRUE;
+       type.sign = TRUE;
+       type.width = 32;
+       type.length = 1;
+
+       lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
+       lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
+
+       bld_base->soa = 1;
+       bld_base->emit_store = emit_store;
+       bld_base->emit_swizzle = emit_swizzle;
+       bld_base->emit_declaration = emit_declaration;
+       bld_base->emit_immediate = lp_emit_immediate_soa;
+
+       bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
+       bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
+       bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch_temporary;
+       bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch_output;
+
+       /* Allocate outputs */
+       ctx->soa.outputs = ctx->outputs;
+
+       /* XXX: Is there a better way to initialize all this ? */
+
+       lp_set_default_actions(bld_base);
+
+       bld_base->op_actions[TGSI_OPCODE_ABS].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.AMDIL.fabs.";
+       bld_base->op_actions[TGSI_OPCODE_ARL].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_ARL].intr_name = "llvm.AMDGPU.arl";
+       bld_base->op_actions[TGSI_OPCODE_BGNLOOP].emit = bgnloop_emit;
+       bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit;
+       bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit;
+       bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp.";
+       bld_base->op_actions[TGSI_OPCODE_CMP].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt";
+       bld_base->op_actions[TGSI_OPCODE_COS].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.AMDGPU.cos";
+       bld_base->op_actions[TGSI_OPCODE_DDX].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+       bld_base->op_actions[TGSI_OPCODE_DDY].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+       bld_base->op_actions[TGSI_OPCODE_DIV].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_DIV].intr_name = "llvm.AMDGPU.div";
+       bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
+       bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit;
+       bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit;
+       bld_base->op_actions[TGSI_OPCODE_EX2].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp.";
+       bld_base->op_actions[TGSI_OPCODE_FLR].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.AMDGPU.floor";
+       bld_base->op_actions[TGSI_OPCODE_FRC].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
+       bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit;
+       bld_base->op_actions[TGSI_OPCODE_KIL].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_KIL].intr_name = "llvm.AMDGPU.kill";
+       bld_base->op_actions[TGSI_OPCODE_KILP].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_KILP].intr_name = "llvm.AMDGPU.kilp";
+       bld_base->op_actions[TGSI_OPCODE_LG2].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.AMDIL.log.";
+       bld_base->op_actions[TGSI_OPCODE_LRP].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_LRP].intr_name = "llvm.AMDGPU.lrp";
+       bld_base->op_actions[TGSI_OPCODE_MIN].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.AMDIL.min.";
+       bld_base->op_actions[TGSI_OPCODE_MAD].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_MAD].intr_name = "llvm.AMDIL.mad.";
+       bld_base->op_actions[TGSI_OPCODE_MAX].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.AMDIL.max.";
+       bld_base->op_actions[TGSI_OPCODE_MUL].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_MUL].intr_name = "llvm.AMDGPU.mul";
+       bld_base->op_actions[TGSI_OPCODE_POW].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.AMDGPU.pow";
+       bld_base->op_actions[TGSI_OPCODE_RCP].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_RCP].intr_name = "llvm.AMDGPU.rcp";
+       bld_base->op_actions[TGSI_OPCODE_SSG].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SSG].intr_name = "llvm.AMDGPU.ssg";
+       bld_base->op_actions[TGSI_OPCODE_SGE].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge.";
+       bld_base->op_actions[TGSI_OPCODE_SEQ].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SEQ].intr_name = "llvm.AMDGPU.seq";
+       bld_base->op_actions[TGSI_OPCODE_SLE].fetch_args = radeon_llvm_fetch_args_2_reverse_soa;
+       bld_base->op_actions[TGSI_OPCODE_SLE].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SLE].intr_name = "llvm.AMDGPU.sge";
+       bld_base->op_actions[TGSI_OPCODE_SLT].fetch_args = radeon_llvm_fetch_args_2_reverse_soa;
+       bld_base->op_actions[TGSI_OPCODE_SLT].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SLT].intr_name = "llvm.AMDGPU.sgt";
+       bld_base->op_actions[TGSI_OPCODE_SNE].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SNE].intr_name = "llvm.AMDGPU.sne";
+       bld_base->op_actions[TGSI_OPCODE_SGT].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SGT].intr_name = "llvm.AMDGPU.sgt";
+       bld_base->op_actions[TGSI_OPCODE_SIN].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.AMDGPU.sin";
+       bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
+       bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
+       bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
+       bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
+       bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = tex_fetch_args;
+       bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
+       bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
+       bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
+       bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
+       bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = lp_build_tgsi_intrinsic;
+       bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
+
+       bld_base->rsq_action.emit = lp_build_tgsi_intrinsic;
+       bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq";
+}
+
+void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx)
+{
+       struct gallivm_state * gallivm = ctx->soa.bld_base.base.gallivm;
+       /* End the main function with Return*/
+       LLVMBuildRetVoid(gallivm->builder);
+
+       /* Create the pass manager */
+       ctx->gallivm.passmgr = LLVMCreateFunctionPassManagerForModule(
+                                                       gallivm->module);
+
+       /* This pass should eliminate all the load and store instructions */
+       LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr);
+
+       /* Add some optimization passes */
+       LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
+       LLVMAddCFGSimplificationPass(gallivm->passmgr);
+
+       /* Run the passs */
+       LLVMRunFunctionPassManager(gallivm->passmgr, ctx->main_fn);
+
+       LLVMDisposeBuilder(gallivm->builder);
+       LLVMDisposePassManager(gallivm->passmgr);
+
+}
+
+void radeon_llvm_dispose(struct radeon_llvm_context * ctx)
+{
+       LLVMDisposeModule(ctx->soa.bld_base.base.gallivm->module);
+       LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context);
+}
diff --git a/src/gallium/drivers/radeonsi/Android.mk b/src/gallium/drivers/radeonsi/Android.mk

new file mode 100644 (file)

index 0000000..f7e01a3
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/Android.mk
@@ -0,0 +1,38 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
+# Copyright (C) 2010-2011 LunarG Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_C_INCLUDES :=
+
+LOCAL_MODULE := libmesa_pipe_radeonsi
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/radeonsi/Makefile b/src/gallium/drivers/radeonsi/Makefile

new file mode 100644 (file)

index 0000000..90f6f47
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = radeonsi
+
+LIBRARY_INCLUDES = \
+       -I$(TOP)/include \
+       -I$(TOP)/src/gallium/drivers/radeon/
+
+
+# get C_SOURCES
+include Makefile.sources
+
+LIBRADEON = $(TOP)/src/gallium/drivers/radeon/libradeon.a
+
+EXTRA_OBJECTS = \
+       $(LIBRADEON)
+
+CFLAGS+=$(RADEON_CFLAGS)
+
+include ../../Makefile.template
+
+# FIXME: Remove when this driver is converted to automake.
+all: default
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources

new file mode 100644 (file)

index 0000000..394cfe9
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -0,0 +1,13 @@
+C_SOURCES := \
+       r600_blit.c \
+       r600_buffer.c \
+       r600_hw_context.c \
+       radeonsi_pipe.c \
+       r600_query.c \
+       r600_resource.c \
+       radeonsi_shader.c \
+       r600_texture.c \
+       evergreen_hw_context.c \
+       evergreen_state.c \
+       r600_translate.c \
+       r600_state_common.c
diff --git a/src/gallium/drivers/radeonsi/SConscript b/src/gallium/drivers/radeonsi/SConscript

new file mode 100644 (file)

index 0000000..f2d2bec
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/SConscript
@@ -0,0 +1,17 @@
+Import('*')
+
+env = env.Clone()
+
+env.Append(CPPPATH = [
+    '#/include',
+    '#/src/gallium/drivers/radeon',
+])
+
+radeonsi = env.ConvenienceLibrary(
+    target = 'radeonsi',
+    source = env.ParseSourceList('Makefile.sources', 'C_SOURCES')
+    )
+
+env.Alias('radeonsi', radeonsi)
+
+Export('radeonsi')
diff --git a/src/gallium/drivers/radeonsi/evergreen_hw_context.c b/src/gallium/drivers/radeonsi/evergreen_hw_context.c

new file mode 100644 (file)

index 0000000..549673f
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/evergreen_hw_context.c
@@ -0,0 +1,561 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#include "r600.h"
+#include "r600_hw_context_priv.h"
+#include "radeonsi_pipe.h"
+#include "sid.h"
+#include "util/u_memory.h"
+#include <errno.h>
+
+#define GROUP_FORCE_NEW_BLOCK  0
+
+static const struct r600_reg si_config_reg_list[] = {
+       {R_0088B0_VGT_VTX_VECT_EJECT_REG, REG_FLAG_FLUSH_CHANGE},
+       {R_0088C8_VGT_ESGS_RING_SIZE, REG_FLAG_FLUSH_CHANGE},
+       {R_0088CC_VGT_GSVS_RING_SIZE, REG_FLAG_FLUSH_CHANGE},
+       {R_008958_VGT_PRIMITIVE_TYPE, 0},
+       {R_008A14_PA_CL_ENHANCE, REG_FLAG_FLUSH_CHANGE},
+       {R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE},
+       {R_00913C_SPI_CONFIG_CNTL_1, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE},
+};
+
+static const struct r600_reg si_context_reg_list[] = {
+       {R_028000_DB_RENDER_CONTROL, 0},
+       {R_028004_DB_COUNT_CONTROL, 0},
+       {R_028008_DB_DEPTH_VIEW, 0},
+       {R_02800C_DB_RENDER_OVERRIDE, 0},
+       {R_028010_DB_RENDER_OVERRIDE2, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028014_DB_HTILE_DATA_BASE, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028020_DB_DEPTH_BOUNDS_MIN, 0},
+       {R_028024_DB_DEPTH_BOUNDS_MAX, 0},
+       {R_028028_DB_STENCIL_CLEAR, 0},
+       {R_02802C_DB_DEPTH_CLEAR, 0},
+       {R_028030_PA_SC_SCREEN_SCISSOR_TL, 0},
+       {R_028034_PA_SC_SCREEN_SCISSOR_BR, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_02803C_DB_DEPTH_INFO, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028040_DB_Z_INFO, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028044_DB_STENCIL_INFO, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028048_DB_Z_READ_BASE, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_02804C_DB_STENCIL_READ_BASE, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028050_DB_Z_WRITE_BASE, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028054_DB_STENCIL_WRITE_BASE, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028058_DB_DEPTH_SIZE, 0},
+       {R_02805C_DB_DEPTH_SLICE, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028080_TA_BC_BASE_ADDR, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028200_PA_SC_WINDOW_OFFSET, 0},
+       {R_028204_PA_SC_WINDOW_SCISSOR_TL, 0},
+       {R_028208_PA_SC_WINDOW_SCISSOR_BR, 0},
+       {R_02820C_PA_SC_CLIPRECT_RULE, 0},
+       {R_028210_PA_SC_CLIPRECT_0_TL, 0},
+       {R_028214_PA_SC_CLIPRECT_0_BR, 0},
+       {R_028218_PA_SC_CLIPRECT_1_TL, 0},
+       {R_02821C_PA_SC_CLIPRECT_1_BR, 0},
+       {R_028220_PA_SC_CLIPRECT_2_TL, 0},
+       {R_028224_PA_SC_CLIPRECT_2_BR, 0},
+       {R_028228_PA_SC_CLIPRECT_3_TL, 0},
+       {R_02822C_PA_SC_CLIPRECT_3_BR, 0},
+       {R_028230_PA_SC_EDGERULE, 0},
+       {R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0},
+       {R_028238_CB_TARGET_MASK, 0},
+       {R_02823C_CB_SHADER_MASK, 0},
+       {R_028240_PA_SC_GENERIC_SCISSOR_TL, 0},
+       {R_028244_PA_SC_GENERIC_SCISSOR_BR, 0},
+       {R_028250_PA_SC_VPORT_SCISSOR_0_TL, 0},
+       {R_028254_PA_SC_VPORT_SCISSOR_0_BR, 0},
+       {R_0282D0_PA_SC_VPORT_ZMIN_0, 0},
+       {R_0282D4_PA_SC_VPORT_ZMAX_0, 0},
+       {R_028350_PA_SC_RASTER_CONFIG, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028400_VGT_MAX_VTX_INDX, 0},
+       {R_028404_VGT_MIN_VTX_INDX, 0},
+       {R_028408_VGT_INDX_OFFSET, 0},
+       {R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, 0},
+       {R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028414_CB_BLEND_RED, 0},
+       {R_028418_CB_BLEND_GREEN, 0},
+       {R_02841C_CB_BLEND_BLUE, 0},
+       {R_028420_CB_BLEND_ALPHA, 0},
+       {R_028430_DB_STENCILREFMASK, 0},
+       {R_028434_DB_STENCILREFMASK_BF, 0},
+       {R_02843C_PA_CL_VPORT_XSCALE_0, 0},
+       {R_028440_PA_CL_VPORT_XOFFSET_0, 0},
+       {R_028444_PA_CL_VPORT_YSCALE_0, 0},
+       {R_028448_PA_CL_VPORT_YOFFSET_0, 0},
+       {R_02844C_PA_CL_VPORT_ZSCALE_0, 0},
+       {R_028450_PA_CL_VPORT_ZOFFSET_0, 0},
+       {R_0285BC_PA_CL_UCP_0_X, 0},
+       {R_0285C0_PA_CL_UCP_0_Y, 0},
+       {R_0285C4_PA_CL_UCP_0_Z, 0},
+       {R_0285C8_PA_CL_UCP_0_W, 0},
+       {R_0285CC_PA_CL_UCP_1_X, 0},
+       {R_0285D0_PA_CL_UCP_1_Y, 0},
+       {R_0285D4_PA_CL_UCP_1_Z, 0},
+       {R_0285D8_PA_CL_UCP_1_W, 0},
+       {R_0285DC_PA_CL_UCP_2_X, 0},
+       {R_0285E0_PA_CL_UCP_2_Y, 0},
+       {R_0285E4_PA_CL_UCP_2_Z, 0},
+       {R_0285E8_PA_CL_UCP_2_W, 0},
+       {R_0285EC_PA_CL_UCP_3_X, 0},
+       {R_0285F0_PA_CL_UCP_3_Y, 0},
+       {R_0285F4_PA_CL_UCP_3_Z, 0},
+       {R_0285F8_PA_CL_UCP_3_W, 0},
+       {R_0285FC_PA_CL_UCP_4_X, 0},
+       {R_028600_PA_CL_UCP_4_Y, 0},
+       {R_028604_PA_CL_UCP_4_Z, 0},
+       {R_028608_PA_CL_UCP_4_W, 0},
+       {R_02860C_PA_CL_UCP_5_X, 0},
+       {R_028610_PA_CL_UCP_5_Y, 0},
+       {R_028614_PA_CL_UCP_5_Z, 0},
+       {R_028618_PA_CL_UCP_5_W, 0},
+       {R_028644_SPI_PS_INPUT_CNTL_0, 0},
+       {R_028648_SPI_PS_INPUT_CNTL_1, 0},
+       {R_02864C_SPI_PS_INPUT_CNTL_2, 0},
+       {R_028650_SPI_PS_INPUT_CNTL_3, 0},
+       {R_028654_SPI_PS_INPUT_CNTL_4, 0},
+       {R_028658_SPI_PS_INPUT_CNTL_5, 0},
+       {R_02865C_SPI_PS_INPUT_CNTL_6, 0},
+       {R_028660_SPI_PS_INPUT_CNTL_7, 0},
+       {R_028664_SPI_PS_INPUT_CNTL_8, 0},
+       {R_028668_SPI_PS_INPUT_CNTL_9, 0},
+       {R_02866C_SPI_PS_INPUT_CNTL_10, 0},
+       {R_028670_SPI_PS_INPUT_CNTL_11, 0},
+       {R_028674_SPI_PS_INPUT_CNTL_12, 0},
+       {R_028678_SPI_PS_INPUT_CNTL_13, 0},
+       {R_02867C_SPI_PS_INPUT_CNTL_14, 0},
+       {R_028680_SPI_PS_INPUT_CNTL_15, 0},
+       {R_028684_SPI_PS_INPUT_CNTL_16, 0},
+       {R_028688_SPI_PS_INPUT_CNTL_17, 0},
+       {R_02868C_SPI_PS_INPUT_CNTL_18, 0},
+       {R_028690_SPI_PS_INPUT_CNTL_19, 0},
+       {R_028694_SPI_PS_INPUT_CNTL_20, 0},
+       {R_028698_SPI_PS_INPUT_CNTL_21, 0},
+       {R_02869C_SPI_PS_INPUT_CNTL_22, 0},
+       {R_0286A0_SPI_PS_INPUT_CNTL_23, 0},
+       {R_0286A4_SPI_PS_INPUT_CNTL_24, 0},
+       {R_0286A8_SPI_PS_INPUT_CNTL_25, 0},
+       {R_0286AC_SPI_PS_INPUT_CNTL_26, 0},
+       {R_0286B0_SPI_PS_INPUT_CNTL_27, 0},
+       {R_0286B4_SPI_PS_INPUT_CNTL_28, 0},
+       {R_0286B8_SPI_PS_INPUT_CNTL_29, 0},
+       {R_0286BC_SPI_PS_INPUT_CNTL_30, 0},
+       {R_0286C0_SPI_PS_INPUT_CNTL_31, 0},
+       {R_0286C4_SPI_VS_OUT_CONFIG, 0},
+       {R_0286CC_SPI_PS_INPUT_ENA, 0},
+       {R_0286D0_SPI_PS_INPUT_ADDR, 0},
+       {R_0286D4_SPI_INTERP_CONTROL_0, 0},
+       {R_0286D8_SPI_PS_IN_CONTROL, 0},
+       {R_0286E0_SPI_BARYC_CNTL, 0},
+       {R_02870C_SPI_SHADER_POS_FORMAT, 0},
+       {R_028710_SPI_SHADER_Z_FORMAT, 0},
+       {R_028714_SPI_SHADER_COL_FORMAT, 0},
+       {R_028780_CB_BLEND0_CONTROL, 0},
+       {R_028784_CB_BLEND1_CONTROL, 0},
+       {R_028788_CB_BLEND2_CONTROL, 0},
+       {R_02878C_CB_BLEND3_CONTROL, 0},
+       {R_028790_CB_BLEND4_CONTROL, 0},
+       {R_028794_CB_BLEND5_CONTROL, 0},
+       {R_028798_CB_BLEND6_CONTROL, 0},
+       {R_02879C_CB_BLEND7_CONTROL, 0},
+       {R_0287D4_PA_CL_POINT_X_RAD, 0},
+       {R_0287D8_PA_CL_POINT_Y_RAD, 0},
+       {R_0287DC_PA_CL_POINT_SIZE, 0},
+       {R_0287E0_PA_CL_POINT_CULL_RAD, 0},
+       {R_028800_DB_DEPTH_CONTROL, 0},
+       {R_028804_DB_EQAA, 0},
+       {R_028808_CB_COLOR_CONTROL, 0},
+       {R_02880C_DB_SHADER_CONTROL, 0},
+       {R_028810_PA_CL_CLIP_CNTL, 0},
+       {R_028814_PA_SU_SC_MODE_CNTL, 0},
+       {R_028818_PA_CL_VTE_CNTL, 0},
+       {R_02881C_PA_CL_VS_OUT_CNTL, 0},
+       {R_028820_PA_CL_NANINF_CNTL, 0},
+       {R_028824_PA_SU_LINE_STIPPLE_CNTL, 0},
+       {R_028828_PA_SU_LINE_STIPPLE_SCALE, 0},
+       {R_02882C_PA_SU_PRIM_FILTER_CNTL, 0},
+       {R_028A00_PA_SU_POINT_SIZE, 0},
+       {R_028A04_PA_SU_POINT_MINMAX, 0},
+       {R_028A08_PA_SU_LINE_CNTL, 0},
+       {R_028A0C_PA_SC_LINE_STIPPLE, 0},
+       {R_028A10_VGT_OUTPUT_PATH_CNTL, 0},
+       {R_028A14_VGT_HOS_CNTL, 0},
+       {R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0},
+       {R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0},
+       {R_028A20_VGT_HOS_REUSE_DEPTH, 0},
+       {R_028A24_VGT_GROUP_PRIM_TYPE, 0},
+       {R_028A28_VGT_GROUP_FIRST_DECR, 0},
+       {R_028A2C_VGT_GROUP_DECR, 0},
+       {R_028A30_VGT_GROUP_VECT_0_CNTL, 0},
+       {R_028A34_VGT_GROUP_VECT_1_CNTL, 0},
+       {R_028A38_VGT_GROUP_VECT_0_FMT_CNTL, 0},
+       {R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL, 0},
+       {R_028A40_VGT_GS_MODE, 0},
+       {R_028A48_PA_SC_MODE_CNTL_0, 0},
+       {R_028A4C_PA_SC_MODE_CNTL_1, 0},
+       {R_028A50_VGT_ENHANCE, 0},
+       {R_028A54_VGT_GS_PER_ES, 0},
+       {R_028A58_VGT_ES_PER_GS, 0},
+       {R_028A5C_VGT_GS_PER_VS, 0},
+       {R_028A60_VGT_GSVS_RING_OFFSET_1, 0},
+       {R_028A64_VGT_GSVS_RING_OFFSET_2, 0},
+       {R_028A68_VGT_GSVS_RING_OFFSET_3, 0},
+       {R_028A6C_VGT_GS_OUT_PRIM_TYPE, 0},
+       {R_028A70_IA_ENHANCE, 0},
+       {R_028A84_VGT_PRIMITIVEID_EN, 0},
+       {R_028A8C_VGT_PRIMITIVEID_RESET, 0},
+       {R_028AA0_VGT_INSTANCE_STEP_RATE_0, 0},
+       {R_028AA4_VGT_INSTANCE_STEP_RATE_1, 0},
+       {R_028AA8_IA_MULTI_VGT_PARAM, 0},
+       {R_028AAC_VGT_ESGS_RING_ITEMSIZE, 0},
+       {R_028AB0_VGT_GSVS_RING_ITEMSIZE, 0},
+       {R_028AB4_VGT_REUSE_OFF, 0},
+       {R_028AB8_VGT_VTX_CNT_EN, 0},
+       {R_028ABC_DB_HTILE_SURFACE, 0},
+       {R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0},
+       {R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0},
+       {R_028AC8_DB_PRELOAD_CONTROL, 0},
+       {R_028B54_VGT_SHADER_STAGES_EN, 0},
+       {R_028B70_DB_ALPHA_TO_MASK, 0},
+       {R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 0},
+       {R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 0},
+       {R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, 0},
+       {R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, 0},
+       {R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, 0},
+       {R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, 0},
+       {R_028B94_VGT_STRMOUT_CONFIG, 0},
+       {R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0},
+       {R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0},
+       {R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0},
+       {R_028BDC_PA_SC_LINE_CNTL, 0},
+       {R_028BE0_PA_SC_AA_CONFIG, 0},
+       {R_028BE4_PA_SU_VTX_CNTL, 0},
+       {R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 0},
+       {R_028BEC_PA_CL_GB_VERT_DISC_ADJ, 0},
+       {R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, 0},
+       {R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, 0},
+       {R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0},
+       {R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, 0},
+       {R_028C00_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2, 0},
+       {R_028C04_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3, 0},
+       {R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0},
+       {R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, 0},
+       {R_028C10_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2, 0},
+       {R_028C14_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3, 0},
+       {R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0},
+       {R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, 0},
+       {R_028C20_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2, 0},
+       {R_028C24_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3, 0},
+       {R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0},
+       {R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, 0},
+       {R_028C30_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2, 0},
+       {R_028C34_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3, 0},
+       {R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 0},
+       {R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028C60_CB_COLOR0_BASE, REG_FLAG_NEED_BO},
+       {R_028C64_CB_COLOR0_PITCH, 0},
+       {R_028C68_CB_COLOR0_SLICE, 0},
+       {R_028C6C_CB_COLOR0_VIEW, 0},
+       {R_028C70_CB_COLOR0_INFO, REG_FLAG_NEED_BO},
+       {R_028C74_CB_COLOR0_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028C9C_CB_COLOR1_BASE, REG_FLAG_NEED_BO},
+       {R_028CA0_CB_COLOR1_PITCH, 0},
+       {R_028CA4_CB_COLOR1_SLICE, 0},
+       {R_028CA8_CB_COLOR1_VIEW, 0},
+       {R_028CAC_CB_COLOR1_INFO, REG_FLAG_NEED_BO},
+       {R_028CB0_CB_COLOR1_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028CD8_CB_COLOR2_BASE, REG_FLAG_NEED_BO},
+       {R_028CDC_CB_COLOR2_PITCH, 0},
+       {R_028CE0_CB_COLOR2_SLICE, 0},
+       {R_028CE4_CB_COLOR2_VIEW, 0},
+       {R_028CE8_CB_COLOR2_INFO, REG_FLAG_NEED_BO},
+       {R_028CEC_CB_COLOR2_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028D14_CB_COLOR3_BASE, REG_FLAG_NEED_BO},
+       {R_028D18_CB_COLOR3_PITCH, 0},
+       {R_028D1C_CB_COLOR3_SLICE, 0},
+       {R_028D20_CB_COLOR3_VIEW, 0},
+       {R_028D24_CB_COLOR3_INFO, REG_FLAG_NEED_BO},
+       {R_028D28_CB_COLOR3_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028D50_CB_COLOR4_BASE, REG_FLAG_NEED_BO},
+       {R_028D54_CB_COLOR4_PITCH, 0},
+       {R_028D58_CB_COLOR4_SLICE, 0},
+       {R_028D5C_CB_COLOR4_VIEW, 0},
+       {R_028D60_CB_COLOR4_INFO, REG_FLAG_NEED_BO},
+       {R_028D64_CB_COLOR4_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028D8C_CB_COLOR5_BASE, REG_FLAG_NEED_BO},
+       {R_028D90_CB_COLOR5_PITCH, 0},
+       {R_028D94_CB_COLOR5_SLICE, 0},
+       {R_028D98_CB_COLOR5_VIEW, 0},
+       {R_028D9C_CB_COLOR5_INFO, REG_FLAG_NEED_BO},
+       {R_028DA0_CB_COLOR5_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028DC8_CB_COLOR6_BASE, REG_FLAG_NEED_BO},
+       {R_028DCC_CB_COLOR6_PITCH, 0},
+       {R_028DD0_CB_COLOR6_SLICE, 0},
+       {R_028DD4_CB_COLOR6_VIEW, 0},
+       {R_028DD8_CB_COLOR6_INFO, REG_FLAG_NEED_BO},
+       {R_028DDC_CB_COLOR6_ATTRIB, REG_FLAG_NEED_BO},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_028E04_CB_COLOR7_BASE, REG_FLAG_NEED_BO},
+       {R_028E08_CB_COLOR7_PITCH, 0},
+       {R_028E0C_CB_COLOR7_SLICE, 0},
+       {R_028E10_CB_COLOR7_VIEW, 0},
+       {R_028E14_CB_COLOR7_INFO, REG_FLAG_NEED_BO},
+       {R_028E18_CB_COLOR7_ATTRIB, REG_FLAG_NEED_BO},
+};
+
+static const struct r600_reg si_sh_reg_list[] = {
+       {R_00B020_SPI_SHADER_PGM_LO_PS, REG_FLAG_NEED_BO},
+       {R_00B024_SPI_SHADER_PGM_HI_PS, REG_FLAG_NEED_BO},
+       {R_00B028_SPI_SHADER_PGM_RSRC1_PS, 0},
+       {R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B030_SPI_SHADER_USER_DATA_PS_0, REG_FLAG_NEED_BO},
+       {R_00B034_SPI_SHADER_USER_DATA_PS_1, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B038_SPI_SHADER_USER_DATA_PS_2, REG_FLAG_NEED_BO},
+       {R_00B03C_SPI_SHADER_USER_DATA_PS_3, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B040_SPI_SHADER_USER_DATA_PS_4, REG_FLAG_NEED_BO},
+       {R_00B044_SPI_SHADER_USER_DATA_PS_5, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B048_SPI_SHADER_USER_DATA_PS_6, REG_FLAG_NEED_BO},
+       {R_00B04C_SPI_SHADER_USER_DATA_PS_7, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B050_SPI_SHADER_USER_DATA_PS_8, REG_FLAG_NEED_BO},
+       {R_00B054_SPI_SHADER_USER_DATA_PS_9, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B058_SPI_SHADER_USER_DATA_PS_10, REG_FLAG_NEED_BO},
+       {R_00B05C_SPI_SHADER_USER_DATA_PS_11, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B060_SPI_SHADER_USER_DATA_PS_12, REG_FLAG_NEED_BO},
+       {R_00B064_SPI_SHADER_USER_DATA_PS_13, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B068_SPI_SHADER_USER_DATA_PS_14, REG_FLAG_NEED_BO},
+       {R_00B06C_SPI_SHADER_USER_DATA_PS_15, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B120_SPI_SHADER_PGM_LO_VS, REG_FLAG_NEED_BO},
+       {R_00B124_SPI_SHADER_PGM_HI_VS, REG_FLAG_NEED_BO},
+       {R_00B128_SPI_SHADER_PGM_RSRC1_VS, 0},
+       {R_00B12C_SPI_SHADER_PGM_RSRC2_VS, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B130_SPI_SHADER_USER_DATA_VS_0, REG_FLAG_NEED_BO},
+       {R_00B134_SPI_SHADER_USER_DATA_VS_1, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B138_SPI_SHADER_USER_DATA_VS_2, REG_FLAG_NEED_BO},
+       {R_00B13C_SPI_SHADER_USER_DATA_VS_3, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B140_SPI_SHADER_USER_DATA_VS_4, REG_FLAG_NEED_BO},
+       {R_00B144_SPI_SHADER_USER_DATA_VS_5, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B148_SPI_SHADER_USER_DATA_VS_6, REG_FLAG_NEED_BO},
+       {R_00B14C_SPI_SHADER_USER_DATA_VS_7, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B150_SPI_SHADER_USER_DATA_VS_8, REG_FLAG_NEED_BO},
+       {R_00B154_SPI_SHADER_USER_DATA_VS_9, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B158_SPI_SHADER_USER_DATA_VS_10, REG_FLAG_NEED_BO},
+       {R_00B15C_SPI_SHADER_USER_DATA_VS_11, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B160_SPI_SHADER_USER_DATA_VS_12, REG_FLAG_NEED_BO},
+       {R_00B164_SPI_SHADER_USER_DATA_VS_13, 0},
+       {GROUP_FORCE_NEW_BLOCK, 0},
+       {R_00B168_SPI_SHADER_USER_DATA_VS_14, REG_FLAG_NEED_BO},
+       {R_00B16C_SPI_SHADER_USER_DATA_VS_15, 0},
+};
+
+int si_context_init(struct r600_context *ctx)
+{
+       int r;
+
+       LIST_INITHEAD(&ctx->active_query_list);
+
+       /* init dirty list */
+       LIST_INITHEAD(&ctx->dirty);
+       LIST_INITHEAD(&ctx->enable_list);
+
+       ctx->range = calloc(NUM_RANGES, sizeof(struct r600_range));
+       if (!ctx->range) {
+               r = -ENOMEM;
+               goto out_err;
+       }
+
+       /* add blocks */
+       r = r600_context_add_block(ctx, si_config_reg_list,
+                                  Elements(si_config_reg_list), PKT3_SET_CONFIG_REG, SI_CONFIG_REG_OFFSET);
+       if (r)
+               goto out_err;
+       r = r600_context_add_block(ctx, si_context_reg_list,
+                                  Elements(si_context_reg_list), PKT3_SET_CONTEXT_REG, SI_CONTEXT_REG_OFFSET);
+       if (r)
+               goto out_err;
+       r = r600_context_add_block(ctx, si_sh_reg_list,
+                                  Elements(si_sh_reg_list), PKT3_SET_SH_REG, SI_SH_REG_OFFSET);
+       if (r)
+               goto out_err;
+
+
+       /* PS SAMPLER */
+       /* VS SAMPLER */
+
+       /* PS SAMPLER BORDER */
+       /* VS SAMPLER BORDER */
+
+       /* PS RESOURCES */
+       /* VS RESOURCES */
+
+       ctx->cs = ctx->ws->cs_create(ctx->ws);
+
+       r600_init_cs(ctx);
+       ctx->max_db = 8;
+       return 0;
+out_err:
+       r600_context_fini(ctx);
+       return r;
+}
+
+static inline void evergreen_context_ps_partial_flush(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+
+       if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
+               return;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+
+       ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
+}
+
+void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       unsigned ndwords = 7;
+       uint32_t *pm4;
+       uint64_t va;
+
+       if (draw->indices) {
+               ndwords = 11;
+       }
+       if (ctx->num_cs_dw_queries_suspend)
+               ndwords += 6;
+
+       /* when increasing ndwords, bump the max limit too */
+       assert(ndwords <= R600_MAX_DRAW_CS_DWORDS);
+
+       /* queries need some special values
+        * (this is non-zero if any query is active) */
+       if (ctx->num_cs_dw_queries_suspend) {
+               pm4 = &cs->buf[cs->cdw];
+               pm4[0] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               pm4[1] = (R_028004_DB_COUNT_CONTROL - SI_CONTEXT_REG_OFFSET) >> 2;
+               pm4[2] = S_028004_PERFECT_ZPASS_COUNTS(1);
+               pm4[3] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               pm4[4] = (R_02800C_DB_RENDER_OVERRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
+               pm4[5] = draw->db_render_override | S_02800C_NOOP_CULL_DISABLE(1);
+               cs->cdw += 6;
+               ndwords -= 6;
+       }
+
+       /* draw packet */
+       pm4 = &cs->buf[cs->cdw];
+       pm4[0] = PKT3(PKT3_INDEX_TYPE, 0, ctx->predicate_drawing);
+       pm4[1] = draw->vgt_index_type;
+       pm4[2] = PKT3(PKT3_NUM_INSTANCES, 0, ctx->predicate_drawing);
+       pm4[3] = draw->vgt_num_instances;
+       if (draw->indices) {
+               va = r600_resource_va(&ctx->screen->screen, (void*)draw->indices);
+               va += draw->indices_bo_offset;
+               pm4[4] = PKT3(PKT3_DRAW_INDEX, 3, ctx->predicate_drawing);
+               pm4[5] = va;
+               pm4[6] = (va >> 32UL) & 0xFF;
+               pm4[7] = draw->vgt_num_indices;
+               pm4[8] = draw->vgt_draw_initiator;
+               pm4[9] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing);
+               pm4[10] = r600_context_bo_reloc(ctx, draw->indices, RADEON_USAGE_READ);
+       } else {
+               pm4[4] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, ctx->predicate_drawing);
+               pm4[5] = draw->vgt_num_indices;
+               pm4[6] = draw->vgt_draw_initiator;
+       }
+       cs->cdw += ndwords;
+}
+
+void evergreen_flush_vgt_streamout(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       cs->buf[cs->cdw++] = (R_0084FC_CP_STRMOUT_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
+       cs->buf[cs->cdw++] = 0;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+       cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+       cs->buf[cs->cdw++] = R_0084FC_CP_STRMOUT_CNTL >> 2;  /* register */
+       cs->buf[cs->cdw++] = 0;
+       cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */
+       cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */
+       cs->buf[cs->cdw++] = 4; /* poll interval */
+}
+
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+
+       if (buffer_enable_bit) {
+               cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2;
+               cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(1);
+
+               cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               cs->buf[cs->cdw++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2;
+               cs->buf[cs->cdw++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit);
+       } else {
+               cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2;
+               cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(0);
+       }
+}
diff --git a/src/gallium/drivers/radeonsi/evergreen_state.c b/src/gallium/drivers/radeonsi/evergreen_state.c

new file mode 100644 (file)

index 0000000..5049c7b
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/evergreen_state.c
@@ -0,0 +1,2169 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* TODO:
+ *     - fix mask for depth control & cull for query
+ */
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "pipebuffer/pb_buffer.h"
+#include "r600.h"
+#include "sid.h"
+#include "r600_resource.h"
+#include "radeonsi_pipe.h"
+
+static uint32_t si_translate_blend_function(int blend_func)
+{
+       switch (blend_func) {
+       case PIPE_BLEND_ADD:
+               return V_028780_COMB_DST_PLUS_SRC;
+       case PIPE_BLEND_SUBTRACT:
+               return V_028780_COMB_SRC_MINUS_DST;
+       case PIPE_BLEND_REVERSE_SUBTRACT:
+               return V_028780_COMB_DST_MINUS_SRC;
+       case PIPE_BLEND_MIN:
+               return V_028780_COMB_MIN_DST_SRC;
+       case PIPE_BLEND_MAX:
+               return V_028780_COMB_MAX_DST_SRC;
+       default:
+               R600_ERR("Unknown blend function %d\n", blend_func);
+               assert(0);
+               break;
+       }
+       return 0;
+}
+
+static uint32_t si_translate_blend_factor(int blend_fact)
+{
+       switch (blend_fact) {
+       case PIPE_BLENDFACTOR_ONE:
+               return V_028780_BLEND_ONE;
+       case PIPE_BLENDFACTOR_SRC_COLOR:
+               return V_028780_BLEND_SRC_COLOR;
+       case PIPE_BLENDFACTOR_SRC_ALPHA:
+               return V_028780_BLEND_SRC_ALPHA;
+       case PIPE_BLENDFACTOR_DST_ALPHA:
+               return V_028780_BLEND_DST_ALPHA;
+       case PIPE_BLENDFACTOR_DST_COLOR:
+               return V_028780_BLEND_DST_COLOR;
+       case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+               return V_028780_BLEND_SRC_ALPHA_SATURATE;
+       case PIPE_BLENDFACTOR_CONST_COLOR:
+               return V_028780_BLEND_CONSTANT_COLOR;
+       case PIPE_BLENDFACTOR_CONST_ALPHA:
+               return V_028780_BLEND_CONSTANT_ALPHA;
+       case PIPE_BLENDFACTOR_ZERO:
+               return V_028780_BLEND_ZERO;
+       case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+               return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
+       case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+               return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
+       case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+               return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
+       case PIPE_BLENDFACTOR_INV_DST_COLOR:
+               return V_028780_BLEND_ONE_MINUS_DST_COLOR;
+       case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+               return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
+       case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+               return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
+       case PIPE_BLENDFACTOR_SRC1_COLOR:
+               return V_028780_BLEND_SRC1_COLOR;
+       case PIPE_BLENDFACTOR_SRC1_ALPHA:
+               return V_028780_BLEND_SRC1_ALPHA;
+       case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+               return V_028780_BLEND_INV_SRC1_COLOR;
+       case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+               return V_028780_BLEND_INV_SRC1_ALPHA;
+       default:
+               R600_ERR("Bad blend factor %d not supported!\n", blend_fact);
+               assert(0);
+               break;
+       }
+       return 0;
+}
+
+#if 0
+static uint32_t r600_translate_stencil_op(int s_op)
+{
+       switch (s_op) {
+       case PIPE_STENCIL_OP_KEEP:
+               return V_028800_STENCIL_KEEP;
+       case PIPE_STENCIL_OP_ZERO:
+               return V_028800_STENCIL_ZERO;
+       case PIPE_STENCIL_OP_REPLACE:
+               return V_028800_STENCIL_REPLACE;
+       case PIPE_STENCIL_OP_INCR:
+               return V_028800_STENCIL_INCR;
+       case PIPE_STENCIL_OP_DECR:
+               return V_028800_STENCIL_DECR;
+       case PIPE_STENCIL_OP_INCR_WRAP:
+               return V_028800_STENCIL_INCR_WRAP;
+       case PIPE_STENCIL_OP_DECR_WRAP:
+               return V_028800_STENCIL_DECR_WRAP;
+       case PIPE_STENCIL_OP_INVERT:
+               return V_028800_STENCIL_INVERT;
+       default:
+               R600_ERR("Unknown stencil op %d", s_op);
+               assert(0);
+               break;
+       }
+       return 0;
+}
+#endif
+
+static uint32_t si_translate_fill(uint32_t func)
+{
+       switch(func) {
+       case PIPE_POLYGON_MODE_FILL:
+               return V_028814_X_DRAW_TRIANGLES;
+       case PIPE_POLYGON_MODE_LINE:
+               return V_028814_X_DRAW_LINES;
+       case PIPE_POLYGON_MODE_POINT:
+               return V_028814_X_DRAW_POINTS;
+       default:
+               assert(0);
+               return V_028814_X_DRAW_POINTS;
+       }
+}
+
+/* translates straight */
+static uint32_t si_translate_ds_func(int func)
+{
+       return func;
+}
+
+static unsigned si_tex_wrap(unsigned wrap)
+{
+       switch (wrap) {
+       default:
+       case PIPE_TEX_WRAP_REPEAT:
+               return V_008F30_SQ_TEX_WRAP;
+       case PIPE_TEX_WRAP_CLAMP:
+               return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
+       case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+               return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
+       case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+               return V_008F30_SQ_TEX_CLAMP_BORDER;
+       case PIPE_TEX_WRAP_MIRROR_REPEAT:
+               return V_008F30_SQ_TEX_MIRROR;
+       case PIPE_TEX_WRAP_MIRROR_CLAMP:
+               return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+               return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+       case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+               return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
+       }
+}
+
+static unsigned si_tex_filter(unsigned filter)
+{
+       switch (filter) {
+       default:
+       case PIPE_TEX_FILTER_NEAREST:
+               return V_008F38_SQ_TEX_XY_FILTER_POINT;
+       case PIPE_TEX_FILTER_LINEAR:
+               return V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
+       }
+}
+
+static unsigned si_tex_mipfilter(unsigned filter)
+{
+       switch (filter) {
+       case PIPE_TEX_MIPFILTER_NEAREST:
+               return V_008F38_SQ_TEX_Z_FILTER_POINT;
+       case PIPE_TEX_MIPFILTER_LINEAR:
+               return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
+       default:
+       case PIPE_TEX_MIPFILTER_NONE:
+               return V_008F38_SQ_TEX_Z_FILTER_NONE;
+       }
+}
+
+static unsigned si_tex_compare(unsigned compare)
+{
+       switch (compare) {
+       default:
+       case PIPE_FUNC_NEVER:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
+       case PIPE_FUNC_LESS:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
+       case PIPE_FUNC_EQUAL:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
+       case PIPE_FUNC_LEQUAL:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
+       case PIPE_FUNC_GREATER:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
+       case PIPE_FUNC_NOTEQUAL:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
+       case PIPE_FUNC_GEQUAL:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
+       case PIPE_FUNC_ALWAYS:
+               return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
+       }
+}
+
+static unsigned si_tex_dim(unsigned dim)
+{
+       switch (dim) {
+       default:
+       case PIPE_TEXTURE_1D:
+               return V_008F1C_SQ_RSRC_IMG_1D;
+       case PIPE_TEXTURE_1D_ARRAY:
+               return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
+       case PIPE_TEXTURE_2D:
+       case PIPE_TEXTURE_RECT:
+               return V_008F1C_SQ_RSRC_IMG_2D;
+       case PIPE_TEXTURE_2D_ARRAY:
+               return V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+       case PIPE_TEXTURE_3D:
+               return V_008F1C_SQ_RSRC_IMG_3D;
+       case PIPE_TEXTURE_CUBE:
+               return V_008F1C_SQ_RSRC_IMG_CUBE;
+       }
+}
+
+static uint32_t si_translate_dbformat(enum pipe_format format)
+{
+       switch (format) {
+       case PIPE_FORMAT_Z16_UNORM:
+               return V_028040_Z_16;
+       case PIPE_FORMAT_Z24X8_UNORM:
+       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+               return V_028040_Z_24; /* XXX no longer supported on SI */
+       case PIPE_FORMAT_Z32_FLOAT:
+       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+               return V_028040_Z_32_FLOAT;
+       default:
+               return ~0U;
+       }
+}
+
+static uint32_t si_translate_colorswap(enum pipe_format format)
+{
+       switch (format) {
+       /* 8-bit buffers. */
+       case PIPE_FORMAT_L4A4_UNORM:
+       case PIPE_FORMAT_A4R4_UNORM:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_A8_UNORM:
+       case PIPE_FORMAT_A8_UINT:
+       case PIPE_FORMAT_A8_SINT:
+       case PIPE_FORMAT_R4A4_UNORM:
+               return V_028C70_SWAP_ALT_REV;
+       case PIPE_FORMAT_I8_UNORM:
+       case PIPE_FORMAT_L8_UNORM:
+       case PIPE_FORMAT_I8_UINT:
+       case PIPE_FORMAT_I8_SINT:
+       case PIPE_FORMAT_L8_UINT:
+       case PIPE_FORMAT_L8_SINT:
+       case PIPE_FORMAT_L8_SRGB:
+       case PIPE_FORMAT_R8_UNORM:
+       case PIPE_FORMAT_R8_SNORM:
+       case PIPE_FORMAT_R8_UINT:
+       case PIPE_FORMAT_R8_SINT:
+               return V_028C70_SWAP_STD;
+
+       /* 16-bit buffers. */
+       case PIPE_FORMAT_B5G6R5_UNORM:
+               return V_028C70_SWAP_STD_REV;
+
+       case PIPE_FORMAT_B5G5R5A1_UNORM:
+       case PIPE_FORMAT_B5G5R5X1_UNORM:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_B4G4R4A4_UNORM:
+       case PIPE_FORMAT_B4G4R4X4_UNORM:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_Z16_UNORM:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_L8A8_UNORM:
+       case PIPE_FORMAT_L8A8_UINT:
+       case PIPE_FORMAT_L8A8_SINT:
+       case PIPE_FORMAT_L8A8_SRGB:
+               return V_028C70_SWAP_ALT;
+       case PIPE_FORMAT_R8G8_UNORM:
+       case PIPE_FORMAT_R8G8_UINT:
+       case PIPE_FORMAT_R8G8_SINT:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_R16_UNORM:
+       case PIPE_FORMAT_R16_UINT:
+       case PIPE_FORMAT_R16_SINT:
+       case PIPE_FORMAT_R16_FLOAT:
+               return V_028C70_SWAP_STD;
+
+       /* 32-bit buffers. */
+       case PIPE_FORMAT_A8B8G8R8_SRGB:
+               return V_028C70_SWAP_STD_REV;
+       case PIPE_FORMAT_B8G8R8A8_SRGB:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_B8G8R8A8_UNORM:
+       case PIPE_FORMAT_B8G8R8X8_UNORM:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_A8R8G8B8_UNORM:
+       case PIPE_FORMAT_X8R8G8B8_UNORM:
+               return V_028C70_SWAP_ALT_REV;
+       case PIPE_FORMAT_R8G8B8A8_SNORM:
+       case PIPE_FORMAT_R8G8B8A8_UNORM:
+       case PIPE_FORMAT_R8G8B8A8_SSCALED:
+       case PIPE_FORMAT_R8G8B8A8_USCALED:
+       case PIPE_FORMAT_R8G8B8A8_SINT:
+       case PIPE_FORMAT_R8G8B8A8_UINT:
+       case PIPE_FORMAT_R8G8B8X8_UNORM:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_A8B8G8R8_UNORM:
+       case PIPE_FORMAT_X8B8G8R8_UNORM:
+       /* case PIPE_FORMAT_R8SG8SB8UX8U_NORM: */
+               return V_028C70_SWAP_STD_REV;
+
+       case PIPE_FORMAT_Z24X8_UNORM:
+       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_X8Z24_UNORM:
+       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_R10G10B10A2_UNORM:
+       case PIPE_FORMAT_R10G10B10X2_SNORM:
+       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+               return V_028C70_SWAP_STD;
+
+       case PIPE_FORMAT_B10G10R10A2_UNORM:
+       case PIPE_FORMAT_B10G10R10A2_UINT:
+               return V_028C70_SWAP_ALT;
+
+       case PIPE_FORMAT_R11G11B10_FLOAT:
+       case PIPE_FORMAT_R32_FLOAT:
+       case PIPE_FORMAT_R32_UINT:
+       case PIPE_FORMAT_R32_SINT:
+       case PIPE_FORMAT_Z32_FLOAT:
+       case PIPE_FORMAT_R16G16_FLOAT:
+       case PIPE_FORMAT_R16G16_UNORM:
+       case PIPE_FORMAT_R16G16_UINT:
+       case PIPE_FORMAT_R16G16_SINT:
+               return V_028C70_SWAP_STD;
+
+       /* 64-bit buffers. */
+       case PIPE_FORMAT_R32G32_FLOAT:
+       case PIPE_FORMAT_R32G32_UINT:
+       case PIPE_FORMAT_R32G32_SINT:
+       case PIPE_FORMAT_R16G16B16A16_UNORM:
+       case PIPE_FORMAT_R16G16B16A16_SNORM:
+       case PIPE_FORMAT_R16G16B16A16_USCALED:
+       case PIPE_FORMAT_R16G16B16A16_SSCALED:
+       case PIPE_FORMAT_R16G16B16A16_UINT:
+       case PIPE_FORMAT_R16G16B16A16_SINT:
+       case PIPE_FORMAT_R16G16B16A16_FLOAT:
+       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+
+       /* 128-bit buffers. */
+       case PIPE_FORMAT_R32G32B32A32_FLOAT:
+       case PIPE_FORMAT_R32G32B32A32_SNORM:
+       case PIPE_FORMAT_R32G32B32A32_UNORM:
+       case PIPE_FORMAT_R32G32B32A32_SSCALED:
+       case PIPE_FORMAT_R32G32B32A32_USCALED:
+       case PIPE_FORMAT_R32G32B32A32_SINT:
+       case PIPE_FORMAT_R32G32B32A32_UINT:
+               return V_028C70_SWAP_STD;
+       default:
+               R600_ERR("unsupported colorswap format %d\n", format);
+               return ~0U;
+       }
+       return ~0U;
+}
+
+static uint32_t si_translate_colorformat(enum pipe_format format)
+{
+       switch (format) {
+       /* 8-bit buffers. */
+       case PIPE_FORMAT_A8_UNORM:
+       case PIPE_FORMAT_A8_UINT:
+       case PIPE_FORMAT_A8_SINT:
+       case PIPE_FORMAT_I8_UNORM:
+       case PIPE_FORMAT_I8_UINT:
+       case PIPE_FORMAT_I8_SINT:
+       case PIPE_FORMAT_L8_UNORM:
+       case PIPE_FORMAT_L8_UINT:
+       case PIPE_FORMAT_L8_SINT:
+       case PIPE_FORMAT_L8_SRGB:
+       case PIPE_FORMAT_R8_UNORM:
+       case PIPE_FORMAT_R8_SNORM:
+       case PIPE_FORMAT_R8_UINT:
+       case PIPE_FORMAT_R8_SINT:
+               return V_028C70_COLOR_8;
+
+       /* 16-bit buffers. */
+       case PIPE_FORMAT_B5G6R5_UNORM:
+               return V_028C70_COLOR_5_6_5;
+
+       case PIPE_FORMAT_B5G5R5A1_UNORM:
+       case PIPE_FORMAT_B5G5R5X1_UNORM:
+               return V_028C70_COLOR_1_5_5_5;
+
+       case PIPE_FORMAT_B4G4R4A4_UNORM:
+       case PIPE_FORMAT_B4G4R4X4_UNORM:
+               return V_028C70_COLOR_4_4_4_4;
+
+       case PIPE_FORMAT_L8A8_UNORM:
+       case PIPE_FORMAT_L8A8_UINT:
+       case PIPE_FORMAT_L8A8_SINT:
+       case PIPE_FORMAT_L8A8_SRGB:
+       case PIPE_FORMAT_R8G8_UNORM:
+       case PIPE_FORMAT_R8G8_UINT:
+       case PIPE_FORMAT_R8G8_SINT:
+               return V_028C70_COLOR_8_8;
+
+       case PIPE_FORMAT_Z16_UNORM:
+       case PIPE_FORMAT_R16_UNORM:
+       case PIPE_FORMAT_R16_UINT:
+       case PIPE_FORMAT_R16_SINT:
+       case PIPE_FORMAT_R16_FLOAT:
+       case PIPE_FORMAT_R16G16_FLOAT:
+               return V_028C70_COLOR_16;
+
+       /* 32-bit buffers. */
+       case PIPE_FORMAT_A8B8G8R8_SRGB:
+       case PIPE_FORMAT_A8B8G8R8_UNORM:
+       case PIPE_FORMAT_A8R8G8B8_UNORM:
+       case PIPE_FORMAT_B8G8R8A8_SRGB:
+       case PIPE_FORMAT_B8G8R8A8_UNORM:
+       case PIPE_FORMAT_B8G8R8X8_UNORM:
+       case PIPE_FORMAT_R8G8B8A8_SNORM:
+       case PIPE_FORMAT_R8G8B8A8_UNORM:
+       case PIPE_FORMAT_R8G8B8X8_UNORM:
+       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
+       case PIPE_FORMAT_X8B8G8R8_UNORM:
+       case PIPE_FORMAT_X8R8G8B8_UNORM:
+       case PIPE_FORMAT_R8G8B8_UNORM:
+       case PIPE_FORMAT_R8G8B8A8_SSCALED:
+       case PIPE_FORMAT_R8G8B8A8_USCALED:
+       case PIPE_FORMAT_R8G8B8A8_SINT:
+       case PIPE_FORMAT_R8G8B8A8_UINT:
+               return V_028C70_COLOR_8_8_8_8;
+
+       case PIPE_FORMAT_R10G10B10A2_UNORM:
+       case PIPE_FORMAT_R10G10B10X2_SNORM:
+       case PIPE_FORMAT_B10G10R10A2_UNORM:
+       case PIPE_FORMAT_B10G10R10A2_UINT:
+       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+               return V_028C70_COLOR_2_10_10_10;
+
+       case PIPE_FORMAT_Z24X8_UNORM:
+       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+               return V_028C70_COLOR_8_24;
+
+       case PIPE_FORMAT_X8Z24_UNORM:
+       case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+               return V_028C70_COLOR_24_8;
+
+       case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+               return V_028C70_COLOR_X24_8_32_FLOAT;
+
+       case PIPE_FORMAT_R32_FLOAT:
+       case PIPE_FORMAT_Z32_FLOAT:
+               return V_028C70_COLOR_32;
+
+       case PIPE_FORMAT_R16G16_SSCALED:
+       case PIPE_FORMAT_R16G16_UNORM:
+       case PIPE_FORMAT_R16G16_UINT:
+       case PIPE_FORMAT_R16G16_SINT:
+               return V_028C70_COLOR_16_16;
+
+       case PIPE_FORMAT_R11G11B10_FLOAT:
+               return V_028C70_COLOR_10_11_11;
+
+       /* 64-bit buffers. */
+       case PIPE_FORMAT_R16G16B16_USCALED:
+       case PIPE_FORMAT_R16G16B16_SSCALED:
+       case PIPE_FORMAT_R16G16B16A16_UINT:
+       case PIPE_FORMAT_R16G16B16A16_SINT:
+       case PIPE_FORMAT_R16G16B16A16_USCALED:
+       case PIPE_FORMAT_R16G16B16A16_SSCALED:
+       case PIPE_FORMAT_R16G16B16A16_UNORM:
+       case PIPE_FORMAT_R16G16B16A16_SNORM:
+       case PIPE_FORMAT_R16G16B16_FLOAT:
+       case PIPE_FORMAT_R16G16B16A16_FLOAT:
+               return V_028C70_COLOR_16_16_16_16;
+
+       case PIPE_FORMAT_R32G32_FLOAT:
+       case PIPE_FORMAT_R32G32_USCALED:
+       case PIPE_FORMAT_R32G32_SSCALED:
+       case PIPE_FORMAT_R32G32_SINT:
+       case PIPE_FORMAT_R32G32_UINT:
+               return V_028C70_COLOR_32_32;
+
+       /* 128-bit buffers. */
+       case PIPE_FORMAT_R32G32B32A32_SNORM:
+       case PIPE_FORMAT_R32G32B32A32_UNORM:
+       case PIPE_FORMAT_R32G32B32A32_SSCALED:
+       case PIPE_FORMAT_R32G32B32A32_USCALED:
+       case PIPE_FORMAT_R32G32B32A32_SINT:
+       case PIPE_FORMAT_R32G32B32A32_UINT:
+       case PIPE_FORMAT_R32G32B32A32_FLOAT:
+               return V_028C70_COLOR_32_32_32_32;
+
+       /* YUV buffers. */
+       case PIPE_FORMAT_UYVY:
+       case PIPE_FORMAT_YUYV:
+       /* 96-bit buffers. */
+       case PIPE_FORMAT_R32G32B32_FLOAT:
+       /* 8-bit buffers. */
+       case PIPE_FORMAT_L4A4_UNORM:
+       case PIPE_FORMAT_R4A4_UNORM:
+       case PIPE_FORMAT_A4R4_UNORM:
+       default:
+               return ~0U; /* Unsupported. */
+       }
+}
+
+static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
+{
+       if (R600_BIG_ENDIAN) {
+               switch(colorformat) {
+               /* 8-bit buffers. */
+               case V_028C70_COLOR_8:
+                       return V_028C70_ENDIAN_NONE;
+
+               /* 16-bit buffers. */
+               case V_028C70_COLOR_5_6_5:
+               case V_028C70_COLOR_1_5_5_5:
+               case V_028C70_COLOR_4_4_4_4:
+               case V_028C70_COLOR_16:
+               case V_028C70_COLOR_8_8:
+                       return V_028C70_ENDIAN_8IN16;
+
+               /* 32-bit buffers. */
+               case V_028C70_COLOR_8_8_8_8:
+               case V_028C70_COLOR_2_10_10_10:
+               case V_028C70_COLOR_8_24:
+               case V_028C70_COLOR_24_8:
+               case V_028C70_COLOR_16_16:
+                       return V_028C70_ENDIAN_8IN32;
+
+               /* 64-bit buffers. */
+               case V_028C70_COLOR_16_16_16_16:
+                       return V_028C70_ENDIAN_8IN16;
+
+               case V_028C70_COLOR_32_32:
+                       return V_028C70_ENDIAN_8IN32;
+
+               /* 128-bit buffers. */
+               case V_028C70_COLOR_32_32_32_32:
+                       return V_028C70_ENDIAN_8IN32;
+               default:
+                       return V_028C70_ENDIAN_NONE; /* Unsupported. */
+               }
+       } else {
+               return V_028C70_ENDIAN_NONE;
+       }
+}
+
+static uint32_t si_translate_texformat(struct pipe_screen *screen,
+                                      enum pipe_format format,
+                                      const struct util_format_description *desc,
+                                      int first_non_void)
+{
+       boolean uniform = TRUE;
+       int i;
+
+       /* Colorspace (return non-RGB formats directly). */
+       switch (desc->colorspace) {
+       /* Depth stencil formats */
+       case UTIL_FORMAT_COLORSPACE_ZS:
+               switch (format) {
+               case PIPE_FORMAT_Z16_UNORM:
+                       return V_008F14_IMG_DATA_FORMAT_16;
+               case PIPE_FORMAT_X24S8_UINT:
+               case PIPE_FORMAT_Z24X8_UNORM:
+               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+                       return V_008F14_IMG_DATA_FORMAT_24_8;
+               case PIPE_FORMAT_S8X24_UINT:
+               case PIPE_FORMAT_X8Z24_UNORM:
+               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+                       return V_008F14_IMG_DATA_FORMAT_8_24;
+               case PIPE_FORMAT_S8_UINT:
+                       return V_008F14_IMG_DATA_FORMAT_8;
+               case PIPE_FORMAT_Z32_FLOAT:
+                       return V_008F14_IMG_DATA_FORMAT_32;
+               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+                       return V_008F14_IMG_DATA_FORMAT_X24_8_32;
+               default:
+                       goto out_unknown;
+               }
+
+       case UTIL_FORMAT_COLORSPACE_YUV:
+               goto out_unknown; /* TODO */
+
+       case UTIL_FORMAT_COLORSPACE_SRGB:
+               break;
+
+       default:
+               break;
+       }
+
+       /* TODO compressed formats */
+
+       if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+               return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
+       } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+               return V_008F14_IMG_DATA_FORMAT_10_11_11;
+       }
+
+       /* R8G8Bx_SNORM - TODO CxV8U8 */
+
+       /* See whether the components are of the same size. */
+       for (i = 1; i < desc->nr_channels; i++) {
+               uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+       }
+
+       /* Non-uniform formats. */
+       if (!uniform) {
+               switch(desc->nr_channels) {
+               case 3:
+                       if (desc->channel[0].size == 5 &&
+                           desc->channel[1].size == 6 &&
+                           desc->channel[2].size == 5) {
+                               return V_008F14_IMG_DATA_FORMAT_5_6_5;
+                       }
+                       goto out_unknown;
+               case 4:
+                       if (desc->channel[0].size == 5 &&
+                           desc->channel[1].size == 5 &&
+                           desc->channel[2].size == 5 &&
+                           desc->channel[3].size == 1) {
+                               return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
+                       }
+                       if (desc->channel[0].size == 10 &&
+                           desc->channel[1].size == 10 &&
+                           desc->channel[2].size == 10 &&
+                           desc->channel[3].size == 2) {
+                               return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
+                       }
+                       goto out_unknown;
+               }
+               goto out_unknown;
+       }
+
+       if (first_non_void < 0 || first_non_void > 3)
+               goto out_unknown;
+
+       /* uniform formats */
+       switch (desc->channel[first_non_void].size) {
+       case 4:
+               switch (desc->nr_channels) {
+               case 2:
+                       return V_008F14_IMG_DATA_FORMAT_4_4;
+               case 4:
+                       return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
+               }
+               break;
+       case 8:
+               switch (desc->nr_channels) {
+               case 1:
+                       return V_008F14_IMG_DATA_FORMAT_8;
+               case 2:
+                       return V_008F14_IMG_DATA_FORMAT_8_8;
+               case 4:
+                       return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
+               }
+               break;
+       case 16:
+               switch (desc->nr_channels) {
+               case 1:
+                       return V_008F14_IMG_DATA_FORMAT_16;
+               case 2:
+                       return V_008F14_IMG_DATA_FORMAT_16_16;
+               case 4:
+                       return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
+               }
+               break;
+       case 32:
+               switch (desc->nr_channels) {
+               case 1:
+                       return V_008F14_IMG_DATA_FORMAT_32;
+               case 2:
+                       return V_008F14_IMG_DATA_FORMAT_32_32;
+               case 3:
+                       return V_008F14_IMG_DATA_FORMAT_32_32_32;
+               case 4:
+                       return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
+               }
+       }
+
+out_unknown:
+       /* R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format)); */
+       return ~0;
+}
+
+static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
+{
+       return si_translate_texformat(screen, format, util_format_description(format),
+                                     util_format_get_first_non_void_channel(format)) != ~0U;
+}
+
+uint32_t si_translate_vertexformat(struct pipe_screen *screen,
+                                  enum pipe_format format,
+                                  const struct util_format_description *desc,
+                                  int first_non_void)
+{
+       uint32_t result = si_translate_texformat(screen, format, desc, first_non_void);
+
+       if (result == V_008F0C_BUF_DATA_FORMAT_INVALID ||
+           result > V_008F0C_BUF_DATA_FORMAT_32_32_32_32)
+               result = ~0;
+
+       return result;
+}
+
+static bool si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format)
+{
+       return si_translate_vertexformat(screen, format, util_format_description(format),
+                                        util_format_get_first_non_void_channel(format)) != ~0U;
+}
+
+static bool r600_is_colorbuffer_format_supported(enum pipe_format format)
+{
+       return si_translate_colorformat(format) != ~0U &&
+               si_translate_colorswap(format) != ~0U;
+}
+
+static bool r600_is_zs_format_supported(enum pipe_format format)
+{
+       return si_translate_dbformat(format) != ~0U;
+}
+
+boolean si_is_format_supported(struct pipe_screen *screen,
+                              enum pipe_format format,
+                              enum pipe_texture_target target,
+                              unsigned sample_count,
+                              unsigned usage)
+{
+       unsigned retval = 0;
+
+       if (target >= PIPE_MAX_TEXTURE_TYPES) {
+               R600_ERR("r600: unsupported texture type %d\n", target);
+               return FALSE;
+       }
+
+       if (!util_format_is_supported(format, usage))
+               return FALSE;
+
+       /* Multisample */
+       if (sample_count > 1)
+               return FALSE;
+
+       if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+           si_is_sampler_format_supported(screen, format)) {
+               retval |= PIPE_BIND_SAMPLER_VIEW;
+       }
+
+       if ((usage & (PIPE_BIND_RENDER_TARGET |
+                     PIPE_BIND_DISPLAY_TARGET |
+                     PIPE_BIND_SCANOUT |
+                     PIPE_BIND_SHARED)) &&
+           r600_is_colorbuffer_format_supported(format)) {
+               retval |= usage &
+                         (PIPE_BIND_RENDER_TARGET |
+                          PIPE_BIND_DISPLAY_TARGET |
+                          PIPE_BIND_SCANOUT |
+                          PIPE_BIND_SHARED);
+       }
+
+       if ((usage & PIPE_BIND_DEPTH_STENCIL) &&
+           r600_is_zs_format_supported(format)) {
+               retval |= PIPE_BIND_DEPTH_STENCIL;
+       }
+
+       if ((usage & PIPE_BIND_VERTEX_BUFFER) &&
+           si_is_vertex_format_supported(screen, format)) {
+               retval |= PIPE_BIND_VERTEX_BUFFER;
+       }
+
+       if (usage & PIPE_BIND_TRANSFER_READ)
+               retval |= PIPE_BIND_TRANSFER_READ;
+       if (usage & PIPE_BIND_TRANSFER_WRITE)
+               retval |= PIPE_BIND_TRANSFER_WRITE;
+
+       return retval == usage;
+}
+
+static void evergreen_set_blend_color(struct pipe_context *ctx,
+                                       const struct pipe_blend_color *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+
+       if (rstate == NULL)
+               return;
+
+       rstate->id = R600_PIPE_STATE_BLEND_COLOR;
+       r600_pipe_state_add_reg(rstate, R_028414_CB_BLEND_RED, fui(state->color[0]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028418_CB_BLEND_GREEN, fui(state->color[1]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02841C_CB_BLEND_BLUE, fui(state->color[2]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028420_CB_BLEND_ALPHA, fui(state->color[3]), NULL, 0);
+
+       free(rctx->states[R600_PIPE_STATE_BLEND_COLOR]);
+       rctx->states[R600_PIPE_STATE_BLEND_COLOR] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void *evergreen_create_blend_state(struct pipe_context *ctx,
+                                       const struct pipe_blend_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_blend *blend = CALLOC_STRUCT(r600_pipe_blend);
+       struct r600_pipe_state *rstate;
+       uint32_t color_control, target_mask;
+       /* FIXME there is more then 8 framebuffer */
+       unsigned blend_cntl[8];
+
+       if (blend == NULL) {
+               return NULL;
+       }
+
+       rstate = &blend->rstate;
+
+       rstate->id = R600_PIPE_STATE_BLEND;
+
+       target_mask = 0;
+       color_control = S_028808_MODE(V_028808_CB_NORMAL);
+       if (state->logicop_enable) {
+               color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
+       } else {
+               color_control |= S_028808_ROP3(0xcc);
+       }
+       /* we pretend 8 buffer are used, CB_SHADER_MASK will disable unused one */
+       if (state->independent_blend_enable) {
+               for (int i = 0; i < 8; i++) {
+                       target_mask |= (state->rt[i].colormask << (4 * i));
+               }
+       } else {
+               for (int i = 0; i < 8; i++) {
+                       target_mask |= (state->rt[0].colormask << (4 * i));
+               }
+       }
+       blend->cb_target_mask = target_mask;
+
+       r600_pipe_state_add_reg(rstate, R_028808_CB_COLOR_CONTROL,
+                               color_control, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, ~0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1, ~0, NULL, 0);
+
+       for (int i = 0; i < 8; i++) {
+               /* state->rt entries > 0 only written if independent blending */
+               const int j = state->independent_blend_enable ? i : 0;
+
+               unsigned eqRGB = state->rt[j].rgb_func;
+               unsigned srcRGB = state->rt[j].rgb_src_factor;
+               unsigned dstRGB = state->rt[j].rgb_dst_factor;
+               unsigned eqA = state->rt[j].alpha_func;
+               unsigned srcA = state->rt[j].alpha_src_factor;
+               unsigned dstA = state->rt[j].alpha_dst_factor;
+
+               blend_cntl[i] = 0;
+               if (!state->rt[j].blend_enable)
+                       continue;
+
+               blend_cntl[i] |= S_028780_ENABLE(1);
+               blend_cntl[i] |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
+               blend_cntl[i] |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
+               blend_cntl[i] |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
+
+               if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
+                       blend_cntl[i] |= S_028780_SEPARATE_ALPHA_BLEND(1);
+                       blend_cntl[i] |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
+                       blend_cntl[i] |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
+                       blend_cntl[i] |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
+               }
+       }
+       for (int i = 0; i < 8; i++) {
+               r600_pipe_state_add_reg(rstate, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl[i], NULL, 0);
+       }
+
+       return rstate;
+}
+
+static void *evergreen_create_dsa_state(struct pipe_context *ctx,
+                                  const struct pipe_depth_stencil_alpha_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_dsa *dsa = CALLOC_STRUCT(r600_pipe_dsa);
+       unsigned db_depth_control, alpha_test_control, alpha_ref;
+       unsigned db_render_override, db_render_control;
+       struct r600_pipe_state *rstate;
+
+       if (dsa == NULL) {
+               return NULL;
+       }
+
+       dsa->valuemask[0] = state->stencil[0].valuemask;
+       dsa->valuemask[1] = state->stencil[1].valuemask;
+       dsa->writemask[0] = state->stencil[0].writemask;
+       dsa->writemask[1] = state->stencil[1].writemask;
+
+       rstate = &dsa->rstate;
+
+       rstate->id = R600_PIPE_STATE_DSA;
+       db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) |
+               S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
+               S_028800_ZFUNC(state->depth.func);
+
+       /* stencil */
+       if (state->stencil[0].enabled) {
+               db_depth_control |= S_028800_STENCIL_ENABLE(1);
+               db_depth_control |= S_028800_STENCILFUNC(si_translate_ds_func(state->stencil[0].func));
+               //db_depth_control |= S_028800_STENCILFAIL(r600_translate_stencil_op(state->stencil[0].fail_op));
+               //db_depth_control |= S_028800_STENCILZPASS(r600_translate_stencil_op(state->stencil[0].zpass_op));
+               //db_depth_control |= S_028800_STENCILZFAIL(r600_translate_stencil_op(state->stencil[0].zfail_op));
+
+               if (state->stencil[1].enabled) {
+                       db_depth_control |= S_028800_BACKFACE_ENABLE(1);
+                       db_depth_control |= S_028800_STENCILFUNC_BF(si_translate_ds_func(state->stencil[1].func));
+                       //db_depth_control |= S_028800_STENCILFAIL_BF(r600_translate_stencil_op(state->stencil[1].fail_op));
+                       //db_depth_control |= S_028800_STENCILZPASS_BF(r600_translate_stencil_op(state->stencil[1].zpass_op));
+                       //db_depth_control |= S_028800_STENCILZFAIL_BF(r600_translate_stencil_op(state->stencil[1].zfail_op));
+               }
+       }
+
+       /* alpha */
+       alpha_test_control = 0;
+       alpha_ref = 0;
+       if (state->alpha.enabled) {
+               //alpha_test_control = S_028410_ALPHA_FUNC(state->alpha.func);
+               //alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1);
+               alpha_ref = fui(state->alpha.ref_value);
+       }
+       dsa->alpha_ref = alpha_ref;
+
+       /* misc */
+       db_render_control = 0;
+       db_render_override = S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE) |
+               S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
+               S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
+       /* TODO db_render_override depends on query */
+       r600_pipe_state_add_reg(rstate, R_028020_DB_DEPTH_BOUNDS_MIN, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028024_DB_DEPTH_BOUNDS_MAX, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, 0x3F800000, NULL, 0);
+       //r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02800C_DB_RENDER_OVERRIDE, db_render_override, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AC8_DB_PRELOAD_CONTROL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028B70_DB_ALPHA_TO_MASK, 0x0000AA00, NULL, 0);
+       dsa->db_render_override = db_render_override;
+
+       return rstate;
+}
+
+static void *evergreen_create_rs_state(struct pipe_context *ctx,
+                                       const struct pipe_rasterizer_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer);
+       struct r600_pipe_state *rstate;
+       unsigned tmp;
+       unsigned prov_vtx = 1, polygon_dual_mode;
+       unsigned clip_rule;
+       float psize_min, psize_max;
+
+       if (rs == NULL) {
+               return NULL;
+       }
+
+       polygon_dual_mode = (state->fill_front != PIPE_POLYGON_MODE_FILL ||
+                               state->fill_back != PIPE_POLYGON_MODE_FILL);
+
+       if (state->flatshade_first)
+               prov_vtx = 0;
+
+       rstate = &rs->rstate;
+       rs->flatshade = state->flatshade;
+       rs->sprite_coord_enable = state->sprite_coord_enable;
+       rs->pa_sc_line_stipple = state->line_stipple_enable ?
+                               S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
+                               S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0;
+       rs->pa_su_sc_mode_cntl =
+               S_028814_PROVOKING_VTX_LAST(prov_vtx) |
+               S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+               S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+               S_028814_FACE(!state->front_ccw) |
+               S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
+               S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
+               S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
+               S_028814_POLY_MODE(polygon_dual_mode) |
+               S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
+               S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back));
+       rs->pa_cl_clip_cntl =
+               S_028810_PS_UCP_MODE(3) |
+               S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
+               S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip) |
+               S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+       rs->pa_cl_vs_out_cntl =
+               S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
+               S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex);
+
+       clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
+
+       /* offset */
+       rs->offset_units = state->offset_units;
+       rs->offset_scale = state->offset_scale * 12.0f;
+
+       rstate->id = R600_PIPE_STATE_RASTERIZER;
+       tmp = S_0286D4_FLAT_SHADE_ENA(state->flatshade);
+       if (state->sprite_coord_enable) {
+               tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
+                       S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+                       S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+                       S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+                       S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1);
+               if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) {
+                       tmp |= S_0286D4_PNT_SPRITE_TOP_1(1);
+               }
+       }
+       r600_pipe_state_add_reg(rstate, R_0286D4_SPI_INTERP_CONTROL_0, tmp, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, NULL, 0);
+       /* point size 12.4 fixed point */
+       tmp = (unsigned)(state->point_size * 8.0);
+       r600_pipe_state_add_reg(rstate, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp), NULL, 0);
+
+       if (state->point_size_per_vertex) {
+               psize_min = util_get_min_point_size(state);
+               psize_max = 8192;
+       } else {
+               /* Force the point size to be as if the vertex output was disabled. */
+               psize_min = state->point_size;
+               psize_max = state->point_size;
+       }
+       /* Divide by two, because 0.5 = 1 pixel. */
+       r600_pipe_state_add_reg(rstate, R_028A04_PA_SU_POINT_MINMAX,
+                               S_028A04_MIN_SIZE(r600_pack_float_12p4(psize_min/2)) |
+                               S_028A04_MAX_SIZE(r600_pack_float_12p4(psize_max/2)),
+                               NULL, 0);
+
+       tmp = (unsigned)state->line_width * 8;
+       r600_pipe_state_add_reg(rstate, R_028A08_PA_SU_LINE_CNTL, S_028A08_WIDTH(tmp), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A48_PA_SC_MODE_CNTL_0,
+                               S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable),
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028BDC_PA_SC_LINE_CNTL, 0x00000400, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BE4_PA_SU_VTX_CNTL,
+                               S_028BE4_PIX_CENTER(state->gl_rasterization_rules),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 0x3F800000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, 0x3F800000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, 0x3F800000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, 0x3F800000, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, NULL, 0);
+       return rstate;
+}
+
+static void *si_create_sampler_state(struct pipe_context *ctx,
+                                    const struct pipe_sampler_state *state)
+{
+       struct si_pipe_sampler_state *rstate = CALLOC_STRUCT(si_pipe_sampler_state);
+       union util_color uc;
+       unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0;
+       unsigned border_color_type;
+
+       if (rstate == NULL) {
+               return NULL;
+       }
+
+       util_pack_color(state->border_color.f, PIPE_FORMAT_B8G8R8A8_UNORM, &uc);
+       switch (uc.ui) {
+       case 0x000000FF: /* opaque black */
+               border_color_type = 0;
+               break;
+       case 0x00000000: /* transparent black */
+               border_color_type = 1;
+               break;
+       case 0xFFFFFFFF: /* white */
+               border_color_type = 1;
+               break;
+       default: /* Use border color pointer */
+               border_color_type = 3;
+       }
+
+       rstate->val[0] = si_tex_wrap(state->wrap_s) |
+                        si_tex_wrap(state->wrap_t) << 3 |
+                        si_tex_wrap(state->wrap_r) << 6 |
+                        (state->max_anisotropy & 0x7) << 9 | /* XXX */
+                        si_tex_compare(state->compare_func) << 12 |
+                        !state->normalized_coords << 15 |
+                        aniso_flag_offset << 16 | /* XXX */
+                        !state->seamless_cube_map << 28 |
+                        si_tex_mipfilter(state->min_mip_filter) << 29;
+       rstate->val[1] = S_FIXED(CLAMP(state->min_lod, 0, 15), 8) |
+                        S_FIXED(CLAMP(state->max_lod, 0, 15), 8) << 12;
+       rstate->val[2] = S_FIXED(CLAMP(state->lod_bias, -16, 16), 8) |
+                        si_tex_filter(state->mag_img_filter) << 20 |
+                        si_tex_filter(state->min_img_filter) << 22;
+       rstate->val[3] = border_color_type << 30;
+
+#if 0
+       if (border_color_type == 3) {
+               r600_pipe_state_add_reg_noblock(rstate, R_00A404_TD_PS_SAMPLER0_BORDER_RED, fui(state->border_color.f[0]), NULL, 0);
+               r600_pipe_state_add_reg_noblock(rstate, R_00A408_TD_PS_SAMPLER0_BORDER_GREEN, fui(state->border_color.f[1]), NULL, 0);
+               r600_pipe_state_add_reg_noblock(rstate, R_00A40C_TD_PS_SAMPLER0_BORDER_BLUE, fui(state->border_color.f[2]), NULL, 0);
+               r600_pipe_state_add_reg_noblock(rstate, R_00A410_TD_PS_SAMPLER0_BORDER_ALPHA, fui(state->border_color.f[3]), NULL, 0);
+       }
+#endif
+       return rstate;
+}
+
+static void si_delete_sampler_state(struct pipe_context *ctx,
+                                   void *state)
+{
+       free(state);
+}
+
+static unsigned si_map_swizzle(unsigned swizzle)
+{
+       switch (swizzle) {
+       case UTIL_FORMAT_SWIZZLE_Y:
+               return V_008F1C_SQ_SEL_Y;
+       case UTIL_FORMAT_SWIZZLE_Z:
+               return V_008F1C_SQ_SEL_Z;
+       case UTIL_FORMAT_SWIZZLE_W:
+               return V_008F1C_SQ_SEL_W;
+       case UTIL_FORMAT_SWIZZLE_0:
+               return V_008F1C_SQ_SEL_0;
+       case UTIL_FORMAT_SWIZZLE_1:
+               return V_008F1C_SQ_SEL_1;
+       default: /* UTIL_FORMAT_SWIZZLE_X */
+               return V_008F1C_SQ_SEL_X;
+       }
+}
+
+static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_context *ctx,
+                                                       struct pipe_resource *texture,
+                                                       const struct pipe_sampler_view *state)
+{
+       struct si_pipe_sampler_view *view = CALLOC_STRUCT(si_pipe_sampler_view);
+       struct r600_resource_texture *tmp = (struct r600_resource_texture*)texture;
+       const struct util_format_description *desc = util_format_description(state->format);
+       unsigned format, num_format, endian;
+       uint32_t pitch = 0;
+       unsigned char state_swizzle[4], swizzle[4], array_mode = 0, tile_type = 0;
+       unsigned height, depth;
+       int first_non_void;
+       uint64_t va;
+
+       if (view == NULL)
+               return NULL;
+
+       /* initialize base object */
+       view->base = *state;
+       view->base.texture = NULL;
+       pipe_reference(NULL, &texture->reference);
+       view->base.texture = texture;
+       view->base.reference.count = 1;
+       view->base.context = ctx;
+
+       state_swizzle[0] = state->swizzle_r;
+       state_swizzle[1] = state->swizzle_g;
+       state_swizzle[2] = state->swizzle_b;
+       state_swizzle[3] = state->swizzle_a;
+       util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
+
+       first_non_void = util_format_get_first_non_void_channel(state->format);
+       switch (desc->channel[first_non_void].type) {
+       case UTIL_FORMAT_TYPE_FLOAT:
+               num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+               break;
+       case UTIL_FORMAT_TYPE_FIXED:
+               num_format = V_008F14_IMG_NUM_FORMAT_USCALED; /* XXX */
+               break;
+       case UTIL_FORMAT_TYPE_SIGNED:
+               num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
+               break;
+       case UTIL_FORMAT_TYPE_UNSIGNED:
+       default:
+               num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
+       }
+
+       format = si_translate_texformat(ctx->screen, state->format, desc, first_non_void);
+       if (format == ~0) {
+               format = 0;
+       }
+
+       if (tmp->depth && !tmp->is_flushing_texture) {
+               r600_texture_depth_flush(ctx, texture, TRUE);
+               tmp = tmp->flushed_depth_texture;
+       }
+
+       endian = si_colorformat_endian_swap(format);
+
+       height = texture->height0;
+       depth = texture->depth0;
+
+       pitch = align(tmp->pitch_in_blocks[0] *
+                     util_format_get_blockwidth(state->format), 8);
+       array_mode = tmp->array_mode[0];
+       tile_type = tmp->tile_type;
+
+       if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
+               height = 1;
+               depth = texture->array_size;
+       } else if (texture->target == PIPE_TEXTURE_2D_ARRAY) {
+               depth = texture->array_size;
+       }
+
+       va = r600_resource_va(ctx->screen, texture);
+       view->state[0] = (va + tmp->offset[0]) >> 8;
+       view->state[1] = ((va + tmp->offset[0]) >> 40) & 0xff;
+       view->state[1] |= (S_008F14_DATA_FORMAT(format) |
+                          S_008F14_NUM_FORMAT(num_format));
+       view->state[2] = (S_008F18_WIDTH(texture->width0 - 1) |
+                         S_008F18_HEIGHT(height - 1));
+       view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+                         S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+                         S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+                         S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+                         S_008F1C_BASE_LEVEL(state->u.tex.first_level) |
+                         S_008F1C_LAST_LEVEL(state->u.tex.last_level) |
+                         S_008F1C_TYPE(si_tex_dim(texture->target)));
+       view->state[4] = (S_008F20_DEPTH(depth - 1) |
+                         S_008F20_PITCH((pitch / 8) - 1));
+       view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
+                         S_008F24_LAST_ARRAY(state->u.tex.last_layer));
+       view->state[6] = 0;
+       view->state[7] = 0;
+
+       return &view->base;
+}
+
+static void evergreen_set_vs_sampler_view(struct pipe_context *ctx, unsigned count,
+                                       struct pipe_sampler_view **views)
+{
+}
+
+static void evergreen_set_ps_sampler_view(struct pipe_context *ctx, unsigned count,
+                                       struct pipe_sampler_view **views)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views;
+       struct r600_pipe_state *rstate = &rctx->ps_samplers.rstate;
+       struct r600_resource *bo;
+       int i;
+       int has_depth = 0;
+       uint64_t va;
+       char *ptr;
+
+       if (!count)
+               goto out;
+
+       r600_inval_texture_cache(rctx);
+
+       bo = (struct r600_resource*)
+               pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
+                                  count * sizeof(resource[0]->state));
+       ptr = rctx->ws->buffer_map(bo->buf, rctx->cs, PIPE_TRANSFER_WRITE);
+
+       for (i = 0; i < count; i++, ptr += sizeof(resource[0]->state)) {
+               pipe_sampler_view_reference(
+                       (struct pipe_sampler_view **)&rctx->ps_samplers.views[i],
+                       views[i]);
+
+               if (resource[i]) {
+                       if (((struct r600_resource_texture *)resource[i]->base.texture)->depth)
+                               has_depth = 1;
+
+                       memcpy(ptr, resource[i]->state, sizeof(resource[0]->state));
+               } else
+                       memset(ptr, 0, sizeof(resource[0]->state));
+       }
+
+       rctx->ws->buffer_unmap(bo->buf);
+
+       for (i = count; i < NUM_TEX_UNITS; i++) {
+               if (rctx->ps_samplers.views[i])
+                       pipe_sampler_view_reference((struct pipe_sampler_view **)&rctx->ps_samplers.views[i], NULL);
+       }
+
+       va = r600_resource_va(ctx->screen, (void *)bo);
+       r600_pipe_state_add_reg(rstate, R_00B040_SPI_SHADER_USER_DATA_PS_4, va, bo, RADEON_USAGE_READ);
+       r600_pipe_state_add_reg(rstate, R_00B044_SPI_SHADER_USER_DATA_PS_5, va >> 32, NULL, 0);
+       r600_context_pipe_state_set(rctx, rstate);
+
+out:
+       rctx->have_depth_texture = has_depth;
+       rctx->ps_samplers.n_views = count;
+}
+
+static void evergreen_bind_ps_sampler(struct pipe_context *ctx, unsigned count, void **states)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct si_pipe_sampler_state **rstates = (struct si_pipe_sampler_state **)states;
+       struct r600_pipe_state *rstate = &rctx->ps_samplers.rstate;
+       struct r600_resource *bo;
+       uint64_t va;
+       char *ptr;
+       int i;
+
+       if (!count)
+               goto out;
+
+       r600_inval_texture_cache(rctx);
+
+       bo = (struct r600_resource*)
+               pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE,
+                                  count * sizeof(rstates[0]->val));
+       ptr = rctx->ws->buffer_map(bo->buf, rctx->cs, PIPE_TRANSFER_WRITE);
+
+       for (i = 0; i < count; i++, ptr += sizeof(rstates[0]->val)) {
+               memcpy(ptr, rstates[i]->val, sizeof(rstates[0]->val));
+       }
+
+       rctx->ws->buffer_unmap(bo->buf);
+
+       va = r600_resource_va(ctx->screen, (void *)bo);
+       r600_pipe_state_add_reg(rstate, R_00B038_SPI_SHADER_USER_DATA_PS_2, va, bo, RADEON_USAGE_READ);
+       r600_pipe_state_add_reg(rstate, R_00B03C_SPI_SHADER_USER_DATA_PS_3, va >> 32, NULL, 0);
+       r600_context_pipe_state_set(rctx, rstate);
+
+out:
+       rctx->ps_samplers.n_samplers = count;
+}
+
+static void evergreen_bind_vs_sampler(struct pipe_context *ctx, unsigned count, void **states)
+{
+}
+
+static void evergreen_set_clip_state(struct pipe_context *ctx,
+                               const struct pipe_clip_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+
+       if (rstate == NULL)
+               return;
+
+       rctx->clip = *state;
+       rstate->id = R600_PIPE_STATE_CLIP;
+       for (int i = 0; i < 6; i++) {
+               r600_pipe_state_add_reg(rstate,
+                                       R_0285BC_PA_CL_UCP_0_X + i * 16,
+                                       fui(state->ucp[i][0]), NULL, 0);
+               r600_pipe_state_add_reg(rstate,
+                                       R_0285C0_PA_CL_UCP_0_Y + i * 16,
+                                       fui(state->ucp[i][1]) , NULL, 0);
+               r600_pipe_state_add_reg(rstate,
+                                       R_0285C4_PA_CL_UCP_0_Z + i * 16,
+                                       fui(state->ucp[i][2]), NULL, 0);
+               r600_pipe_state_add_reg(rstate,
+                                       R_0285C8_PA_CL_UCP_0_W + i * 16,
+                                       fui(state->ucp[i][3]), NULL, 0);
+       }
+
+       free(rctx->states[R600_PIPE_STATE_CLIP]);
+       rctx->states[R600_PIPE_STATE_CLIP] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void evergreen_set_polygon_stipple(struct pipe_context *ctx,
+                                        const struct pipe_poly_stipple *state)
+{
+}
+
+static void evergreen_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+}
+
+static void evergreen_set_scissor_state(struct pipe_context *ctx,
+                                       const struct pipe_scissor_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+       uint32_t tl, br;
+
+       if (rstate == NULL)
+               return;
+
+       rstate->id = R600_PIPE_STATE_SCISSOR;
+       tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny);
+       br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
+       r600_pipe_state_add_reg(rstate,
+                               R_028210_PA_SC_CLIPRECT_0_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028214_PA_SC_CLIPRECT_0_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028218_PA_SC_CLIPRECT_1_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_02821C_PA_SC_CLIPRECT_1_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028220_PA_SC_CLIPRECT_2_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028224_PA_SC_CLIPRECT_2_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028228_PA_SC_CLIPRECT_3_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_02822C_PA_SC_CLIPRECT_3_BR, br,
+                               NULL, 0);
+
+       free(rctx->states[R600_PIPE_STATE_SCISSOR]);
+       rctx->states[R600_PIPE_STATE_SCISSOR] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void evergreen_set_viewport_state(struct pipe_context *ctx,
+                                       const struct pipe_viewport_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+
+       if (rstate == NULL)
+               return;
+
+       rctx->viewport = *state;
+       rstate->id = R600_PIPE_STATE_VIEWPORT;
+       r600_pipe_state_add_reg(rstate, R_0282D0_PA_SC_VPORT_ZMIN_0, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_0282D4_PA_SC_VPORT_ZMAX_0, 0x3F800000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028350_PA_SC_RASTER_CONFIG, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2]), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028818_PA_CL_VTE_CNTL, 0x0000043F, NULL, 0);
+
+       free(rctx->states[R600_PIPE_STATE_VIEWPORT]);
+       rctx->states[R600_PIPE_STATE_VIEWPORT] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void evergreen_cb(struct r600_context *rctx, struct r600_pipe_state *rstate,
+                       const struct pipe_framebuffer_state *state, int cb)
+{
+       struct r600_resource_texture *rtex;
+       struct r600_surface *surf;
+       unsigned level = state->cbufs[cb]->u.tex.level;
+       unsigned pitch, slice;
+       unsigned color_info;
+       unsigned format, swap, ntype, endian;
+       uint64_t offset;
+       unsigned tile_type;
+       const struct util_format_description *desc;
+       int i;
+       unsigned blend_clamp = 0, blend_bypass = 0;
+
+       surf = (struct r600_surface *)state->cbufs[cb];
+       rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture;
+
+       if (rtex->depth)
+               rctx->have_depth_fb = TRUE;
+
+       if (rtex->depth && !rtex->is_flushing_texture) {
+               r600_texture_depth_flush(&rctx->context, state->cbufs[cb]->texture, TRUE);
+               rtex = rtex->flushed_depth_texture;
+       }
+
+       /* XXX quite sure for dx10+ hw don't need any offset hacks */
+       offset = r600_texture_get_offset(rtex,
+                                        level, state->cbufs[cb]->u.tex.first_layer);
+       pitch = rtex->pitch_in_blocks[level] / 8 - 1;
+       slice = rtex->pitch_in_blocks[level] * surf->aligned_height / 64 - 1;
+       desc = util_format_description(surf->base.format);
+       for (i = 0; i < 4; i++) {
+               if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+                       break;
+               }
+       }
+       if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) {
+               ntype = V_028C70_NUMBER_FLOAT;
+       } else {
+               ntype = V_028C70_NUMBER_UNORM;
+               if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
+                       ntype = V_028C70_NUMBER_SRGB;
+               else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+                       if (desc->channel[i].normalized)
+                               ntype = V_028C70_NUMBER_SNORM;
+                       else if (desc->channel[i].pure_integer)
+                               ntype = V_028C70_NUMBER_SINT;
+               } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+                       if (desc->channel[i].normalized)
+                               ntype = V_028C70_NUMBER_UNORM;
+                       else if (desc->channel[i].pure_integer)
+                               ntype = V_028C70_NUMBER_UINT;
+               }
+       }
+
+       format = si_translate_colorformat(surf->base.format);
+       swap = si_translate_colorswap(surf->base.format);
+       if (rtex->resource.b.b.b.usage == PIPE_USAGE_STAGING) {
+               endian = V_028C70_ENDIAN_NONE;
+       } else {
+               endian = si_colorformat_endian_swap(format);
+       }
+
+       /* blend clamp should be set for all NORM/SRGB types */
+       if (ntype == V_028C70_NUMBER_UNORM ||
+           ntype == V_028C70_NUMBER_SNORM ||
+           ntype == V_028C70_NUMBER_SRGB)
+               blend_clamp = 1;
+
+       /* set blend bypass according to docs if SINT/UINT or
+          8/24 COLOR variants */
+       if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
+           format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
+           format == V_028C70_COLOR_X24_8_32_FLOAT) {
+               blend_clamp = 0;
+               blend_bypass = 1;
+       }
+
+       color_info = S_028C70_FORMAT(format) |
+               S_028C70_COMP_SWAP(swap) |
+               //S_028C70_ARRAY_MODE(rtex->array_mode[level]) |
+               S_028C70_BLEND_CLAMP(blend_clamp) |
+               S_028C70_BLEND_BYPASS(blend_bypass) |
+               S_028C70_NUMBER_TYPE(ntype) |
+               S_028C70_ENDIAN(endian);
+
+       color_info |= S_028C70_LINEAR_GENERAL(1);
+
+       rctx->alpha_ref_dirty = true;
+
+       offset += r600_resource_va(rctx->context.screen, state->cbufs[cb]->texture);
+       offset >>= 8;
+
+       /* FIXME handle enabling of CB beyond BASE8 which has different offset */
+       r600_pipe_state_add_reg(rstate,
+                               R_028C60_CB_COLOR0_BASE + cb * 0x3C,
+                               offset, &rtex->resource, RADEON_USAGE_READWRITE);
+       r600_pipe_state_add_reg(rstate,
+                               R_028C64_CB_COLOR0_PITCH + cb * 0x3C,
+                               S_028C64_TILE_MAX(pitch),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028C68_CB_COLOR0_SLICE + cb * 0x3C,
+                               S_028C68_TILE_MAX(slice),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028C6C_CB_COLOR0_VIEW + cb * 0x3C,
+                               0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028C70_CB_COLOR0_INFO + cb * 0x3C,
+                               color_info, &rtex->resource, RADEON_USAGE_READWRITE);
+       r600_pipe_state_add_reg(rstate,
+                               R_028C74_CB_COLOR0_ATTRIB + cb * 0x3C,
+                               0,
+                               &rtex->resource, RADEON_USAGE_READWRITE);
+}
+
+static void evergreen_db(struct r600_context *rctx, struct r600_pipe_state *rstate,
+                        const struct pipe_framebuffer_state *state)
+{
+       struct r600_resource_texture *rtex;
+       struct r600_surface *surf;
+       unsigned level, first_layer, pitch, slice, format, array_mode;
+       uint64_t offset;
+
+       if (state->zsbuf == NULL) {
+               r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO, 0, NULL, 0);
+               r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO, 0, NULL, 0);
+               return;
+       }
+
+       surf = (struct r600_surface *)state->zsbuf;
+       level = surf->base.u.tex.level;
+       rtex = (struct r600_resource_texture*)surf->base.texture;
+
+       /* XXX remove this once tiling is properly supported */
+       array_mode = 0;/*rtex->array_mode[level] ? rtex->array_mode[level] :
+                        V_028C70_ARRAY_1D_TILED_THIN1;*/
+
+       first_layer = surf->base.u.tex.first_layer;
+       offset = r600_texture_get_offset(rtex, level, first_layer);
+       pitch = rtex->pitch_in_blocks[level] / 8 - 1;
+       slice = rtex->pitch_in_blocks[level] * surf->aligned_height / 64 - 1;
+       format = si_translate_dbformat(rtex->real_format);
+
+       offset += r600_resource_va(rctx->context.screen, surf->base.texture);
+       offset >>= 8;
+
+       r600_pipe_state_add_reg(rstate, R_028048_DB_Z_READ_BASE,
+                               offset, &rtex->resource, RADEON_USAGE_READWRITE);
+       r600_pipe_state_add_reg(rstate, R_028050_DB_Z_WRITE_BASE,
+                               offset, &rtex->resource, RADEON_USAGE_READWRITE);
+       r600_pipe_state_add_reg(rstate, R_028008_DB_DEPTH_VIEW, 0x00000000, NULL, 0);
+
+       if (rtex->stencil) {
+               uint64_t stencil_offset =
+                       r600_texture_get_offset(rtex->stencil, level, first_layer);
+
+               stencil_offset += r600_resource_va(rctx->context.screen, (void*)rtex->stencil);
+               stencil_offset >>= 8;
+
+               r600_pipe_state_add_reg(rstate, R_02804C_DB_STENCIL_READ_BASE,
+                                       stencil_offset, &rtex->stencil->resource, RADEON_USAGE_READWRITE);
+               r600_pipe_state_add_reg(rstate, R_028054_DB_STENCIL_WRITE_BASE,
+                                       stencil_offset, &rtex->stencil->resource, RADEON_USAGE_READWRITE);
+               r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO,
+                                       1, NULL, 0);
+       } else {
+               r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO,
+                                       0, NULL, 0);
+       }
+
+       r600_pipe_state_add_reg(rstate, R_02803C_DB_DEPTH_INFO, 0x1, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO,
+                               /*S_028040_ARRAY_MODE(array_mode) |*/ S_028040_FORMAT(format),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028058_DB_DEPTH_SIZE,
+                               S_028058_PITCH_TILE_MAX(pitch),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_02805C_DB_DEPTH_SLICE,
+                               S_02805C_SLICE_TILE_MAX(slice),
+                               NULL, 0);
+}
+
+static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
+                                       const struct pipe_framebuffer_state *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+       uint32_t shader_mask, tl, br;
+       int tl_x, tl_y, br_x, br_y;
+
+       if (rstate == NULL)
+               return;
+
+       r600_flush_framebuffer(rctx, false);
+
+       /* unreference old buffer and reference new one */
+       rstate->id = R600_PIPE_STATE_FRAMEBUFFER;
+
+       util_copy_framebuffer_state(&rctx->framebuffer, state);
+
+       /* build states */
+       rctx->have_depth_fb = 0;
+       rctx->nr_cbufs = state->nr_cbufs;
+       for (int i = 0; i < state->nr_cbufs; i++) {
+               evergreen_cb(rctx, rstate, state, i);
+       }
+       evergreen_db(rctx, rstate, state);
+
+       shader_mask = 0;
+       for (int i = 0; i < state->nr_cbufs; i++) {
+               shader_mask |= 0xf << (i * 4);
+       }
+       tl_x = 0;
+       tl_y = 0;
+       br_x = state->width;
+       br_y = state->height;
+#if 0 /* These shouldn't be necessary on SI, see PA_SC_ENHANCE register */
+       /* EG hw workaround */
+       if (br_x == 0)
+               tl_x = 1;
+       if (br_y == 0)
+               tl_y = 1;
+       /* cayman hw workaround */
+       if (rctx->chip_class == CAYMAN) {
+               if (br_x == 1 && br_y == 1)
+                       br_x = 2;
+       }
+#endif
+       tl = S_028240_TL_X(tl_x) | S_028240_TL_Y(tl_y);
+       br = S_028244_BR_X(br_x) | S_028244_BR_Y(br_y);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_028240_PA_SC_GENERIC_SCISSOR_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028244_PA_SC_GENERIC_SCISSOR_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028250_PA_SC_VPORT_SCISSOR_0_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028254_PA_SC_VPORT_SCISSOR_0_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028030_PA_SC_SCREEN_SCISSOR_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028034_PA_SC_SCREEN_SCISSOR_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028204_PA_SC_WINDOW_SCISSOR_TL, tl,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028208_PA_SC_WINDOW_SCISSOR_BR, br,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028200_PA_SC_WINDOW_OFFSET, 0x00000000,
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028230_PA_SC_EDGERULE, 0xAAAAAAAA,
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_02823C_CB_SHADER_MASK,
+                               shader_mask, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028BE0_PA_SC_AA_CONFIG,
+                               0x00000000, NULL, 0);
+
+       free(rctx->states[R600_PIPE_STATE_FRAMEBUFFER]);
+       rctx->states[R600_PIPE_STATE_FRAMEBUFFER] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+
+       if (state->zsbuf) {
+               cayman_polygon_offset_update(rctx);
+       }
+}
+
+void cayman_init_state_functions(struct r600_context *rctx)
+{
+       rctx->context.create_blend_state = evergreen_create_blend_state;
+       rctx->context.create_depth_stencil_alpha_state = evergreen_create_dsa_state;
+       rctx->context.create_fs_state = si_create_shader_state;
+       rctx->context.create_rasterizer_state = evergreen_create_rs_state;
+       rctx->context.create_sampler_state = si_create_sampler_state;
+       rctx->context.create_sampler_view = evergreen_create_sampler_view;
+       rctx->context.create_vertex_elements_state = si_create_vertex_elements;
+       rctx->context.create_vs_state = si_create_shader_state;
+       rctx->context.bind_blend_state = r600_bind_blend_state;
+       rctx->context.bind_depth_stencil_alpha_state = r600_bind_dsa_state;
+       rctx->context.bind_fragment_sampler_states = evergreen_bind_ps_sampler;
+       rctx->context.bind_fs_state = r600_bind_ps_shader;
+       rctx->context.bind_rasterizer_state = r600_bind_rs_state;
+       rctx->context.bind_vertex_elements_state = r600_bind_vertex_elements;
+       rctx->context.bind_vertex_sampler_states = evergreen_bind_vs_sampler;
+       rctx->context.bind_vs_state = r600_bind_vs_shader;
+       rctx->context.delete_blend_state = r600_delete_state;
+       rctx->context.delete_depth_stencil_alpha_state = r600_delete_state;
+       rctx->context.delete_fs_state = r600_delete_ps_shader;
+       rctx->context.delete_rasterizer_state = r600_delete_rs_state;
+       rctx->context.delete_sampler_state = si_delete_sampler_state;
+       rctx->context.delete_vertex_elements_state = r600_delete_vertex_element;
+       rctx->context.delete_vs_state = r600_delete_vs_shader;
+       rctx->context.set_blend_color = evergreen_set_blend_color;
+       rctx->context.set_clip_state = evergreen_set_clip_state;
+       rctx->context.set_constant_buffer = r600_set_constant_buffer;
+       rctx->context.set_fragment_sampler_views = evergreen_set_ps_sampler_view;
+       rctx->context.set_framebuffer_state = evergreen_set_framebuffer_state;
+       rctx->context.set_polygon_stipple = evergreen_set_polygon_stipple;
+       rctx->context.set_sample_mask = evergreen_set_sample_mask;
+       rctx->context.set_scissor_state = evergreen_set_scissor_state;
+       rctx->context.set_stencil_ref = r600_set_pipe_stencil_ref;
+       rctx->context.set_vertex_buffers = r600_set_vertex_buffers;
+       rctx->context.set_index_buffer = r600_set_index_buffer;
+       rctx->context.set_vertex_sampler_views = evergreen_set_vs_sampler_view;
+       rctx->context.set_viewport_state = evergreen_set_viewport_state;
+       rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
+       rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
+       rctx->context.texture_barrier = r600_texture_barrier;
+       rctx->context.create_stream_output_target = r600_create_so_target;
+       rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+       rctx->context.set_stream_output_targets = r600_set_so_targets;
+}
+
+void si_init_config(struct r600_context *rctx)
+{
+       struct r600_pipe_state *rstate = &rctx->config;
+       unsigned tmp;
+
+       r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL_1, 0x0, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028A10_VGT_OUTPUT_PATH_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A14_VGT_HOS_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A20_VGT_HOS_REUSE_DEPTH, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A24_VGT_GROUP_PRIM_TYPE, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A28_VGT_GROUP_FIRST_DECR, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A2C_VGT_GROUP_DECR, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A30_VGT_GROUP_VECT_0_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A34_VGT_GROUP_VECT_1_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A38_VGT_GROUP_VECT_0_FMT_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A40_VGT_GS_MODE, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A84_VGT_PRIMITIVEID_EN, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028B94_VGT_STRMOUT_CONFIG, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AA8_IA_MULTI_VGT_PARAM, S_028AA8_SWITCH_ON_EOP(1) | S_028AA8_PARTIAL_VS_WAVE_ON(1) | S_028AA8_PRIMGROUP_SIZE(63), NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AB4_VGT_REUSE_OFF, 0x00000000, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028AB8_VGT_VTX_CNT_EN, 0x0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL, 0x0, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028B54_VGT_SHADER_STAGES_EN, 0, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210, NULL, 0);
+       r600_pipe_state_add_reg(rstate, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98, NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_028804_DB_EQAA, 0x110000, NULL, 0);
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+void cayman_polygon_offset_update(struct r600_context *rctx)
+{
+       struct r600_pipe_state state;
+
+       state.id = R600_PIPE_STATE_POLYGON_OFFSET;
+       state.nregs = 0;
+       if (rctx->rasterizer && rctx->framebuffer.zsbuf) {
+               float offset_units = rctx->rasterizer->offset_units;
+               unsigned offset_db_fmt_cntl = 0, depth;
+
+               switch (rctx->framebuffer.zsbuf->texture->format) {
+               case PIPE_FORMAT_Z24X8_UNORM:
+               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+                       depth = -24;
+                       offset_units *= 2.0f;
+                       break;
+               case PIPE_FORMAT_Z32_FLOAT:
+               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+                       depth = -23;
+                       offset_units *= 1.0f;
+                       offset_db_fmt_cntl |= S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
+                       break;
+               case PIPE_FORMAT_Z16_UNORM:
+                       depth = -16;
+                       offset_units *= 4.0f;
+                       break;
+               default:
+                       return;
+               }
+               /* FIXME some of those reg can be computed with cso */
+               offset_db_fmt_cntl |= S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(depth);
+               r600_pipe_state_add_reg(&state,
+                               R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE,
+                               fui(rctx->rasterizer->offset_scale), NULL, 0);
+               r600_pipe_state_add_reg(&state,
+                               R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET,
+                               fui(offset_units), NULL, 0);
+               r600_pipe_state_add_reg(&state,
+                               R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE,
+                               fui(rctx->rasterizer->offset_scale), NULL, 0);
+               r600_pipe_state_add_reg(&state,
+                               R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET,
+                               fui(offset_units), NULL, 0);
+               r600_pipe_state_add_reg(&state,
+                               R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
+                               offset_db_fmt_cntl, NULL, 0);
+               r600_context_pipe_state_set(rctx, &state);
+       }
+}
+
+void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = &shader->rstate;
+       struct r600_shader *rshader = &shader->shader;
+       unsigned i, exports_ps, num_cout, spi_ps_in_control, db_shader_control;
+       int pos_index = -1, face_index = -1;
+       int ninterp = 0;
+       boolean have_linear = FALSE, have_centroid = FALSE, have_perspective = FALSE;
+       unsigned spi_baryc_cntl;
+       uint64_t va;
+
+       if (si_pipe_shader_create(ctx, shader))
+               return;
+
+       rstate->nregs = 0;
+
+       db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
+       for (i = 0; i < rshader->ninput; i++) {
+               /* evergreen NUM_INTERP only contains values interpolated into the LDS,
+                  POSITION goes via GPRs from the SC so isn't counted */
+               if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
+                       pos_index = i;
+               else if (rshader->input[i].name == TGSI_SEMANTIC_FACE)
+                       face_index = i;
+               else {
+                       ninterp++;
+                       if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
+                               have_linear = TRUE;
+                       if (rshader->input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
+                               have_perspective = TRUE;
+                       if (rshader->input[i].centroid)
+                               have_centroid = TRUE;
+               }
+       }
+
+       for (i = 0; i < rshader->noutput; i++) {
+               if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
+                       db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1);
+               if (rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
+                       db_shader_control |= 0; // XXX OP_VAL or TEST_VAL?
+       }
+       if (rshader->uses_kill)
+               db_shader_control |= S_02880C_KILL_ENABLE(1);
+
+       exports_ps = 0;
+       num_cout = 0;
+       for (i = 0; i < rshader->noutput; i++) {
+               if (rshader->output[i].name == TGSI_SEMANTIC_POSITION ||
+                   rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
+                       exports_ps |= 1;
+               else if (rshader->output[i].name == TGSI_SEMANTIC_COLOR) {
+                       if (rshader->fs_write_all)
+                               num_cout = rshader->nr_cbufs;
+                       else
+                               num_cout++;
+               }
+       }
+       if (!exports_ps) {
+               /* always at least export 1 component per pixel */
+               exports_ps = 2;
+       }
+
+       if (ninterp == 0) {
+               ninterp = 1;
+               have_perspective = TRUE;
+       }
+
+       if (!have_perspective && !have_linear)
+               have_perspective = TRUE;
+
+       spi_ps_in_control = S_0286D8_NUM_INTERP(ninterp);
+
+       spi_baryc_cntl = 0;
+       if (have_perspective)
+               spi_baryc_cntl |= have_centroid ?
+                       S_0286E0_PERSP_CENTROID_CNTL(1) : S_0286E0_PERSP_CENTER_CNTL(1);
+       if (have_linear)
+               spi_baryc_cntl |= have_centroid ?
+                       S_0286E0_LINEAR_CENTROID_CNTL(1) : S_0286E0_LINEAR_CENTER_CNTL(1);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_0286E0_SPI_BARYC_CNTL,
+                               spi_baryc_cntl,
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_0286CC_SPI_PS_INPUT_ENA,
+                               shader->spi_ps_input_ena,
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_0286D0_SPI_PS_INPUT_ADDR,
+                               shader->spi_ps_input_ena,
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_0286D8_SPI_PS_IN_CONTROL,
+                               spi_ps_in_control,
+                               NULL, 0);
+
+       /* XXX: Depends on Z buffer format? */
+       r600_pipe_state_add_reg(rstate,
+                               R_028710_SPI_SHADER_Z_FORMAT,
+                               0,
+                               NULL, 0);
+
+       /* XXX: Depends on color buffer format? */
+       r600_pipe_state_add_reg(rstate,
+                               R_028714_SPI_SHADER_COL_FORMAT,
+                               S_028714_COL0_EXPORT_FORMAT(V_028714_SPI_SHADER_32_ABGR),
+                               NULL, 0);
+
+       va = r600_resource_va(ctx->screen, (void *)shader->bo);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B020_SPI_SHADER_PGM_LO_PS,
+                               va >> 8,
+                               shader->bo, RADEON_USAGE_READ);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B024_SPI_SHADER_PGM_HI_PS,
+                               va >> 40,
+                               shader->bo, RADEON_USAGE_READ);
+
+       /* Last 2 reserved SGPRs are used for VCC */
+       /* XXX: Hard-coding 2 SGPRs for constant buffer */
+       r600_pipe_state_add_reg(rstate,
+                               R_00B028_SPI_SHADER_PGM_RSRC1_PS,
+                               S_00B028_VGPRS(shader->num_vgprs / 4) |
+                               S_00B028_SGPRS((shader->num_sgprs + 2 + 2 + 1) / 8),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
+                               S_00B02C_USER_SGPR(6),
+                               NULL, 0);
+
+       r600_pipe_state_add_reg(rstate, R_02880C_DB_SHADER_CONTROL,
+                               db_shader_control,
+                               NULL, 0);
+
+       shader->sprite_coord_enable = rctx->sprite_coord_enable;
+}
+
+void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = &shader->rstate;
+       struct r600_shader *rshader = &shader->shader;
+       unsigned nparams, i;
+       uint64_t va;
+
+       if (si_pipe_shader_create(ctx, shader))
+               return;
+
+       /* clear previous register */
+       rstate->nregs = 0;
+
+       /* Certain attributes (position, psize, etc.) don't count as params.
+        * VS is required to export at least one param and r600_shader_from_tgsi()
+        * takes care of adding a dummy export.
+        */
+       for (nparams = 0, i = 0 ; i < rshader->noutput; i++) {
+               if (rshader->output[i].name != TGSI_SEMANTIC_POSITION)
+                       nparams++;
+       }
+       if (nparams < 1)
+               nparams = 1;
+
+       r600_pipe_state_add_reg(rstate,
+                       R_0286C4_SPI_VS_OUT_CONFIG,
+                       S_0286C4_VS_EXPORT_COUNT(nparams - 1),
+                       NULL, 0);
+
+       r600_pipe_state_add_reg(rstate,
+                               R_02870C_SPI_SHADER_POS_FORMAT,
+                               S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+                               S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+                               S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
+                               S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP),
+                               NULL, 0);
+
+       va = r600_resource_va(ctx->screen, (void *)shader->bo);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B120_SPI_SHADER_PGM_LO_VS,
+                               va >> 8,
+                               shader->bo, RADEON_USAGE_READ);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B124_SPI_SHADER_PGM_HI_VS,
+                               va >> 40,
+                               shader->bo, RADEON_USAGE_READ);
+
+       /* Last 2 reserved SGPRs are used for VCC */
+       /* XXX: Hard-coding 2 SGPRs for constant buffer */
+       r600_pipe_state_add_reg(rstate,
+                               R_00B128_SPI_SHADER_PGM_RSRC1_VS,
+                               S_00B128_VGPRS(shader->num_vgprs / 4) |
+                               S_00B128_SGPRS((shader->num_sgprs + 2 + 2 + 2) / 8),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
+                               S_00B12C_USER_SGPR(2 + 2),
+                               NULL, 0);
+}
+
+void si_update_spi_map(struct r600_context *rctx)
+{
+       struct r600_shader *ps = &rctx->ps_shader->shader;
+       struct r600_shader *vs = &rctx->vs_shader->shader;
+       struct r600_pipe_state *rstate = &rctx->spi;
+       unsigned i, j, tmp;
+
+       rstate->nregs = 0;
+
+       for (i = 0; i < ps->ninput; i++) {
+               tmp = 0;
+
+               if (ps->input[i].name == TGSI_SEMANTIC_COLOR ||
+                   ps->input[i].name == TGSI_SEMANTIC_BCOLOR ||
+                   ps->input[i].name == TGSI_SEMANTIC_POSITION) {
+                       tmp |= S_028644_FLAT_SHADE(1);
+               }
+
+               if (ps->input[i].name == TGSI_SEMANTIC_GENERIC &&
+                   rctx->sprite_coord_enable & (1 << ps->input[i].sid)) {
+                       tmp |= S_028644_PT_SPRITE_TEX(1);
+               }
+
+               for (j = 0; j < vs->noutput; j++) {
+                       if (ps->input[i].name == vs->output[j].name &&
+                           ps->input[i].sid == vs->output[j].sid) {
+                               tmp |= S_028644_OFFSET(ps->input[i].sid);
+                               break;
+                       }
+               }
+
+               if (j == vs->noutput) {
+                       /* No corresponding output found, load defaults into input */
+                       tmp |= S_028644_OFFSET(0x20);
+               }
+
+               r600_pipe_state_add_reg(rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4,
+                                       tmp, NULL, 0);
+       }
+
+       if (rstate->nregs > 0)
+               r600_context_pipe_state_set(rctx, rstate);
+}
+
+void *cayman_create_db_flush_dsa(struct r600_context *rctx)
+{
+       struct pipe_depth_stencil_alpha_state dsa;
+       struct r600_pipe_state *rstate;
+
+       memset(&dsa, 0, sizeof(dsa));
+
+       rstate = rctx->context.create_depth_stencil_alpha_state(&rctx->context, &dsa);
+       r600_pipe_state_add_reg(rstate,
+                               R_028000_DB_RENDER_CONTROL,
+                               S_028000_DEPTH_COPY(1) |
+                               S_028000_STENCIL_COPY(1) |
+                               S_028000_COPY_CENTROID(1),
+                               NULL, 0);
+       return rstate;
+}
diff --git a/src/gallium/drivers/radeonsi/r600.h b/src/gallium/drivers/radeonsi/r600.h

new file mode 100644 (file)

index 0000000..56915ab
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef R600_H
+#define R600_H
+
+#include "../../winsys/radeon/drm/radeon_winsys.h"
+#include "util/u_double_list.h"
+#include "util/u_vbuf.h"
+
+#define R600_ERR(fmt, args...) \
+       fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args)
+
+struct winsys_handle;
+
+enum radeon_family {
+       CHIP_UNKNOWN,
+       CHIP_CAYMAN,
+       CHIP_TAHITI,
+       CHIP_PITCAIRN,
+       CHIP_VERDE,
+       CHIP_LAST,
+};
+
+enum chip_class {
+       CAYMAN,
+       TAHITI,
+};
+
+struct r600_tiling_info {
+       unsigned num_channels;
+       unsigned num_banks;
+       unsigned group_bytes;
+};
+
+struct r600_resource {
+       struct u_vbuf_resource          b;
+
+       /* Winsys objects. */
+       struct pb_buffer                *buf;
+       struct radeon_winsys_cs_handle  *cs_buf;
+
+       /* Resource state. */
+       unsigned                        domains;
+};
+
+/* R600/R700 STATES */
+#define R600_GROUP_MAX                 16
+#define R600_BLOCK_MAX_BO              32
+#define R600_BLOCK_MAX_REG             128
+
+/* each range covers 9 bits of dword space = 512 dwords = 2k bytes */
+/* there is a block entry for each register so 512 blocks */
+/* we have no registers to read/write below 0x8000 (0x2000 in dw space) */
+/* we use some fake offsets at 0x40000 to do evergreen sampler borders so take 0x42000 as a max bound*/
+#define RANGE_OFFSET_START 0x8000
+#define HASH_SHIFT 9
+#define NUM_RANGES (0x42000 - RANGE_OFFSET_START) / (4 << HASH_SHIFT) /* 128 << 9 = 64k */
+
+#define CTX_RANGE_ID(offset) ((((offset - RANGE_OFFSET_START) >> 2) >> HASH_SHIFT) & 255)
+#define CTX_BLOCK_ID(offset) (((offset - RANGE_OFFSET_START) >> 2) & ((1 << HASH_SHIFT) - 1))
+
+struct r600_pipe_reg {
+       uint32_t                        value;
+       struct r600_block               *block;
+       struct r600_resource            *bo;
+       enum radeon_bo_usage            bo_usage;
+       uint32_t                        id;
+};
+
+struct r600_pipe_state {
+       unsigned                        id;
+       unsigned                        nregs;
+       struct r600_pipe_reg            regs[R600_BLOCK_MAX_REG];
+};
+
+#define R600_BLOCK_STATUS_ENABLED      (1 << 0)
+#define R600_BLOCK_STATUS_DIRTY                (1 << 1)
+
+struct r600_block_reloc {
+       struct r600_resource    *bo;
+       enum radeon_bo_usage    bo_usage;
+       unsigned                bo_pm4_index;
+};
+
+struct r600_block {
+       struct list_head        list;
+       struct list_head        enable_list;
+       unsigned                status;
+       unsigned                flags;
+       unsigned                start_offset;
+       unsigned                pm4_ndwords;
+       unsigned                nbo;
+       uint16_t                nreg;
+       uint16_t                nreg_dirty;
+       uint32_t                *reg;
+       uint32_t                pm4[R600_BLOCK_MAX_REG];
+       unsigned                pm4_bo_index[R600_BLOCK_MAX_REG];
+       struct r600_block_reloc reloc[R600_BLOCK_MAX_BO];
+};
+
+struct r600_range {
+       struct r600_block       **blocks;
+};
+
+struct r600_query {
+       union {
+               uint64_t                        u64;
+               boolean                         b;
+               struct pipe_query_data_so_statistics so;
+       } result;
+       /* The kind of query */
+       unsigned                                type;
+       /* Offset of the first result for current query */
+       unsigned                                results_start;
+       /* Offset of the next free result after current query data */
+       unsigned                                results_end;
+       /* Size of the result in memory for both begin_query and end_query,
+        * this can be one or two numbers, or it could even be a size of a structure. */
+       unsigned                                result_size;
+       /* The buffer where query results are stored. It's used as a ring,
+        * data blocks for current query are stored sequentially from
+        * results_start to results_end, with wrapping on the buffer end */
+       struct r600_resource                    *buffer;
+       /* The number of dwords for begin_query or end_query. */
+       unsigned                                num_cs_dw;
+       /* linked list of queries */
+       struct list_head                        list;
+};
+
+struct r600_so_target {
+       struct pipe_stream_output_target b;
+
+       /* The buffer where BUFFER_FILLED_SIZE is stored. */
+       struct r600_resource    *filled_size;
+       unsigned                stride;
+       unsigned                so_index;
+};
+
+#define R600_CONTEXT_DRAW_PENDING      (1 << 0)
+#define R600_CONTEXT_DST_CACHES_DIRTY  (1 << 1)
+#define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2)
+
+struct r600_draw {
+       uint32_t                vgt_num_indices;
+       uint32_t                vgt_num_instances;
+       uint32_t                vgt_index_type;
+       uint32_t                vgt_draw_initiator;
+       uint32_t                indices_bo_offset;
+       unsigned                db_render_override;
+       unsigned                db_render_control;
+       struct r600_resource    *indices;
+};
+
+struct r600_context;
+struct r600_screen;
+
+void r600_get_backend_mask(struct r600_context *ctx);
+void r600_context_fini(struct r600_context *ctx);
+void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state);
+void r600_context_flush(struct r600_context *ctx, unsigned flags);
+void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
+
+struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type);
+void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query);
+boolean r600_context_query_result(struct r600_context *ctx,
+                               struct r600_query *query,
+                               boolean wait, void *vresult);
+void r600_query_begin(struct r600_context *ctx, struct r600_query *query);
+void r600_query_end(struct r600_context *ctx, struct r600_query *query);
+void r600_context_queries_suspend(struct r600_context *ctx);
+void r600_context_queries_resume(struct r600_context *ctx);
+void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
+                           int flag_wait);
+void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence,
+                             unsigned offset, unsigned value);
+void r600_inval_shader_cache(struct r600_context *ctx);
+void r600_inval_texture_cache(struct r600_context *ctx);
+void r600_inval_vertex_cache(struct r600_context *ctx);
+void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now);
+
+void r600_context_streamout_begin(struct r600_context *ctx);
+void r600_context_streamout_end(struct r600_context *ctx);
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t);
+void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in);
+void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block);
+void r600_context_block_resource_emit_dirty(struct r600_context *ctx, struct r600_block *block);
+
+int si_context_init(struct r600_context *ctx);
+void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
+
+void _r600_pipe_state_add_reg(struct r600_context *ctx,
+                             struct r600_pipe_state *state,
+                             uint32_t offset, uint32_t value,
+                             uint32_t range_id, uint32_t block_id,
+                             struct r600_resource *bo,
+                             enum radeon_bo_usage usage);
+
+void r600_pipe_state_add_reg_noblock(struct r600_pipe_state *state,
+                                    uint32_t offset, uint32_t value,
+                                    struct r600_resource *bo,
+                                    enum radeon_bo_usage usage);
+
+#define r600_pipe_state_add_reg(state, offset, value, bo, usage) _r600_pipe_state_add_reg(rctx, state, offset, value, CTX_RANGE_ID(offset), CTX_BLOCK_ID(offset), bo, usage)
+
+static inline void r600_pipe_state_mod_reg(struct r600_pipe_state *state,
+                                          uint32_t value)
+{
+       state->regs[state->nregs].value = value;
+       state->nregs++;
+}
+
+static inline void r600_pipe_state_mod_reg_bo(struct r600_pipe_state *state,
+                                             uint32_t value, struct r600_resource *bo,
+                                             enum radeon_bo_usage usage)
+{
+       state->regs[state->nregs].value = value;
+       state->regs[state->nregs].bo = bo;
+       state->regs[state->nregs].bo_usage = usage;
+       state->nregs++;
+}
+
+#endif
diff --git a/src/gallium/drivers/radeonsi/r600_blit.c b/src/gallium/drivers/radeonsi/r600_blit.c

new file mode 100644 (file)

index 0000000..6515808
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_blit.c
@@ -0,0 +1,379 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "util/u_surface.h"
+#include "util/u_blitter.h"
+#include "util/u_format.h"
+#include "radeonsi_pipe.h"
+
+enum r600_blitter_op /* bitmask */
+{
+       R600_SAVE_TEXTURES      = 1,
+       R600_SAVE_FRAMEBUFFER   = 2,
+       R600_DISABLE_RENDER_COND = 4,
+
+       R600_CLEAR         = 0,
+
+       R600_CLEAR_SURFACE = R600_SAVE_FRAMEBUFFER,
+
+       R600_COPY          = R600_SAVE_FRAMEBUFFER | R600_SAVE_TEXTURES |
+                            R600_DISABLE_RENDER_COND,
+
+       R600_DECOMPRESS    = R600_SAVE_FRAMEBUFFER | R600_DISABLE_RENDER_COND,
+};
+
+static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       r600_context_queries_suspend(rctx);
+
+       util_blitter_save_blend(rctx->blitter, rctx->states[R600_PIPE_STATE_BLEND]);
+       util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->states[R600_PIPE_STATE_DSA]);
+       if (rctx->states[R600_PIPE_STATE_STENCIL_REF]) {
+               util_blitter_save_stencil_ref(rctx->blitter, &rctx->stencil_ref);
+       }
+       util_blitter_save_rasterizer(rctx->blitter, rctx->states[R600_PIPE_STATE_RASTERIZER]);
+       util_blitter_save_fragment_shader(rctx->blitter, rctx->ps_shader);
+       util_blitter_save_vertex_shader(rctx->blitter, rctx->vs_shader);
+       util_blitter_save_vertex_elements(rctx->blitter, rctx->vertex_elements);
+       if (rctx->states[R600_PIPE_STATE_VIEWPORT]) {
+               util_blitter_save_viewport(rctx->blitter, &rctx->viewport);
+       }
+       util_blitter_save_vertex_buffers(rctx->blitter,
+                                        rctx->vbuf_mgr->nr_vertex_buffers,
+                                        rctx->vbuf_mgr->vertex_buffer);
+       util_blitter_save_so_targets(rctx->blitter, rctx->num_so_targets,
+                                    (struct pipe_stream_output_target**)rctx->so_targets);
+
+       if (op & R600_SAVE_FRAMEBUFFER)
+               util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer);
+
+       if (op & R600_SAVE_TEXTURES) {
+               util_blitter_save_fragment_sampler_states(
+                       rctx->blitter, rctx->ps_samplers.n_samplers,
+                       (void**)rctx->ps_samplers.samplers);
+
+               util_blitter_save_fragment_sampler_views(
+                       rctx->blitter, rctx->ps_samplers.n_views,
+                       (struct pipe_sampler_view**)rctx->ps_samplers.views);
+       }
+
+       if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) {
+               rctx->saved_render_cond = rctx->current_render_cond;
+               rctx->saved_render_cond_mode = rctx->current_render_cond_mode;
+               rctx->context.render_condition(&rctx->context, NULL, 0);
+       }
+
+}
+
+static void r600_blitter_end(struct pipe_context *ctx)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       if (rctx->saved_render_cond) {
+               rctx->context.render_condition(&rctx->context,
+                                              rctx->saved_render_cond,
+                                              rctx->saved_render_cond_mode);
+               rctx->saved_render_cond = NULL;
+       }
+       r600_context_queries_resume(rctx);
+}
+
+static unsigned u_num_layers(struct pipe_resource *r, unsigned level)
+{
+       switch (r->target) {
+       case PIPE_TEXTURE_CUBE:
+               return 6;
+       case PIPE_TEXTURE_3D:
+               return u_minify(r->depth0, level);
+       case PIPE_TEXTURE_1D_ARRAY:
+               return r->array_size;
+       case PIPE_TEXTURE_2D_ARRAY:
+               return r->array_size;
+       default:
+               return 1;
+       }
+}
+
+void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       unsigned layer, level;
+       float depth = 1.0f;
+
+       if (!texture->dirty_db)
+               return;
+
+       for (level = 0; level <= texture->resource.b.b.b.last_level; level++) {
+               unsigned num_layers = u_num_layers(&texture->resource.b.b.b, level);
+
+               for (layer = 0; layer < num_layers; layer++) {
+                       struct pipe_surface *zsurf, *cbsurf, surf_tmpl;
+
+                       surf_tmpl.format = texture->real_format;
+                       surf_tmpl.u.tex.level = level;
+                       surf_tmpl.u.tex.first_layer = layer;
+                       surf_tmpl.u.tex.last_layer = layer;
+                       surf_tmpl.usage = PIPE_BIND_DEPTH_STENCIL;
+
+                       zsurf = ctx->create_surface(ctx, &texture->resource.b.b.b, &surf_tmpl);
+
+                       surf_tmpl.format = texture->flushed_depth_texture->real_format;
+                       surf_tmpl.usage = PIPE_BIND_RENDER_TARGET;
+                       cbsurf = ctx->create_surface(ctx,
+                                       (struct pipe_resource*)texture->flushed_depth_texture, &surf_tmpl);
+
+                       r600_blitter_begin(ctx, R600_DECOMPRESS);
+                       util_blitter_custom_depth_stencil(rctx->blitter, zsurf, cbsurf, rctx->custom_dsa_flush, depth);
+                       r600_blitter_end(ctx);
+
+                       pipe_surface_reference(&zsurf, NULL);
+                       pipe_surface_reference(&cbsurf, NULL);
+               }
+       }
+
+       texture->dirty_db = FALSE;
+}
+
+void r600_flush_depth_textures(struct r600_context *rctx)
+{
+       unsigned int i;
+
+       /* FIXME: This handles fragment shader textures only. */
+
+       for (i = 0; i < rctx->ps_samplers.n_views; ++i) {
+               struct si_pipe_sampler_view *view;
+               struct r600_resource_texture *tex;
+
+               view = rctx->ps_samplers.views[i];
+               if (!view) continue;
+
+               tex = (struct r600_resource_texture *)view->base.texture;
+               if (!tex->depth)
+                       continue;
+
+               if (tex->is_flushing_texture)
+                       continue;
+
+               r600_blit_uncompress_depth(&rctx->context, tex);
+       }
+
+       /* also check CB here */
+       for (i = 0; i < rctx->framebuffer.nr_cbufs; i++) {
+               struct r600_resource_texture *tex;
+               tex = (struct r600_resource_texture *)rctx->framebuffer.cbufs[i]->texture;
+
+               if (!tex->depth)
+                       continue;
+
+               if (tex->is_flushing_texture)
+                       continue;
+
+               r600_blit_uncompress_depth(&rctx->context, tex);
+       }
+}
+
+static void r600_clear(struct pipe_context *ctx, unsigned buffers,
+                      const union pipe_color_union *color,
+                      double depth, unsigned stencil)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct pipe_framebuffer_state *fb = &rctx->framebuffer;
+       
+       r600_blitter_begin(ctx, R600_CLEAR);
+       util_blitter_clear(rctx->blitter, fb->width, fb->height,
+                          fb->nr_cbufs, buffers, fb->nr_cbufs ? fb->cbufs[0]->format : PIPE_FORMAT_NONE,
+                          color, depth, stencil);
+       r600_blitter_end(ctx);
+}
+
+static void r600_clear_render_target(struct pipe_context *ctx,
+                                    struct pipe_surface *dst,
+                                    const union pipe_color_union *color,
+                                    unsigned dstx, unsigned dsty,
+                                    unsigned width, unsigned height)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
+       util_blitter_clear_render_target(rctx->blitter, dst, color,
+                                        dstx, dsty, width, height);
+       r600_blitter_end(ctx);
+}
+
+static void r600_clear_depth_stencil(struct pipe_context *ctx,
+                                    struct pipe_surface *dst,
+                                    unsigned clear_flags,
+                                    double depth,
+                                    unsigned stencil,
+                                    unsigned dstx, unsigned dsty,
+                                    unsigned width, unsigned height)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
+       util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil,
+                                        dstx, dsty, width, height);
+       r600_blitter_end(ctx);
+}
+
+
+
+/* Copy a block of pixels from one surface to another using HW. */
+static void r600_hw_copy_region(struct pipe_context *ctx,
+                               struct pipe_resource *dst,
+                               unsigned dst_level,
+                               unsigned dstx, unsigned dsty, unsigned dstz,
+                               struct pipe_resource *src,
+                               unsigned src_level,
+                               const struct pipe_box *src_box)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       r600_blitter_begin(ctx, R600_COPY);
+       util_blitter_copy_texture(rctx->blitter, dst, dst_level, dstx, dsty, dstz,
+                                 src, src_level, src_box, TRUE);
+       r600_blitter_end(ctx);
+}
+
+struct texture_orig_info {
+       unsigned format;
+       unsigned width0;
+       unsigned height0;
+};
+
+static void r600_compressed_to_blittable(struct pipe_resource *tex,
+                                  unsigned level,
+                                  struct texture_orig_info *orig)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)tex;
+       unsigned pixsize = util_format_get_blocksize(rtex->real_format);
+       int new_format;
+       int new_height, new_width;
+
+       orig->format = tex->format;
+       orig->width0 = tex->width0;
+       orig->height0 = tex->height0;
+
+       if (pixsize == 8)
+               new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
+       else
+               new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
+
+       new_width = util_format_get_nblocksx(tex->format, orig->width0);
+       new_height = util_format_get_nblocksy(tex->format, orig->height0);
+
+       tex->width0 = new_width;
+       tex->height0 = new_height;
+       tex->format = new_format;
+}
+
+static void r600_reset_blittable_to_compressed(struct pipe_resource *tex,
+                                              struct texture_orig_info *orig)
+{
+       tex->format = orig->format;
+       tex->width0 = orig->width0;
+       tex->height0 = orig->height0;
+}
+
+static void r600_resource_copy_region(struct pipe_context *ctx,
+                                     struct pipe_resource *dst,
+                                     unsigned dst_level,
+                                     unsigned dstx, unsigned dsty, unsigned dstz,
+                                     struct pipe_resource *src,
+                                     unsigned src_level,
+                                     const struct pipe_box *src_box)
+{
+       struct r600_resource_texture *rsrc = (struct r600_resource_texture*)src;
+       struct texture_orig_info orig_info[2];
+       struct pipe_box sbox;
+       const struct pipe_box *psbox;
+       boolean restore_orig[2];
+
+       memset(orig_info, 0, sizeof(orig_info));
+
+       /* Fallback for buffers. */
+       if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+               util_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
+                                          src, src_level, src_box);
+               return;
+       }
+
+       if (rsrc->depth && !rsrc->is_flushing_texture)
+               r600_texture_depth_flush(ctx, src, FALSE);
+
+       restore_orig[0] = restore_orig[1] = FALSE;
+
+       if (util_format_is_compressed(src->format)) {
+               r600_compressed_to_blittable(src, src_level, &orig_info[0]);
+               restore_orig[0] = TRUE;
+               sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x);
+               sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y);
+               sbox.z = src_box->z;
+               sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width);
+               sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height);
+               sbox.depth = src_box->depth;
+               psbox=&sbox;
+       } else
+               psbox=src_box;
+
+       if (util_format_is_compressed(dst->format)) {
+               r600_compressed_to_blittable(dst, dst_level, &orig_info[1]);
+               restore_orig[1] = TRUE;
+               /* translate the dst box as well */
+               dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
+               dsty = util_format_get_nblocksy(orig_info[1].format, dsty);
+       }
+
+       r600_hw_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
+                           src, src_level, psbox);
+
+       if (restore_orig[0])
+               r600_reset_blittable_to_compressed(src, &orig_info[0]);
+
+       if (restore_orig[1])
+               r600_reset_blittable_to_compressed(dst, &orig_info[1]);
+}
+
+void r600_init_blit_functions(struct r600_context *rctx)
+{
+       rctx->context.clear = r600_clear;
+       rctx->context.clear_render_target = r600_clear_render_target;
+       rctx->context.clear_depth_stencil = r600_clear_depth_stencil;
+       rctx->context.resource_copy_region = r600_resource_copy_region;
+}
+
+void r600_blit_push_depth(struct pipe_context *ctx, struct r600_resource_texture *texture)
+{
+       struct pipe_box sbox;
+
+       sbox.x = sbox.y = sbox.z = 0;
+       sbox.width = texture->resource.b.b.b.width0;
+       sbox.height = texture->resource.b.b.b.height0;
+       /* XXX that might be wrong */
+       sbox.depth = 1;
+
+       r600_hw_copy_region(ctx, (struct pipe_resource *)texture, 0,
+                           0, 0, 0,
+                           (struct pipe_resource *)texture->flushed_depth_texture, 0,
+                           &sbox);
+}
diff --git a/src/gallium/drivers/radeonsi/r600_buffer.c b/src/gallium/drivers/radeonsi/r600_buffer.c

new file mode 100644 (file)

index 0000000..bb885df
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_buffer.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson <MostAwesomeDude@gmail.com>
+ */
+#include <byteswap.h>
+
+#include "pipe/p_screen.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_upload_mgr.h"
+
+#include "r600.h"
+#include "radeonsi_pipe.h"
+
+static void r600_buffer_destroy(struct pipe_screen *screen,
+                               struct pipe_resource *buf)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       struct r600_resource *rbuffer = r600_resource(buf);
+
+       pb_reference(&rbuffer->buf, NULL);
+       util_slab_free(&rscreen->pool_buffers, rbuffer);
+}
+
+static struct pipe_transfer *r600_get_transfer(struct pipe_context *ctx,
+                                              struct pipe_resource *resource,
+                                              unsigned level,
+                                              unsigned usage,
+                                              const struct pipe_box *box)
+{
+       struct r600_context *rctx = (struct r600_context*)ctx;
+       struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers);
+
+       transfer->resource = resource;
+       transfer->level = level;
+       transfer->usage = usage;
+       transfer->box = *box;
+       transfer->stride = 0;
+       transfer->layer_stride = 0;
+       transfer->data = NULL;
+
+       /* Note strides are zero, this is ok for buffers, but not for
+        * textures 2d & higher at least.
+        */
+       return transfer;
+}
+
+static void *r600_buffer_transfer_map(struct pipe_context *pipe,
+                                     struct pipe_transfer *transfer)
+{
+       struct r600_resource *rbuffer = r600_resource(transfer->resource);
+       struct r600_context *rctx = (struct r600_context*)pipe;
+       uint8_t *data;
+
+       if (rbuffer->b.user_ptr)
+               return (uint8_t*)rbuffer->b.user_ptr + transfer->box.x;
+
+       data = rctx->ws->buffer_map(rbuffer->buf, rctx->cs, transfer->usage);
+       if (!data)
+               return NULL;
+
+       return (uint8_t*)data + transfer->box.x;
+}
+
+static void r600_buffer_transfer_unmap(struct pipe_context *pipe,
+                                       struct pipe_transfer *transfer)
+{
+       struct r600_resource *rbuffer = r600_resource(transfer->resource);
+       struct r600_context *rctx = (struct r600_context*)pipe;
+
+       if (rbuffer->b.user_ptr)
+               return;
+
+       rctx->ws->buffer_unmap(rbuffer->buf);
+}
+
+static void r600_buffer_transfer_flush_region(struct pipe_context *pipe,
+                                               struct pipe_transfer *transfer,
+                                               const struct pipe_box *box)
+{
+}
+
+static void r600_transfer_destroy(struct pipe_context *ctx,
+                                 struct pipe_transfer *transfer)
+{
+       struct r600_context *rctx = (struct r600_context*)ctx;
+       util_slab_free(&rctx->pool_transfers, transfer);
+}
+
+static void r600_buffer_transfer_inline_write(struct pipe_context *pipe,
+                                               struct pipe_resource *resource,
+                                               unsigned level,
+                                               unsigned usage,
+                                               const struct pipe_box *box,
+                                               const void *data,
+                                               unsigned stride,
+                                               unsigned layer_stride)
+{
+       struct r600_context *rctx = (struct r600_context*)pipe;
+       struct r600_resource *rbuffer = r600_resource(resource);
+       uint8_t *map = NULL;
+
+       assert(rbuffer->b.user_ptr == NULL);
+
+       map = rctx->ws->buffer_map(rbuffer->buf, rctx->cs,
+                                  PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE | usage);
+
+       memcpy(map + box->x, data, box->width);
+
+       rctx->ws->buffer_unmap(rbuffer->buf);
+}
+
+static const struct u_resource_vtbl r600_buffer_vtbl =
+{
+       u_default_resource_get_handle,          /* get_handle */
+       r600_buffer_destroy,                    /* resource_destroy */
+       r600_get_transfer,                      /* get_transfer */
+       r600_transfer_destroy,                  /* transfer_destroy */
+       r600_buffer_transfer_map,               /* transfer_map */
+       r600_buffer_transfer_flush_region,      /* transfer_flush_region */
+       r600_buffer_transfer_unmap,             /* transfer_unmap */
+       r600_buffer_transfer_inline_write       /* transfer_inline_write */
+};
+
+bool r600_init_resource(struct r600_screen *rscreen,
+                       struct r600_resource *res,
+                       unsigned size, unsigned alignment,
+                       unsigned bind, unsigned usage)
+{
+       uint32_t initial_domain, domains;
+
+       /* Staging resources particpate in transfers and blits only
+        * and are used for uploads and downloads from regular
+        * resources.  We generate them internally for some transfers.
+        */
+       if (usage == PIPE_USAGE_STAGING) {
+               domains = RADEON_DOMAIN_GTT;
+               initial_domain = RADEON_DOMAIN_GTT;
+       } else {
+               domains = RADEON_DOMAIN_GTT | RADEON_DOMAIN_VRAM;
+
+               switch(usage) {
+               case PIPE_USAGE_DYNAMIC:
+               case PIPE_USAGE_STREAM:
+               case PIPE_USAGE_STAGING:
+                       initial_domain = RADEON_DOMAIN_GTT;
+                       break;
+               case PIPE_USAGE_DEFAULT:
+               case PIPE_USAGE_STATIC:
+               case PIPE_USAGE_IMMUTABLE:
+               default:
+                       initial_domain = RADEON_DOMAIN_VRAM;
+                       break;
+               }
+       }
+
+       res->buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, bind, initial_domain);
+       if (!res->buf) {
+               return false;
+       }
+
+       res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf);
+       res->domains = domains;
+       return true;
+}
+
+struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
+                                        const struct pipe_resource *templ)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       struct r600_resource *rbuffer;
+       /* XXX We probably want a different alignment for buffers and textures. */
+       unsigned alignment = 4096;
+
+       rbuffer = util_slab_alloc(&rscreen->pool_buffers);
+
+       rbuffer->b.b.b = *templ;
+       pipe_reference_init(&rbuffer->b.b.b.reference, 1);
+       rbuffer->b.b.b.screen = screen;
+       rbuffer->b.b.vtbl = &r600_buffer_vtbl;
+       rbuffer->b.user_ptr = NULL;
+
+       if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, templ->bind, templ->usage)) {
+               util_slab_free(&rscreen->pool_buffers, rbuffer);
+               return NULL;
+       }
+       return &rbuffer->b.b.b;
+}
+
+struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen,
+                                             void *ptr, unsigned bytes,
+                                             unsigned bind)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       struct r600_resource *rbuffer;
+
+       rbuffer = util_slab_alloc(&rscreen->pool_buffers);
+
+       pipe_reference_init(&rbuffer->b.b.b.reference, 1);
+       rbuffer->b.b.vtbl = &r600_buffer_vtbl;
+       rbuffer->b.b.b.screen = screen;
+       rbuffer->b.b.b.target = PIPE_BUFFER;
+       rbuffer->b.b.b.format = PIPE_FORMAT_R8_UNORM;
+       rbuffer->b.b.b.usage = PIPE_USAGE_IMMUTABLE;
+       rbuffer->b.b.b.bind = bind;
+       rbuffer->b.b.b.width0 = bytes;
+       rbuffer->b.b.b.height0 = 1;
+       rbuffer->b.b.b.depth0 = 1;
+       rbuffer->b.b.b.array_size = 1;
+       rbuffer->b.b.b.flags = 0;
+       rbuffer->b.user_ptr = ptr;
+       rbuffer->buf = NULL;
+       return &rbuffer->b.b.b;
+}
+
+void r600_upload_index_buffer(struct r600_context *rctx,
+                             struct pipe_index_buffer *ib, unsigned count)
+{
+       struct r600_resource *rbuffer = r600_resource(ib->buffer);
+
+       u_upload_data(rctx->vbuf_mgr->uploader, 0, count * ib->index_size,
+                     rbuffer->b.user_ptr, &ib->offset, &ib->buffer);
+}
+
+void r600_upload_const_buffer(struct r600_context *rctx, struct r600_resource **rbuffer,
+                            uint32_t *const_offset)
+{
+       if ((*rbuffer)->b.user_ptr) {
+               uint8_t *ptr = (*rbuffer)->b.user_ptr;
+               unsigned size = (*rbuffer)->b.b.b.width0;
+
+               *rbuffer = NULL;
+
+               if (R600_BIG_ENDIAN) {
+                       uint32_t *tmpPtr;
+                       unsigned i;
+
+                       if (!(tmpPtr = malloc(size))) {
+                               R600_ERR("Failed to allocate BE swap buffer.\n");
+                               return;
+                       }
+
+                       for (i = 0; i < size / 4; ++i) {
+                               tmpPtr[i] = bswap_32(((uint32_t *)ptr)[i]);
+                       }
+
+                       u_upload_data(rctx->vbuf_mgr->uploader, 0, size, tmpPtr, const_offset,
+                                     (struct pipe_resource**)rbuffer);
+
+                       free(tmpPtr);
+               } else {
+                       u_upload_data(rctx->vbuf_mgr->uploader, 0, size, ptr, const_offset,
+                                     (struct pipe_resource**)rbuffer);
+               }
+       } else {
+               *const_offset = 0;
+       }
+}
diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c

new file mode 100644 (file)

index 0000000..494b0d3
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_hw_context.c
@@ -0,0 +1,1151 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#include "r600_hw_context_priv.h"
+#include "radeonsi_pipe.h"
+#include "sid.h"
+#include "util/u_memory.h"
+#include <errno.h>
+
+#define GROUP_FORCE_NEW_BLOCK  0
+
+/* Get backends mask */
+void r600_get_backend_mask(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       struct r600_resource *buffer;
+       uint32_t *results;
+       unsigned num_backends = ctx->screen->info.r600_num_backends;
+       unsigned i, mask = 0;
+
+       /* if backend_map query is supported by the kernel */
+       if (ctx->screen->info.r600_backend_map_valid) {
+               unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes;
+               unsigned backend_map = ctx->screen->info.r600_backend_map;
+               unsigned item_width, item_mask;
+
+               if (ctx->chip_class >= CAYMAN) {
+                       item_width = 4;
+                       item_mask = 0x7;
+               }
+
+               while(num_tile_pipes--) {
+                       i = backend_map & item_mask;
+                       mask |= (1<<i);
+                       backend_map >>= item_width;
+               }
+               if (mask != 0) {
+                       ctx->backend_mask = mask;
+                       return;
+               }
+       }
+
+       /* otherwise backup path for older kernels */
+
+       /* create buffer for event data */
+       buffer = (struct r600_resource*)
+               pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM,
+                                  PIPE_USAGE_STAGING, ctx->max_db*16);
+       if (!buffer)
+               goto err;
+
+       /* initialize buffer with zeroes */
+       results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+       if (results) {
+               uint64_t va = 0;
+
+               memset(results, 0, ctx->max_db * 4 * 4);
+               ctx->ws->buffer_unmap(buffer->buf);
+
+               /* emit EVENT_WRITE for ZPASS_DONE */
+               va = r600_resource_va(&ctx->screen->screen, (void *)buffer);
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = va >> 32;
+
+               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+               cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE);
+
+               /* analyze results */
+               results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_READ);
+               if (results) {
+                       for(i = 0; i < ctx->max_db; i++) {
+                               /* at least highest bit will be set if backend is used */
+                               if (results[i*4 + 1])
+                                       mask |= (1<<i);
+                       }
+                       ctx->ws->buffer_unmap(buffer->buf);
+               }
+       }
+
+       pipe_resource_reference((struct pipe_resource**)&buffer, NULL);
+
+       if (mask != 0) {
+               ctx->backend_mask = mask;
+               return;
+       }
+
+err:
+       /* fallback to old method - set num_backends lower bits to 1 */
+       ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends);
+       return;
+}
+
+static inline void r600_context_ps_partial_flush(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+
+       if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING))
+               return;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+
+       ctx->flags &= ~R600_CONTEXT_DRAW_PENDING;
+}
+
+void r600_init_cs(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+
+       /* All asics require this one */
+       cs->buf[cs->cdw++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0);
+       cs->buf[cs->cdw++] = 0x80000000;
+       cs->buf[cs->cdw++] = 0x80000000;
+
+       ctx->init_dwords = cs->cdw;
+}
+
+static void r600_init_block(struct r600_context *ctx,
+                           struct r600_block *block,
+                           const struct r600_reg *reg, int index, int nreg,
+                           unsigned opcode, unsigned offset_base)
+{
+       int i = index;
+       int j, n = nreg;
+
+       /* initialize block */
+       block->flags = 0;
+       block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */
+       block->start_offset = reg[i].offset;
+       block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0);
+       block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2;
+       block->reg = &block->pm4[block->pm4_ndwords];
+       block->pm4_ndwords += n;
+       block->nreg = n;
+       block->nreg_dirty = n;
+       LIST_INITHEAD(&block->list);
+       LIST_INITHEAD(&block->enable_list);
+
+       for (j = 0; j < n; j++) {
+               if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) {
+                       block->flags |= REG_FLAG_DIRTY_ALWAYS;
+               }
+               if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) {
+                       if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
+                               block->status |= R600_BLOCK_STATUS_ENABLED;
+                               LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
+                               LIST_ADDTAIL(&block->list,&ctx->dirty);
+                       }
+               }
+               if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) {
+                       block->flags |= REG_FLAG_FLUSH_CHANGE;
+               }
+
+               if (reg[i+j].flags & REG_FLAG_NEED_BO) {
+                       block->nbo++;
+                       assert(block->nbo < R600_BLOCK_MAX_BO);
+                       block->pm4_bo_index[j] = block->nbo;
+                       block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0);
+                       block->pm4[block->pm4_ndwords++] = 0x00000000;
+                       block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1;
+               }
+       }
+       /* check that we stay in limit */
+       assert(block->pm4_ndwords < R600_BLOCK_MAX_REG);
+}
+
+int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
+                          unsigned opcode, unsigned offset_base)
+{
+       struct r600_block *block;
+       struct r600_range *range;
+       int offset;
+
+       for (unsigned i = 0, n = 0; i < nreg; i += n) {
+               /* ignore new block balise */
+               if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) {
+                       n = 1;
+                       continue;
+               }
+
+               /* register that need relocation are in their own group */
+               /* find number of consecutive registers */
+               n = 0;
+               offset = reg[i].offset;
+               while (reg[i + n].offset == offset) {
+                       n++;
+                       offset += 4;
+                       if ((n + i) >= nreg)
+                               break;
+                       if (n >= (R600_BLOCK_MAX_REG - 2))
+                               break;
+               }
+
+               /* allocate new block */
+               block = calloc(1, sizeof(struct r600_block));
+               if (block == NULL) {
+                       return -ENOMEM;
+               }
+               ctx->nblocks++;
+               for (int j = 0; j < n; j++) {
+                       range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)];
+                       /* create block table if it doesn't exist */
+                       if (!range->blocks)
+                               range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *));
+                       if (!range->blocks)
+                               return -1;
+
+                       range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block;
+               }
+
+               r600_init_block(ctx, block, reg, i, n, opcode, offset_base);
+
+       }
+       return 0;
+}
+
+
+/* initialize */
+void r600_context_fini(struct r600_context *ctx)
+{
+       struct r600_block *block;
+       struct r600_range *range;
+
+       for (int i = 0; i < NUM_RANGES; i++) {
+               if (!ctx->range[i].blocks)
+                       continue;
+               for (int j = 0; j < (1 << HASH_SHIFT); j++) {
+                       block = ctx->range[i].blocks[j];
+                       if (block) {
+                               for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) {
+                                       range = &ctx->range[CTX_RANGE_ID(offset)];
+                                       range->blocks[CTX_BLOCK_ID(offset)] = NULL;
+                               }
+                               for (int k = 1; k <= block->nbo; k++) {
+                                       pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL);
+                               }
+                               free(block);
+                       }
+               }
+               free(ctx->range[i].blocks);
+       }
+       free(ctx->range);
+       free(ctx->blocks);
+       ctx->ws->cs_destroy(ctx->cs);
+}
+
+int r600_setup_block_table(struct r600_context *ctx)
+{
+       /* setup block table */
+       int c = 0;
+       ctx->blocks = calloc(ctx->nblocks, sizeof(void*));
+       if (!ctx->blocks)
+               return -ENOMEM;
+       for (int i = 0; i < NUM_RANGES; i++) {
+               if (!ctx->range[i].blocks)
+                       continue;
+               for (int j = 0, add; j < (1 << HASH_SHIFT); j++) {
+                       if (!ctx->range[i].blocks[j])
+                               continue;
+
+                       add = 1;
+                       for (int k = 0; k < c; k++) {
+                               if (ctx->blocks[k] == ctx->range[i].blocks[j]) {
+                                       add = 0;
+                                       break;
+                               }
+                       }
+                       if (add) {
+                               assert(c < ctx->nblocks);
+                               ctx->blocks[c++] = ctx->range[i].blocks[j];
+                               j += (ctx->range[i].blocks[j]->nreg) - 1;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
+                       boolean count_draw_in)
+{
+       struct r600_atom *state;
+
+       /* The number of dwords we already used in the CS so far. */
+       num_dw += ctx->cs->cdw;
+
+       if (count_draw_in) {
+               /* The number of dwords all the dirty states would take. */
+               LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) {
+                       num_dw += state->num_dw;
+               }
+
+               num_dw += ctx->pm4_dirty_cdwords;
+
+               /* The upper-bound of how much a draw command would take. */
+               num_dw += R600_MAX_DRAW_CS_DWORDS;
+       }
+
+       /* Count in queries_suspend. */
+       num_dw += ctx->num_cs_dw_queries_suspend;
+
+       /* Count in streamout_end at the end of CS. */
+       num_dw += ctx->num_cs_dw_streamout_end;
+
+       /* Count in render_condition(NULL) at the end of CS. */
+       if (ctx->predicate_drawing) {
+               num_dw += 3;
+       }
+
+       /* Count in framebuffer cache flushes at the end of CS. */
+       num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */
+
+       /* Save 16 dwords for the fence mechanism. */
+       num_dw += 16;
+
+       /* Flush if there's not enough space. */
+       if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+               radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC);
+       }
+}
+
+void r600_context_dirty_block(struct r600_context *ctx,
+                             struct r600_block *block,
+                             int dirty, int index)
+{
+       if ((index + 1) > block->nreg_dirty)
+               block->nreg_dirty = index + 1;
+
+       if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) {
+               block->status |= R600_BLOCK_STATUS_DIRTY;
+               ctx->pm4_dirty_cdwords += block->pm4_ndwords;
+               if (!(block->status & R600_BLOCK_STATUS_ENABLED)) {
+                       block->status |= R600_BLOCK_STATUS_ENABLED;
+                       LIST_ADDTAIL(&block->enable_list, &ctx->enable_list);
+               }
+               LIST_ADDTAIL(&block->list,&ctx->dirty);
+
+               if (block->flags & REG_FLAG_FLUSH_CHANGE) {
+                       r600_context_ps_partial_flush(ctx);
+               }
+       }
+}
+
+void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state)
+{
+       struct r600_block *block;
+       int dirty;
+       for (int i = 0; i < state->nregs; i++) {
+               unsigned id, reloc_id;
+               struct r600_pipe_reg *reg = &state->regs[i];
+
+               block = reg->block;
+               id = reg->id;
+
+               dirty = block->status & R600_BLOCK_STATUS_DIRTY;
+
+               if (reg->value != block->reg[id]) {
+                       block->reg[id] = reg->value;
+                       dirty |= R600_BLOCK_STATUS_DIRTY;
+               }
+               if (block->flags & REG_FLAG_DIRTY_ALWAYS)
+                       dirty |= R600_BLOCK_STATUS_DIRTY;
+               if (block->pm4_bo_index[id]) {
+                       /* find relocation */
+                       reloc_id = block->pm4_bo_index[id];
+                       pipe_resource_reference((struct pipe_resource**)&block->reloc[reloc_id].bo, &reg->bo->b.b.b);
+                       block->reloc[reloc_id].bo_usage = reg->bo_usage;
+                       /* always force dirty for relocs for now */
+                       dirty |= R600_BLOCK_STATUS_DIRTY;
+               }
+
+               if (dirty)
+                       r600_context_dirty_block(ctx, block, dirty, id);
+       }
+}
+
+struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset)
+{
+       struct r600_range *range;
+       struct r600_block *block;
+       unsigned id;
+
+       range = &ctx->range[CTX_RANGE_ID(offset)];
+       block = range->blocks[CTX_BLOCK_ID(offset)];
+       offset -= block->start_offset;
+       id = block->pm4_bo_index[offset >> 2];
+       if (block->reloc[id].bo) {
+               return block->reloc[id].bo;
+       }
+       return NULL;
+}
+
+void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS);
+       int cp_dwords = block->pm4_ndwords, start_dword = 0;
+       int new_dwords = 0;
+       int nbo = block->nbo;
+
+       if (block->nreg_dirty == 0 && optional) {
+               goto out;
+       }
+
+       if (nbo) {
+               ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH;
+
+               for (int j = 0; j < block->nreg; j++) {
+                       if (block->pm4_bo_index[j]) {
+                               /* find relocation */
+                               struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]];
+                               block->pm4[reloc->bo_pm4_index] =
+                                       r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage);
+                               nbo--;
+                               if (nbo == 0)
+                                       break;
+                       }
+               }
+               ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH;
+       }
+
+       optional &= (block->nreg_dirty != block->nreg);
+       if (optional) {
+               new_dwords = block->nreg_dirty;
+               start_dword = cs->cdw;
+               cp_dwords = new_dwords + 2;
+       }
+       memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4);
+       cs->cdw += cp_dwords;
+
+       if (optional) {
+               uint32_t newword;
+
+               newword = cs->buf[start_dword];
+               newword &= PKT_COUNT_C;
+               newword |= PKT_COUNT_S(new_dwords);
+               cs->buf[start_dword] = newword;
+       }
+out:
+       block->status ^= R600_BLOCK_STATUS_DIRTY;
+       block->nreg_dirty = 0;
+       LIST_DELINIT(&block->list);
+}
+
+void r600_inval_shader_cache(struct r600_context *ctx)
+{
+       ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
+       ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
+       r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
+}
+
+void r600_inval_texture_cache(struct r600_context *ctx)
+{
+       ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
+       r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
+}
+
+void r600_inval_vertex_cache(struct r600_context *ctx)
+{
+       /* Some GPUs don't have the vertex cache and must use the texture cache instead. */
+       ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1);
+       r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
+}
+
+void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now)
+{
+       if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY))
+               return;
+
+       ctx->atom_surface_sync.flush_flags |=
+               r600_get_cb_flush_flags(ctx) |
+               (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0);
+
+       if (flush_now) {
+               r600_emit_atom(ctx, &ctx->atom_surface_sync.atom);
+       } else {
+               r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
+       }
+
+       ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
+}
+
+void r600_context_flush(struct r600_context *ctx, unsigned flags)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       struct r600_block *enable_block = NULL;
+       bool queries_suspended = false;
+       bool streamout_suspended = false;
+
+       if (cs->cdw == ctx->init_dwords)
+               return;
+
+       /* suspend queries */
+       if (ctx->num_cs_dw_queries_suspend) {
+               r600_context_queries_suspend(ctx);
+               queries_suspended = true;
+       }
+
+       if (ctx->num_cs_dw_streamout_end) {
+               r600_context_streamout_end(ctx);
+               streamout_suspended = true;
+       }
+
+       r600_flush_framebuffer(ctx, true);
+
+       /* partial flush is needed to avoid lockups on some chips with user fences */
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+
+       /* Flush the CS. */
+       ctx->ws->cs_flush(ctx->cs, flags);
+
+       ctx->pm4_dirty_cdwords = 0;
+       ctx->flags = 0;
+
+       r600_init_cs(ctx);
+
+       if (streamout_suspended) {
+               ctx->streamout_start = TRUE;
+               ctx->streamout_append_bitmask = ~0;
+       }
+
+       /* resume queries */
+       if (queries_suspended) {
+               r600_context_queries_resume(ctx);
+       }
+
+       /* set all valid group as dirty so they get reemited on
+        * next draw command
+        */
+       LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) {
+               if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) {
+                       LIST_ADDTAIL(&enable_block->list,&ctx->dirty);
+                       enable_block->status |= R600_BLOCK_STATUS_DIRTY;
+               }
+               ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords;
+               enable_block->nreg_dirty = enable_block->nreg;
+       }
+}
+
+void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       uint64_t va;
+
+       r600_need_cs_space(ctx, 10, FALSE);
+
+       va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo);
+       va = va + (offset << 2);
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4);
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+       cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;       /* ADDRESS_LO */
+       /* DATA_SEL | INT_EN | ADDRESS_HI */
+       cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF);
+       cs->buf[cs->cdw++] = value;                   /* DATA_LO */
+       cs->buf[cs->cdw++] = 0;                       /* DATA_HI */
+       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE);
+}
+
+static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index,
+                                      bool test_status_bit)
+{
+       uint32_t *current_result = (uint32_t*)map;
+       uint64_t start, end;
+
+       start = (uint64_t)current_result[start_index] |
+               (uint64_t)current_result[start_index+1] << 32;
+       end = (uint64_t)current_result[end_index] |
+             (uint64_t)current_result[end_index+1] << 32;
+
+       if (!test_status_bit ||
+           ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) {
+               return end - start;
+       }
+       return 0;
+}
+
+static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait)
+{
+       unsigned results_base = query->results_start;
+       char *map;
+
+       map = ctx->ws->buffer_map(query->buffer->buf, ctx->cs,
+                                 PIPE_TRANSFER_READ |
+                                 (wait ? 0 : PIPE_TRANSFER_DONTBLOCK));
+       if (!map)
+               return FALSE;
+
+       /* count all results across all data blocks */
+       switch (query->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 0, 2, true);
+                       results_base = (results_base + 16) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               while (results_base != query->results_end) {
+                       query->result.b = query->result.b ||
+                               r600_query_read_result(map + results_base, 0, 2, true) != 0;
+                       results_base = (results_base + 16) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 0, 2, false);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+               /* SAMPLE_STREAMOUTSTATS stores this structure:
+                * {
+                *    u64 NumPrimitivesWritten;
+                *    u64 PrimitiveStorageNeeded;
+                * }
+                * We only need NumPrimitivesWritten here. */
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 2, 6, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+               /* Here we read PrimitiveStorageNeeded. */
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_SO_STATISTICS:
+               while (results_base != query->results_end) {
+                       query->result.so.num_primitives_written +=
+                               r600_query_read_result(map + results_base, 2, 6, true);
+                       query->result.so.primitives_storage_needed +=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               while (results_base != query->results_end) {
+                       query->result.b = query->result.b ||
+                               r600_query_read_result(map + results_base, 2, 6, true) !=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       default:
+               assert(0);
+       }
+
+       query->results_start = query->results_end;
+       ctx->ws->buffer_unmap(query->buffer->buf);
+       return TRUE;
+}
+
+void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       unsigned new_results_end, i;
+       uint32_t *results;
+       uint64_t va;
+
+       r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE);
+
+       new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0;
+
+       /* collect current results if query buffer is full */
+       if (new_results_end == query->results_start) {
+               r600_query_result(ctx, query, TRUE);
+       }
+
+       switch (query->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+               if (results) {
+                       results = (uint32_t*)((char*)results + query->results_end);
+                       memset(results, 0, query->result_size);
+
+                       /* Set top bits for unused backends */
+                       for (i = 0; i < ctx->max_db; i++) {
+                               if (!(ctx->backend_mask & (1<<i))) {
+                                       results[(i * 4)+1] = 0x80000000;
+                                       results[(i * 4)+3] = 0x80000000;
+                               }
+                       }
+                       ctx->ws->buffer_unmap(query->buffer->buf);
+               }
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+               results = (uint32_t*)((char*)results + query->results_end);
+               memset(results, 0, query->result_size);
+               ctx->ws->buffer_unmap(query->buffer->buf);
+               break;
+       default:
+               assert(0);
+       }
+
+       /* emit begin query */
+       va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
+       va += query->results_end;
+
+       switch (query->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+               cs->buf[cs->cdw++] = query->results_end;
+               cs->buf[cs->cdw++] = 0;
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
+               cs->buf[cs->cdw++] = 0;
+               cs->buf[cs->cdw++] = 0;
+               break;
+       default:
+               assert(0);
+       }
+       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
+
+       ctx->num_cs_dw_queries_suspend += query->num_cs_dw;
+}
+
+void r600_query_end(struct r600_context *ctx, struct r600_query *query)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       uint64_t va;
+
+       va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
+       /* emit end query */
+       switch (query->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               va += query->results_end + 8;
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+               cs->buf[cs->cdw++] = query->results_end + query->result_size/2;
+               cs->buf[cs->cdw++] = 0;
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               va += query->results_end + query->result_size/2;
+               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
+               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF);
+               cs->buf[cs->cdw++] = 0;
+               cs->buf[cs->cdw++] = 0;
+               break;
+       default:
+               assert(0);
+       }
+       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE);
+
+       query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0;
+       ctx->num_cs_dw_queries_suspend -= query->num_cs_dw;
+}
+
+void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation,
+                           int flag_wait)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       uint64_t va;
+
+       if (operation == PREDICATION_OP_CLEAR) {
+               r600_need_cs_space(ctx, 3, FALSE);
+
+               cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+               cs->buf[cs->cdw++] = 0;
+               cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR);
+       } else {
+               unsigned results_base = query->results_start;
+               unsigned count;
+               uint32_t op;
+
+               /* find count of the query data blocks */
+               count = (query->buffer->b.b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.b.width0;
+               count /= query->result_size;
+
+               r600_need_cs_space(ctx, 5 * count, TRUE);
+
+               op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE |
+                               (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW);
+               va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer);
+
+               /* emit predicate packets for all data blocks */
+               while (results_base != query->results_end) {
+                       cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0);
+                       cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL;
+                       cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF);
+                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+                       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer,
+                                                                            RADEON_USAGE_READ);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+
+                       /* set CONTINUE bit for all packets except the first */
+                       op |= PREDICATION_CONTINUE;
+               }
+       }
+}
+
+struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type)
+{
+       struct r600_query *query;
+       unsigned buffer_size = 4096;
+
+       query = CALLOC_STRUCT(r600_query);
+       if (query == NULL)
+               return NULL;
+
+       query->type = query_type;
+
+       switch (query_type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               query->result_size = 16 * ctx->max_db;
+               query->num_cs_dw = 6;
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               query->result_size = 16;
+               query->num_cs_dw = 8;
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+               query->result_size = 32;
+               query->num_cs_dw = 6;
+               break;
+       default:
+               assert(0);
+               FREE(query);
+               return NULL;
+       }
+
+       /* adjust buffer size to simplify offsets wrapping math */
+       buffer_size -= buffer_size % query->result_size;
+
+       /* Queries are normally read by the CPU after
+        * being written by the gpu, hence staging is probably a good
+        * usage pattern.
+        */
+       query->buffer = (struct r600_resource*)
+               pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buffer_size);
+       if (!query->buffer) {
+               FREE(query);
+               return NULL;
+       }
+       return query;
+}
+
+void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
+{
+       pipe_resource_reference((struct pipe_resource**)&query->buffer, NULL);
+       free(query);
+}
+
+boolean r600_context_query_result(struct r600_context *ctx,
+                               struct r600_query *query,
+                               boolean wait, void *vresult)
+{
+       boolean *result_b = (boolean*)vresult;
+       uint64_t *result_u64 = (uint64_t*)vresult;
+       struct pipe_query_data_so_statistics *result_so =
+               (struct pipe_query_data_so_statistics*)vresult;
+
+       if (!r600_query_result(ctx, query, wait))
+               return FALSE;
+
+       switch (query->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+               *result_u64 = query->result.u64;
+               break;
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               *result_b = query->result.b;
+               break;
+       case PIPE_QUERY_TIME_ELAPSED:
+               *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
+               break;
+       case PIPE_QUERY_SO_STATISTICS:
+               *result_so = query->result.so;
+               break;
+       default:
+               assert(0);
+       }
+       return TRUE;
+}
+
+void r600_context_queries_suspend(struct r600_context *ctx)
+{
+       struct r600_query *query;
+
+       LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
+               r600_query_end(ctx, query);
+       }
+       assert(ctx->num_cs_dw_queries_suspend == 0);
+}
+
+void r600_context_queries_resume(struct r600_context *ctx)
+{
+       struct r600_query *query;
+
+       assert(ctx->num_cs_dw_queries_suspend == 0);
+
+       LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) {
+               r600_query_begin(ctx, query);
+       }
+}
+
+void r600_context_streamout_begin(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       struct r600_so_target **t = ctx->so_targets;
+       unsigned *strides = ctx->vs_shader_so_strides;
+       unsigned buffer_en, i;
+
+       buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
+                   (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
+                   (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
+                   (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
+
+       ctx->num_cs_dw_streamout_end =
+               12 + /* flush_vgt_streamout */
+               util_bitcount(buffer_en) * 8 +
+               3;
+
+       r600_need_cs_space(ctx,
+                          12 + /* flush_vgt_streamout */
+                          6 + /* enables */
+                          util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
+                          util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
+                          ctx->num_cs_dw_streamout_end, TRUE);
+
+       if (ctx->chip_class >= CAYMAN) {
+               evergreen_flush_vgt_streamout(ctx);
+               evergreen_set_streamout_enable(ctx, buffer_en);
+       }
+
+       for (i = 0; i < ctx->num_so_targets; i++) {
+#if 0
+               if (t[i]) {
+                       t[i]->stride = strides[i];
+                       t[i]->so_index = i;
+
+                       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
+                       cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
+                                                       16*i - SI_CONTEXT_REG_OFFSET) >> 2;
+                       cs->buf[cs->cdw++] = (t[i]->b.buffer_offset +
+                                                       t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
+                       cs->buf[cs->cdw++] = strides[i] >> 2;              /* VTX_STRIDE (in DW) */
+                       cs->buf[cs->cdw++] = 0;                    /* BUFFER_BASE */
+
+                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+                       cs->buf[cs->cdw++] =
+                               r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
+                                                     RADEON_USAGE_WRITE);
+
+                       if (ctx->streamout_append_bitmask & (1 << i)) {
+                               /* Append. */
+                               cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                               cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
+                                                              STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
+                               cs->buf[cs->cdw++] = 0; /* unused */
+                               cs->buf[cs->cdw++] = 0; /* unused */
+                               cs->buf[cs->cdw++] = 0; /* src address lo */
+                               cs->buf[cs->cdw++] = 0; /* src address hi */
+
+                               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+                               cs->buf[cs->cdw++] =
+                                       r600_context_bo_reloc(ctx,  t[i]->filled_size,
+                                                             RADEON_USAGE_READ);
+                       } else {
+                               /* Start from the beginning. */
+                               cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                               cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
+                                                              STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
+                               cs->buf[cs->cdw++] = 0; /* unused */
+                               cs->buf[cs->cdw++] = 0; /* unused */
+                               cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
+                               cs->buf[cs->cdw++] = 0; /* unused */
+                       }
+               }
+#endif
+       }
+}
+
+void r600_context_streamout_end(struct r600_context *ctx)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       struct r600_so_target **t = ctx->so_targets;
+       unsigned i, flush_flags = 0;
+
+       evergreen_flush_vgt_streamout(ctx);
+
+       for (i = 0; i < ctx->num_so_targets; i++) {
+#if 0
+               if (t[i]) {
+                       cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                       cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) |
+                                                      STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                                                      STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
+                       cs->buf[cs->cdw++] = 0; /* dst address lo */
+                       cs->buf[cs->cdw++] = 0; /* dst address hi */
+                       cs->buf[cs->cdw++] = 0; /* unused */
+                       cs->buf[cs->cdw++] = 0; /* unused */
+
+                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+                       cs->buf[cs->cdw++] =
+                               r600_context_bo_reloc(ctx,  t[i]->filled_size,
+                                                     RADEON_USAGE_WRITE);
+
+                       flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
+               }
+#endif
+       }
+
+       evergreen_set_streamout_enable(ctx, 0);
+
+       ctx->atom_surface_sync.flush_flags |= flush_flags;
+       r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom);
+
+       ctx->num_cs_dw_streamout_end = 0;
+
+       /* XXX print some debug info */
+       for (i = 0; i < ctx->num_so_targets; i++) {
+               if (!t[i])
+                       continue;
+
+               uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ);
+               printf("FILLED_SIZE%i: %u\n", i, *ptr);
+               ctx->ws->buffer_unmap(t[i]->filled_size->buf);
+       }
+}
+
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
+{
+       struct radeon_winsys_cs *cs = ctx->cs;
+       r600_need_cs_space(ctx, 14 + 21, TRUE);
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+       cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2;
+       cs->buf[cs->cdw++] = 0;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+       cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2;
+       cs->buf[cs->cdw++] = t->stride >> 2;
+
+#if 0
+       cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
+       cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
+       cs->buf[cs->cdw++] = 0; /* src address lo */
+       cs->buf[cs->cdw++] = 0; /* src address hi */
+       cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
+       cs->buf[cs->cdw++] = 0; /* unused */
+#endif
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ);
+
+#if 0 /* I have not found this useful yet. */
+       cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
+       cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
+       cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
+       cs->buf[cs->cdw++] = 0; /* unused */
+       cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
+       cs->buf[cs->cdw++] = 0; /* unused */
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2;
+       cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2;
+       cs->buf[cs->cdw++] = t->b.buffer_offset >> 2;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+       cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
+                                                            RADEON_USAGE_WRITE);
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+       cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+       cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
+       cs->buf[cs->cdw++] = 0;
+       cs->buf[cs->cdw++] = 0; /* reference value */
+       cs->buf[cs->cdw++] = 0xffffffff; /* mask */
+       cs->buf[cs->cdw++] = 4; /* poll interval */
+#endif
+}
diff --git a/src/gallium/drivers/radeonsi/r600_hw_context_priv.h b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h

new file mode 100644 (file)

index 0000000..7d5394e
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef R600_PRIV_H
+#define R600_PRIV_H
+
+#include "radeonsi_pipe.h"
+#include "util/u_hash_table.h"
+#include "os/os_thread.h"
+
+#define R600_MAX_DRAW_CS_DWORDS 17
+
+#define PKT_COUNT_C                     0xC000FFFF
+#define PKT_COUNT_S(x)                  (((x) & 0x3FFF) << 16)
+
+/* these flags are used in register flags and added into block flags */
+#define REG_FLAG_NEED_BO 1
+#define REG_FLAG_DIRTY_ALWAYS 2
+#define REG_FLAG_RV6XX_SBU 4
+#define REG_FLAG_NOT_R600 8
+#define REG_FLAG_ENABLE_ALWAYS 16
+#define REG_FLAG_FLUSH_CHANGE 64
+
+struct r600_reg {
+       unsigned                        offset;
+       unsigned                        flags;
+};
+
+/*
+ * r600_hw_context.c
+ */
+struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset);
+int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg,
+                          unsigned opcode, unsigned offset_base);
+void r600_context_dirty_block(struct r600_context *ctx, struct r600_block *block,
+                             int dirty, int index);
+int r600_setup_block_table(struct r600_context *ctx);
+void r600_init_cs(struct r600_context *ctx);
+
+/*
+ * evergreen_hw_context.c
+ */
+void evergreen_flush_vgt_streamout(struct r600_context *ctx);
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit);
+
+
+static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo,
+                                            enum radeon_bo_usage usage)
+{
+       assert(usage);
+       return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains) * 4;
+}
+
+#endif
diff --git a/src/gallium/drivers/radeonsi/r600_query.c b/src/gallium/drivers/radeonsi/r600_query.c

new file mode 100644 (file)

index 0000000..bbf7c04
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_query.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "radeonsi_pipe.h"
+#include "sid.h"
+
+static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       return (struct pipe_query*)r600_context_query_create(rctx, query_type);
+}
+
+static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       r600_context_query_destroy(rctx, (struct r600_query *)query);
+}
+
+static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_query *rquery = (struct r600_query *)query;
+
+       memset(&rquery->result, 0, sizeof(rquery->result));
+       rquery->results_start = rquery->results_end;
+       r600_query_begin(rctx, (struct r600_query *)query);
+       LIST_ADDTAIL(&rquery->list, &rctx->active_query_list);
+}
+
+static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_query *rquery = (struct r600_query *)query;
+
+       r600_query_end(rctx, rquery);
+       LIST_DELINIT(&rquery->list);
+}
+
+static boolean r600_get_query_result(struct pipe_context *ctx,
+                                       struct pipe_query *query,
+                                       boolean wait, union pipe_query_result *vresult)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_query *rquery = (struct r600_query *)query;
+
+       return r600_context_query_result(rctx, rquery, wait, vresult);
+}
+
+static void r600_render_condition(struct pipe_context *ctx,
+                                 struct pipe_query *query,
+                                 uint mode)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_query *rquery = (struct r600_query *)query;
+       int wait_flag = 0;
+
+       /* If we already have nonzero result, render unconditionally */
+       if (query != NULL && rquery->result.u64 != 0) {
+               if (rctx->current_render_cond) {
+                       r600_render_condition(ctx, NULL, 0);
+               }
+               return;
+       }
+
+       rctx->current_render_cond = query;
+       rctx->current_render_cond_mode = mode;
+
+       if (query == NULL) {
+               if (rctx->predicate_drawing) {
+                       rctx->predicate_drawing = false;
+                       r600_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, 1);
+               }
+               return;
+       }
+
+       if (mode == PIPE_RENDER_COND_WAIT ||
+           mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
+               wait_flag = 1;
+       }
+
+       rctx->predicate_drawing = true;
+
+       switch (rquery->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               r600_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               r600_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+               break;
+       default:
+               assert(0);
+       }
+}
+
+void r600_init_query_functions(struct r600_context *rctx)
+{
+       rctx->context.create_query = r600_create_query;
+       rctx->context.destroy_query = r600_destroy_query;
+       rctx->context.begin_query = r600_begin_query;
+       rctx->context.end_query = r600_end_query;
+       rctx->context.get_query_result = r600_get_query_result;
+
+       if (rctx->screen->info.r600_num_backends > 0)
+           rctx->context.render_condition = r600_render_condition;
+}
diff --git a/src/gallium/drivers/radeonsi/r600_resource.c b/src/gallium/drivers/radeonsi/r600_resource.c

new file mode 100644 (file)

index 0000000..7bdf6d6
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_resource.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "radeonsi_pipe.h"
+
+static struct pipe_resource *r600_resource_create(struct pipe_screen *screen,
+                                               const struct pipe_resource *templ)
+{
+       if (templ->target == PIPE_BUFFER) {
+               return r600_buffer_create(screen, templ);
+       } else {
+               return r600_texture_create(screen, templ);
+       }
+}
+
+static struct pipe_resource *r600_resource_from_handle(struct pipe_screen * screen,
+                                                       const struct pipe_resource *templ,
+                                                       struct winsys_handle *whandle)
+{
+       if (templ->target == PIPE_BUFFER) {
+               return NULL;
+       } else {
+               return r600_texture_from_handle(screen, templ, whandle);
+       }
+}
+
+void r600_init_screen_resource_functions(struct pipe_screen *screen)
+{
+       screen->resource_create = r600_resource_create;
+       screen->resource_from_handle = r600_resource_from_handle;
+       screen->resource_get_handle = u_resource_get_handle_vtbl;
+       screen->resource_destroy = u_resource_destroy_vtbl;
+       screen->user_buffer_create = r600_user_buffer_create;
+}
+
+void r600_init_context_resource_functions(struct r600_context *r600)
+{
+       r600->context.get_transfer = u_get_transfer_vtbl;
+       r600->context.transfer_map = u_transfer_map_vtbl;
+       r600->context.transfer_flush_region = u_transfer_flush_region_vtbl;
+       r600->context.transfer_unmap = u_transfer_unmap_vtbl;
+       r600->context.transfer_destroy = u_transfer_destroy_vtbl;
+       r600->context.transfer_inline_write = u_transfer_inline_write_vtbl;
+}
diff --git a/src/gallium/drivers/radeonsi/r600_resource.h b/src/gallium/drivers/radeonsi/r600_resource.h

new file mode 100644 (file)

index 0000000..d6f97b0
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_resource.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2010 Marek Olšák <maraeo@gmail.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef R600_RESOURCE_H
+#define R600_RESOURCE_H
+
+#include "util/u_transfer.h"
+#include "util/u_vbuf.h"
+
+/* flag to indicate a resource is to be used as a transfer so should not be tiled */
+#define R600_RESOURCE_FLAG_TRANSFER     PIPE_RESOURCE_FLAG_DRV_PRIV
+
+/* Texture transfer. */
+struct r600_transfer {
+       /* Base class. */
+       struct pipe_transfer            transfer;
+       /* Buffer transfer. */
+       struct pipe_transfer            *buffer_transfer;
+       unsigned                        offset;
+       struct pipe_resource            *staging_texture;
+};
+
+struct r600_resource_texture {
+       struct r600_resource            resource;
+
+       /* If this resource is a depth-stencil buffer on evergreen, this contains
+        * the depth part of the format. There is a separate stencil resource
+        * for the stencil buffer below. */
+       enum pipe_format                real_format;
+
+       unsigned                        offset[PIPE_MAX_TEXTURE_LEVELS];
+       unsigned                        pitch_in_bytes[PIPE_MAX_TEXTURE_LEVELS];  /* transfer */
+       unsigned                        pitch_in_blocks[PIPE_MAX_TEXTURE_LEVELS]; /* texture resource */
+       unsigned                        layer_size[PIPE_MAX_TEXTURE_LEVELS];
+       unsigned                        array_mode[PIPE_MAX_TEXTURE_LEVELS];
+       unsigned                        pitch_override;
+       unsigned                        size;
+       unsigned                        tile_type;
+       unsigned                        depth;
+       unsigned                        dirty_db;
+       struct r600_resource_texture    *stencil; /* Stencil is in a separate buffer on Evergreen. */
+       struct r600_resource_texture    *flushed_depth_texture;
+       boolean                         is_flushing_texture;
+};
+
+#define R600_TEX_IS_TILED(tex, level) ((tex)->array_mode[level] != V_038000_ARRAY_LINEAR_GENERAL && (tex)->array_mode[level] != V_038000_ARRAY_LINEAR_ALIGNED)
+
+struct r600_surface {
+       struct pipe_surface             base;
+       unsigned                        aligned_height;
+};
+
+void r600_init_screen_resource_functions(struct pipe_screen *screen);
+
+/* r600_texture */
+struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
+                                       const struct pipe_resource *templ);
+struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
+                                               const struct pipe_resource *base,
+                                               struct winsys_handle *whandle);
+
+static INLINE struct r600_resource *r600_resource(struct pipe_resource *r)
+{
+       return (struct r600_resource*)r;
+}
+
+int r600_texture_depth_flush(struct pipe_context *ctx, struct pipe_resource *texture, boolean just_create);
+
+/* r600_texture.c texture transfer functions. */
+struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
+                                               struct pipe_resource *texture,
+                                               unsigned level,
+                                               unsigned usage,
+                                               const struct pipe_box *box);
+void r600_texture_transfer_destroy(struct pipe_context *ctx,
+                                  struct pipe_transfer *trans);
+void* r600_texture_transfer_map(struct pipe_context *ctx,
+                               struct pipe_transfer* transfer);
+void r600_texture_transfer_unmap(struct pipe_context *ctx,
+                                struct pipe_transfer* transfer);
+
+struct r600_context;
+
+void r600_upload_const_buffer(struct r600_context *rctx, struct r600_resource **rbuffer, uint32_t *offset);
+
+#endif
diff --git a/src/gallium/drivers/radeonsi/r600_state_common.c b/src/gallium/drivers/radeonsi/r600_state_common.c

new file mode 100644 (file)

index 0000000..4ba83de
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_state_common.c
@@ -0,0 +1,899 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *           2010 Jerome Glisse
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie <airlied@redhat.com>
+ *          Jerome Glisse <jglisse@redhat.com>
+ */
+#include "util/u_blitter.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "pipebuffer/pb_buffer.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "r600_hw_context_priv.h"
+#include "radeonsi_pipe.h"
+#include "sid.h"
+
+static void r600_emit_surface_sync(struct r600_context *rctx, struct r600_atom *atom)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       struct r600_atom_surface_sync *a = (struct r600_atom_surface_sync*)atom;
+
+       cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+       cs->buf[cs->cdw++] = a->flush_flags;  /* CP_COHER_CNTL */
+       cs->buf[cs->cdw++] = 0xffffffff;      /* CP_COHER_SIZE */
+       cs->buf[cs->cdw++] = 0;               /* CP_COHER_BASE */
+       cs->buf[cs->cdw++] = 0x0000000A;      /* POLL_INTERVAL */
+
+       a->flush_flags = 0;
+}
+
+static void r600_emit_r6xx_flush_and_inv(struct r600_context *rctx, struct r600_atom *atom)
+{
+       struct radeon_winsys_cs *cs = rctx->cs;
+       cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
+}
+
+static void r600_init_atom(struct r600_atom *atom,
+                          void (*emit)(struct r600_context *ctx, struct r600_atom *state),
+                          unsigned num_dw,
+                          enum r600_atom_flags flags)
+{
+       atom->emit = emit;
+       atom->num_dw = num_dw;
+       atom->flags = flags;
+}
+
+void r600_init_common_atoms(struct r600_context *rctx)
+{
+       r600_init_atom(&rctx->atom_surface_sync.atom,   r600_emit_surface_sync,         5, EMIT_EARLY);
+       r600_init_atom(&rctx->atom_r6xx_flush_and_inv,  r600_emit_r6xx_flush_and_inv,   2, EMIT_EARLY);
+}
+
+unsigned r600_get_cb_flush_flags(struct r600_context *rctx)
+{
+       unsigned flags = 0;
+
+       if (rctx->framebuffer.nr_cbufs) {
+               flags |= S_0085F0_CB_ACTION_ENA(1) |
+                        (((1 << rctx->framebuffer.nr_cbufs) - 1) << S_0085F0_CB0_DEST_BASE_ENA_SHIFT);
+       }
+
+       return flags;
+}
+
+void r600_texture_barrier(struct pipe_context *ctx)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       rctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1) | r600_get_cb_flush_flags(rctx);
+       r600_atom_dirty(rctx, &rctx->atom_surface_sync.atom);
+}
+
+static bool r600_conv_pipe_prim(unsigned pprim, unsigned *prim)
+{
+       static const int prim_conv[] = {
+               V_008958_DI_PT_POINTLIST,
+               V_008958_DI_PT_LINELIST,
+               V_008958_DI_PT_LINELOOP,
+               V_008958_DI_PT_LINESTRIP,
+               V_008958_DI_PT_TRILIST,
+               V_008958_DI_PT_TRISTRIP,
+               V_008958_DI_PT_TRIFAN,
+               V_008958_DI_PT_QUADLIST,
+               V_008958_DI_PT_QUADSTRIP,
+               V_008958_DI_PT_POLYGON,
+               -1,
+               -1,
+               -1,
+               -1
+       };
+
+       *prim = prim_conv[pprim];
+       if (*prim == -1) {
+               fprintf(stderr, "%s:%d unsupported %d\n", __func__, __LINE__, pprim);
+               return false;
+       }
+       return true;
+}
+
+/* common state between evergreen and r600 */
+void r600_bind_blend_state(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_blend *blend = (struct r600_pipe_blend *)state;
+       struct r600_pipe_state *rstate;
+
+       if (state == NULL)
+               return;
+       rstate = &blend->rstate;
+       rctx->states[rstate->id] = rstate;
+       rctx->cb_target_mask = blend->cb_target_mask;
+       rctx->cb_color_control = blend->cb_color_control;
+
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void r600_set_stencil_ref(struct pipe_context *ctx,
+                                const struct r600_stencil_ref *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state);
+
+       if (rstate == NULL)
+               return;
+
+       rstate->id = R600_PIPE_STATE_STENCIL_REF;
+       r600_pipe_state_add_reg(rstate,
+                               R_028430_DB_STENCILREFMASK,
+                               S_028430_STENCILTESTVAL(state->ref_value[0]) |
+                               S_028430_STENCILMASK(state->valuemask[0]) |
+                               S_028430_STENCILWRITEMASK(state->writemask[0]),
+                               NULL, 0);
+       r600_pipe_state_add_reg(rstate,
+                               R_028434_DB_STENCILREFMASK_BF,
+                               S_028434_STENCILTESTVAL_BF(state->ref_value[1]) |
+                               S_028434_STENCILMASK_BF(state->valuemask[1]) |
+                               S_028434_STENCILWRITEMASK_BF(state->writemask[1]),
+                               NULL, 0);
+
+       free(rctx->states[R600_PIPE_STATE_STENCIL_REF]);
+       rctx->states[R600_PIPE_STATE_STENCIL_REF] = rstate;
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+void r600_set_pipe_stencil_ref(struct pipe_context *ctx,
+                              const struct pipe_stencil_ref *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_dsa *dsa = (struct r600_pipe_dsa*)rctx->states[R600_PIPE_STATE_DSA];
+       struct r600_stencil_ref ref;
+
+       rctx->stencil_ref = *state;
+
+       if (!dsa)
+               return;
+
+       ref.ref_value[0] = state->ref_value[0];
+       ref.ref_value[1] = state->ref_value[1];
+       ref.valuemask[0] = dsa->valuemask[0];
+       ref.valuemask[1] = dsa->valuemask[1];
+       ref.writemask[0] = dsa->writemask[0];
+       ref.writemask[1] = dsa->writemask[1];
+
+       r600_set_stencil_ref(ctx, &ref);
+}
+
+void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_dsa *dsa = state;
+       struct r600_pipe_state *rstate;
+       struct r600_stencil_ref ref;
+
+       if (state == NULL)
+               return;
+       rstate = &dsa->rstate;
+       rctx->states[rstate->id] = rstate;
+       rctx->alpha_ref = dsa->alpha_ref;
+       rctx->alpha_ref_dirty = true;
+       r600_context_pipe_state_set(rctx, rstate);
+
+       ref.ref_value[0] = rctx->stencil_ref.ref_value[0];
+       ref.ref_value[1] = rctx->stencil_ref.ref_value[1];
+       ref.valuemask[0] = dsa->valuemask[0];
+       ref.valuemask[1] = dsa->valuemask[1];
+       ref.writemask[0] = dsa->writemask[0];
+       ref.writemask[1] = dsa->writemask[1];
+
+       r600_set_stencil_ref(ctx, &ref);
+}
+
+void r600_bind_rs_state(struct pipe_context *ctx, void *state)
+{
+       struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       if (state == NULL)
+               return;
+
+       rctx->sprite_coord_enable = rs->sprite_coord_enable;
+       rctx->pa_sc_line_stipple = rs->pa_sc_line_stipple;
+       rctx->pa_su_sc_mode_cntl = rs->pa_su_sc_mode_cntl;
+       rctx->pa_cl_clip_cntl = rs->pa_cl_clip_cntl;
+       rctx->pa_cl_vs_out_cntl = rs->pa_cl_vs_out_cntl;
+
+       rctx->rasterizer = rs;
+
+       rctx->states[rs->rstate.id] = &rs->rstate;
+       r600_context_pipe_state_set(rctx, &rs->rstate);
+
+       if (rctx->chip_class >= CAYMAN) {
+               cayman_polygon_offset_update(rctx);
+       }
+}
+
+void r600_delete_rs_state(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
+
+       if (rctx->rasterizer == rs) {
+               rctx->rasterizer = NULL;
+       }
+       if (rctx->states[rs->rstate.id] == &rs->rstate) {
+               rctx->states[rs->rstate.id] = NULL;
+       }
+       free(rs);
+}
+
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+                              struct pipe_sampler_view *state)
+{
+       struct r600_pipe_sampler_view *resource = (struct r600_pipe_sampler_view *)state;
+
+       pipe_resource_reference(&state->texture, NULL);
+       FREE(resource);
+}
+
+void r600_delete_state(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
+
+       if (rctx->states[rstate->id] == rstate) {
+               rctx->states[rstate->id] = NULL;
+       }
+       for (int i = 0; i < rstate->nregs; i++) {
+               pipe_resource_reference((struct pipe_resource**)&rstate->regs[i].bo, NULL);
+       }
+       free(rstate);
+}
+
+void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_vertex_element *v = (struct r600_vertex_element*)state;
+
+       rctx->vertex_elements = v;
+       if (v) {
+               r600_inval_shader_cache(rctx);
+               u_vbuf_bind_vertex_elements(rctx->vbuf_mgr, state,
+                                               v->vmgr_elements);
+
+               rctx->states[v->rstate.id] = &v->rstate;
+               r600_context_pipe_state_set(rctx, &v->rstate);
+       }
+}
+
+void r600_delete_vertex_element(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_vertex_element *v = (struct r600_vertex_element*)state;
+
+       if (rctx->states[v->rstate.id] == &v->rstate) {
+               rctx->states[v->rstate.id] = NULL;
+       }
+       if (rctx->vertex_elements == state)
+               rctx->vertex_elements = NULL;
+
+       u_vbuf_destroy_vertex_elements(rctx->vbuf_mgr, v->vmgr_elements);
+       FREE(state);
+}
+
+
+void r600_set_index_buffer(struct pipe_context *ctx,
+                          const struct pipe_index_buffer *ib)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       u_vbuf_set_index_buffer(rctx->vbuf_mgr, ib);
+}
+
+void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       u_vbuf_set_vertex_buffers(rctx->vbuf_mgr, count, buffers);
+}
+
+void *si_create_vertex_elements(struct pipe_context *ctx,
+                               unsigned count,
+                               const struct pipe_vertex_element *elements)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element);
+
+       assert(count < 32);
+       if (!v)
+               return NULL;
+
+       v->count = count;
+       v->vmgr_elements =
+               u_vbuf_create_vertex_elements(rctx->vbuf_mgr, count,
+                                                 elements, v->elements);
+
+       return v;
+}
+
+void *si_create_shader_state(struct pipe_context *ctx,
+                             const struct pipe_shader_state *state)
+{
+       struct si_pipe_shader *shader = CALLOC_STRUCT(si_pipe_shader);
+
+       shader->tokens = tgsi_dup_tokens(state->tokens);
+       shader->so = state->stream_output;
+
+       return shader;
+}
+
+void r600_bind_ps_shader(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       if (rctx->ps_shader != state)
+               rctx->shader_dirty = true;
+
+       /* TODO delete old shader */
+       rctx->ps_shader = (struct si_pipe_shader *)state;
+       if (state) {
+               r600_inval_shader_cache(rctx);
+               r600_context_pipe_state_set(rctx, &rctx->ps_shader->rstate);
+       }
+}
+
+void r600_bind_vs_shader(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+
+       if (rctx->vs_shader != state)
+               rctx->shader_dirty = true;
+
+       /* TODO delete old shader */
+       rctx->vs_shader = (struct si_pipe_shader *)state;
+       if (state) {
+               r600_inval_shader_cache(rctx);
+               r600_context_pipe_state_set(rctx, &rctx->vs_shader->rstate);
+       }
+}
+
+void r600_delete_ps_shader(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct si_pipe_shader *shader = (struct si_pipe_shader *)state;
+
+       if (rctx->ps_shader == shader) {
+               rctx->ps_shader = NULL;
+       }
+
+       free(shader->tokens);
+       si_pipe_shader_destroy(ctx, shader);
+       free(shader);
+}
+
+void r600_delete_vs_shader(struct pipe_context *ctx, void *state)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct si_pipe_shader *shader = (struct si_pipe_shader *)state;
+
+       if (rctx->vs_shader == shader) {
+               rctx->vs_shader = NULL;
+       }
+
+       free(shader->tokens);
+       si_pipe_shader_destroy(ctx, shader);
+       free(shader);
+}
+
+static void r600_update_alpha_ref(struct r600_context *rctx)
+{
+#if 0
+       unsigned alpha_ref;
+       struct r600_pipe_state rstate;
+
+       alpha_ref = rctx->alpha_ref;
+       rstate.nregs = 0;
+       if (rctx->export_16bpc)
+               alpha_ref &= ~0x1FFF;
+       r600_pipe_state_add_reg(&rstate, R_028438_SX_ALPHA_REF, alpha_ref, NULL, 0);
+
+       r600_context_pipe_state_set(rctx, &rstate);
+       rctx->alpha_ref_dirty = false;
+#endif
+}
+
+void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
+                             struct pipe_resource *buffer)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_resource *rbuffer = r600_resource(buffer);
+       struct r600_pipe_state *rstate;
+       uint64_t va_offset;
+       uint32_t offset;
+
+       /* Note that the state tracker can unbind constant buffers by
+        * passing NULL here.
+        */
+       if (buffer == NULL) {
+               return;
+       }
+
+       r600_inval_shader_cache(rctx);
+
+       r600_upload_const_buffer(rctx, &rbuffer, &offset);
+       va_offset = r600_resource_va(ctx->screen, (void*)rbuffer);
+       va_offset += offset;
+       //va_offset >>= 8;
+
+       switch (shader) {
+       case PIPE_SHADER_VERTEX:
+               rstate = &rctx->vs_const_buffer;
+               rstate->nregs = 0;
+               r600_pipe_state_add_reg(rstate,
+                                       R_00B138_SPI_SHADER_USER_DATA_VS_2,
+                                       va_offset, rbuffer, RADEON_USAGE_READ);
+               r600_pipe_state_add_reg(rstate,
+                                       R_00B13C_SPI_SHADER_USER_DATA_VS_3,
+                                       va_offset >> 32, NULL, 0);
+               break;
+       case PIPE_SHADER_FRAGMENT:
+               rstate = &rctx->ps_const_buffer;
+               rstate->nregs = 0;
+               r600_pipe_state_add_reg(rstate,
+                                       R_00B030_SPI_SHADER_USER_DATA_PS_0,
+                                       va_offset, rbuffer, RADEON_USAGE_READ);
+               r600_pipe_state_add_reg(rstate,
+                                       R_00B034_SPI_SHADER_USER_DATA_PS_1,
+                                       va_offset >> 32, NULL, 0);
+               break;
+       default:
+               R600_ERR("unsupported %d\n", shader);
+               return;
+       }
+
+       r600_context_pipe_state_set(rctx, rstate);
+
+       if (buffer != &rbuffer->b.b.b)
+               pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL);
+}
+
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+                     struct pipe_resource *buffer,
+                     unsigned buffer_offset,
+                     unsigned buffer_size)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_so_target *t;
+       void *ptr;
+
+       t = CALLOC_STRUCT(r600_so_target);
+       if (!t) {
+               return NULL;
+       }
+
+       t->b.reference.count = 1;
+       t->b.context = ctx;
+       pipe_resource_reference(&t->b.buffer, buffer);
+       t->b.buffer_offset = buffer_offset;
+       t->b.buffer_size = buffer_size;
+
+       t->filled_size = (struct r600_resource*)
+               pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4);
+       ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->cs, PIPE_TRANSFER_WRITE);
+       memset(ptr, 0, t->filled_size->buf->size);
+       rctx->ws->buffer_unmap(t->filled_size->buf);
+
+       return &t->b;
+}
+
+void r600_so_target_destroy(struct pipe_context *ctx,
+                           struct pipe_stream_output_target *target)
+{
+       struct r600_so_target *t = (struct r600_so_target*)target;
+       pipe_resource_reference(&t->b.buffer, NULL);
+       pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL);
+       FREE(t);
+}
+
+void r600_set_so_targets(struct pipe_context *ctx,
+                        unsigned num_targets,
+                        struct pipe_stream_output_target **targets,
+                        unsigned append_bitmask)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       unsigned i;
+
+       /* Stop streamout. */
+       if (rctx->num_so_targets) {
+               r600_context_streamout_end(rctx);
+       }
+
+       /* Set the new targets. */
+       for (i = 0; i < num_targets; i++) {
+               pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], targets[i]);
+       }
+       for (; i < rctx->num_so_targets; i++) {
+               pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], NULL);
+       }
+
+       rctx->num_so_targets = num_targets;
+       rctx->streamout_start = num_targets != 0;
+       rctx->streamout_append_bitmask = append_bitmask;
+}
+
+static void r600_vertex_buffer_update(struct r600_context *rctx)
+{
+       struct pipe_context *ctx = &rctx->context;
+       struct r600_pipe_state *rstate = &rctx->vs_user_data;
+       struct r600_resource *rbuffer, *t_list_buffer;
+       struct pipe_vertex_buffer *vertex_buffer;
+       unsigned i, count, offset;
+       uint32_t *ptr;
+       uint64_t va;
+
+       r600_inval_vertex_cache(rctx);
+
+       if (rctx->vertex_elements->vbuffer_need_offset) {
+               /* one resource per vertex elements */
+               count = rctx->vertex_elements->count;
+       } else {
+               /* bind vertex buffer once */
+               count = rctx->vbuf_mgr->nr_real_vertex_buffers;
+       }
+       assert(count <= 256 / 4);
+
+       t_list_buffer = (struct r600_resource*)
+               pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
+                                  PIPE_USAGE_IMMUTABLE, 4 * 4 * count);
+       if (t_list_buffer == NULL)
+               return;
+
+       ptr = (uint32_t*)rctx->ws->buffer_map(t_list_buffer->buf,
+                                             rctx->cs,
+                                             PIPE_TRANSFER_WRITE);
+
+       for (i = 0 ; i < count; i++, ptr += 4) {
+               struct pipe_vertex_element *velem = &rctx->vertex_elements->elements[i];
+               const struct util_format_description *desc;
+               unsigned data_format, num_format;
+               int first_non_void;
+
+               if (rctx->vertex_elements->vbuffer_need_offset) {
+                       /* one resource per vertex elements */
+                       unsigned vbuffer_index;
+                       vbuffer_index = rctx->vertex_elements->elements[i].vertex_buffer_index;
+                       vertex_buffer = &rctx->vbuf_mgr->real_vertex_buffer[vbuffer_index];
+                       rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+                       offset = rctx->vertex_elements->vbuffer_offset[i];
+               } else {
+                       /* bind vertex buffer once */
+                       vertex_buffer = &rctx->vbuf_mgr->real_vertex_buffer[i];
+                       rbuffer = (struct r600_resource*)vertex_buffer->buffer;
+                       offset = 0;
+               }
+               if (vertex_buffer == NULL || rbuffer == NULL)
+                       continue;
+               offset += vertex_buffer->buffer_offset;
+
+               va = r600_resource_va(ctx->screen, (void*)rbuffer);
+               va += offset;
+
+               desc = util_format_description(velem->src_format);
+               first_non_void = util_format_get_first_non_void_channel(velem->src_format);
+               data_format = si_translate_vertexformat(ctx->screen,
+                                                       velem->src_format,
+                                                       desc, first_non_void);
+
+               switch (desc->channel[first_non_void].type) {
+               case UTIL_FORMAT_TYPE_FIXED:
+                       num_format = V_008F0C_BUF_NUM_FORMAT_USCALED; /* XXX */
+                       break;
+               case UTIL_FORMAT_TYPE_SIGNED:
+                       num_format = V_008F0C_BUF_NUM_FORMAT_SNORM;
+                       break;
+               case UTIL_FORMAT_TYPE_UNSIGNED:
+                       num_format = V_008F0C_BUF_NUM_FORMAT_UNORM;
+                       break;
+               case UTIL_FORMAT_TYPE_FLOAT:
+               default:
+                       num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
+               }
+
+               /* Fill in T# buffer resource description */
+               ptr[0] = va & 0xFFFFFFFF;
+               ptr[1] = ((va >> 32) & 0xFFFF) |
+                        (vertex_buffer->stride & 0x3FFF) << 16;
+               ptr[2] = (vertex_buffer->buffer->width0 - offset) / vertex_buffer->stride;
+               /* XXX: Hardcoding RGBA */
+               ptr[3] = 4 | 5 << 3 | 6 << 6 | 7 << 9 |
+                       num_format << 12 | data_format << 15;
+
+               r600_context_bo_reloc(rctx, rbuffer, RADEON_USAGE_READ);
+       }
+
+       rstate->nregs = 0;
+
+       va = r600_resource_va(ctx->screen, (void*)t_list_buffer);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B130_SPI_SHADER_USER_DATA_VS_0,
+                               va, t_list_buffer, RADEON_USAGE_READ);
+       r600_pipe_state_add_reg(rstate,
+                               R_00B134_SPI_SHADER_USER_DATA_VS_1,
+                               va >> 32,
+                               NULL, 0);
+
+       r600_context_pipe_state_set(rctx, rstate);
+}
+
+static void si_update_derived_state(struct r600_context *rctx)
+{
+       struct pipe_context * ctx = (struct pipe_context*)rctx;
+
+       if (!rctx->blitter->running) {
+               if (rctx->have_depth_fb || rctx->have_depth_texture)
+                       r600_flush_depth_textures(rctx);
+       }
+
+       if (rctx->shader_dirty) {
+               si_pipe_shader_destroy(&rctx->context, rctx->vs_shader);
+       }
+
+       if (rctx->shader_dirty ||
+           (rctx->ps_shader->shader.fs_write_all &&
+            (rctx->ps_shader->shader.nr_cbufs != rctx->nr_cbufs)) ||
+           (rctx->sprite_coord_enable &&
+            (rctx->ps_shader->sprite_coord_enable != rctx->sprite_coord_enable))) {
+               si_pipe_shader_destroy(&rctx->context, rctx->ps_shader);
+       }
+
+       if (rctx->alpha_ref_dirty) {
+               r600_update_alpha_ref(rctx);
+       }
+
+       if (!rctx->vs_shader->bo) {
+               si_pipe_shader_vs(ctx, rctx->vs_shader);
+
+               r600_context_pipe_state_set(rctx, &rctx->vs_shader->rstate);
+       }
+
+       if (!rctx->ps_shader->bo) {
+               si_pipe_shader_ps(ctx, rctx->ps_shader);
+
+               r600_context_pipe_state_set(rctx, &rctx->ps_shader->rstate);
+       }
+
+       if (rctx->shader_dirty) {
+               si_update_spi_map(rctx);
+               rctx->shader_dirty = false;
+       }
+}
+
+void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_pipe_dsa *dsa = (struct r600_pipe_dsa*)rctx->states[R600_PIPE_STATE_DSA];
+       struct pipe_draw_info info = *dinfo;
+       struct r600_draw rdraw = {};
+       struct pipe_index_buffer ib = {};
+       unsigned prim, mask, ls_mask = 0;
+       struct r600_block *dirty_block = NULL, *next_block = NULL;
+       struct r600_atom *state = NULL, *next_state = NULL;
+       int i;
+
+       if ((!info.count && (info.indexed || !info.count_from_stream_output)) ||
+           (info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) ||
+           !r600_conv_pipe_prim(info.mode, &prim)) {
+               return;
+       }
+
+       if (!rctx->ps_shader || !rctx->vs_shader)
+               return;
+
+       si_update_derived_state(rctx);
+
+       u_vbuf_draw_begin(rctx->vbuf_mgr, &info);
+       r600_vertex_buffer_update(rctx);
+
+       rdraw.vgt_num_indices = info.count;
+       rdraw.vgt_num_instances = info.instance_count;
+
+       if (info.indexed) {
+               /* Initialize the index buffer struct. */
+               pipe_resource_reference(&ib.buffer, rctx->vbuf_mgr->index_buffer.buffer);
+               ib.index_size = rctx->vbuf_mgr->index_buffer.index_size;
+               ib.offset = rctx->vbuf_mgr->index_buffer.offset + info.start * ib.index_size;
+
+               /* Translate or upload, if needed. */
+               r600_translate_index_buffer(rctx, &ib, info.count);
+
+               if (u_vbuf_resource(ib.buffer)->user_ptr) {
+                       r600_upload_index_buffer(rctx, &ib, info.count);
+               }
+
+               /* Initialize the r600_draw struct with index buffer info. */
+               if (ib.index_size == 4) {
+                       rdraw.vgt_index_type = V_028A7C_VGT_INDEX_32 |
+                               (R600_BIG_ENDIAN ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
+               } else {
+                       rdraw.vgt_index_type = V_028A7C_VGT_INDEX_16 |
+                               (R600_BIG_ENDIAN ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
+               }
+               rdraw.indices = (struct r600_resource*)ib.buffer;
+               rdraw.indices_bo_offset = ib.offset;
+               rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_DMA;
+       } else {
+               info.index_bias = info.start;
+               rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+               if (info.count_from_stream_output) {
+                       rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1);
+
+                       r600_context_draw_opaque_count(rctx, (struct r600_so_target*)info.count_from_stream_output);
+               }
+       }
+
+       rctx->vs_shader_so_strides = rctx->vs_shader->so_strides;
+
+       mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
+
+       if (rctx->vgt.id != R600_PIPE_STATE_VGT) {
+               rctx->vgt.id = R600_PIPE_STATE_VGT;
+               rctx->vgt.nregs = 0;
+               r600_pipe_state_add_reg(&rctx->vgt, R_008958_VGT_PRIMITIVE_TYPE, prim, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028238_CB_TARGET_MASK, rctx->cb_target_mask & mask, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028400_VGT_MAX_VTX_INDX, ~0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028404_VGT_MIN_VTX_INDX, 0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028408_VGT_INDX_OFFSET, info.index_bias, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info.restart_index, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info.primitive_restart, NULL, 0);
+#if 0
+               r600_pipe_state_add_reg(&rctx->vgt, R_03CFF0_SQ_VTX_BASE_VTX_LOC, 0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_03CFF4_SQ_VTX_START_INST_LOC, info.start_instance, NULL, 0);
+#endif
+               r600_pipe_state_add_reg(&rctx->vgt, R_028A0C_PA_SC_LINE_STIPPLE, 0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028814_PA_SU_SC_MODE_CNTL, 0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_02881C_PA_CL_VS_OUT_CNTL, 0, NULL, 0);
+               r600_pipe_state_add_reg(&rctx->vgt, R_028810_PA_CL_CLIP_CNTL, 0x0, NULL, 0);
+       }
+
+       rctx->vgt.nregs = 0;
+       r600_pipe_state_mod_reg(&rctx->vgt, prim);
+       r600_pipe_state_mod_reg(&rctx->vgt, rctx->cb_target_mask & mask);
+       r600_pipe_state_mod_reg(&rctx->vgt, ~0);
+       r600_pipe_state_mod_reg(&rctx->vgt, 0);
+       r600_pipe_state_mod_reg(&rctx->vgt, info.index_bias);
+       r600_pipe_state_mod_reg(&rctx->vgt, info.restart_index);
+       r600_pipe_state_mod_reg(&rctx->vgt, info.primitive_restart);
+#if 0
+       r600_pipe_state_mod_reg(&rctx->vgt, 0);
+       r600_pipe_state_mod_reg(&rctx->vgt, info.start_instance);
+#endif
+
+       if (prim == V_008958_DI_PT_LINELIST)
+               ls_mask = 1;
+       else if (prim == V_008958_DI_PT_LINESTRIP) 
+               ls_mask = 2;
+       r600_pipe_state_mod_reg(&rctx->vgt, S_028A0C_AUTO_RESET_CNTL(ls_mask) | rctx->pa_sc_line_stipple);
+
+       if (info.mode == PIPE_PRIM_QUADS || info.mode == PIPE_PRIM_QUAD_STRIP || info.mode == PIPE_PRIM_POLYGON) {
+               r600_pipe_state_mod_reg(&rctx->vgt, S_028814_PROVOKING_VTX_LAST(1) | rctx->pa_su_sc_mode_cntl);
+       } else {
+               r600_pipe_state_mod_reg(&rctx->vgt, rctx->pa_su_sc_mode_cntl);
+       }
+       r600_pipe_state_mod_reg(&rctx->vgt,
+                               rctx->pa_cl_vs_out_cntl /*|
+                               (rctx->rasterizer->clip_plane_enable & rctx->vs_shader->shader.clip_dist_write)*/);
+       r600_pipe_state_mod_reg(&rctx->vgt,
+                               rctx->pa_cl_clip_cntl /*|
+                               (rctx->vs_shader->shader.clip_dist_write ||
+                                rctx->vs_shader->shader.vs_prohibit_ucps ?
+                                0 : rctx->rasterizer->clip_plane_enable & 0x3F)*/);
+
+       r600_context_pipe_state_set(rctx, &rctx->vgt);
+
+       rdraw.db_render_override = dsa->db_render_override;
+       rdraw.db_render_control = dsa->db_render_control;
+
+       /* Emit states. */
+       r600_need_cs_space(rctx, 0, TRUE);
+
+       LIST_FOR_EACH_ENTRY_SAFE(state, next_state, &rctx->dirty_states, head) {
+               r600_emit_atom(rctx, state);
+       }
+       LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &rctx->dirty,list) {
+               r600_context_block_emit_dirty(rctx, dirty_block);
+       }
+       rctx->pm4_dirty_cdwords = 0;
+
+       /* Enable stream out if needed. */
+       if (rctx->streamout_start) {
+               r600_context_streamout_begin(rctx);
+               rctx->streamout_start = FALSE;
+       }
+
+       for (i = 0; i < NUM_TEX_UNITS; i++) {
+               if (rctx->ps_samplers.views[i])
+                       r600_context_bo_reloc(rctx,
+                                             (struct r600_resource*)rctx->ps_samplers.views[i]->base.texture,
+                                             RADEON_USAGE_READ);
+       }
+
+       if (rctx->chip_class >= CAYMAN) {
+               evergreen_context_draw(rctx, &rdraw);
+       }
+
+       rctx->flags |= R600_CONTEXT_DST_CACHES_DIRTY | R600_CONTEXT_DRAW_PENDING;
+
+       if (rctx->framebuffer.zsbuf)
+       {
+               struct pipe_resource *tex = rctx->framebuffer.zsbuf->texture;
+               ((struct r600_resource_texture *)tex)->dirty_db = TRUE;
+       }
+
+       pipe_resource_reference(&ib.buffer, NULL);
+       u_vbuf_draw_end(rctx->vbuf_mgr);
+}
+
+void _r600_pipe_state_add_reg(struct r600_context *ctx,
+                             struct r600_pipe_state *state,
+                             uint32_t offset, uint32_t value,
+                             uint32_t range_id, uint32_t block_id,
+                             struct r600_resource *bo,
+                             enum radeon_bo_usage usage)
+{
+       struct r600_range *range;
+       struct r600_block *block;
+
+       if (bo) assert(usage);
+
+       range = &ctx->range[range_id];
+       block = range->blocks[block_id];
+       state->regs[state->nregs].block = block;
+       state->regs[state->nregs].id = (offset - block->start_offset) >> 2;
+
+       state->regs[state->nregs].value = value;
+       state->regs[state->nregs].bo = bo;
+       state->regs[state->nregs].bo_usage = usage;
+
+       state->nregs++;
+       assert(state->nregs < R600_BLOCK_MAX_REG);
+}
+
+void r600_pipe_state_add_reg_noblock(struct r600_pipe_state *state,
+                                    uint32_t offset, uint32_t value,
+                                    struct r600_resource *bo,
+                                    enum radeon_bo_usage usage)
+{
+       if (bo) assert(usage);
+
+       state->regs[state->nregs].id = offset;
+       state->regs[state->nregs].block = NULL;
+       state->regs[state->nregs].value = value;
+       state->regs[state->nregs].bo = bo;
+       state->regs[state->nregs].bo_usage = usage;
+
+       state->nregs++;
+       assert(state->nregs < R600_BLOCK_MAX_REG);
+}
diff --git a/src/gallium/drivers/radeonsi/r600_texture.c b/src/gallium/drivers/radeonsi/r600_texture.c

new file mode 100644 (file)

index 0000000..c9e1b83
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_texture.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ *      Corbin Simpson
+ */
+#include <errno.h>
+#include "pipe/p_screen.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "pipebuffer/pb_buffer.h"
+#include "radeonsi_pipe.h"
+#include "r600_resource.h"
+#include "sid.h"
+
+/* Copy from a full GPU texture to a transfer's staging one. */
+static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+{
+       struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
+       struct pipe_resource *texture = transfer->resource;
+
+       ctx->resource_copy_region(ctx, rtransfer->staging_texture,
+                               0, 0, 0, 0, texture, transfer->level,
+                               &transfer->box);
+}
+
+
+/* Copy from a transfer's staging texture to a full GPU one. */
+static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+{
+       struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
+       struct pipe_resource *texture = transfer->resource;
+       struct pipe_box sbox;
+
+       sbox.x = sbox.y = sbox.z = 0;
+       sbox.width = transfer->box.width;
+       sbox.height = transfer->box.height;
+       /* XXX that might be wrong */
+       sbox.depth = 1;
+       ctx->resource_copy_region(ctx, texture, transfer->level,
+                                 transfer->box.x, transfer->box.y, transfer->box.z,
+                                 rtransfer->staging_texture,
+                                 0, &sbox);
+}
+
+unsigned r600_texture_get_offset(struct r600_resource_texture *rtex,
+                                       unsigned level, unsigned layer)
+{
+       unsigned offset = rtex->offset[level];
+
+       switch (rtex->resource.b.b.b.target) {
+       case PIPE_TEXTURE_3D:
+       case PIPE_TEXTURE_CUBE:
+       default:
+               return offset + layer * rtex->layer_size[level];
+       }
+}
+
+static unsigned r600_get_block_alignment(struct pipe_screen *screen,
+                                        enum pipe_format format,
+                                        unsigned array_mode)
+{
+       struct r600_screen* rscreen = (struct r600_screen *)screen;
+       unsigned pixsize = util_format_get_blocksize(format);
+       int p_align;
+
+       switch(array_mode) {
+#if 0
+       case V_038000_ARRAY_1D_TILED_THIN1:
+               p_align = MAX2(8,
+                              ((rscreen->tiling_info.group_bytes / 8 / pixsize)));
+               break;
+       case V_038000_ARRAY_2D_TILED_THIN1:
+               p_align = MAX2(rscreen->tiling_info.num_banks,
+                              (((rscreen->tiling_info.group_bytes / 8 / pixsize)) *
+                               rscreen->tiling_info.num_banks)) * 8;
+               break;
+       case V_038000_ARRAY_LINEAR_ALIGNED:
+               p_align = MAX2(64, rscreen->tiling_info.group_bytes / pixsize);
+               break;
+       case V_038000_ARRAY_LINEAR_GENERAL:
+#endif
+       default:
+               p_align = rscreen->tiling_info.group_bytes / pixsize;
+               break;
+       }
+       return p_align;
+}
+
+static unsigned r600_get_height_alignment(struct pipe_screen *screen,
+                                         unsigned array_mode)
+{
+       struct r600_screen* rscreen = (struct r600_screen *)screen;
+       int h_align;
+
+       switch (array_mode) {
+#if 0
+       case V_038000_ARRAY_2D_TILED_THIN1:
+               h_align = rscreen->tiling_info.num_channels * 8;
+               break;
+       case V_038000_ARRAY_1D_TILED_THIN1:
+       case V_038000_ARRAY_LINEAR_ALIGNED:
+               h_align = 8;
+               break;
+       case V_038000_ARRAY_LINEAR_GENERAL:
+#endif
+       default:
+               h_align = 1;
+               break;
+       }
+       return h_align;
+}
+
+static unsigned r600_get_base_alignment(struct pipe_screen *screen,
+                                       enum pipe_format format,
+                                       unsigned array_mode)
+{
+       struct r600_screen* rscreen = (struct r600_screen *)screen;
+       unsigned pixsize = util_format_get_blocksize(format);
+       int p_align = r600_get_block_alignment(screen, format, array_mode);
+       int h_align = r600_get_height_alignment(screen, array_mode);
+       int b_align;
+
+       switch (array_mode) {
+#if 0
+       case V_038000_ARRAY_2D_TILED_THIN1:
+               b_align = MAX2(rscreen->tiling_info.num_banks * rscreen->tiling_info.num_channels * 8 * 8 * pixsize,
+                              p_align * pixsize * h_align);
+               break;
+       case V_038000_ARRAY_1D_TILED_THIN1:
+       case V_038000_ARRAY_LINEAR_ALIGNED:
+       case V_038000_ARRAY_LINEAR_GENERAL:
+#endif
+       default:
+               b_align = rscreen->tiling_info.group_bytes;
+               break;
+       }
+       return b_align;
+}
+
+static unsigned mip_minify(unsigned size, unsigned level)
+{
+       unsigned val;
+       val = u_minify(size, level);
+       if (level > 0)
+               val = util_next_power_of_two(val);
+       return val;
+}
+
+static unsigned r600_texture_get_nblocksx(struct pipe_screen *screen,
+                                         struct r600_resource_texture *rtex,
+                                         unsigned level)
+{
+       struct pipe_resource *ptex = &rtex->resource.b.b.b;
+       unsigned nblocksx, block_align, width;
+       unsigned blocksize = util_format_get_blocksize(rtex->real_format);
+
+       if (rtex->pitch_override)
+               return rtex->pitch_override / blocksize;
+
+       width = mip_minify(ptex->width0, level);
+       nblocksx = util_format_get_nblocksx(rtex->real_format, width);
+
+       block_align = r600_get_block_alignment(screen, rtex->real_format,
+                                             rtex->array_mode[level]);
+       nblocksx = align(nblocksx, block_align);
+       return nblocksx;
+}
+
+static unsigned r600_texture_get_nblocksy(struct pipe_screen *screen,
+                                         struct r600_resource_texture *rtex,
+                                         unsigned level)
+{
+       struct pipe_resource *ptex = &rtex->resource.b.b.b;
+       unsigned height, tile_height;
+
+       height = mip_minify(ptex->height0, level);
+       height = util_format_get_nblocksy(rtex->real_format, height);
+       tile_height = r600_get_height_alignment(screen,
+                                               rtex->array_mode[level]);
+
+       /* XXX Hack around an alignment issue. Less tests fail with this.
+        *
+        * The thing is depth-stencil buffers should be tiled, i.e.
+        * the alignment should be >=8. If I make them tiled, stencil starts
+        * working because it no longer overlaps with the depth buffer
+        * in memory, but texturing like drawpix-stencil breaks. */
+       if (util_format_is_depth_or_stencil(rtex->real_format) && tile_height < 8)
+               tile_height = 8;
+
+       height = align(height, tile_height);
+       return height;
+}
+
+static void r600_texture_set_array_mode(struct pipe_screen *screen,
+                                       struct r600_resource_texture *rtex,
+                                       unsigned level, unsigned array_mode)
+{
+       struct pipe_resource *ptex = &rtex->resource.b.b.b;
+
+       switch (array_mode) {
+#if 0
+       case V_0280A0_ARRAY_LINEAR_GENERAL:
+       case V_0280A0_ARRAY_LINEAR_ALIGNED:
+       case V_0280A0_ARRAY_1D_TILED_THIN1:
+#endif
+       default:
+               rtex->array_mode[level] = array_mode;
+               break;
+#if 0
+       case V_0280A0_ARRAY_2D_TILED_THIN1:
+       {
+               unsigned w, h, tile_height, tile_width;
+
+               tile_height = r600_get_height_alignment(screen, array_mode);
+               tile_width = r600_get_block_alignment(screen, rtex->real_format, array_mode);
+
+               w = mip_minify(ptex->width0, level);
+               h = mip_minify(ptex->height0, level);
+               if (w <= tile_width || h <= tile_height)
+                       rtex->array_mode[level] = V_0280A0_ARRAY_1D_TILED_THIN1;
+               else
+                       rtex->array_mode[level] = array_mode;
+       }
+       break;
+#endif
+       }
+}
+
+static void r600_setup_miptree(struct pipe_screen *screen,
+                              struct r600_resource_texture *rtex,
+                              unsigned array_mode)
+{
+       struct pipe_resource *ptex = &rtex->resource.b.b.b;
+       enum chip_class chipc = ((struct r600_screen*)screen)->chip_class;
+       unsigned size, layer_size, i, offset;
+       unsigned nblocksx, nblocksy;
+
+       for (i = 0, offset = 0; i <= ptex->last_level; i++) {
+               unsigned blocksize = util_format_get_blocksize(rtex->real_format);
+               unsigned base_align = r600_get_base_alignment(screen, rtex->real_format, array_mode);
+
+               r600_texture_set_array_mode(screen, rtex, i, array_mode);
+
+               nblocksx = r600_texture_get_nblocksx(screen, rtex, i);
+               nblocksy = r600_texture_get_nblocksy(screen, rtex, i);
+
+               if (chipc >= CAYMAN /*&& array_mode == V_038000_ARRAY_LINEAR_GENERAL*/)
+                       layer_size = align(nblocksx, 64) * nblocksy * blocksize;
+               else
+                       layer_size = nblocksx * nblocksy * blocksize;
+
+               if (ptex->target == PIPE_TEXTURE_CUBE) {
+                       if (chipc >= CAYMAN)
+                               size = layer_size * 8;
+               }
+               else if (ptex->target == PIPE_TEXTURE_3D)
+                       size = layer_size * u_minify(ptex->depth0, i);
+               else
+                       size = layer_size * ptex->array_size;
+
+               /* align base image and start of miptree */
+               if ((i == 0) || (i == 1))
+                       offset = align(offset, base_align);
+               rtex->offset[i] = offset;
+               rtex->layer_size[i] = layer_size;
+               rtex->pitch_in_blocks[i] = nblocksx; /* CB talks in elements */
+               rtex->pitch_in_bytes[i] = nblocksx * blocksize;
+
+               offset += size;
+       }
+       rtex->size = offset;
+}
+
+/* Figure out whether u_blitter will fallback to a transfer operation.
+ * If so, don't use a staging resource.
+ */
+static boolean permit_hardware_blit(struct pipe_screen *screen,
+                                       const struct pipe_resource *res)
+{
+       unsigned bind;
+
+       if (util_format_is_depth_or_stencil(res->format))
+               bind = PIPE_BIND_DEPTH_STENCIL;
+       else
+               bind = PIPE_BIND_RENDER_TARGET;
+
+       /* hackaround for S3TC */
+       if (util_format_is_compressed(res->format))
+               return TRUE;
+           
+       if (!screen->is_format_supported(screen,
+                               res->format,
+                               res->target,
+                               res->nr_samples,
+                                bind))
+               return FALSE;
+
+       if (!screen->is_format_supported(screen,
+                               res->format,
+                               res->target,
+                               res->nr_samples,
+                                PIPE_BIND_SAMPLER_VIEW))
+               return FALSE;
+
+       switch (res->usage) {
+       case PIPE_USAGE_STREAM:
+       case PIPE_USAGE_STAGING:
+               return FALSE;
+
+       default:
+               return TRUE;
+       }
+}
+
+static boolean r600_texture_get_handle(struct pipe_screen* screen,
+                                       struct pipe_resource *ptex,
+                                       struct winsys_handle *whandle)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)ptex;
+       struct r600_resource *resource = &rtex->resource;
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+
+       return rscreen->ws->buffer_get_handle(resource->buf,
+                                             rtex->pitch_in_bytes[0], whandle);
+}
+
+static void r600_texture_destroy(struct pipe_screen *screen,
+                                struct pipe_resource *ptex)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)ptex;
+       struct r600_resource *resource = &rtex->resource;
+
+       if (rtex->flushed_depth_texture)
+               pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL);
+
+       if (rtex->stencil)
+               pipe_resource_reference((struct pipe_resource **)&rtex->stencil, NULL);
+
+       pb_reference(&resource->buf, NULL);
+       FREE(rtex);
+}
+
+static const struct u_resource_vtbl r600_texture_vtbl =
+{
+       r600_texture_get_handle,        /* get_handle */
+       r600_texture_destroy,           /* resource_destroy */
+       r600_texture_get_transfer,      /* get_transfer */
+       r600_texture_transfer_destroy,  /* transfer_destroy */
+       r600_texture_transfer_map,      /* transfer_map */
+       u_default_transfer_flush_region,/* transfer_flush_region */
+       r600_texture_transfer_unmap,    /* transfer_unmap */
+       u_default_transfer_inline_write /* transfer_inline_write */
+};
+
+static struct r600_resource_texture *
+r600_texture_create_object(struct pipe_screen *screen,
+                          const struct pipe_resource *base,
+                          unsigned array_mode,
+                          unsigned pitch_in_bytes_override,
+                          unsigned max_buffer_size,
+                          struct pb_buffer *buf,
+                          boolean alloc_bo)
+{
+       struct r600_resource_texture *rtex;
+       struct r600_resource *resource;
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+
+       rtex = CALLOC_STRUCT(r600_resource_texture);
+       if (rtex == NULL)
+               return NULL;
+
+       resource = &rtex->resource;
+       resource->b.b.b = *base;
+       resource->b.b.vtbl = &r600_texture_vtbl;
+       pipe_reference_init(&resource->b.b.b.reference, 1);
+       resource->b.b.b.screen = screen;
+       rtex->pitch_override = pitch_in_bytes_override;
+       rtex->real_format = base->format;
+
+       /* We must split depth and stencil into two separate buffers on Evergreen. */
+       if (!(base->flags & R600_RESOURCE_FLAG_TRANSFER) &&
+           ((struct r600_screen*)screen)->chip_class >= CAYMAN &&
+           util_format_is_depth_and_stencil(base->format)) {
+               struct pipe_resource stencil;
+               unsigned stencil_pitch_override = 0;
+
+               switch (base->format) {
+               case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+                       rtex->real_format = PIPE_FORMAT_Z24X8_UNORM;
+                       break;
+               case PIPE_FORMAT_S8_UINT_Z24_UNORM:
+                       rtex->real_format = PIPE_FORMAT_X8Z24_UNORM;
+                       break;
+               case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+                       rtex->real_format = PIPE_FORMAT_Z32_FLOAT;
+                       break;
+               default:
+                       assert(0);
+                       FREE(rtex);
+                       return NULL;
+               }
+
+               /* Divide the pitch in bytes by 4 for stencil, because it has a smaller pixel size. */
+               if (pitch_in_bytes_override) {
+                       assert(base->format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
+                              base->format == PIPE_FORMAT_S8_UINT_Z24_UNORM);
+                       stencil_pitch_override = pitch_in_bytes_override / 4;
+               }
+
+               /* Allocate the stencil buffer. */
+               stencil = *base;
+               stencil.format = PIPE_FORMAT_S8_UINT;
+               rtex->stencil = r600_texture_create_object(screen, &stencil, array_mode,
+                                                          stencil_pitch_override,
+                                                          max_buffer_size, NULL, FALSE);
+               if (!rtex->stencil) {
+                       FREE(rtex);
+                       return NULL;
+               }
+               /* Proceed in creating the depth buffer. */
+       }
+
+       /* only mark depth textures the HW can hit as depth textures */
+       if (util_format_is_depth_or_stencil(rtex->real_format) && permit_hardware_blit(screen, base))
+               rtex->depth = 1;
+
+       r600_setup_miptree(screen, rtex, array_mode);
+
+       /* If we initialized separate stencil for Evergreen. place it after depth. */
+       if (rtex->stencil) {
+               unsigned stencil_align, stencil_offset;
+
+               stencil_align = r600_get_base_alignment(screen, rtex->stencil->real_format, array_mode);
+               stencil_offset = align(rtex->size, stencil_align);
+
+               for (unsigned i = 0; i <= rtex->stencil->resource.b.b.b.last_level; i++)
+                       rtex->stencil->offset[i] += stencil_offset;
+
+               rtex->size = stencil_offset + rtex->stencil->size;
+       }
+
+       /* Now create the backing buffer. */
+       if (!buf && alloc_bo) {
+               struct pipe_resource *ptex = &rtex->resource.b.b.b;
+               unsigned base_align = r600_get_base_alignment(screen, ptex->format, array_mode);
+
+               if (!r600_init_resource(rscreen, resource, rtex->size, base_align, base->bind, base->usage)) {
+                       pipe_resource_reference((struct pipe_resource**)&rtex->stencil, NULL);
+                       FREE(rtex);
+                       return NULL;
+               }
+       } else if (buf) {
+               resource->buf = buf;
+               resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf);
+               resource->domains = RADEON_DOMAIN_GTT | RADEON_DOMAIN_VRAM;
+       }
+
+       if (rtex->stencil) {
+               pb_reference(&rtex->stencil->resource.buf, rtex->resource.buf);
+               rtex->stencil->resource.cs_buf = rtex->resource.cs_buf;
+               rtex->stencil->resource.domains = rtex->resource.domains;
+       }
+       return rtex;
+}
+
+DEBUG_GET_ONCE_BOOL_OPTION(tiling_enabled, "R600_TILING", FALSE);
+
+struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
+                                               const struct pipe_resource *templ)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       unsigned array_mode = 0;
+
+       if (!(templ->flags & R600_RESOURCE_FLAG_TRANSFER) &&
+           !(templ->bind & PIPE_BIND_SCANOUT)) {
+#if 0
+               if (util_format_is_compressed(templ->format)) {
+                       array_mode = V_038000_ARRAY_1D_TILED_THIN1;
+               }
+               else if (debug_get_option_tiling_enabled() &&
+                        rscreen->info.drm_minor >= 9 &&
+                        permit_hardware_blit(screen, templ)) {
+                       array_mode = V_038000_ARRAY_2D_TILED_THIN1;
+               }
+#endif
+       }
+
+       return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode,
+                                                                 0, 0, NULL, TRUE);
+}
+
+static struct pipe_surface *r600_create_surface(struct pipe_context *pipe,
+                                               struct pipe_resource *texture,
+                                               const struct pipe_surface *surf_tmpl)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
+       struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
+       unsigned level = surf_tmpl->u.tex.level;
+
+       assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
+       if (surface == NULL)
+               return NULL;
+       /* XXX no offset */
+/*     offset = r600_texture_get_offset(rtex, level, surf_tmpl->u.tex.first_layer);*/
+       pipe_reference_init(&surface->base.reference, 1);
+       pipe_resource_reference(&surface->base.texture, texture);
+       surface->base.context = pipe;
+       surface->base.format = surf_tmpl->format;
+       surface->base.width = mip_minify(texture->width0, level);
+       surface->base.height = mip_minify(texture->height0, level);
+       surface->base.usage = surf_tmpl->usage;
+       surface->base.texture = texture;
+       surface->base.u.tex.first_layer = surf_tmpl->u.tex.first_layer;
+       surface->base.u.tex.last_layer = surf_tmpl->u.tex.last_layer;
+       surface->base.u.tex.level = level;
+
+       surface->aligned_height = r600_texture_get_nblocksy(pipe->screen,
+                                                           rtex, level);
+       return &surface->base;
+}
+
+static void r600_surface_destroy(struct pipe_context *pipe,
+                                struct pipe_surface *surface)
+{
+       pipe_resource_reference(&surface->texture, NULL);
+       FREE(surface);
+}
+
+struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
+                                              const struct pipe_resource *templ,
+                                              struct winsys_handle *whandle)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       struct pb_buffer *buf = NULL;
+       unsigned stride = 0;
+       unsigned array_mode = 0;
+       enum radeon_bo_layout micro, macro;
+
+       /* Support only 2D textures without mipmaps */
+       if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
+             templ->depth0 != 1 || templ->last_level != 0)
+               return NULL;
+
+       buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride);
+       if (!buf)
+               return NULL;
+
+       rscreen->ws->buffer_get_tiling(buf, &micro, &macro, NULL, NULL, NULL, NULL, NULL);
+
+#if 0
+       if (macro == RADEON_LAYOUT_TILED)
+               array_mode = V_0280A0_ARRAY_2D_TILED_THIN1;
+       else if (micro == RADEON_LAYOUT_TILED)
+               array_mode = V_0280A0_ARRAY_1D_TILED_THIN1;
+       else
+#endif
+               array_mode = 0;
+
+       return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode,
+                                                                 stride, 0, buf, FALSE);
+}
+
+int r600_texture_depth_flush(struct pipe_context *ctx,
+                            struct pipe_resource *texture, boolean just_create)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
+       struct pipe_resource resource;
+
+       if (rtex->flushed_depth_texture)
+               goto out;
+
+       resource.target = texture->target;
+       resource.format = texture->format;
+       resource.width0 = texture->width0;
+       resource.height0 = texture->height0;
+       resource.depth0 = texture->depth0;
+       resource.array_size = texture->array_size;
+       resource.last_level = texture->last_level;
+       resource.nr_samples = texture->nr_samples;
+       resource.usage = PIPE_USAGE_DYNAMIC;
+       resource.bind = texture->bind | PIPE_BIND_DEPTH_STENCIL;
+       resource.flags = R600_RESOURCE_FLAG_TRANSFER | texture->flags;
+
+       rtex->flushed_depth_texture = (struct r600_resource_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+       if (rtex->flushed_depth_texture == NULL) {
+               R600_ERR("failed to create temporary texture to hold untiled copy\n");
+               return -ENOMEM;
+       }
+
+       ((struct r600_resource_texture *)rtex->flushed_depth_texture)->is_flushing_texture = TRUE;
+out:
+       if (just_create)
+               return 0;
+
+       /* XXX: only do this if the depth texture has actually changed:
+        */
+       r600_blit_uncompress_depth(ctx, rtex);
+       return 0;
+}
+
+/* Needs adjustment for pixelformat:
+ */
+static INLINE unsigned u_box_volume( const struct pipe_box *box )
+{
+       return box->width * box->depth * box->height;
+};
+
+struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
+                                               struct pipe_resource *texture,
+                                               unsigned level,
+                                               unsigned usage,
+                                               const struct pipe_box *box)
+{
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
+       struct pipe_resource resource;
+       struct r600_transfer *trans;
+       int r;
+       boolean use_staging_texture = FALSE;
+
+#if 0
+       /* We cannot map a tiled texture directly because the data is
+        * in a different order, therefore we do detiling using a blit.
+        *
+        * Also, use a temporary in GTT memory for read transfers, as
+        * the CPU is much happier reading out of cached system memory
+        * than uncached VRAM.
+        */
+       if (R600_TEX_IS_TILED(rtex, level))
+               use_staging_texture = TRUE;
+#endif
+
+       if ((usage & PIPE_TRANSFER_READ) && u_box_volume(box) > 1024)
+               use_staging_texture = TRUE;
+
+       /* XXX: Use a staging texture for uploads if the underlying BO
+        * is busy.  No interface for checking that currently? so do
+        * it eagerly whenever the transfer doesn't require a readback
+        * and might block.
+        */
+       if ((usage & PIPE_TRANSFER_WRITE) &&
+                       !(usage & (PIPE_TRANSFER_READ |
+                                       PIPE_TRANSFER_DONTBLOCK |
+                                       PIPE_TRANSFER_UNSYNCHRONIZED)))
+               use_staging_texture = TRUE;
+
+       if (!permit_hardware_blit(ctx->screen, texture) ||
+               (texture->flags & R600_RESOURCE_FLAG_TRANSFER))
+               use_staging_texture = FALSE;
+
+       if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY))
+               return NULL;
+
+       trans = CALLOC_STRUCT(r600_transfer);
+       if (trans == NULL)
+               return NULL;
+       pipe_resource_reference(&trans->transfer.resource, texture);
+       trans->transfer.level = level;
+       trans->transfer.usage = usage;
+       trans->transfer.box = *box;
+       if (rtex->depth) {
+               /* XXX: only readback the rectangle which is being mapped?
+               */
+               /* XXX: when discard is true, no need to read back from depth texture
+               */
+               r = r600_texture_depth_flush(ctx, texture, FALSE);
+               if (r < 0) {
+                       R600_ERR("failed to create temporary texture to hold untiled copy\n");
+                       pipe_resource_reference(&trans->transfer.resource, NULL);
+                       FREE(trans);
+                       return NULL;
+               }
+               trans->transfer.stride = rtex->flushed_depth_texture->pitch_in_bytes[level];
+               trans->offset = r600_texture_get_offset(rtex->flushed_depth_texture, level, box->z);
+               return &trans->transfer;
+       } else if (use_staging_texture) {
+               resource.target = PIPE_TEXTURE_2D;
+               resource.format = texture->format;
+               resource.width0 = box->width;
+               resource.height0 = box->height;
+               resource.depth0 = 1;
+               resource.array_size = 1;
+               resource.last_level = 0;
+               resource.nr_samples = 0;
+               resource.usage = PIPE_USAGE_STAGING;
+               resource.bind = 0;
+               resource.flags = R600_RESOURCE_FLAG_TRANSFER;
+               /* For texture reading, the temporary (detiled) texture is used as
+                * a render target when blitting from a tiled texture. */
+               if (usage & PIPE_TRANSFER_READ) {
+                       resource.bind |= PIPE_BIND_RENDER_TARGET;
+               }
+               /* For texture writing, the temporary texture is used as a sampler
+                * when blitting into a tiled texture. */
+               if (usage & PIPE_TRANSFER_WRITE) {
+                       resource.bind |= PIPE_BIND_SAMPLER_VIEW;
+               }
+               /* Create the temporary texture. */
+               trans->staging_texture = ctx->screen->resource_create(ctx->screen, &resource);
+               if (trans->staging_texture == NULL) {
+                       R600_ERR("failed to create temporary texture to hold untiled copy\n");
+                       pipe_resource_reference(&trans->transfer.resource, NULL);
+                       FREE(trans);
+                       return NULL;
+               }
+
+               trans->transfer.stride =
+                       ((struct r600_resource_texture *)trans->staging_texture)->pitch_in_bytes[0];
+               if (usage & PIPE_TRANSFER_READ) {
+                       r600_copy_to_staging_texture(ctx, trans);
+                       /* Always referenced in the blit. */
+                       radeonsi_flush(ctx, NULL, 0);
+               }
+               return &trans->transfer;
+       }
+       trans->transfer.stride = rtex->pitch_in_bytes[level];
+       trans->transfer.layer_stride = rtex->layer_size[level];
+       trans->offset = r600_texture_get_offset(rtex, level, box->z);
+       return &trans->transfer;
+}
+
+void r600_texture_transfer_destroy(struct pipe_context *ctx,
+                                  struct pipe_transfer *transfer)
+{
+       struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+       struct pipe_resource *texture = transfer->resource;
+       struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
+
+       if (rtransfer->staging_texture) {
+               if (transfer->usage & PIPE_TRANSFER_WRITE) {
+                       r600_copy_from_staging_texture(ctx, rtransfer);
+               }
+               pipe_resource_reference(&rtransfer->staging_texture, NULL);
+       }
+
+       if (rtex->depth && !rtex->is_flushing_texture) {
+               if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtex->flushed_depth_texture)
+                       r600_blit_push_depth(ctx, rtex);
+       }
+
+       pipe_resource_reference(&transfer->resource, NULL);
+       FREE(transfer);
+}
+
+void* r600_texture_transfer_map(struct pipe_context *ctx,
+                               struct pipe_transfer* transfer)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+       struct pb_buffer *buf;
+       enum pipe_format format = transfer->resource->format;
+       unsigned offset = 0;
+       char *map;
+
+       if (rtransfer->staging_texture) {
+               buf = ((struct r600_resource *)rtransfer->staging_texture)->buf;
+       } else {
+               struct r600_resource_texture *rtex = (struct r600_resource_texture*)transfer->resource;
+
+               if (rtex->flushed_depth_texture)
+                       buf = ((struct r600_resource *)rtex->flushed_depth_texture)->buf;
+               else
+                       buf = ((struct r600_resource *)transfer->resource)->buf;
+
+               offset = rtransfer->offset +
+                       transfer->box.y / util_format_get_blockheight(format) * transfer->stride +
+                       transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+       }
+
+       if (!(map = rctx->ws->buffer_map(buf, rctx->cs, transfer->usage))) {
+               return NULL;
+       }
+
+       return map + offset;
+}
+
+void r600_texture_transfer_unmap(struct pipe_context *ctx,
+                                struct pipe_transfer* transfer)
+{
+       struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+       struct r600_context *rctx = (struct r600_context*)ctx;
+       struct pb_buffer *buf;
+
+       if (rtransfer->staging_texture) {
+               buf = ((struct r600_resource *)rtransfer->staging_texture)->buf;
+       } else {
+               struct r600_resource_texture *rtex = (struct r600_resource_texture*)transfer->resource;
+
+               if (rtex->flushed_depth_texture) {
+                       buf = ((struct r600_resource *)rtex->flushed_depth_texture)->buf;
+               } else {
+                       buf = ((struct r600_resource *)transfer->resource)->buf;
+               }
+       }
+       rctx->ws->buffer_unmap(buf);
+}
+
+void r600_init_surface_functions(struct r600_context *r600)
+{
+       r600->context.create_surface = r600_create_surface;
+       r600->context.surface_destroy = r600_surface_destroy;
+}
diff --git a/src/gallium/drivers/radeonsi/r600_translate.c b/src/gallium/drivers/radeonsi/r600_translate.c

new file mode 100644 (file)

index 0000000..6551044
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/r600_translate.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie <airlied@redhat.com>
+ */
+
+#include "util/u_index_modify.h"
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "radeonsi_pipe.h"
+
+
+void r600_translate_index_buffer(struct r600_context *r600,
+                                struct pipe_index_buffer *ib,
+                                unsigned count)
+{
+       struct pipe_resource *out_buffer = NULL;
+       unsigned out_offset;
+       void *ptr;
+
+       switch (ib->index_size) {
+       case 1:
+               u_upload_alloc(r600->vbuf_mgr->uploader, 0, count * 2,
+                              &out_offset, &out_buffer, &ptr);
+
+               util_shorten_ubyte_elts_to_userptr(
+                               &r600->context, ib->buffer, 0, ib->offset, count, ptr);
+
+               pipe_resource_reference(&ib->buffer, NULL);
+               ib->buffer = out_buffer;
+               ib->offset = out_offset;
+               ib->index_size = 2;
+               break;
+       }
+}
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c

new file mode 100644 (file)

index 0000000..9e84952
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c
@@ -0,0 +1,731 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdio.h>
+#include <errno.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "util/u_blitter.h"
+#include "util/u_double_list.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_upload_mgr.h"
+#include "vl/vl_decoder.h"
+#include "vl/vl_video_buffer.h"
+#include "os/os_time.h"
+#include "pipebuffer/pb_buffer.h"
+#include "r600.h"
+#include "sid.h"
+#include "r600_resource.h"
+#include "radeonsi_pipe.h"
+#include "r600_hw_context_priv.h"
+
+/*
+ * pipe_context
+ */
+static struct r600_fence *r600_create_fence(struct r600_context *rctx)
+{
+       struct r600_screen *rscreen = rctx->screen;
+       struct r600_fence *fence = NULL;
+
+       pipe_mutex_lock(rscreen->fences.mutex);
+
+       if (!rscreen->fences.bo) {
+               /* Create the shared buffer object */
+               rscreen->fences.bo = (struct r600_resource*)
+                       pipe_buffer_create(&rscreen->screen, PIPE_BIND_CUSTOM,
+                                          PIPE_USAGE_STAGING, 4096);
+               if (!rscreen->fences.bo) {
+                       R600_ERR("r600: failed to create bo for fence objects\n");
+                       goto out;
+               }
+               rscreen->fences.data = rctx->ws->buffer_map(rscreen->fences.bo->buf,
+                                                          rctx->cs,
+                                                          PIPE_TRANSFER_READ_WRITE);
+       }
+
+       if (!LIST_IS_EMPTY(&rscreen->fences.pool)) {
+               struct r600_fence *entry;
+
+               /* Try to find a freed fence that has been signalled */
+               LIST_FOR_EACH_ENTRY(entry, &rscreen->fences.pool, head) {
+                       if (rscreen->fences.data[entry->index] != 0) {
+                               LIST_DELINIT(&entry->head);
+                               fence = entry;
+                               break;
+                       }
+               }
+       }
+
+       if (!fence) {
+               /* Allocate a new fence */
+               struct r600_fence_block *block;
+               unsigned index;
+
+               if ((rscreen->fences.next_index + 1) >= 1024) {
+                       R600_ERR("r600: too many concurrent fences\n");
+                       goto out;
+               }
+
+               index = rscreen->fences.next_index++;
+
+               if (!(index % FENCE_BLOCK_SIZE)) {
+                       /* Allocate a new block */
+                       block = CALLOC_STRUCT(r600_fence_block);
+                       if (block == NULL)
+                               goto out;
+
+                       LIST_ADD(&block->head, &rscreen->fences.blocks);
+               } else {
+                       block = LIST_ENTRY(struct r600_fence_block, rscreen->fences.blocks.next, head);
+               }
+
+               fence = &block->fences[index % FENCE_BLOCK_SIZE];
+               fence->index = index;
+       }
+
+       pipe_reference_init(&fence->reference, 1);
+
+       rscreen->fences.data[fence->index] = 0;
+       r600_context_emit_fence(rctx, rscreen->fences.bo, fence->index, 1);
+
+       /* Create a dummy BO so that fence_finish without a timeout can sleep waiting for completion */
+       fence->sleep_bo = (struct r600_resource*)
+                       pipe_buffer_create(&rctx->screen->screen, PIPE_BIND_CUSTOM,
+                                          PIPE_USAGE_STAGING, 1);
+       /* Add the fence as a dummy relocation. */
+       r600_context_bo_reloc(rctx, fence->sleep_bo, RADEON_USAGE_READWRITE);
+
+out:
+       pipe_mutex_unlock(rscreen->fences.mutex);
+       return fence;
+}
+
+
+void radeonsi_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence,
+                   unsigned flags)
+{
+       struct r600_context *rctx = (struct r600_context *)ctx;
+       struct r600_fence **rfence = (struct r600_fence**)fence;
+       struct pipe_query *render_cond = NULL;
+       unsigned render_cond_mode = 0;
+
+       if (rfence)
+               *rfence = r600_create_fence(rctx);
+
+       /* Disable render condition. */
+       if (rctx->current_render_cond) {
+               render_cond = rctx->current_render_cond;
+               render_cond_mode = rctx->current_render_cond_mode;
+               ctx->render_condition(ctx, NULL, 0);
+       }
+
+       r600_context_flush(rctx, flags);
+
+       /* Re-enable render condition. */
+       if (render_cond) {
+               ctx->render_condition(ctx, render_cond, render_cond_mode);
+       }
+}
+
+static void r600_flush_from_st(struct pipe_context *ctx,
+                              struct pipe_fence_handle **fence)
+{
+       radeonsi_flush(ctx, fence, 0);
+}
+
+static void r600_flush_from_winsys(void *ctx, unsigned flags)
+{
+       radeonsi_flush((struct pipe_context*)ctx, NULL, flags);
+}
+
+static void r600_update_num_contexts(struct r600_screen *rscreen, int diff)
+{
+       pipe_mutex_lock(rscreen->mutex_num_contexts);
+       if (diff > 0) {
+               rscreen->num_contexts++;
+
+               if (rscreen->num_contexts > 1)
+                       util_slab_set_thread_safety(&rscreen->pool_buffers,
+                                                   UTIL_SLAB_MULTITHREADED);
+       } else {
+               rscreen->num_contexts--;
+
+               if (rscreen->num_contexts <= 1)
+                       util_slab_set_thread_safety(&rscreen->pool_buffers,
+                                                   UTIL_SLAB_SINGLETHREADED);
+       }
+       pipe_mutex_unlock(rscreen->mutex_num_contexts);
+}
+
+static void r600_destroy_context(struct pipe_context *context)
+{
+       struct r600_context *rctx = (struct r600_context *)context;
+
+       rctx->context.delete_depth_stencil_alpha_state(&rctx->context, rctx->custom_dsa_flush);
+       util_unreference_framebuffer_state(&rctx->framebuffer);
+
+       r600_context_fini(rctx);
+
+       util_blitter_destroy(rctx->blitter);
+
+       for (int i = 0; i < R600_PIPE_NSTATES; i++) {
+               free(rctx->states[i]);
+       }
+
+       u_vbuf_destroy(rctx->vbuf_mgr);
+       util_slab_destroy(&rctx->pool_transfers);
+
+       r600_update_num_contexts(rctx->screen, -1);
+
+       FREE(rctx);
+}
+
+static struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv)
+{
+       struct r600_context *rctx = CALLOC_STRUCT(r600_context);
+       struct r600_screen* rscreen = (struct r600_screen *)screen;
+
+       if (rctx == NULL)
+               return NULL;
+
+       r600_update_num_contexts(rscreen, 1);
+
+       rctx->context.screen = screen;
+       rctx->context.priv = priv;
+       rctx->context.destroy = r600_destroy_context;
+       rctx->context.flush = r600_flush_from_st;
+
+       /* Easy accessing of screen/winsys. */
+       rctx->screen = rscreen;
+       rctx->ws = rscreen->ws;
+       rctx->family = rscreen->family;
+       rctx->chip_class = rscreen->chip_class;
+
+       r600_init_blit_functions(rctx);
+       r600_init_query_functions(rctx);
+       r600_init_context_resource_functions(rctx);
+       r600_init_surface_functions(rctx);
+       rctx->context.draw_vbo = r600_draw_vbo;
+
+       rctx->context.create_video_decoder = vl_create_decoder;
+       rctx->context.create_video_buffer = vl_video_buffer_create;
+
+       r600_init_common_atoms(rctx);
+
+       switch (rctx->chip_class) {
+       case TAHITI:
+               cayman_init_state_functions(rctx);
+               if (si_context_init(rctx)) {
+                       r600_destroy_context(&rctx->context);
+                       return NULL;
+               }
+               si_init_config(rctx);
+               rctx->custom_dsa_flush = cayman_create_db_flush_dsa(rctx);
+               break;
+       default:
+               R600_ERR("Unsupported chip class %d.\n", rctx->chip_class);
+               r600_destroy_context(&rctx->context);
+               return NULL;
+       }
+
+       rctx->ws->cs_set_flush_callback(rctx->cs, r600_flush_from_winsys, rctx);
+
+       util_slab_create(&rctx->pool_transfers,
+                        sizeof(struct pipe_transfer), 64,
+                        UTIL_SLAB_SINGLETHREADED);
+
+       rctx->vbuf_mgr = u_vbuf_create(&rctx->context, 1024 * 1024, 256,
+                                          PIPE_BIND_VERTEX_BUFFER |
+                                          PIPE_BIND_INDEX_BUFFER |
+                                          PIPE_BIND_CONSTANT_BUFFER,
+                                          U_VERTEX_FETCH_DWORD_ALIGNED);
+       if (!rctx->vbuf_mgr) {
+               r600_destroy_context(&rctx->context);
+               return NULL;
+       }
+       rctx->vbuf_mgr->caps.format_fixed32 = 0;
+
+       rctx->blitter = util_blitter_create(&rctx->context);
+       if (rctx->blitter == NULL) {
+               r600_destroy_context(&rctx->context);
+               return NULL;
+       }
+
+       LIST_INITHEAD(&rctx->dirty_states);
+
+       r600_get_backend_mask(rctx); /* this emits commands and must be last */
+
+       return &rctx->context;
+}
+
+/*
+ * pipe_screen
+ */
+static const char* r600_get_vendor(struct pipe_screen* pscreen)
+{
+       return "X.Org";
+}
+
+static const char *r600_get_family_name(enum radeon_family family)
+{
+       switch(family) {
+       case CHIP_CAYMAN: return "AMD CAYMAN";
+       default: return "AMD unknown";
+       }
+}
+
+static const char* r600_get_name(struct pipe_screen* pscreen)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+
+       return r600_get_family_name(rscreen->family);
+}
+
+static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+       enum radeon_family family = rscreen->family;
+
+       switch (param) {
+       /* Supported features (boolean caps). */
+       case PIPE_CAP_NPOT_TEXTURES:
+       case PIPE_CAP_TWO_SIDED_STENCIL:
+       case PIPE_CAP_DUAL_SOURCE_BLEND:
+       case PIPE_CAP_ANISOTROPIC_FILTER:
+       case PIPE_CAP_POINT_SPRITE:
+       case PIPE_CAP_OCCLUSION_QUERY:
+       case PIPE_CAP_TEXTURE_SHADOW_MAP:
+       case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+       case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+       case PIPE_CAP_TEXTURE_SWIZZLE:
+       case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+       case PIPE_CAP_DEPTH_CLIP_DISABLE:
+       case PIPE_CAP_SHADER_STENCIL_EXPORT:
+       case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+       case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+       case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+       case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+       case PIPE_CAP_SM3:
+       case PIPE_CAP_SEAMLESS_CUBE_MAP:
+       case PIPE_CAP_PRIMITIVE_RESTART:
+       case PIPE_CAP_CONDITIONAL_RENDER:
+       case PIPE_CAP_TEXTURE_BARRIER:
+       case PIPE_CAP_INDEP_BLEND_ENABLE:
+       case PIPE_CAP_INDEP_BLEND_FUNC:
+       case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+       case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+               return 1;
+
+       case PIPE_CAP_GLSL_FEATURE_LEVEL:
+               return debug_get_bool_option("R600_GLSL130", FALSE) ? 130 : 120;
+
+       /* Unsupported features. */
+       case PIPE_CAP_TGSI_INSTANCEID:
+       case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+       case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+       case PIPE_CAP_SCALED_RESOLVE:
+       case PIPE_CAP_TGSI_CAN_COMPACT_VARYINGS:
+       case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+       case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+       case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+       case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+               return 0;
+
+       /* Stream output. */
+       case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+               return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0;
+       case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+               return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 1 : 0;
+       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+       case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+               return 16*4;
+
+       /* Texturing. */
+       case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+       case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+       case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+                       return 15;
+       case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+               return rscreen->info.drm_minor >= 9 ? 16384 : 0;
+       case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+               return 32;
+
+       /* Render targets. */
+       case PIPE_CAP_MAX_RENDER_TARGETS:
+               /* FIXME some r6xx are buggy and can only do 4 */
+               return 8;
+
+       /* Timer queries, present when the clock frequency is non zero. */
+       case PIPE_CAP_TIMER_QUERY:
+               return rscreen->info.r600_clock_crystal_freq != 0;
+
+       case PIPE_CAP_MIN_TEXEL_OFFSET:
+               return -8;
+
+       case PIPE_CAP_MAX_TEXEL_OFFSET:
+               return 7;
+       }
+       return 0;
+}
+
+static float r600_get_paramf(struct pipe_screen* pscreen,
+                            enum pipe_capf param)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+       enum radeon_family family = rscreen->family;
+
+       switch (param) {
+       case PIPE_CAPF_MAX_LINE_WIDTH:
+       case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+       case PIPE_CAPF_MAX_POINT_WIDTH:
+       case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+               return 16384.0f;
+       case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+               return 16.0f;
+       case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+               return 16.0f;
+       case PIPE_CAPF_GUARD_BAND_LEFT:
+       case PIPE_CAPF_GUARD_BAND_TOP:
+       case PIPE_CAPF_GUARD_BAND_RIGHT:
+       case PIPE_CAPF_GUARD_BAND_BOTTOM:
+               return 0.0f;
+       }
+       return 0.0f;
+}
+
+static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enum pipe_shader_cap param)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+       switch(shader)
+       {
+       case PIPE_SHADER_FRAGMENT:
+       case PIPE_SHADER_VERTEX:
+               break;
+       case PIPE_SHADER_GEOMETRY:
+               /* TODO: support and enable geometry programs */
+               return 0;
+       default:
+               /* TODO: support tessellation */
+               return 0;
+       }
+
+       /* TODO: all these should be fixed, since r600 surely supports much more! */
+       switch (param) {
+       case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+       case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+       case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+       case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+               return 16384;
+       case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+               return 8; /* FIXME */
+       case PIPE_SHADER_CAP_MAX_INPUTS:
+               if(shader == PIPE_SHADER_FRAGMENT)
+                       return 34;
+               else
+                       return 32;
+       case PIPE_SHADER_CAP_MAX_TEMPS:
+               return 256; /* Max native temporaries. */
+       case PIPE_SHADER_CAP_MAX_ADDRS:
+               /* FIXME Isn't this equal to TEMPS? */
+               return 1; /* Max native address registers */
+       case PIPE_SHADER_CAP_MAX_CONSTS:
+               return R600_MAX_CONST_BUFFER_SIZE;
+       case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+               return R600_MAX_CONST_BUFFERS;
+       case PIPE_SHADER_CAP_MAX_PREDS:
+               return 0; /* FIXME */
+       case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+               return 1;
+       case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+       case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+       case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+       case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+       case PIPE_SHADER_CAP_INTEGERS:
+               return 0;
+       case PIPE_SHADER_CAP_SUBROUTINES:
+               return 0;
+       case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+               return 16;
+       }
+       return 0;
+}
+
+static int r600_get_video_param(struct pipe_screen *screen,
+                               enum pipe_video_profile profile,
+                               enum pipe_video_cap param)
+{
+       switch (param) {
+       case PIPE_VIDEO_CAP_SUPPORTED:
+               return vl_profile_supported(screen, profile);
+       case PIPE_VIDEO_CAP_NPOT_TEXTURES:
+               return 1;
+       case PIPE_VIDEO_CAP_MAX_WIDTH:
+       case PIPE_VIDEO_CAP_MAX_HEIGHT:
+               return vl_video_buffer_max_size(screen);
+       case PIPE_VIDEO_CAP_PREFERED_FORMAT:
+               return PIPE_FORMAT_NV12;
+       default:
+               return 0;
+       }
+}
+
+static void r600_destroy_screen(struct pipe_screen* pscreen)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+
+       if (rscreen == NULL)
+               return;
+
+       if (rscreen->fences.bo) {
+               struct r600_fence_block *entry, *tmp;
+
+               LIST_FOR_EACH_ENTRY_SAFE(entry, tmp, &rscreen->fences.blocks, head) {
+                       LIST_DEL(&entry->head);
+                       FREE(entry);
+               }
+
+               rscreen->ws->buffer_unmap(rscreen->fences.bo->buf);
+               pipe_resource_reference((struct pipe_resource**)&rscreen->fences.bo, NULL);
+       }
+       pipe_mutex_destroy(rscreen->fences.mutex);
+
+       rscreen->ws->destroy(rscreen->ws);
+
+       util_slab_destroy(&rscreen->pool_buffers);
+       pipe_mutex_destroy(rscreen->mutex_num_contexts);
+       FREE(rscreen);
+}
+
+static void r600_fence_reference(struct pipe_screen *pscreen,
+                                 struct pipe_fence_handle **ptr,
+                                 struct pipe_fence_handle *fence)
+{
+       struct r600_fence **oldf = (struct r600_fence**)ptr;
+       struct r600_fence *newf = (struct r600_fence*)fence;
+
+       if (pipe_reference(&(*oldf)->reference, &newf->reference)) {
+               struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+               pipe_mutex_lock(rscreen->fences.mutex);
+               pipe_resource_reference((struct pipe_resource**)&(*oldf)->sleep_bo, NULL);
+               LIST_ADDTAIL(&(*oldf)->head, &rscreen->fences.pool);
+               pipe_mutex_unlock(rscreen->fences.mutex);
+       }
+
+       *ptr = fence;
+}
+
+static boolean r600_fence_signalled(struct pipe_screen *pscreen,
+                                    struct pipe_fence_handle *fence)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+       struct r600_fence *rfence = (struct r600_fence*)fence;
+
+       return rscreen->fences.data[rfence->index];
+}
+
+static boolean r600_fence_finish(struct pipe_screen *pscreen,
+                                 struct pipe_fence_handle *fence,
+                                 uint64_t timeout)
+{
+       struct r600_screen *rscreen = (struct r600_screen *)pscreen;
+       struct r600_fence *rfence = (struct r600_fence*)fence;
+       int64_t start_time = 0;
+       unsigned spins = 0;
+
+       if (timeout != PIPE_TIMEOUT_INFINITE) {
+               start_time = os_time_get();
+
+               /* Convert to microseconds. */
+               timeout /= 1000;
+       }
+
+       while (rscreen->fences.data[rfence->index] == 0) {
+               /* Special-case infinite timeout - wait for the dummy BO to become idle */
+               if (timeout == PIPE_TIMEOUT_INFINITE) {
+                       rscreen->ws->buffer_wait(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE);
+                       break;
+               }
+
+               /* The dummy BO will be busy until the CS including the fence has completed, or
+                * the GPU is reset. Don't bother continuing to spin when the BO is idle. */
+               if (!rscreen->ws->buffer_is_busy(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE))
+                       break;
+
+               if (++spins % 256)
+                       continue;
+#ifdef PIPE_OS_UNIX
+               sched_yield();
+#else
+               os_time_sleep(10);
+#endif
+               if (timeout != PIPE_TIMEOUT_INFINITE &&
+                   os_time_get() - start_time >= timeout) {
+                       break;
+               }
+       }
+
+       return rscreen->fences.data[rfence->index] != 0;
+}
+
+static int evergreen_interpret_tiling(struct r600_screen *rscreen, uint32_t tiling_config)
+{
+       switch (tiling_config & 0xf) {
+       case 0:
+               rscreen->tiling_info.num_channels = 1;
+               break;
+       case 1:
+               rscreen->tiling_info.num_channels = 2;
+               break;
+       case 2:
+               rscreen->tiling_info.num_channels = 4;
+               break;
+       case 3:
+               rscreen->tiling_info.num_channels = 8;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       switch ((tiling_config & 0xf0) >> 4) {
+       case 0:
+               rscreen->tiling_info.num_banks = 4;
+               break;
+       case 1:
+               rscreen->tiling_info.num_banks = 8;
+               break;
+       case 2:
+               rscreen->tiling_info.num_banks = 16;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       switch ((tiling_config & 0xf00) >> 8) {
+       case 0:
+               rscreen->tiling_info.group_bytes = 256;
+               break;
+       case 1:
+               rscreen->tiling_info.group_bytes = 512;
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int r600_init_tiling(struct r600_screen *rscreen)
+{
+       uint32_t tiling_config = rscreen->info.r600_tiling_config;
+
+       /* set default group bytes, overridden by tiling info ioctl */
+       rscreen->tiling_info.group_bytes = 512;
+
+       if (!tiling_config)
+               return 0;
+
+       return evergreen_interpret_tiling(rscreen, tiling_config);
+}
+
+static unsigned radeon_family_from_device(unsigned device)
+{
+       switch (device) {
+#define CHIPSET(pciid, name, family) case pciid: return CHIP_##family;
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+       default:
+               return CHIP_UNKNOWN;
+       }
+}
+
+struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
+{
+       struct r600_screen *rscreen = CALLOC_STRUCT(r600_screen);
+       if (rscreen == NULL) {
+               return NULL;
+       }
+
+       rscreen->ws = ws;
+       ws->query_info(ws, &rscreen->info);
+
+       rscreen->family = radeon_family_from_device(rscreen->info.pci_id);
+       if (rscreen->family == CHIP_UNKNOWN) {
+               fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->info.pci_id);
+               FREE(rscreen);
+               return NULL;
+       }
+
+       /* setup class */
+       if (rscreen->family >= CHIP_TAHITI) {
+               rscreen->chip_class = TAHITI;
+       } else {
+               fprintf(stderr, "r600: Unsupported family %d\n", rscreen->family);
+               FREE(rscreen);
+               return NULL;
+       }
+
+       if (r600_init_tiling(rscreen)) {
+               FREE(rscreen);
+               return NULL;
+       }
+
+       rscreen->screen.destroy = r600_destroy_screen;
+       rscreen->screen.get_name = r600_get_name;
+       rscreen->screen.get_vendor = r600_get_vendor;
+       rscreen->screen.get_param = r600_get_param;
+       rscreen->screen.get_shader_param = r600_get_shader_param;
+       rscreen->screen.get_paramf = r600_get_paramf;
+       rscreen->screen.get_video_param = r600_get_video_param;
+       rscreen->screen.is_format_supported = si_is_format_supported;
+       rscreen->screen.is_video_format_supported = vl_video_buffer_is_format_supported;
+       rscreen->screen.context_create = r600_create_context;
+       rscreen->screen.fence_reference = r600_fence_reference;
+       rscreen->screen.fence_signalled = r600_fence_signalled;
+       rscreen->screen.fence_finish = r600_fence_finish;
+       r600_init_screen_resource_functions(&rscreen->screen);
+
+       util_format_s3tc_init();
+
+       util_slab_create(&rscreen->pool_buffers,
+                        sizeof(struct r600_resource), 64,
+                        UTIL_SLAB_SINGLETHREADED);
+
+       pipe_mutex_init(rscreen->mutex_num_contexts);
+
+       rscreen->fences.bo = NULL;
+       rscreen->fences.data = NULL;
+       rscreen->fences.next_index = 0;
+       LIST_INITHEAD(&rscreen->fences.pool);
+       LIST_INITHEAD(&rscreen->fences.blocks);
+       pipe_mutex_init(rscreen->fences.mutex);
+
+       return &rscreen->screen;
+}
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h

new file mode 100644 (file)

index 0000000..f4a1219
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+#ifndef RADEONSI_PIPE_H
+#define RADEONSI_PIPE_H
+
+#include "../../winsys/radeon/drm/radeon_winsys.h"
+
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_slab.h"
+#include "util/u_vbuf.h"
+#include "r600.h"
+#include "radeonsi_public.h"
+#include "r600_resource.h"
+
+#define R600_MAX_CONST_BUFFERS 1
+#define R600_MAX_CONST_BUFFER_SIZE 4096
+
+#ifdef PIPE_ARCH_BIG_ENDIAN
+#define R600_BIG_ENDIAN 1
+#else
+#define R600_BIG_ENDIAN 0
+#endif
+
+enum r600_atom_flags {
+       /* When set, atoms are added at the beginning of the dirty list
+        * instead of the end. */
+       EMIT_EARLY = (1 << 0)
+};
+
+/* This encapsulates a state or an operation which can emitted into the GPU
+ * command stream. It's not limited to states only, it can be used for anything
+ * that wants to write commands into the CS (e.g. cache flushes). */
+struct r600_atom {
+       void (*emit)(struct r600_context *ctx, struct r600_atom *state);
+
+       unsigned                num_dw;
+       enum r600_atom_flags    flags;
+       bool                    dirty;
+
+       struct list_head        head;
+};
+
+struct r600_atom_surface_sync {
+       struct r600_atom atom;
+       unsigned flush_flags; /* CP_COHER_CNTL */
+};
+
+enum r600_pipe_state_id {
+       R600_PIPE_STATE_BLEND = 0,
+       R600_PIPE_STATE_BLEND_COLOR,
+       R600_PIPE_STATE_CONFIG,
+       R600_PIPE_STATE_SEAMLESS_CUBEMAP,
+       R600_PIPE_STATE_CLIP,
+       R600_PIPE_STATE_SCISSOR,
+       R600_PIPE_STATE_VIEWPORT,
+       R600_PIPE_STATE_RASTERIZER,
+       R600_PIPE_STATE_VGT,
+       R600_PIPE_STATE_FRAMEBUFFER,
+       R600_PIPE_STATE_DSA,
+       R600_PIPE_STATE_STENCIL_REF,
+       R600_PIPE_STATE_PS_SHADER,
+       R600_PIPE_STATE_VS_SHADER,
+       R600_PIPE_STATE_CONSTANT,
+       R600_PIPE_STATE_SAMPLER,
+       R600_PIPE_STATE_RESOURCE,
+       R600_PIPE_STATE_POLYGON_OFFSET,
+       R600_PIPE_NSTATES
+};
+
+struct r600_pipe_fences {
+       struct r600_resource            *bo;
+       unsigned                        *data;
+       unsigned                        next_index;
+       /* linked list of preallocated blocks */
+       struct list_head                blocks;
+       /* linked list of freed fences */
+       struct list_head                pool;
+       pipe_mutex                      mutex;
+};
+
+struct r600_screen {
+       struct pipe_screen              screen;
+       struct radeon_winsys            *ws;
+       unsigned                        family;
+       enum chip_class                 chip_class;
+       struct radeon_info              info;
+       struct r600_tiling_info         tiling_info;
+       struct util_slab_mempool        pool_buffers;
+       struct r600_pipe_fences         fences;
+
+       unsigned                        num_contexts;
+
+       /* for thread-safe write accessing to num_contexts */
+       pipe_mutex                      mutex_num_contexts;
+};
+
+struct si_pipe_sampler_view {
+       struct pipe_sampler_view        base;
+       uint32_t                        state[8];
+};
+
+struct si_pipe_sampler_state {
+       uint32_t                        val[4];
+};
+
+struct r600_pipe_rasterizer {
+       struct r600_pipe_state          rstate;
+       boolean                         flatshade;
+       unsigned                        sprite_coord_enable;
+       unsigned                        pa_sc_line_stipple;
+       unsigned                        pa_su_sc_mode_cntl;
+       unsigned                        pa_cl_clip_cntl;
+       unsigned                        pa_cl_vs_out_cntl;
+       float                           offset_units;
+       float                           offset_scale;
+};
+
+struct r600_pipe_blend {
+       struct r600_pipe_state          rstate;
+       unsigned                        cb_target_mask;
+       unsigned                        cb_color_control;
+};
+
+struct r600_pipe_dsa {
+       struct r600_pipe_state          rstate;
+       unsigned                        alpha_ref;
+       unsigned                        db_render_override;
+       unsigned                        db_render_control;
+       ubyte                           valuemask[2];
+       ubyte                           writemask[2];
+};
+
+struct r600_vertex_element
+{
+       unsigned                        count;
+       struct pipe_vertex_element      elements[PIPE_MAX_ATTRIBS];
+       struct u_vbuf_elements          *vmgr_elements;
+       unsigned                        fs_size;
+       struct r600_pipe_state          rstate;
+       /* if offset is to big for fetch instructio we need to alterate
+        * offset of vertex buffer, record here the offset need to add
+        */
+       unsigned                        vbuffer_need_offset;
+       unsigned                        vbuffer_offset[PIPE_MAX_ATTRIBS];
+};
+
+struct r600_shader_io {
+       unsigned                name;
+       unsigned                gpr;
+       unsigned                done;
+       int                     sid;
+       unsigned                interpolate;
+       boolean                 centroid;
+       unsigned                lds_pos; /* for evergreen */
+};
+
+struct r600_shader {
+       unsigned                ninput;
+       unsigned                noutput;
+       struct r600_shader_io   input[32];
+       struct r600_shader_io   output[32];
+       boolean                 uses_kill;
+       boolean                 fs_write_all;
+       unsigned                nr_cbufs;
+};
+
+struct si_pipe_shader {
+       struct r600_shader              shader;
+       struct r600_pipe_state          rstate;
+       struct r600_resource            *bo;
+       struct r600_vertex_element      vertex_elements;
+       struct tgsi_token               *tokens;
+       unsigned                        num_sgprs;
+       unsigned                        num_vgprs;
+       unsigned                        spi_ps_input_ena;
+       unsigned        sprite_coord_enable;
+       struct pipe_stream_output_info  so;
+       unsigned                        so_strides[4];
+};
+
+/* needed for blitter save */
+#define NUM_TEX_UNITS 16
+
+struct r600_textures_info {
+       struct r600_pipe_state          rstate;
+       struct si_pipe_sampler_view     *views[NUM_TEX_UNITS];
+       struct si_pipe_sampler_state    *samplers[NUM_TEX_UNITS];
+       unsigned                        n_views;
+       unsigned                        n_samplers;
+       bool                            samplers_dirty;
+       bool                            is_array_sampler[NUM_TEX_UNITS];
+};
+
+struct r600_fence {
+       struct pipe_reference           reference;
+       unsigned                        index; /* in the shared bo */
+       struct r600_resource            *sleep_bo;
+       struct list_head                head;
+};
+
+#define FENCE_BLOCK_SIZE 16
+
+struct r600_fence_block {
+       struct r600_fence               fences[FENCE_BLOCK_SIZE];
+       struct list_head                head;
+};
+
+#define R600_CONSTANT_ARRAY_SIZE 256
+#define R600_RESOURCE_ARRAY_SIZE 160
+
+struct r600_stencil_ref
+{
+       ubyte ref_value[2];
+       ubyte valuemask[2];
+       ubyte writemask[2];
+};
+
+struct r600_context {
+       struct pipe_context             context;
+       struct blitter_context          *blitter;
+       enum radeon_family              family;
+       enum chip_class                 chip_class;
+       void                            *custom_dsa_flush;
+       struct r600_screen              *screen;
+       struct radeon_winsys            *ws;
+       struct r600_pipe_state          *states[R600_PIPE_NSTATES];
+       struct r600_vertex_element      *vertex_elements;
+       struct pipe_framebuffer_state   framebuffer;
+       unsigned                        cb_target_mask;
+       unsigned                        cb_color_control;
+       unsigned                        pa_sc_line_stipple;
+       unsigned                        pa_su_sc_mode_cntl;
+       unsigned                        pa_cl_clip_cntl;
+       unsigned                        pa_cl_vs_out_cntl;
+       /* for saving when using blitter */
+       struct pipe_stencil_ref         stencil_ref;
+       struct pipe_viewport_state      viewport;
+       struct pipe_clip_state          clip;
+       struct r600_pipe_state          config;
+       struct si_pipe_shader   *ps_shader;
+       struct si_pipe_shader   *vs_shader;
+       struct r600_pipe_state          vs_const_buffer;
+       struct r600_pipe_state          vs_user_data;
+       struct r600_pipe_state          ps_const_buffer;
+       struct r600_pipe_rasterizer     *rasterizer;
+       struct r600_pipe_state          vgt;
+       struct r600_pipe_state          spi;
+       struct pipe_query               *current_render_cond;
+       unsigned                        current_render_cond_mode;
+       struct pipe_query               *saved_render_cond;
+       unsigned                        saved_render_cond_mode;
+       /* shader information */
+       unsigned                        sprite_coord_enable;
+       boolean                         export_16bpc;
+       unsigned                        alpha_ref;
+       boolean                         alpha_ref_dirty;
+       unsigned                        nr_cbufs;
+       struct r600_textures_info       vs_samplers;
+       struct r600_textures_info       ps_samplers;
+       boolean                         shader_dirty;
+
+       struct u_vbuf                   *vbuf_mgr;
+       struct util_slab_mempool        pool_transfers;
+       boolean                         have_depth_texture, have_depth_fb;
+
+       unsigned default_ps_gprs, default_vs_gprs;
+
+       /* States based on r600_state. */
+       struct list_head                dirty_states;
+       struct r600_atom_surface_sync   atom_surface_sync;
+       struct r600_atom                atom_r6xx_flush_and_inv;
+
+       /* Below are variables from the old r600_context.
+        */
+       struct radeon_winsys_cs *cs;
+
+       struct r600_range       *range;
+       unsigned                nblocks;
+       struct r600_block       **blocks;
+       struct list_head        dirty;
+       struct list_head        enable_list;
+       unsigned                pm4_dirty_cdwords;
+       unsigned                ctx_pm4_ndwords;
+       unsigned                init_dwords;
+
+       /* The list of active queries. Only one query of each type can be active. */
+       struct list_head        active_query_list;
+       unsigned                num_cs_dw_queries_suspend;
+       unsigned                num_cs_dw_streamout_end;
+
+       unsigned                backend_mask;
+       unsigned                max_db; /* for OQ */
+       unsigned                flags;
+       boolean                 predicate_drawing;
+
+       unsigned                num_so_targets;
+       struct r600_so_target   *so_targets[PIPE_MAX_SO_BUFFERS];
+       boolean                 streamout_start;
+       unsigned                streamout_append_bitmask;
+       unsigned                *vs_so_stride_in_dw;
+       unsigned                *vs_shader_so_strides;
+};
+
+static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
+{
+       atom->emit(rctx, atom);
+       atom->dirty = false;
+       if (atom->head.next && atom->head.prev)
+               LIST_DELINIT(&atom->head);
+}
+
+static INLINE void r600_atom_dirty(struct r600_context *rctx, struct r600_atom *state)
+{
+       if (!state->dirty) {
+               if (state->flags & EMIT_EARLY) {
+                       LIST_ADD(&state->head, &rctx->dirty_states);
+               } else {
+                       LIST_ADDTAIL(&state->head, &rctx->dirty_states);
+               }
+               state->dirty = true;
+       }
+}
+
+/* evergreen_state.c */
+void cayman_init_state_functions(struct r600_context *rctx);
+void si_init_config(struct r600_context *rctx);
+void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader);
+void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader);
+void si_update_spi_map(struct r600_context *rctx);
+void *cayman_create_db_flush_dsa(struct r600_context *rctx);
+void cayman_polygon_offset_update(struct r600_context *rctx);
+uint32_t si_translate_vertexformat(struct pipe_screen *screen,
+                                  enum pipe_format format,
+                                  const struct util_format_description *desc,
+                                  int first_non_void);
+boolean si_is_format_supported(struct pipe_screen *screen,
+                              enum pipe_format format,
+                              enum pipe_texture_target target,
+                              unsigned sample_count,
+                              unsigned usage);
+
+/* r600_blit.c */
+void r600_init_blit_functions(struct r600_context *rctx);
+void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture);
+void r600_blit_push_depth(struct pipe_context *ctx, struct r600_resource_texture *texture);
+void r600_flush_depth_textures(struct r600_context *rctx);
+
+/* r600_buffer.c */
+bool r600_init_resource(struct r600_screen *rscreen,
+                       struct r600_resource *res,
+                       unsigned size, unsigned alignment,
+                       unsigned bind, unsigned usage);
+struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
+                                        const struct pipe_resource *templ);
+struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen,
+                                             void *ptr, unsigned bytes,
+                                             unsigned bind);
+void r600_upload_index_buffer(struct r600_context *rctx,
+                             struct pipe_index_buffer *ib, unsigned count);
+
+
+/* r600_pipe.c */
+void radeonsi_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence,
+                   unsigned flags);
+
+/* r600_query.c */
+void r600_init_query_functions(struct r600_context *rctx);
+
+/* r600_resource.c */
+void r600_init_context_resource_functions(struct r600_context *r600);
+
+/* radeonsi_shader.c */
+int si_pipe_shader_create(struct pipe_context *ctx, struct si_pipe_shader *shader);
+void si_pipe_shader_destroy(struct pipe_context *ctx, struct si_pipe_shader *shader);
+
+/* r600_texture.c */
+void r600_init_screen_texture_functions(struct pipe_screen *screen);
+void r600_init_surface_functions(struct r600_context *r600);
+unsigned r600_texture_get_offset(struct r600_resource_texture *rtex,
+                                       unsigned level, unsigned layer);
+
+/* r600_translate.c */
+void r600_translate_index_buffer(struct r600_context *r600,
+                                struct pipe_index_buffer *ib,
+                                unsigned count);
+
+/* r600_state_common.c */
+void r600_init_common_atoms(struct r600_context *rctx);
+unsigned r600_get_cb_flush_flags(struct r600_context *rctx);
+void r600_texture_barrier(struct pipe_context *ctx);
+void r600_set_index_buffer(struct pipe_context *ctx,
+                          const struct pipe_index_buffer *ib);
+void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
+                            const struct pipe_vertex_buffer *buffers);
+void *si_create_vertex_elements(struct pipe_context *ctx,
+                               unsigned count,
+                               const struct pipe_vertex_element *elements);
+void r600_delete_vertex_element(struct pipe_context *ctx, void *state);
+void r600_bind_blend_state(struct pipe_context *ctx, void *state);
+void r600_bind_dsa_state(struct pipe_context *ctx, void *state);
+void r600_bind_rs_state(struct pipe_context *ctx, void *state);
+void r600_delete_rs_state(struct pipe_context *ctx, void *state);
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+                              struct pipe_sampler_view *state);
+void r600_delete_state(struct pipe_context *ctx, void *state);
+void r600_bind_vertex_elements(struct pipe_context *ctx, void *state);
+void *si_create_shader_state(struct pipe_context *ctx,
+                            const struct pipe_shader_state *state);
+void r600_bind_ps_shader(struct pipe_context *ctx, void *state);
+void r600_bind_vs_shader(struct pipe_context *ctx, void *state);
+void r600_delete_ps_shader(struct pipe_context *ctx, void *state);
+void r600_delete_vs_shader(struct pipe_context *ctx, void *state);
+void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
+                             struct pipe_resource *buffer);
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+                     struct pipe_resource *buffer,
+                     unsigned buffer_offset,
+                     unsigned buffer_size);
+void r600_so_target_destroy(struct pipe_context *ctx,
+                           struct pipe_stream_output_target *target);
+void r600_set_so_targets(struct pipe_context *ctx,
+                        unsigned num_targets,
+                        struct pipe_stream_output_target **targets,
+                        unsigned append_bitmask);
+void r600_set_pipe_stencil_ref(struct pipe_context *ctx,
+                              const struct pipe_stencil_ref *state);
+void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
+
+/*
+ * common helpers
+ */
+static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
+{
+       return value * (1 << frac_bits);
+}
+#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
+
+static inline unsigned r600_tex_aniso_filter(unsigned filter)
+{
+       if (filter <= 1)   return 0;
+       if (filter <= 2)   return 1;
+       if (filter <= 4)   return 2;
+       if (filter <= 8)   return 3;
+        /* else */        return 4;
+}
+
+/* 12.4 fixed-point */
+static INLINE unsigned r600_pack_float_12p4(float x)
+{
+       return x <= 0    ? 0 :
+              x >= 4096 ? 0xffff : x * 16;
+}
+
+static INLINE uint64_t r600_resource_va(struct pipe_screen *screen, struct pipe_resource *resource)
+{
+       struct r600_screen *rscreen = (struct r600_screen*)screen;
+       struct r600_resource *rresource = (struct r600_resource*)resource;
+
+       return rscreen->ws->buffer_get_virtual_address(rresource->cs_buf);
+}
+
+#endif
diff --git a/src/gallium/drivers/radeonsi/radeonsi_public.h b/src/gallium/drivers/radeonsi/radeonsi_public.h

new file mode 100644 (file)

index 0000000..5dcec0f
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/radeonsi_public.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef RADEONSI_PUBLIC_H
+#define RADEONSI_PUBLIC_H
+
+struct radeon_winsys;
+
+struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws);
+
+#endif
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c

new file mode 100644 (file)

index 0000000..50f2e39
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -0,0 +1,565 @@
+
+#include "gallivm/lp_bld_tgsi_action.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "radeon_llvm.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "radeonsi_pipe.h"
+#include "radeonsi_shader.h"
+#include "sid.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+
+/*
+static ps_remap_inputs(
+       struct tgsi_llvm_context * tl_ctx,
+       unsigned tgsi_index,
+       unsigned tgsi_chan)
+{
+       :
+}
+
+struct si_input
+{
+       struct list_head head;
+       unsigned tgsi_index;
+       unsigned tgsi_chan;
+       unsigned order;
+};
+*/
+
+
+struct si_shader_context
+{
+       struct radeon_llvm_context radeon_bld;
+       struct r600_context *rctx;
+       struct tgsi_parse_context parse;
+       struct tgsi_token * tokens;
+       struct si_pipe_shader *shader;
+       unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */
+/*     unsigned num_inputs; */
+/*     struct list_head inputs; */
+/*     unsigned * input_mappings *//* From TGSI to SI hw */
+/*     struct tgsi_shader_info info;*/
+};
+
+static struct si_shader_context * si_shader_context(
+       struct lp_build_tgsi_context * bld_base)
+{
+       return (struct si_shader_context *)bld_base;
+}
+
+
+#define PERSPECTIVE_BASE 0
+#define LINEAR_BASE 9
+
+#define SAMPLE_OFFSET 0
+#define CENTER_OFFSET 2
+#define CENTROID_OFSET 4
+
+#define USE_SGPR_MAX_SUFFIX_LEN 5
+
+enum sgpr_type {
+       SGPR_I32,
+       SGPR_I64,
+       SGPR_PTR_V4I32,
+       SGPR_PTR_V8I32
+};
+
+static LLVMValueRef use_sgpr(
+       struct gallivm_state * gallivm,
+       enum sgpr_type type,
+       unsigned sgpr)
+{
+       LLVMValueRef sgpr_index;
+       LLVMValueRef sgpr_value;
+       LLVMTypeRef ret_type;
+
+       sgpr_index = lp_build_const_int32(gallivm, sgpr);
+
+       if (type == SGPR_I32) {
+               ret_type = LLVMInt32TypeInContext(gallivm->context);
+               return lp_build_intrinsic_unary(gallivm->builder,
+                                               "llvm.SI.use.sgpr.i32",
+                                               ret_type, sgpr_index);
+       }
+
+       ret_type = LLVMInt64TypeInContext(gallivm->context);
+       sgpr_value = lp_build_intrinsic_unary(gallivm->builder,
+                               "llvm.SI.use.sgpr.i64",
+                                ret_type, sgpr_index);
+
+       switch (type) {
+       case SGPR_I64:
+               return sgpr_value;
+       case SGPR_PTR_V4I32:
+               ret_type = LLVMInt32TypeInContext(gallivm->context);
+               ret_type = LLVMVectorType(ret_type, 4);
+               ret_type = LLVMPointerType(ret_type,
+                                       0 /*XXX: Specify address space*/);
+               return LLVMBuildIntToPtr(gallivm->builder, sgpr_value,
+                                                               ret_type, "");
+       case SGPR_PTR_V8I32:
+               ret_type = LLVMInt32TypeInContext(gallivm->context);
+               ret_type = LLVMVectorType(ret_type, 8);
+               ret_type = LLVMPointerType(ret_type,
+                                       0 /*XXX: Specify address space*/);
+               return LLVMBuildIntToPtr(gallivm->builder, sgpr_value,
+                                                               ret_type, "");
+       default:
+               assert(!"Unsupported SGPR type in use_sgpr()");
+               return NULL;
+       }
+}
+
+static void declare_input_vs(
+       struct si_shader_context * si_shader_ctx,
+       unsigned input_index,
+       const struct tgsi_full_declaration *decl)
+{
+       LLVMValueRef t_list_ptr;
+       LLVMValueRef t_offset;
+       LLVMValueRef attribute_offset;
+       LLVMValueRef buffer_index_reg;
+       LLVMValueRef args[4];
+       LLVMTypeRef vec4_type;
+       LLVMValueRef input;
+       struct lp_build_context * uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+       struct lp_build_context * base = &si_shader_ctx->radeon_bld.soa.bld_base.base;
+       struct r600_context *rctx = si_shader_ctx->rctx;
+       struct pipe_vertex_element *velem = &rctx->vertex_elements->elements[input_index];
+       unsigned chan;
+
+       /* XXX: Communicate with the rest of the driver about which SGPR the T#
+        * list pointer is going to be stored in.  Hard code to SGPR[0-1] for
+        * now */
+       t_list_ptr = use_sgpr(base->gallivm, SGPR_I64, 0);
+
+       t_offset = lp_build_const_int32(base->gallivm,
+                                       4 * velem->vertex_buffer_index);
+       attribute_offset = lp_build_const_int32(base->gallivm, velem->src_offset);
+
+       /* Load the buffer index is always, which is always stored in VGPR0
+        * for Vertex Shaders */
+       buffer_index_reg = lp_build_intrinsic(base->gallivm->builder,
+               "llvm.SI.vs.load.buffer.index", uint->elem_type, NULL, 0);
+
+       vec4_type = LLVMVectorType(base->elem_type, 4);
+       args[0] = t_list_ptr;
+       args[1] = t_offset;
+       args[2] = attribute_offset;
+       args[3] = buffer_index_reg;
+       input = lp_build_intrinsic(base->gallivm->builder,
+               "llvm.SI.vs.load.input", vec4_type, args, 4);
+
+       /* Break up the vec4 into individual components */
+       for (chan = 0; chan < 4; chan++) {
+               LLVMValueRef llvm_chan = lp_build_const_int32(base->gallivm, chan);
+               /* XXX: Use a helper function for this.  There is one in
+                * tgsi_llvm.c. */
+               si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
+                               LLVMBuildExtractElement(base->gallivm->builder,
+                               input, llvm_chan, "");
+       }
+}
+
+static void declare_input_fs(
+       struct si_shader_context * si_shader_ctx,
+       unsigned input_index,
+       const struct tgsi_full_declaration *decl)
+{
+       const char * intr_name;
+       unsigned chan;
+       struct lp_build_context * base =
+                               &si_shader_ctx->radeon_bld.soa.bld_base.base;
+       struct gallivm_state * gallivm = base->gallivm;
+
+       /* This value is:
+        * [15:0] NewPrimMask (Bit mask for each quad.  It is set it the
+        *                     quad begins a new primitive.  Bit 0 always needs
+        *                     to be unset)
+        * [32:16] ParamOffset
+        *
+        */
+       LLVMValueRef params = use_sgpr(base->gallivm, SGPR_I32, 6);
+
+
+       /* XXX: Is this the input_index? */
+       LLVMValueRef attr_number = lp_build_const_int32(gallivm, input_index);
+
+       /* XXX: Handle all possible interpolation modes */
+       switch (decl->Declaration.Interpolate) {
+       case TGSI_INTERPOLATE_COLOR:
+               if (si_shader_ctx->rctx->rasterizer->flatshade)
+                       intr_name = "llvm.SI.fs.interp.constant";
+               else
+                       intr_name = "llvm.SI.fs.interp.linear.center";
+               break;
+       case TGSI_INTERPOLATE_CONSTANT:
+               intr_name = "llvm.SI.fs.interp.constant";
+               break;
+       case TGSI_INTERPOLATE_LINEAR:
+               intr_name = "llvm.SI.fs.interp.linear.center";
+               break;
+       default:
+               fprintf(stderr, "Warning: Unhandled interpolation mode.\n");
+               return;
+       }
+
+       /* XXX: Could there be more than TGSI_NUM_CHANNELS (4) ? */
+       for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+               LLVMValueRef args[3];
+               LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
+               unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
+               LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context);
+               args[0] = llvm_chan;
+               args[1] = attr_number;
+               args[2] = params;
+               si_shader_ctx->radeon_bld.inputs[soa_index] =
+                       lp_build_intrinsic(gallivm->builder, intr_name,
+                                               input_type, args, 3);
+       }
+}
+
+static void declare_input(
+       struct radeon_llvm_context * radeon_bld,
+       unsigned input_index,
+       const struct tgsi_full_declaration *decl)
+{
+       struct si_shader_context * si_shader_ctx =
+                               si_shader_context(&radeon_bld->soa.bld_base);
+       if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) {
+               declare_input_vs(si_shader_ctx, input_index, decl);
+       } else if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+               declare_input_fs(si_shader_ctx, input_index, decl);
+       } else {
+               fprintf(stderr, "Warning: Unsupported shader type,\n");
+       }
+}
+
+static LLVMValueRef fetch_constant(
+       struct lp_build_tgsi_context * bld_base,
+       const struct tgsi_full_src_register *reg,
+       enum tgsi_opcode_type type,
+       unsigned swizzle)
+{
+       struct lp_build_context * base = &bld_base->base;
+
+       LLVMValueRef const_ptr;
+       LLVMValueRef offset;
+
+       /* XXX: Assume the pointer to the constant buffer is being stored in
+        * SGPR[2:3] */
+       const_ptr = use_sgpr(base->gallivm, SGPR_I64, 1);
+
+       /* XXX: This assumes that the constant buffer is not packed, so
+        * CONST[0].x will have an offset of 0 and CONST[1].x will have an
+        * offset of 4. */
+       offset = lp_build_const_int32(base->gallivm,
+                                       (reg->Register.Index * 4) + swizzle);
+
+       return lp_build_intrinsic_binary(base->gallivm->builder,
+               "llvm.SI.load.const", base->elem_type, const_ptr, offset);
+}
+
+
+/* Declare some intrinsics with the correct attributes */
+static void si_llvm_emit_prologue(struct lp_build_tgsi_context * bld_base)
+{
+       LLVMValueRef function;
+       struct gallivm_state * gallivm = bld_base->base.gallivm;
+
+       LLVMTypeRef i64 = LLVMInt64TypeInContext(gallivm->context);
+       LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
+
+       /* declare i32 @llvm.SI.use.sgpr.i32(i32) */
+       function = lp_declare_intrinsic(gallivm->module, "llvm.SI.use.sgpr.i32",
+                                       i32, &i32, 1);
+       LLVMAddFunctionAttr(function, LLVMReadNoneAttribute);
+
+       /* declare i64 @llvm.SI.use.sgpr.i64(i32) */
+       function = lp_declare_intrinsic(gallivm->module, "llvm.SI.use.sgpr.i64",
+                                       i64, &i32, 1);
+       LLVMAddFunctionAttr(function, LLVMReadNoneAttribute);
+}
+
+/* XXX: This is partially implemented for VS only at this point.  It is not complete */
+static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
+{
+       struct si_shader_context * si_shader_ctx = si_shader_context(bld_base);
+       struct r600_shader * shader = &si_shader_ctx->shader->shader;
+       struct lp_build_context * base = &bld_base->base;
+       struct lp_build_context * uint =
+                               &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld;
+       struct tgsi_parse_context *parse = &si_shader_ctx->parse;
+       LLVMValueRef last_args[9] = { 0 };
+
+       while (!tgsi_parse_end_of_tokens(parse)) {
+               /* XXX: component_bits controls which components of the output
+                * registers actually get exported. (e.g bit 0 means export
+                * X component, bit 1 means export Y component, etc.)  I'm
+                * hard coding this to 0xf for now.  In the future, we might
+                * want to do something else. */
+               unsigned component_bits = 0xf;
+               unsigned chan;
+               struct tgsi_full_declaration *d =
+                                       &parse->FullToken.FullDeclaration;
+               LLVMValueRef args[9];
+               unsigned target;
+               unsigned index;
+               unsigned color_count = 0;
+               unsigned param_count = 0;
+               int i;
+
+               tgsi_parse_token(parse);
+               if (parse->FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION)
+                       continue;
+
+               switch (d->Declaration.File) {
+               case TGSI_FILE_INPUT:
+                       i = shader->ninput++;
+                       shader->input[i].name = d->Semantic.Name;
+                       shader->input[i].sid = d->Semantic.Index;
+                       shader->input[i].interpolate = d->Declaration.Interpolate;
+                       shader->input[i].centroid = d->Declaration.Centroid;
+                       break;
+               case TGSI_FILE_OUTPUT:
+                       i = shader->noutput++;
+                       shader->output[i].name = d->Semantic.Name;
+                       shader->output[i].sid = d->Semantic.Index;
+                       shader->output[i].interpolate = d->Declaration.Interpolate;
+                       break;
+               }
+
+               if (d->Declaration.File != TGSI_FILE_OUTPUT)
+                       continue;
+
+               for (index = d->Range.First; index <= d->Range.Last; index++) {
+                       for (chan = 0; chan < 4; chan++ ) {
+                               LLVMValueRef out_ptr =
+                                       si_shader_ctx->radeon_bld.soa.outputs
+                                       [index][chan];
+                               /* +5 because the first output value will be
+                                * the 6th argument to the intrinsic. */
+                               args[chan + 5]= LLVMBuildLoad(
+                                       base->gallivm->builder, out_ptr, "");
+                       }
+
+                       /* XXX: We probably need to keep track of the output
+                        * values, so we know what we are passing to the next
+                        * stage. */
+
+                       /* Select the correct target */
+                       switch(d->Semantic.Name) {
+                       case TGSI_SEMANTIC_POSITION:
+                               target = V_008DFC_SQ_EXP_POS;
+                               break;
+                       case TGSI_SEMANTIC_COLOR:
+                               if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) {
+                                       target = V_008DFC_SQ_EXP_PARAM + param_count;
+                                       param_count++;
+                               } else {
+                                       target = V_008DFC_SQ_EXP_MRT + color_count;
+                                       color_count++;
+                               }
+                               break;
+                       case TGSI_SEMANTIC_GENERIC:
+                               target = V_008DFC_SQ_EXP_PARAM + param_count;
+                               param_count++;
+                               break;
+                       default:
+                               target = 0;
+                               fprintf(stderr,
+                                       "Warning: SI unhandled output type:%d\n",
+                                       d->Semantic.Name);
+                       }
+
+                       /* Specify which components to enable */
+                       args[0] = lp_build_const_int32(base->gallivm,
+                                                               component_bits);
+
+                       /* Specify whether the EXEC mask represents the valid mask */
+                       args[1] = lp_build_const_int32(base->gallivm, 0);
+
+                       /* Specify whether this is the last export */
+                       args[2] = lp_build_const_int32(base->gallivm, 0);
+
+                       /* Specify the target we are exporting */
+                       args[3] = lp_build_const_int32(base->gallivm, target);
+
+                       /* Set COMPR flag to zero to export data as 32-bit */
+                       args[4] = uint->zero;
+
+                       if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX ?
+                           (d->Semantic.Name == TGSI_SEMANTIC_POSITION) :
+                           (d->Semantic.Name == TGSI_SEMANTIC_COLOR)) {
+                               if (last_args[0]) {
+                                       lp_build_intrinsic(base->gallivm->builder,
+                                                          "llvm.SI.export",
+                                                          LLVMVoidTypeInContext(base->gallivm->context),
+                                                          last_args, 9);
+                               }
+
+                               memcpy(last_args, args, sizeof(args));
+                       } else {
+                               lp_build_intrinsic(base->gallivm->builder,
+                                                  "llvm.SI.export",
+                                                  LLVMVoidTypeInContext(base->gallivm->context),
+                                                  args, 9);
+                       }
+
+               }
+       }
+
+       /* Specify whether the EXEC mask represents the valid mask */
+       last_args[1] = lp_build_const_int32(base->gallivm,
+                                           si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT);
+
+       /* Specify that this is the last export */
+       last_args[2] = lp_build_const_int32(base->gallivm, 1);
+
+       lp_build_intrinsic(base->gallivm->builder,
+                          "llvm.SI.export",
+                          LLVMVoidTypeInContext(base->gallivm->context),
+                          last_args, 9);
+
+/* XXX: Look up what this function does */
+/*             ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);*/
+}
+
+static void tex_fetch_args(
+       struct lp_build_tgsi_context * bld_base,
+       struct lp_build_emit_data * emit_data)
+{
+       /* WriteMask */
+       emit_data->args[0] = lp_build_const_int32(bld_base->base.gallivm,
+                               emit_data->inst->Dst[0].Register.WriteMask);
+
+       /* Coordinates */
+       /* XXX: Not all sample instructions need 4 address arguments. */
+       emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                                       0, LP_CHAN_ALL);
+
+       /* Resource */
+       emit_data->args[2] = use_sgpr(bld_base->base.gallivm, SGPR_I64, 2);
+       emit_data->args[3] = lp_build_const_int32(bld_base->base.gallivm,
+                                                 32 * emit_data->inst->Src[2].Register.Index);
+
+       /* Sampler */
+       emit_data->args[4] = use_sgpr(bld_base->base.gallivm, SGPR_I64, 1);
+       emit_data->args[5] = lp_build_const_int32(bld_base->base.gallivm,
+                                                 16 * emit_data->inst->Src[2].Register.Index);
+
+       /* Dimensions */
+       /* XXX: We might want to pass this information to the shader at some. */
+/*     emit_data->args[4] = lp_build_const_int32(bld_base->base.gallivm,
+                                       emit_data->inst->Texture.Texture);
+*/
+
+       emit_data->arg_count = 6;
+       /* XXX: To optimize, we could use a float or v2f32, if the last bits of
+        * the writemask are clear */
+       emit_data->dst_type = LLVMVectorType(
+                       LLVMFloatTypeInContext(bld_base->base.gallivm->context),
+                       4);
+}
+
+static const struct lp_build_tgsi_action tex_action = {
+       .fetch_args = tex_fetch_args,
+       .emit = lp_build_tgsi_intrinsic,
+       .intr_name = "llvm.SI.sample"
+};
+
+
+int si_pipe_shader_create(
+       struct pipe_context *ctx,
+       struct si_pipe_shader *shader)
+{
+       struct r600_context *rctx = (struct r600_context*)ctx;
+       struct si_shader_context si_shader_ctx;
+       struct tgsi_shader_info shader_info;
+       struct lp_build_tgsi_context * bld_base;
+       LLVMModuleRef mod;
+       unsigned char * inst_bytes;
+       unsigned inst_byte_count;
+       unsigned i;
+
+       radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
+       bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
+
+       tgsi_scan_shader(shader->tokens, &shader_info);
+       bld_base->info = &shader_info;
+       bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
+       bld_base->emit_prologue = si_llvm_emit_prologue;
+       bld_base->emit_epilogue = si_llvm_emit_epilogue;
+
+       bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action;
+
+       si_shader_ctx.radeon_bld.load_input = declare_input;
+       si_shader_ctx.tokens = shader->tokens;
+       tgsi_parse_init(&si_shader_ctx.parse, si_shader_ctx.tokens);
+       si_shader_ctx.shader = shader;
+       si_shader_ctx.type = si_shader_ctx.parse.FullHeader.Processor.Processor;
+       si_shader_ctx.rctx = rctx;
+
+       shader->shader.nr_cbufs = rctx->nr_cbufs;
+
+       lp_build_tgsi_llvm(bld_base, shader->tokens);
+
+       radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld);
+
+       mod = bld_base->base.gallivm->module;
+       tgsi_dump(shader->tokens, 0);
+       LLVMDumpModule(mod);
+       radeon_llvm_compile(mod, &inst_bytes, &inst_byte_count, "SI", 1 /* dump */);
+       fprintf(stderr, "SI CODE:\n");
+       for (i = 0; i < inst_byte_count; i+=4 ) {
+               fprintf(stderr, "%02x%02x%02x%02x\n", inst_bytes[i + 3],
+                       inst_bytes[i + 2], inst_bytes[i + 1],
+                       inst_bytes[i]);
+       }
+
+       shader->num_sgprs = util_le32_to_cpu(*(uint32_t*)inst_bytes);
+       shader->num_vgprs = util_le32_to_cpu(*(uint32_t*)(inst_bytes + 4));
+       shader->spi_ps_input_ena = util_le32_to_cpu(*(uint32_t*)(inst_bytes + 8));
+
+       tgsi_parse_free(&si_shader_ctx.parse);
+
+       /* copy new shader */
+       if (shader->bo == NULL) {
+               uint32_t *ptr;
+
+               shader->bo = (struct r600_resource*)
+                       pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, inst_byte_count);
+               if (shader->bo == NULL) {
+                       return -ENOMEM;
+               }
+               ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->buf, rctx->cs, PIPE_TRANSFER_WRITE);
+               if (0 /*R600_BIG_ENDIAN*/) {
+                       for (i = 0; i < (inst_byte_count-12)/4; ++i) {
+                               ptr[i] = util_bswap32(*(uint32_t*)(inst_bytes+12 + i*4));
+                       }
+               } else {
+                       memcpy(ptr, inst_bytes + 12, inst_byte_count - 12);
+               }
+               rctx->ws->buffer_unmap(shader->bo->buf);
+       }
+
+       free(inst_bytes);
+
+       return 0;
+}
+
+void si_pipe_shader_destroy(struct pipe_context *ctx, struct si_pipe_shader *shader)
+{
+       pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
+
+       memset(&shader->shader,0,sizeof(struct r600_shader));
+}
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h b/src/gallium/drivers/radeonsi/radeonsi_shader.h

new file mode 100644 (file)

index 0000000..cd742f5
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h
@@ -0,0 +1,4 @@
+
+struct tgsi_token;
+
+void si_test(struct tgsi_token * token, unsigned type);
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h

new file mode 100644 (file)

index 0000000..325445c
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -0,0 +1,7668 @@
+/*
+ * Southern Islands Register documentation
+ *
+ * Copyright (C) 2011  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SID_H
+#define SID_H
+
+/* si values */
+#define SI_CONFIG_REG_OFFSET                 0x00008000
+#define SI_CONFIG_REG_END                    0x0000B000
+#define SI_SH_REG_OFFSET                     0x0000B000
+#define SI_SH_REG_END                        0x0000C000
+#define SI_CONTEXT_REG_OFFSET                0x00028000
+#define SI_CONTEXT_REG_END                   0x00029000
+
+#define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
+#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
+#define EVENT_TYPE_ZPASS_DONE                  0x15
+#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH       0x1f
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS       0x20
+#define                EVENT_TYPE(x)                           ((x) << 0)
+#define                EVENT_INDEX(x)                          ((x) << 8)
+                /* 0 - any non-TS event
+                * 1 - ZPASS_DONE
+                * 2 - SAMPLE_PIPELINESTAT
+                * 3 - SAMPLE_STREAMOUTSTAT*
+                * 4 - *S_PARTIAL_FLUSH
+                * 5 - TS events
+                */
+
+#define PREDICATION_OP_CLEAR 0x0
+#define PREDICATION_OP_ZPASS 0x1
+#define PREDICATION_OP_PRIMCOUNT 0x2
+
+#define PRED_OP(x) ((x) << 16)
+
+#define PREDICATION_CONTINUE (1 << 31)
+
+#define PREDICATION_HINT_WAIT (0 << 12)
+#define PREDICATION_HINT_NOWAIT_DRAW (1 << 12)
+
+#define PREDICATION_DRAW_NOT_VISIBLE (0 << 8)
+#define PREDICATION_DRAW_VISIBLE (1 << 8)
+
+#define R600_TEXEL_PITCH_ALIGNMENT_MASK        0x7
+
+#define PKT3_NOP                               0x10
+#define PKT3_SET_PREDICATION                   0x20
+#define PKT3_COND_EXEC                         0x22
+#define PKT3_PRED_EXEC                         0x23
+#define PKT3_START_3D_CMDBUF                   0x24
+#define PKT3_DRAW_INDEX_2                      0x27
+#define PKT3_CONTEXT_CONTROL                   0x28
+#define PKT3_INDEX_TYPE                        0x2A
+#define PKT3_DRAW_INDEX                        0x2B
+#define PKT3_DRAW_INDEX_AUTO                   0x2D
+#define PKT3_DRAW_INDEX_IMMD                   0x2E
+#define PKT3_NUM_INSTANCES                     0x2F
+#define PKT3_STRMOUT_BUFFER_UPDATE             0x34
+#define PKT3_MEM_SEMAPHORE                     0x39
+#define PKT3_MPEG_INDEX                        0x3A
+#define PKT3_WAIT_REG_MEM                      0x3C
+#define                WAIT_REG_MEM_EQUAL              3
+#define PKT3_MEM_WRITE                         0x3D
+#define PKT3_INDIRECT_BUFFER                   0x32
+#define PKT3_SURFACE_SYNC                      0x43
+#define PKT3_ME_INITIALIZE                     0x44
+#define PKT3_COND_WRITE                        0x45
+#define PKT3_EVENT_WRITE                       0x46
+#define PKT3_EVENT_WRITE_EOP                   0x47
+#define PKT3_EVENT_WRITE_EOS                   0x48
+#define PKT3_ONE_REG_WRITE                     0x57
+#define PKT3_SET_CONFIG_REG                    0x68
+#define PKT3_SET_CONTEXT_REG                   0x69
+#define PKT3_SET_SH_REG                        0x76
+
+#define PKT_TYPE_S(x)                   (((x) & 0x3) << 30)
+#define PKT_TYPE_G(x)                   (((x) >> 30) & 0x3)
+#define PKT_TYPE_C                      0x3FFFFFFF
+#define PKT_COUNT_S(x)                  (((x) & 0x3FFF) << 16)
+#define PKT_COUNT_G(x)                  (((x) >> 16) & 0x3FFF)
+#define PKT_COUNT_C                     0xC000FFFF
+#define PKT0_BASE_INDEX_S(x)            (((x) & 0xFFFF) << 0)
+#define PKT0_BASE_INDEX_G(x)            (((x) >> 0) & 0xFFFF)
+#define PKT0_BASE_INDEX_C               0xFFFF0000
+#define PKT3_IT_OPCODE_S(x)             (((x) & 0xFF) << 8)
+#define PKT3_IT_OPCODE_G(x)             (((x) >> 8) & 0xFF)
+#define PKT3_IT_OPCODE_C                0xFFFF00FF
+#define PKT3_PREDICATE(x)               (((x) >> 0) & 0x1)
+#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count))
+#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
+
+#define R_0084FC_CP_STRMOUT_CNTL                                       0x0084FC
+#define   S_0084FC_OFFSET_UPDATE_DONE(x)                             (((x) & 0x1) << 0)
+#define R_0085F0_CP_COHER_CNTL                                          0x0085F0
+#define   S_0085F0_DEST_BASE_0_ENA(x)                                 (((x) & 0x1) << 0)
+#define   G_0085F0_DEST_BASE_0_ENA(x)                                 (((x) >> 0) & 0x1)
+#define   C_0085F0_DEST_BASE_0_ENA                                    0xFFFFFFFE
+#define   S_0085F0_DEST_BASE_1_ENA(x)                                 (((x) & 0x1) << 1)
+#define   G_0085F0_DEST_BASE_1_ENA(x)                                 (((x) >> 1) & 0x1)
+#define   C_0085F0_DEST_BASE_1_ENA                                    0xFFFFFFFD
+#define   S_0085F0_CB0_DEST_BASE_ENA_SHIFT                                   6
+#define   S_0085F0_CB0_DEST_BASE_ENA(x)                               (((x) & 0x1) << 6)
+#define   G_0085F0_CB0_DEST_BASE_ENA(x)                               (((x) >> 6) & 0x1)
+#define   C_0085F0_CB0_DEST_BASE_ENA                                  0xFFFFFFBF
+#define   S_0085F0_CB1_DEST_BASE_ENA(x)                               (((x) & 0x1) << 7)
+#define   G_0085F0_CB1_DEST_BASE_ENA(x)                               (((x) >> 7) & 0x1)
+#define   C_0085F0_CB1_DEST_BASE_ENA                                  0xFFFFFF7F
+#define   S_0085F0_CB2_DEST_BASE_ENA(x)                               (((x) & 0x1) << 8)
+#define   G_0085F0_CB2_DEST_BASE_ENA(x)                               (((x) >> 8) & 0x1)
+#define   C_0085F0_CB2_DEST_BASE_ENA                                  0xFFFFFEFF
+#define   S_0085F0_CB3_DEST_BASE_ENA(x)                               (((x) & 0x1) << 9)
+#define   G_0085F0_CB3_DEST_BASE_ENA(x)                               (((x) >> 9) & 0x1)
+#define   C_0085F0_CB3_DEST_BASE_ENA                                  0xFFFFFDFF
+#define   S_0085F0_CB4_DEST_BASE_ENA(x)                               (((x) & 0x1) << 10)
+#define   G_0085F0_CB4_DEST_BASE_ENA(x)                               (((x) >> 10) & 0x1)
+#define   C_0085F0_CB4_DEST_BASE_ENA                                  0xFFFFFBFF
+#define   S_0085F0_CB5_DEST_BASE_ENA(x)                               (((x) & 0x1) << 11)
+#define   G_0085F0_CB5_DEST_BASE_ENA(x)                               (((x) >> 11) & 0x1)
+#define   C_0085F0_CB5_DEST_BASE_ENA                                  0xFFFFF7FF
+#define   S_0085F0_CB6_DEST_BASE_ENA(x)                               (((x) & 0x1) << 12)
+#define   G_0085F0_CB6_DEST_BASE_ENA(x)                               (((x) >> 12) & 0x1)
+#define   C_0085F0_CB6_DEST_BASE_ENA                                  0xFFFFEFFF
+#define   S_0085F0_CB7_DEST_BASE_ENA(x)                               (((x) & 0x1) << 13)
+#define   G_0085F0_CB7_DEST_BASE_ENA(x)                               (((x) >> 13) & 0x1)
+#define   C_0085F0_CB7_DEST_BASE_ENA                                  0xFFFFDFFF
+#define   S_0085F0_DB_DEST_BASE_ENA(x)                                (((x) & 0x1) << 14)
+#define   G_0085F0_DB_DEST_BASE_ENA(x)                                (((x) >> 14) & 0x1)
+#define   C_0085F0_DB_DEST_BASE_ENA                                   0xFFFFBFFF
+#define   S_0085F0_DEST_BASE_2_ENA(x)                                 (((x) & 0x1) << 19)
+#define   G_0085F0_DEST_BASE_2_ENA(x)                                 (((x) >> 19) & 0x1)
+#define   C_0085F0_DEST_BASE_2_ENA                                    0xFFF7FFFF
+#define   S_0085F0_DEST_BASE_3_ENA(x)                                 (((x) & 0x1) << 21)
+#define   G_0085F0_DEST_BASE_3_ENA(x)                                 (((x) >> 21) & 0x1)
+#define   C_0085F0_DEST_BASE_3_ENA                                    0xFFDFFFFF
+#define   S_0085F0_TCL1_ACTION_ENA(x)                                 (((x) & 0x1) << 22)
+#define   G_0085F0_TCL1_ACTION_ENA(x)                                 (((x) >> 22) & 0x1)
+#define   C_0085F0_TCL1_ACTION_ENA                                    0xFFBFFFFF
+#define   S_0085F0_TC_ACTION_ENA(x)                                   (((x) & 0x1) << 23)
+#define   G_0085F0_TC_ACTION_ENA(x)                                   (((x) >> 23) & 0x1)
+#define   C_0085F0_TC_ACTION_ENA                                      0xFF7FFFFF
+#define   S_0085F0_CB_ACTION_ENA(x)                                   (((x) & 0x1) << 25)
+#define   G_0085F0_CB_ACTION_ENA(x)                                   (((x) >> 25) & 0x1)
+#define   C_0085F0_CB_ACTION_ENA                                      0xFDFFFFFF
+#define   S_0085F0_DB_ACTION_ENA(x)                                   (((x) & 0x1) << 26)
+#define   G_0085F0_DB_ACTION_ENA(x)                                   (((x) >> 26) & 0x1)
+#define   C_0085F0_DB_ACTION_ENA                                      0xFBFFFFFF
+#define   S_0085F0_SH_KCACHE_ACTION_ENA(x)                            (((x) & 0x1) << 27)
+#define   G_0085F0_SH_KCACHE_ACTION_ENA(x)                            (((x) >> 27) & 0x1)
+#define   C_0085F0_SH_KCACHE_ACTION_ENA                               0xF7FFFFFF
+#define   S_0085F0_SH_ICACHE_ACTION_ENA(x)                            (((x) & 0x1) << 29)
+#define   G_0085F0_SH_ICACHE_ACTION_ENA(x)                            (((x) >> 29) & 0x1)
+#define   C_0085F0_SH_ICACHE_ACTION_ENA                               0xDFFFFFFF
+#define R_0085F4_CP_COHER_SIZE                                          0x0085F4
+#define R_0085F8_CP_COHER_BASE                                          0x0085F8
+#define R_0088B0_VGT_VTX_VECT_EJECT_REG                                 0x0088B0
+#define   S_0088B0_PRIM_COUNT(x)                                      (((x) & 0x3FF) << 0)
+#define   G_0088B0_PRIM_COUNT(x)                                      (((x) >> 0) & 0x3FF)
+#define   C_0088B0_PRIM_COUNT                                         0xFFFFFC00
+#define R_0088C4_VGT_CACHE_INVALIDATION                                 0x0088C4
+#define   S_0088C4_VS_NO_EXTRA_BUFFER(x)                              (((x) & 0x1) << 5)
+#define   G_0088C4_VS_NO_EXTRA_BUFFER(x)                              (((x) >> 5) & 0x1)
+#define   C_0088C4_VS_NO_EXTRA_BUFFER                                 0xFFFFFFDF
+#define   S_0088C4_STREAMOUT_FULL_FLUSH(x)                            (((x) & 0x1) << 13)
+#define   G_0088C4_STREAMOUT_FULL_FLUSH(x)                            (((x) >> 13) & 0x1)
+#define   C_0088C4_STREAMOUT_FULL_FLUSH                               0xFFFFDFFF
+#define   S_0088C4_ES_LIMIT(x)                                        (((x) & 0x1F) << 16)
+#define   G_0088C4_ES_LIMIT(x)                                        (((x) >> 16) & 0x1F)
+#define   C_0088C4_ES_LIMIT                                           0xFFE0FFFF
+#define R_0088C8_VGT_ESGS_RING_SIZE                                     0x0088C8
+#define R_0088CC_VGT_GSVS_RING_SIZE                                     0x0088CC
+#define R_0088D4_VGT_GS_VERTEX_REUSE                                    0x0088D4
+#define   S_0088D4_VERT_REUSE(x)                                      (((x) & 0x1F) << 0)
+#define   G_0088D4_VERT_REUSE(x)                                      (((x) >> 0) & 0x1F)
+#define   C_0088D4_VERT_REUSE                                         0xFFFFFFE0
+#define R_008958_VGT_PRIMITIVE_TYPE                                     0x008958
+#define   S_008958_PRIM_TYPE(x)                                       (((x) & 0x3F) << 0)
+#define   G_008958_PRIM_TYPE(x)                                       (((x) >> 0) & 0x3F)
+#define   C_008958_PRIM_TYPE                                          0xFFFFFFC0
+#define     V_008958_DI_PT_NONE                                     0x00
+#define     V_008958_DI_PT_POINTLIST                                0x01
+#define     V_008958_DI_PT_LINELIST                                 0x02
+#define     V_008958_DI_PT_LINESTRIP                                0x03
+#define     V_008958_DI_PT_TRILIST                                  0x04
+#define     V_008958_DI_PT_TRIFAN                                   0x05
+#define     V_008958_DI_PT_TRISTRIP                                 0x06
+#define     V_008958_DI_PT_UNUSED_0                                 0x07
+#define     V_008958_DI_PT_UNUSED_1                                 0x08
+#define     V_008958_DI_PT_PATCH                                    0x09
+#define     V_008958_DI_PT_LINELIST_ADJ                             0x0A
+#define     V_008958_DI_PT_LINESTRIP_ADJ                            0x0B
+#define     V_008958_DI_PT_TRILIST_ADJ                              0x0C
+#define     V_008958_DI_PT_TRISTRIP_ADJ                             0x0D
+#define     V_008958_DI_PT_UNUSED_3                                 0x0E
+#define     V_008958_DI_PT_UNUSED_4                                 0x0F
+#define     V_008958_DI_PT_TRI_WITH_WFLAGS                          0x10
+#define     V_008958_DI_PT_RECTLIST                                 0x11
+#define     V_008958_DI_PT_LINELOOP                                 0x12
+#define     V_008958_DI_PT_QUADLIST                                 0x13
+#define     V_008958_DI_PT_QUADSTRIP                                0x14
+#define     V_008958_DI_PT_POLYGON                                  0x15
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V0                     0x16
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V1                     0x17
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V2                     0x18
+#define     V_008958_DI_PT_2D_COPY_RECT_LIST_V3                     0x19
+#define     V_008958_DI_PT_2D_FILL_RECT_LIST                        0x1A
+#define     V_008958_DI_PT_2D_LINE_STRIP                            0x1B
+#define     V_008958_DI_PT_2D_TRI_STRIP                             0x1C
+#define R_00895C_VGT_INDEX_TYPE                                         0x00895C
+#define   S_00895C_INDEX_TYPE(x)                                      (((x) & 0x03) << 0)
+#define   G_00895C_INDEX_TYPE(x)                                      (((x) >> 0) & 0x03)
+#define   C_00895C_INDEX_TYPE                                         0xFFFFFFFC
+#define     V_00895C_DI_INDEX_SIZE_16_BIT                           0x00
+#define     V_00895C_DI_INDEX_SIZE_32_BIT                           0x01
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0                       0x008960
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1                       0x008964
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2                       0x008968
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3                       0x00896C
+#define R_008970_VGT_NUM_INDICES                                        0x008970
+#define R_008974_VGT_NUM_INSTANCES                                      0x008974
+#define R_008988_VGT_TF_RING_SIZE                                       0x008988
+#define   S_008988_SIZE(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_008988_SIZE(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_008988_SIZE                                               0xFFFF0000
+#define R_0089B0_VGT_HS_OFFCHIP_PARAM                                   0x0089B0
+#define   S_0089B0_OFFCHIP_BUFFERING(x)                               (((x) & 0x7F) << 0)
+#define   G_0089B0_OFFCHIP_BUFFERING(x)                               (((x) >> 0) & 0x7F)
+#define   C_0089B0_OFFCHIP_BUFFERING                                  0xFFFFFF80
+#define R_0089B8_VGT_TF_MEMORY_BASE                                     0x0089B8
+#define R_008A14_PA_CL_ENHANCE                                          0x008A14
+#define   S_008A14_CLIP_VTX_REORDER_ENA(x)                            (((x) & 0x1) << 0)
+#define   G_008A14_CLIP_VTX_REORDER_ENA(x)                            (((x) >> 0) & 0x1)
+#define   C_008A14_CLIP_VTX_REORDER_ENA                               0xFFFFFFFE
+#define   S_008A14_NUM_CLIP_SEQ(x)                                    (((x) & 0x03) << 1)
+#define   G_008A14_NUM_CLIP_SEQ(x)                                    (((x) >> 1) & 0x03)
+#define   C_008A14_NUM_CLIP_SEQ                                       0xFFFFFFF9
+#define   S_008A14_CLIPPED_PRIM_SEQ_STALL(x)                          (((x) & 0x1) << 3)
+#define   G_008A14_CLIPPED_PRIM_SEQ_STALL(x)                          (((x) >> 3) & 0x1)
+#define   C_008A14_CLIPPED_PRIM_SEQ_STALL                             0xFFFFFFF7
+#define   S_008A14_VE_NAN_PROC_DISABLE(x)                             (((x) & 0x1) << 4)
+#define   G_008A14_VE_NAN_PROC_DISABLE(x)                             (((x) >> 4) & 0x1)
+#define   C_008A14_VE_NAN_PROC_DISABLE                                0xFFFFFFEF
+#define R_008A60_PA_SU_LINE_STIPPLE_VALUE                               0x008A60
+#define   S_008A60_LINE_STIPPLE_VALUE(x)                              (((x) & 0xFFFFFF) << 0)
+#define   G_008A60_LINE_STIPPLE_VALUE(x)                              (((x) >> 0) & 0xFFFFFF)
+#define   C_008A60_LINE_STIPPLE_VALUE                                 0xFF000000
+#define R_008B10_PA_SC_LINE_STIPPLE_STATE                               0x008B10
+#define   S_008B10_CURRENT_PTR(x)                                     (((x) & 0x0F) << 0)
+#define   G_008B10_CURRENT_PTR(x)                                     (((x) >> 0) & 0x0F)
+#define   C_008B10_CURRENT_PTR                                        0xFFFFFFF0
+#define   S_008B10_CURRENT_COUNT(x)                                   (((x) & 0xFF) << 8)
+#define   G_008B10_CURRENT_COUNT(x)                                   (((x) >> 8) & 0xFF)
+#define   C_008B10_CURRENT_COUNT                                      0xFFFF00FF
+#define R_008BF0_PA_SC_ENHANCE                                          0x008BF0
+#define   S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x)                       (((x) & 0x1) << 0)
+#define   G_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x)                       (((x) >> 0) & 0x1)
+#define   C_008BF0_ENABLE_PA_SC_OUT_OF_ORDER                          0xFFFFFFFE
+#define   S_008BF0_DISABLE_SC_DB_TILE_FIX(x)                          (((x) & 0x1) << 1)
+#define   G_008BF0_DISABLE_SC_DB_TILE_FIX(x)                          (((x) >> 1) & 0x1)
+#define   C_008BF0_DISABLE_SC_DB_TILE_FIX                             0xFFFFFFFD
+#define   S_008BF0_DISABLE_AA_MASK_FULL_FIX(x)                        (((x) & 0x1) << 2)
+#define   G_008BF0_DISABLE_AA_MASK_FULL_FIX(x)                        (((x) >> 2) & 0x1)
+#define   C_008BF0_DISABLE_AA_MASK_FULL_FIX                           0xFFFFFFFB
+#define   S_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS(x)                  (((x) & 0x1) << 3)
+#define   G_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS(x)                  (((x) >> 3) & 0x1)
+#define   C_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS                     0xFFFFFFF7
+#define   S_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID(x)               (((x) & 0x1) << 4)
+#define   G_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID(x)               (((x) >> 4) & 0x1)
+#define   C_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID                  0xFFFFFFEF
+#define   S_008BF0_DISABLE_SCISSOR_FIX(x)                             (((x) & 0x1) << 5)
+#define   G_008BF0_DISABLE_SCISSOR_FIX(x)                             (((x) >> 5) & 0x1)
+#define   C_008BF0_DISABLE_SCISSOR_FIX                                0xFFFFFFDF
+#define   S_008BF0_DISABLE_PW_BUBBLE_COLLAPSE(x)                      (((x) & 0x03) << 6)
+#define   G_008BF0_DISABLE_PW_BUBBLE_COLLAPSE(x)                      (((x) >> 6) & 0x03)
+#define   C_008BF0_DISABLE_PW_BUBBLE_COLLAPSE                         0xFFFFFF3F
+#define   S_008BF0_SEND_UNLIT_STILES_TO_PACKER(x)                     (((x) & 0x1) << 8)
+#define   G_008BF0_SEND_UNLIT_STILES_TO_PACKER(x)                     (((x) >> 8) & 0x1)
+#define   C_008BF0_SEND_UNLIT_STILES_TO_PACKER                        0xFFFFFEFF
+#define   S_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION(x)              (((x) & 0x1) << 9)
+#define   G_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION(x)              (((x) >> 9) & 0x1)
+#define   C_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION                 0xFFFFFDFF
+#define R_008C08_SQC_CACHES                                             0x008C08
+#define   S_008C08_INST_INVALIDATE(x)                                 (((x) & 0x1) << 0)
+#define   G_008C08_INST_INVALIDATE(x)                                 (((x) >> 0) & 0x1)
+#define   C_008C08_INST_INVALIDATE                                    0xFFFFFFFE
+#define   S_008C08_DATA_INVALIDATE(x)                                 (((x) & 0x1) << 1)
+#define   G_008C08_DATA_INVALIDATE(x)                                 (((x) >> 1) & 0x1)
+#define   C_008C08_DATA_INVALIDATE                                    0xFFFFFFFD
+#define R_008C0C_SQ_RANDOM_WAVE_PRI                                     0x008C0C
+#define   S_008C0C_RET(x)                                             (((x) & 0x7F) << 0)
+#define   G_008C0C_RET(x)                                             (((x) >> 0) & 0x7F)
+#define   C_008C0C_RET                                                0xFFFFFF80
+#define   S_008C0C_RUI(x)                                             (((x) & 0x07) << 7)
+#define   G_008C0C_RUI(x)                                             (((x) >> 7) & 0x07)
+#define   C_008C0C_RUI                                                0xFFFFFC7F
+#define   S_008C0C_RNG(x)                                             (((x) & 0x7FF) << 10)
+#define   G_008C0C_RNG(x)                                             (((x) >> 10) & 0x7FF)
+#define   C_008C0C_RNG                                                0xFFE003FF
+#if 0
+#define R_008DFC_SQ_INST                                                0x008DFC
+#define R_008DFC_SQ_VOP1                                                0x008DFC
+#define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
+#define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
+#define   C_008DFC_SRC0                                               0xFFFFFE00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_OP(x)                                              (((x) & 0xFF) << 9)
+#define   G_008DFC_OP(x)                                              (((x) >> 9) & 0xFF)
+#define   C_008DFC_OP                                                 0xFFFE01FF
+#define     V_008DFC_SQ_V_NOP                                       0x00
+#define     V_008DFC_SQ_V_MOV_B32                                   0x01
+#define     V_008DFC_SQ_V_READFIRSTLANE_B32                         0x02
+#define     V_008DFC_SQ_V_CVT_I32_F64                               0x03
+#define     V_008DFC_SQ_V_CVT_F64_I32                               0x04
+#define     V_008DFC_SQ_V_CVT_F32_I32                               0x05
+#define     V_008DFC_SQ_V_CVT_F32_U32                               0x06
+#define     V_008DFC_SQ_V_CVT_U32_F32                               0x07
+#define     V_008DFC_SQ_V_CVT_I32_F32                               0x08
+#define     V_008DFC_SQ_V_MOV_FED_B32                               0x09
+#define     V_008DFC_SQ_V_CVT_F16_F32                               0x0A
+#define     V_008DFC_SQ_V_CVT_F32_F16                               0x0B
+#define     V_008DFC_SQ_V_CVT_RPI_I32_F32                           0x0C
+#define     V_008DFC_SQ_V_CVT_FLR_I32_F32                           0x0D
+#define     V_008DFC_SQ_V_CVT_OFF_F32_I4                            0x0E
+#define     V_008DFC_SQ_V_CVT_F32_F64                               0x0F
+#define     V_008DFC_SQ_V_CVT_F64_F32                               0x10
+#define     V_008DFC_SQ_V_CVT_F32_UBYTE0                            0x11
+#define     V_008DFC_SQ_V_CVT_F32_UBYTE1                            0x12
+#define     V_008DFC_SQ_V_CVT_F32_UBYTE2                            0x13
+#define     V_008DFC_SQ_V_CVT_F32_UBYTE3                            0x14
+#define     V_008DFC_SQ_V_CVT_U32_F64                               0x15
+#define     V_008DFC_SQ_V_CVT_F64_U32                               0x16
+#define     V_008DFC_SQ_V_FRACT_F32                                 0x20
+#define     V_008DFC_SQ_V_TRUNC_F32                                 0x21
+#define     V_008DFC_SQ_V_CEIL_F32                                  0x22
+#define     V_008DFC_SQ_V_RNDNE_F32                                 0x23
+#define     V_008DFC_SQ_V_FLOOR_F32                                 0x24
+#define     V_008DFC_SQ_V_EXP_F32                                   0x25
+#define     V_008DFC_SQ_V_LOG_CLAMP_F32                             0x26
+#define     V_008DFC_SQ_V_LOG_F32                                   0x27
+#define     V_008DFC_SQ_V_RCP_CLAMP_F32                             0x28
+#define     V_008DFC_SQ_V_RCP_LEGACY_F32                            0x29
+#define     V_008DFC_SQ_V_RCP_F32                                   0x2A
+#define     V_008DFC_SQ_V_RCP_IFLAG_F32                             0x2B
+#define     V_008DFC_SQ_V_RSQ_CLAMP_F32                             0x2C
+#define     V_008DFC_SQ_V_RSQ_LEGACY_F32                            0x2D
+#define     V_008DFC_SQ_V_RSQ_F32                                   0x2E
+#define     V_008DFC_SQ_V_RCP_F64                                   0x2F
+#define     V_008DFC_SQ_V_RCP_CLAMP_F64                             0x30
+#define     V_008DFC_SQ_V_RSQ_F64                                   0x31
+#define     V_008DFC_SQ_V_RSQ_CLAMP_F64                             0x32
+#define     V_008DFC_SQ_V_SQRT_F32                                  0x33
+#define     V_008DFC_SQ_V_SQRT_F64                                  0x34
+#define     V_008DFC_SQ_V_SIN_F32                                   0x35
+#define     V_008DFC_SQ_V_COS_F32                                   0x36
+#define     V_008DFC_SQ_V_NOT_B32                                   0x37
+#define     V_008DFC_SQ_V_BFREV_B32                                 0x38
+#define     V_008DFC_SQ_V_FFBH_U32                                  0x39
+#define     V_008DFC_SQ_V_FFBL_B32                                  0x3A
+#define     V_008DFC_SQ_V_FFBH_I32                                  0x3B
+#define     V_008DFC_SQ_V_FREXP_EXP_I32_F64                         0x3C
+#define     V_008DFC_SQ_V_FREXP_MANT_F64                            0x3D
+#define     V_008DFC_SQ_V_FRACT_F64                                 0x3E
+#define     V_008DFC_SQ_V_FREXP_EXP_I32_F32                         0x3F
+#define     V_008DFC_SQ_V_FREXP_MANT_F32                            0x40
+#define     V_008DFC_SQ_V_CLREXCP                                   0x41
+#define     V_008DFC_SQ_V_MOVRELD_B32                               0x42
+#define     V_008DFC_SQ_V_MOVRELS_B32                               0x43
+#define     V_008DFC_SQ_V_MOVRELSD_B32                              0x44
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 17)
+#define   G_008DFC_VDST(x)                                            (((x) >> 17) & 0xFF)
+#define   C_008DFC_VDST                                               0xFE01FFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x7F) << 25)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 25) & 0x7F)
+#define   C_008DFC_ENCODING                                           0x01FFFFFF
+#define     V_008DFC_SQ_ENC_VOP1_FIELD                              0x3F
+#define R_008DFC_SQ_MIMG_1                                              0x008DFC
+#define   S_008DFC_VADDR(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_VADDR(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_VADDR                                              0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VDATA(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_VDATA(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_VDATA                                              0xFFFF00FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_SRSRC(x)                                           (((x) & 0x1F) << 16)
+#define   G_008DFC_SRSRC(x)                                           (((x) >> 16) & 0x1F)
+#define   C_008DFC_SRSRC                                              0xFFE0FFFF
+#define   S_008DFC_SSAMP(x)                                           (((x) & 0x1F) << 21)
+#define   G_008DFC_SSAMP(x)                                           (((x) >> 21) & 0x1F)
+#define   C_008DFC_SSAMP                                              0xFC1FFFFF
+#define R_008DFC_SQ_VOP3_1                                              0x008DFC
+#define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
+#define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
+#define   C_008DFC_SRC0                                               0xFFFFFE00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_SRC1(x)                                            (((x) & 0x1FF) << 9)
+#define   G_008DFC_SRC1(x)                                            (((x) >> 9) & 0x1FF)
+#define   C_008DFC_SRC1                                               0xFFFC01FF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_SRC2(x)                                            (((x) & 0x1FF) << 18)
+#define   G_008DFC_SRC2(x)                                            (((x) >> 18) & 0x1FF)
+#define   C_008DFC_SRC2                                               0xF803FFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_OMOD(x)                                            (((x) & 0x03) << 27)
+#define   G_008DFC_OMOD(x)                                            (((x) >> 27) & 0x03)
+#define   C_008DFC_OMOD                                               0xE7FFFFFF
+#define     V_008DFC_SQ_OMOD_OFF                                    0x00
+#define     V_008DFC_SQ_OMOD_M2                                     0x01
+#define     V_008DFC_SQ_OMOD_M4                                     0x02
+#define     V_008DFC_SQ_OMOD_D2                                     0x03
+#define   S_008DFC_NEG(x)                                             (((x) & 0x07) << 29)
+#define   G_008DFC_NEG(x)                                             (((x) >> 29) & 0x07)
+#define   C_008DFC_NEG                                                0x1FFFFFFF
+#define R_008DFC_SQ_MUBUF_1                                             0x008DFC
+#define   S_008DFC_VADDR(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_VADDR(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_VADDR                                              0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VDATA(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_VDATA(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_VDATA                                              0xFFFF00FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_SRSRC(x)                                           (((x) & 0x1F) << 16)
+#define   G_008DFC_SRSRC(x)                                           (((x) >> 16) & 0x1F)
+#define   C_008DFC_SRSRC                                              0xFFE0FFFF
+#define   S_008DFC_SLC(x)                                             (((x) & 0x1) << 22)
+#define   G_008DFC_SLC(x)                                             (((x) >> 22) & 0x1)
+#define   C_008DFC_SLC                                                0xFFBFFFFF
+#define   S_008DFC_TFE(x)                                             (((x) & 0x1) << 23)
+#define   G_008DFC_TFE(x)                                             (((x) >> 23) & 0x1)
+#define   C_008DFC_TFE                                                0xFF7FFFFF
+#define   S_008DFC_SOFFSET(x)                                         (((x) & 0xFF) << 24)
+#define   G_008DFC_SOFFSET(x)                                         (((x) >> 24) & 0xFF)
+#define   C_008DFC_SOFFSET                                            0x00FFFFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define R_008DFC_SQ_DS_0                                                0x008DFC
+#define   S_008DFC_OFFSET0(x)                                         (((x) & 0xFF) << 0)
+#define   G_008DFC_OFFSET0(x)                                         (((x) >> 0) & 0xFF)
+#define   C_008DFC_OFFSET0                                            0xFFFFFF00
+#define   S_008DFC_OFFSET1(x)                                         (((x) & 0xFF) << 8)
+#define   G_008DFC_OFFSET1(x)                                         (((x) >> 8) & 0xFF)
+#define   C_008DFC_OFFSET1                                            0xFFFF00FF
+#define   S_008DFC_GDS(x)                                             (((x) & 0x1) << 17)
+#define   G_008DFC_GDS(x)                                             (((x) >> 17) & 0x1)
+#define   C_008DFC_GDS                                                0xFFFDFFFF
+#define   S_008DFC_OP(x)                                              (((x) & 0xFF) << 18)
+#define   G_008DFC_OP(x)                                              (((x) >> 18) & 0xFF)
+#define   C_008DFC_OP                                                 0xFC03FFFF
+#define     V_008DFC_SQ_DS_ADD_U32                                  0x00
+#define     V_008DFC_SQ_DS_SUB_U32                                  0x01
+#define     V_008DFC_SQ_DS_RSUB_U32                                 0x02
+#define     V_008DFC_SQ_DS_INC_U32                                  0x03
+#define     V_008DFC_SQ_DS_DEC_U32                                  0x04
+#define     V_008DFC_SQ_DS_MIN_I32                                  0x05
+#define     V_008DFC_SQ_DS_MAX_I32                                  0x06
+#define     V_008DFC_SQ_DS_MIN_U32                                  0x07
+#define     V_008DFC_SQ_DS_MAX_U32                                  0x08
+#define     V_008DFC_SQ_DS_AND_B32                                  0x09
+#define     V_008DFC_SQ_DS_OR_B32                                   0x0A
+#define     V_008DFC_SQ_DS_XOR_B32                                  0x0B
+#define     V_008DFC_SQ_DS_MSKOR_B32                                0x0C
+#define     V_008DFC_SQ_DS_WRITE_B32                                0x0D
+#define     V_008DFC_SQ_DS_WRITE2_B32                               0x0E
+#define     V_008DFC_SQ_DS_WRITE2ST64_B32                           0x0F
+#define     V_008DFC_SQ_DS_CMPST_B32                                0x10
+#define     V_008DFC_SQ_DS_CMPST_F32                                0x11
+#define     V_008DFC_SQ_DS_MIN_F32                                  0x12
+#define     V_008DFC_SQ_DS_MAX_F32                                  0x13
+#define     V_008DFC_SQ_DS_GWS_INIT                                 0x19
+#define     V_008DFC_SQ_DS_GWS_SEMA_V                               0x1A
+#define     V_008DFC_SQ_DS_GWS_SEMA_BR                              0x1B
+#define     V_008DFC_SQ_DS_GWS_SEMA_P                               0x1C
+#define     V_008DFC_SQ_DS_GWS_BARRIER                              0x1D
+#define     V_008DFC_SQ_DS_WRITE_B8                                 0x1E
+#define     V_008DFC_SQ_DS_WRITE_B16                                0x1F
+#define     V_008DFC_SQ_DS_ADD_RTN_U32                              0x20
+#define     V_008DFC_SQ_DS_SUB_RTN_U32                              0x21
+#define     V_008DFC_SQ_DS_RSUB_RTN_U32                             0x22
+#define     V_008DFC_SQ_DS_INC_RTN_U32                              0x23
+#define     V_008DFC_SQ_DS_DEC_RTN_U32                              0x24
+#define     V_008DFC_SQ_DS_MIN_RTN_I32                              0x25
+#define     V_008DFC_SQ_DS_MAX_RTN_I32                              0x26
+#define     V_008DFC_SQ_DS_MIN_RTN_U32                              0x27
+#define     V_008DFC_SQ_DS_MAX_RTN_U32                              0x28
+#define     V_008DFC_SQ_DS_AND_RTN_B32                              0x29
+#define     V_008DFC_SQ_DS_OR_RTN_B32                               0x2A
+#define     V_008DFC_SQ_DS_XOR_RTN_B32                              0x2B
+#define     V_008DFC_SQ_DS_MSKOR_RTN_B32                            0x2C
+#define     V_008DFC_SQ_DS_WRXCHG_RTN_B32                           0x2D
+#define     V_008DFC_SQ_DS_WRXCHG2_RTN_B32                          0x2E
+#define     V_008DFC_SQ_DS_WRXCHG2ST64_RTN_B32                      0x2F
+#define     V_008DFC_SQ_DS_CMPST_RTN_B32                            0x30
+#define     V_008DFC_SQ_DS_CMPST_RTN_F32                            0x31
+#define     V_008DFC_SQ_DS_MIN_RTN_F32                              0x32
+#define     V_008DFC_SQ_DS_MAX_RTN_F32                              0x33
+#define     V_008DFC_SQ_DS_SWIZZLE_B32                              0x35
+#define     V_008DFC_SQ_DS_READ_B32                                 0x36
+#define     V_008DFC_SQ_DS_READ2_B32                                0x37
+#define     V_008DFC_SQ_DS_READ2ST64_B32                            0x38
+#define     V_008DFC_SQ_DS_READ_I8                                  0x39
+#define     V_008DFC_SQ_DS_READ_U8                                  0x3A
+#define     V_008DFC_SQ_DS_READ_I16                                 0x3B
+#define     V_008DFC_SQ_DS_READ_U16                                 0x3C
+#define     V_008DFC_SQ_DS_CONSUME                                  0x3D
+#define     V_008DFC_SQ_DS_APPEND                                   0x3E
+#define     V_008DFC_SQ_DS_ORDERED_COUNT                            0x3F
+#define     V_008DFC_SQ_DS_ADD_U64                                  0x40
+#define     V_008DFC_SQ_DS_SUB_U64                                  0x41
+#define     V_008DFC_SQ_DS_RSUB_U64                                 0x42
+#define     V_008DFC_SQ_DS_INC_U64                                  0x43
+#define     V_008DFC_SQ_DS_DEC_U64                                  0x44
+#define     V_008DFC_SQ_DS_MIN_I64                                  0x45
+#define     V_008DFC_SQ_DS_MAX_I64                                  0x46
+#define     V_008DFC_SQ_DS_MIN_U64                                  0x47
+#define     V_008DFC_SQ_DS_MAX_U64                                  0x48
+#define     V_008DFC_SQ_DS_AND_B64                                  0x49
+#define     V_008DFC_SQ_DS_OR_B64                                   0x4A
+#define     V_008DFC_SQ_DS_XOR_B64                                  0x4B
+#define     V_008DFC_SQ_DS_MSKOR_B64                                0x4C
+#define     V_008DFC_SQ_DS_WRITE_B64                                0x4D
+#define     V_008DFC_SQ_DS_WRITE2_B64                               0x4E
+#define     V_008DFC_SQ_DS_WRITE2ST64_B64                           0x4F
+#define     V_008DFC_SQ_DS_CMPST_B64                                0x50
+#define     V_008DFC_SQ_DS_CMPST_F64                                0x51
+#define     V_008DFC_SQ_DS_MIN_F64                                  0x52
+#define     V_008DFC_SQ_DS_MAX_F64                                  0x53
+#define     V_008DFC_SQ_DS_ADD_RTN_U64                              0x60
+#define     V_008DFC_SQ_DS_SUB_RTN_U64                              0x61
+#define     V_008DFC_SQ_DS_RSUB_RTN_U64                             0x62
+#define     V_008DFC_SQ_DS_INC_RTN_U64                              0x63
+#define     V_008DFC_SQ_DS_DEC_RTN_U64                              0x64
+#define     V_008DFC_SQ_DS_MIN_RTN_I64                              0x65
+#define     V_008DFC_SQ_DS_MAX_RTN_I64                              0x66
+#define     V_008DFC_SQ_DS_MIN_RTN_U64                              0x67
+#define     V_008DFC_SQ_DS_MAX_RTN_U64                              0x68
+#define     V_008DFC_SQ_DS_AND_RTN_B64                              0x69
+#define     V_008DFC_SQ_DS_OR_RTN_B64                               0x6A
+#define     V_008DFC_SQ_DS_XOR_RTN_B64                              0x6B
+#define     V_008DFC_SQ_DS_MSKOR_RTN_B64                            0x6C
+#define     V_008DFC_SQ_DS_WRXCHG_RTN_B64                           0x6D
+#define     V_008DFC_SQ_DS_WRXCHG2_RTN_B64                          0x6E
+#define     V_008DFC_SQ_DS_WRXCHG2ST64_RTN_B64                      0x6F
+#define     V_008DFC_SQ_DS_CMPST_RTN_B64                            0x70
+#define     V_008DFC_SQ_DS_CMPST_RTN_F64                            0x71
+#define     V_008DFC_SQ_DS_MIN_RTN_F64                              0x72
+#define     V_008DFC_SQ_DS_MAX_RTN_F64                              0x73
+#define     V_008DFC_SQ_DS_READ_B64                                 0x76
+#define     V_008DFC_SQ_DS_READ2_B64                                0x77
+#define     V_008DFC_SQ_DS_READ2ST64_B64                            0x78
+#define     V_008DFC_SQ_DS_ADD_SRC2_U32                             0x80
+#define     V_008DFC_SQ_DS_SUB_SRC2_U32                             0x81
+#define     V_008DFC_SQ_DS_RSUB_SRC2_U32                            0x82
+#define     V_008DFC_SQ_DS_INC_SRC2_U32                             0x83
+#define     V_008DFC_SQ_DS_DEC_SRC2_U32                             0x84
+#define     V_008DFC_SQ_DS_MIN_SRC2_I32                             0x85
+#define     V_008DFC_SQ_DS_MAX_SRC2_I32                             0x86
+#define     V_008DFC_SQ_DS_MIN_SRC2_U32                             0x87
+#define     V_008DFC_SQ_DS_MAX_SRC2_U32                             0x88
+#define     V_008DFC_SQ_DS_AND_SRC2_B32                             0x89
+#define     V_008DFC_SQ_DS_OR_SRC2_B32                              0x8A
+#define     V_008DFC_SQ_DS_XOR_SRC2_B32                             0x8B
+#define     V_008DFC_SQ_DS_WRITE_SRC2_B32                           0x8D
+#define     V_008DFC_SQ_DS_MIN_SRC2_F32                             0x92
+#define     V_008DFC_SQ_DS_MAX_SRC2_F32                             0x93
+#define     V_008DFC_SQ_DS_ADD_SRC2_U64                             0xC0
+#define     V_008DFC_SQ_DS_SUB_SRC2_U64                             0xC1
+#define     V_008DFC_SQ_DS_RSUB_SRC2_U64                            0xC2
+#define     V_008DFC_SQ_DS_INC_SRC2_U64                             0xC3
+#define     V_008DFC_SQ_DS_DEC_SRC2_U64                             0xC4
+#define     V_008DFC_SQ_DS_MIN_SRC2_I64                             0xC5
+#define     V_008DFC_SQ_DS_MAX_SRC2_I64                             0xC6
+#define     V_008DFC_SQ_DS_MIN_SRC2_U64                             0xC7
+#define     V_008DFC_SQ_DS_MAX_SRC2_U64                             0xC8
+#define     V_008DFC_SQ_DS_AND_SRC2_B64                             0xC9
+#define     V_008DFC_SQ_DS_OR_SRC2_B64                              0xCA
+#define     V_008DFC_SQ_DS_XOR_SRC2_B64                             0xCB
+#define     V_008DFC_SQ_DS_WRITE_SRC2_B64                           0xCD
+#define     V_008DFC_SQ_DS_MIN_SRC2_F64                             0xD2
+#define     V_008DFC_SQ_DS_MAX_SRC2_F64                             0xD3
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_DS_FIELD                                0x36
+#define R_008DFC_SQ_SOPC                                                0x008DFC
+#define   S_008DFC_SSRC0(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_SSRC0(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_SSRC0                                              0xFFFFFF00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define   S_008DFC_SSRC1(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_SSRC1(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_SSRC1                                              0xFFFF00FF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define   S_008DFC_OP(x)                                              (((x) & 0x7F) << 16)
+#define   G_008DFC_OP(x)                                              (((x) >> 16) & 0x7F)
+#define   C_008DFC_OP                                                 0xFF80FFFF
+#define     V_008DFC_SQ_S_CMP_EQ_I32                                0x00
+#define     V_008DFC_SQ_S_CMP_LG_I32                                0x01
+#define     V_008DFC_SQ_S_CMP_GT_I32                                0x02
+#define     V_008DFC_SQ_S_CMP_GE_I32                                0x03
+#define     V_008DFC_SQ_S_CMP_LT_I32                                0x04
+#define     V_008DFC_SQ_S_CMP_LE_I32                                0x05
+#define     V_008DFC_SQ_S_CMP_EQ_U32                                0x06
+#define     V_008DFC_SQ_S_CMP_LG_U32                                0x07
+#define     V_008DFC_SQ_S_CMP_GT_U32                                0x08
+#define     V_008DFC_SQ_S_CMP_GE_U32                                0x09
+#define     V_008DFC_SQ_S_CMP_LT_U32                                0x0A
+#define     V_008DFC_SQ_S_CMP_LE_U32                                0x0B
+#define     V_008DFC_SQ_S_BITCMP0_B32                               0x0C
+#define     V_008DFC_SQ_S_BITCMP1_B32                               0x0D
+#define     V_008DFC_SQ_S_BITCMP0_B64                               0x0E
+#define     V_008DFC_SQ_S_BITCMP1_B64                               0x0F
+#define     V_008DFC_SQ_S_SETVSKIP                                  0x10
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x1FF) << 23)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 23) & 0x1FF)
+#define   C_008DFC_ENCODING                                           0x007FFFFF
+#define     V_008DFC_SQ_ENC_SOPC_FIELD                              0x17E
+#endif
+#define R_008DFC_SQ_EXP_0                                               0x008DFC
+#define   S_008DFC_EN(x)                                              (((x) & 0x0F) << 0)
+#define   G_008DFC_EN(x)                                              (((x) >> 0) & 0x0F)
+#define   C_008DFC_EN                                                 0xFFFFFFF0
+#define   S_008DFC_TGT(x)                                             (((x) & 0x3F) << 4)
+#define   G_008DFC_TGT(x)                                             (((x) >> 4) & 0x3F)
+#define   C_008DFC_TGT                                                0xFFFFFC0F
+#define     V_008DFC_SQ_EXP_MRT                                     0x00
+#define     V_008DFC_SQ_EXP_MRTZ                                    0x08
+#define     V_008DFC_SQ_EXP_NULL                                    0x09
+#define     V_008DFC_SQ_EXP_POS                                     0x0C
+#define     V_008DFC_SQ_EXP_PARAM                                   0x20
+#define   S_008DFC_COMPR(x)                                           (((x) & 0x1) << 10)
+#define   G_008DFC_COMPR(x)                                           (((x) >> 10) & 0x1)
+#define   C_008DFC_COMPR                                              0xFFFFFBFF
+#define   S_008DFC_DONE(x)                                            (((x) & 0x1) << 11)
+#define   G_008DFC_DONE(x)                                            (((x) >> 11) & 0x1)
+#define   C_008DFC_DONE                                               0xFFFFF7FF
+#define   S_008DFC_VM(x)                                              (((x) & 0x1) << 12)
+#define   G_008DFC_VM(x)                                              (((x) >> 12) & 0x1)
+#define   C_008DFC_VM                                                 0xFFFFEFFF
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_EXP_FIELD                               0x3E
+#if 0
+#define R_008DFC_SQ_MIMG_0                                              0x008DFC
+#define   S_008DFC_DMASK(x)                                           (((x) & 0x0F) << 8)
+#define   G_008DFC_DMASK(x)                                           (((x) >> 8) & 0x0F)
+#define   C_008DFC_DMASK                                              0xFFFFF0FF
+#define   S_008DFC_UNORM(x)                                           (((x) & 0x1) << 12)
+#define   G_008DFC_UNORM(x)                                           (((x) >> 12) & 0x1)
+#define   C_008DFC_UNORM                                              0xFFFFEFFF
+#define   S_008DFC_GLC(x)                                             (((x) & 0x1) << 13)
+#define   G_008DFC_GLC(x)                                             (((x) >> 13) & 0x1)
+#define   C_008DFC_GLC                                                0xFFFFDFFF
+#define   S_008DFC_DA(x)                                              (((x) & 0x1) << 14)
+#define   G_008DFC_DA(x)                                              (((x) >> 14) & 0x1)
+#define   C_008DFC_DA                                                 0xFFFFBFFF
+#define   S_008DFC_R128(x)                                            (((x) & 0x1) << 15)
+#define   G_008DFC_R128(x)                                            (((x) >> 15) & 0x1)
+#define   C_008DFC_R128                                               0xFFFF7FFF
+#define   S_008DFC_TFE(x)                                             (((x) & 0x1) << 16)
+#define   G_008DFC_TFE(x)                                             (((x) >> 16) & 0x1)
+#define   C_008DFC_TFE                                                0xFFFEFFFF
+#define   S_008DFC_LWE(x)                                             (((x) & 0x1) << 17)
+#define   G_008DFC_LWE(x)                                             (((x) >> 17) & 0x1)
+#define   C_008DFC_LWE                                                0xFFFDFFFF
+#define   S_008DFC_OP(x)                                              (((x) & 0x7F) << 18)
+#define   G_008DFC_OP(x)                                              (((x) >> 18) & 0x7F)
+#define   C_008DFC_OP                                                 0xFE03FFFF
+#define     V_008DFC_SQ_IMAGE_LOAD                                  0x00
+#define     V_008DFC_SQ_IMAGE_LOAD_MIP                              0x01
+#define     V_008DFC_SQ_IMAGE_LOAD_PCK                              0x02
+#define     V_008DFC_SQ_IMAGE_LOAD_PCK_SGN                          0x03
+#define     V_008DFC_SQ_IMAGE_LOAD_MIP_PCK                          0x04
+#define     V_008DFC_SQ_IMAGE_LOAD_MIP_PCK_SGN                      0x05
+#define     V_008DFC_SQ_IMAGE_STORE                                 0x08
+#define     V_008DFC_SQ_IMAGE_STORE_MIP                             0x09
+#define     V_008DFC_SQ_IMAGE_STORE_PCK                             0x0A
+#define     V_008DFC_SQ_IMAGE_STORE_MIP_PCK                         0x0B
+#define     V_008DFC_SQ_IMAGE_GET_RESINFO                           0x0E
+#define     V_008DFC_SQ_IMAGE_ATOMIC_SWAP                           0x0F
+#define     V_008DFC_SQ_IMAGE_ATOMIC_CMPSWAP                        0x10
+#define     V_008DFC_SQ_IMAGE_ATOMIC_ADD                            0x11
+#define     V_008DFC_SQ_IMAGE_ATOMIC_SUB                            0x12
+#define     V_008DFC_SQ_IMAGE_ATOMIC_RSUB                           0x13
+#define     V_008DFC_SQ_IMAGE_ATOMIC_SMIN                           0x14
+#define     V_008DFC_SQ_IMAGE_ATOMIC_UMIN                           0x15
+#define     V_008DFC_SQ_IMAGE_ATOMIC_SMAX                           0x16
+#define     V_008DFC_SQ_IMAGE_ATOMIC_UMAX                           0x17
+#define     V_008DFC_SQ_IMAGE_ATOMIC_AND                            0x18
+#define     V_008DFC_SQ_IMAGE_ATOMIC_OR                             0x19
+#define     V_008DFC_SQ_IMAGE_ATOMIC_XOR                            0x1A
+#define     V_008DFC_SQ_IMAGE_ATOMIC_INC                            0x1B
+#define     V_008DFC_SQ_IMAGE_ATOMIC_DEC                            0x1C
+#define     V_008DFC_SQ_IMAGE_ATOMIC_FCMPSWAP                       0x1D
+#define     V_008DFC_SQ_IMAGE_ATOMIC_FMIN                           0x1E
+#define     V_008DFC_SQ_IMAGE_ATOMIC_FMAX                           0x1F
+#define     V_008DFC_SQ_IMAGE_SAMPLE                                0x20
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CL                             0x21
+#define     V_008DFC_SQ_IMAGE_SAMPLE_D                              0x22
+#define     V_008DFC_SQ_IMAGE_SAMPLE_D_CL                           0x23
+#define     V_008DFC_SQ_IMAGE_SAMPLE_L                              0x24
+#define     V_008DFC_SQ_IMAGE_SAMPLE_B                              0x25
+#define     V_008DFC_SQ_IMAGE_SAMPLE_B_CL                           0x26
+#define     V_008DFC_SQ_IMAGE_SAMPLE_LZ                             0x27
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C                              0x28
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CL                           0x29
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_D                            0x2A
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_D_CL                         0x2B
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_L                            0x2C
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_B                            0x2D
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_B_CL                         0x2E
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_LZ                           0x2F
+#define     V_008DFC_SQ_IMAGE_SAMPLE_O                              0x30
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CL_O                           0x31
+#define     V_008DFC_SQ_IMAGE_SAMPLE_D_O                            0x32
+#define     V_008DFC_SQ_IMAGE_SAMPLE_D_CL_O                         0x33
+#define     V_008DFC_SQ_IMAGE_SAMPLE_L_O                            0x34
+#define     V_008DFC_SQ_IMAGE_SAMPLE_B_O                            0x35
+#define     V_008DFC_SQ_IMAGE_SAMPLE_B_CL_O                         0x36
+#define     V_008DFC_SQ_IMAGE_SAMPLE_LZ_O                           0x37
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_O                            0x38
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CL_O                         0x39
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_D_O                          0x3A
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_D_CL_O                       0x3B
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_L_O                          0x3C
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_B_O                          0x3D
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_B_CL_O                       0x3E
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_LZ_O                         0x3F
+#define     V_008DFC_SQ_IMAGE_GATHER4                               0x40
+#define     V_008DFC_SQ_IMAGE_GATHER4_CL                            0x41
+#define     V_008DFC_SQ_IMAGE_GATHER4_L                             0x44
+#define     V_008DFC_SQ_IMAGE_GATHER4_B                             0x45
+#define     V_008DFC_SQ_IMAGE_GATHER4_B_CL                          0x46
+#define     V_008DFC_SQ_IMAGE_GATHER4_LZ                            0x47
+#define     V_008DFC_SQ_IMAGE_GATHER4_C                             0x48
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_CL                          0x49
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_L                           0x4C
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_B                           0x4D
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_B_CL                        0x4E
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_LZ                          0x4F
+#define     V_008DFC_SQ_IMAGE_GATHER4_O                             0x50
+#define     V_008DFC_SQ_IMAGE_GATHER4_CL_O                          0x51
+#define     V_008DFC_SQ_IMAGE_GATHER4_L_O                           0x54
+#define     V_008DFC_SQ_IMAGE_GATHER4_B_O                           0x55
+#define     V_008DFC_SQ_IMAGE_GATHER4_B_CL_O                        0x56
+#define     V_008DFC_SQ_IMAGE_GATHER4_LZ_O                          0x57
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_O                           0x58
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_CL_O                        0x59
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_L_O                         0x5C
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_B_O                         0x5D
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_B_CL_O                      0x5E
+#define     V_008DFC_SQ_IMAGE_GATHER4_C_LZ_O                        0x5F
+#define     V_008DFC_SQ_IMAGE_GET_LOD                               0x60
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CD                             0x68
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CD_CL                          0x69
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CD                           0x6A
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CD_CL                        0x6B
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CD_O                           0x6C
+#define     V_008DFC_SQ_IMAGE_SAMPLE_CD_CL_O                        0x6D
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CD_O                         0x6E
+#define     V_008DFC_SQ_IMAGE_SAMPLE_C_CD_CL_O                      0x6F
+#define     V_008DFC_SQ_IMAGE_RSRC256                               0x7E
+#define     V_008DFC_SQ_IMAGE_SAMPLER                               0x7F
+#define   S_008DFC_SLC(x)                                             (((x) & 0x1) << 25)
+#define   G_008DFC_SLC(x)                                             (((x) >> 25) & 0x1)
+#define   C_008DFC_SLC                                                0xFDFFFFFF
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_MIMG_FIELD                              0x3C
+#define R_008DFC_SQ_SOPP                                                0x008DFC
+#define   S_008DFC_SIMM16(x)                                          (((x) & 0xFFFF) << 0)
+#define   G_008DFC_SIMM16(x)                                          (((x) >> 0) & 0xFFFF)
+#define   C_008DFC_SIMM16                                             0xFFFF0000
+#define   S_008DFC_OP(x)                                              (((x) & 0x7F) << 16)
+#define   G_008DFC_OP(x)                                              (((x) >> 16) & 0x7F)
+#define   C_008DFC_OP                                                 0xFF80FFFF
+#define     V_008DFC_SQ_S_NOP                                       0x00
+#define     V_008DFC_SQ_S_ENDPGM                                    0x01
+#define     V_008DFC_SQ_S_BRANCH                                    0x02
+#define     V_008DFC_SQ_S_CBRANCH_SCC0                              0x04
+#define     V_008DFC_SQ_S_CBRANCH_SCC1                              0x05
+#define     V_008DFC_SQ_S_CBRANCH_VCCZ                              0x06
+#define     V_008DFC_SQ_S_CBRANCH_VCCNZ                             0x07
+#define     V_008DFC_SQ_S_CBRANCH_EXECZ                             0x08
+#define     V_008DFC_SQ_S_CBRANCH_EXECNZ                            0x09
+#define     V_008DFC_SQ_S_BARRIER                                   0x0A
+#define     V_008DFC_SQ_S_WAITCNT                                   0x0C
+#define     V_008DFC_SQ_S_SETHALT                                   0x0D
+#define     V_008DFC_SQ_S_SLEEP                                     0x0E
+#define     V_008DFC_SQ_S_SETPRIO                                   0x0F
+#define     V_008DFC_SQ_S_SENDMSG                                   0x10
+#define     V_008DFC_SQ_S_SENDMSGHALT                               0x11
+#define     V_008DFC_SQ_S_TRAP                                      0x12
+#define     V_008DFC_SQ_S_ICACHE_INV                                0x13
+#define     V_008DFC_SQ_S_INCPERFLEVEL                              0x14
+#define     V_008DFC_SQ_S_DECPERFLEVEL                              0x15
+#define     V_008DFC_SQ_S_TTRACEDATA                                0x16
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x1FF) << 23)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 23) & 0x1FF)
+#define   C_008DFC_ENCODING                                           0x007FFFFF
+#define     V_008DFC_SQ_ENC_SOPP_FIELD                              0x17F
+#define R_008DFC_SQ_VINTRP                                              0x008DFC
+#define   S_008DFC_VSRC(x)                                            (((x) & 0xFF) << 0)
+#define   G_008DFC_VSRC(x)                                            (((x) >> 0) & 0xFF)
+#define   C_008DFC_VSRC                                               0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_ATTRCHAN(x)                                        (((x) & 0x03) << 8)
+#define   G_008DFC_ATTRCHAN(x)                                        (((x) >> 8) & 0x03)
+#define   C_008DFC_ATTRCHAN                                           0xFFFFFCFF
+#define     V_008DFC_SQ_CHAN_X                                      0x00
+#define     V_008DFC_SQ_CHAN_Y                                      0x01
+#define     V_008DFC_SQ_CHAN_Z                                      0x02
+#define     V_008DFC_SQ_CHAN_W                                      0x03
+#define   S_008DFC_ATTR(x)                                            (((x) & 0x3F) << 10)
+#define   G_008DFC_ATTR(x)                                            (((x) >> 10) & 0x3F)
+#define   C_008DFC_ATTR                                               0xFFFF03FF
+#define     V_008DFC_SQ_ATTR                                        0x00
+#define   S_008DFC_OP(x)                                              (((x) & 0x03) << 16)
+#define   G_008DFC_OP(x)                                              (((x) >> 16) & 0x03)
+#define   C_008DFC_OP                                                 0xFFFCFFFF
+#define     V_008DFC_SQ_V_INTERP_P1_F32                             0x00
+#define     V_008DFC_SQ_V_INTERP_P2_F32                             0x01
+#define     V_008DFC_SQ_V_INTERP_MOV_F32                            0x02
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 18)
+#define   G_008DFC_VDST(x)                                            (((x) >> 18) & 0xFF)
+#define   C_008DFC_VDST                                               0xFC03FFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_VINTRP_FIELD                            0x32
+#define R_008DFC_SQ_MTBUF_0                                             0x008DFC
+#define   S_008DFC_OFFSET(x)                                          (((x) & 0xFFF) << 0)
+#define   G_008DFC_OFFSET(x)                                          (((x) >> 0) & 0xFFF)
+#define   C_008DFC_OFFSET                                             0xFFFFF000
+#define   S_008DFC_OFFEN(x)                                           (((x) & 0x1) << 12)
+#define   G_008DFC_OFFEN(x)                                           (((x) >> 12) & 0x1)
+#define   C_008DFC_OFFEN                                              0xFFFFEFFF
+#define   S_008DFC_IDXEN(x)                                           (((x) & 0x1) << 13)
+#define   G_008DFC_IDXEN(x)                                           (((x) >> 13) & 0x1)
+#define   C_008DFC_IDXEN                                              0xFFFFDFFF
+#define   S_008DFC_GLC(x)                                             (((x) & 0x1) << 14)
+#define   G_008DFC_GLC(x)                                             (((x) >> 14) & 0x1)
+#define   C_008DFC_GLC                                                0xFFFFBFFF
+#define   S_008DFC_ADDR64(x)                                          (((x) & 0x1) << 15)
+#define   G_008DFC_ADDR64(x)                                          (((x) >> 15) & 0x1)
+#define   C_008DFC_ADDR64                                             0xFFFF7FFF
+#define   S_008DFC_OP(x)                                              (((x) & 0x07) << 16)
+#define   G_008DFC_OP(x)                                              (((x) >> 16) & 0x07)
+#define   C_008DFC_OP                                                 0xFFF8FFFF
+#define     V_008DFC_SQ_TBUFFER_LOAD_FORMAT_X                       0x00
+#define     V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XY                      0x01
+#define     V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XYZ                     0x02
+#define     V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XYZW                    0x03
+#define     V_008DFC_SQ_TBUFFER_STORE_FORMAT_X                      0x04
+#define     V_008DFC_SQ_TBUFFER_STORE_FORMAT_XY                     0x05
+#define     V_008DFC_SQ_TBUFFER_STORE_FORMAT_XYZ                    0x06
+#define     V_008DFC_SQ_TBUFFER_STORE_FORMAT_XYZW                   0x07
+#define   S_008DFC_DFMT(x)                                            (((x) & 0x0F) << 19)
+#define   G_008DFC_DFMT(x)                                            (((x) >> 19) & 0x0F)
+#define   C_008DFC_DFMT                                               0xFF87FFFF
+#define   S_008DFC_NFMT(x)                                            (((x) & 0x07) << 23)
+#define   G_008DFC_NFMT(x)                                            (((x) >> 23) & 0x07)
+#define   C_008DFC_NFMT                                               0xFC7FFFFF
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_MTBUF_FIELD                             0x3A
+#define R_008DFC_SQ_SMRD                                                0x008DFC
+#define   S_008DFC_OFFSET(x)                                          (((x) & 0xFF) << 0)
+#define   G_008DFC_OFFSET(x)                                          (((x) >> 0) & 0xFF)
+#define   C_008DFC_OFFSET                                             0xFFFFFF00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define   S_008DFC_IMM(x)                                             (((x) & 0x1) << 8)
+#define   G_008DFC_IMM(x)                                             (((x) >> 8) & 0x1)
+#define   C_008DFC_IMM                                                0xFFFFFEFF
+#define   S_008DFC_SBASE(x)                                           (((x) & 0x3F) << 9)
+#define   G_008DFC_SBASE(x)                                           (((x) >> 9) & 0x3F)
+#define   C_008DFC_SBASE                                              0xFFFF81FF
+#define   S_008DFC_SDST(x)                                            (((x) & 0x7F) << 15)
+#define   G_008DFC_SDST(x)                                            (((x) >> 15) & 0x7F)
+#define   C_008DFC_SDST                                               0xFFC07FFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define   S_008DFC_OP(x)                                              (((x) & 0x1F) << 22)
+#define   G_008DFC_OP(x)                                              (((x) >> 22) & 0x1F)
+#define   C_008DFC_OP                                                 0xF83FFFFF
+#define     V_008DFC_SQ_S_LOAD_DWORD                                0x00
+#define     V_008DFC_SQ_S_LOAD_DWORDX2                              0x01
+#define     V_008DFC_SQ_S_LOAD_DWORDX4                              0x02
+#define     V_008DFC_SQ_S_LOAD_DWORDX8                              0x03
+#define     V_008DFC_SQ_S_LOAD_DWORDX16                             0x04
+#define     V_008DFC_SQ_S_BUFFER_LOAD_DWORD                         0x08
+#define     V_008DFC_SQ_S_BUFFER_LOAD_DWORDX2                       0x09
+#define     V_008DFC_SQ_S_BUFFER_LOAD_DWORDX4                       0x0A
+#define     V_008DFC_SQ_S_BUFFER_LOAD_DWORDX8                       0x0B
+#define     V_008DFC_SQ_S_BUFFER_LOAD_DWORDX16                      0x0C
+#define     V_008DFC_SQ_S_MEMTIME                                   0x1E
+#define     V_008DFC_SQ_S_DCACHE_INV                                0x1F
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x1F) << 27)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 27) & 0x1F)
+#define   C_008DFC_ENCODING                                           0x07FFFFFF
+#define     V_008DFC_SQ_ENC_SMRD_FIELD                              0x18
+#define R_008DFC_SQ_EXP_1                                               0x008DFC
+#define   S_008DFC_VSRC0(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_VSRC0(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_VSRC0                                              0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VSRC1(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_VSRC1(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_VSRC1                                              0xFFFF00FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VSRC2(x)                                           (((x) & 0xFF) << 16)
+#define   G_008DFC_VSRC2(x)                                           (((x) >> 16) & 0xFF)
+#define   C_008DFC_VSRC2                                              0xFF00FFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VSRC3(x)                                           (((x) & 0xFF) << 24)
+#define   G_008DFC_VSRC3(x)                                           (((x) >> 24) & 0xFF)
+#define   C_008DFC_VSRC3                                              0x00FFFFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define R_008DFC_SQ_DS_1                                                0x008DFC
+#define   S_008DFC_ADDR(x)                                            (((x) & 0xFF) << 0)
+#define   G_008DFC_ADDR(x)                                            (((x) >> 0) & 0xFF)
+#define   C_008DFC_ADDR                                               0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_DATA0(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_DATA0(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_DATA0                                              0xFFFF00FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_DATA1(x)                                           (((x) & 0xFF) << 16)
+#define   G_008DFC_DATA1(x)                                           (((x) >> 16) & 0xFF)
+#define   C_008DFC_DATA1                                              0xFF00FFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 24)
+#define   G_008DFC_VDST(x)                                            (((x) >> 24) & 0xFF)
+#define   C_008DFC_VDST                                               0x00FFFFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define R_008DFC_SQ_VOPC                                                0x008DFC
+#define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
+#define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
+#define   C_008DFC_SRC0                                               0xFFFFFE00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_VSRC1(x)                                           (((x) & 0xFF) << 9)
+#define   G_008DFC_VSRC1(x)                                           (((x) >> 9) & 0xFF)
+#define   C_008DFC_VSRC1                                              0xFFFE01FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_OP(x)                                              (((x) & 0xFF) << 17)
+#define   G_008DFC_OP(x)                                              (((x) >> 17) & 0xFF)
+#define   C_008DFC_OP                                                 0xFE01FFFF
+#define     V_008DFC_SQ_V_CMP_F_F32                                 0x00
+#define     V_008DFC_SQ_V_CMP_LT_F32                                0x01
+#define     V_008DFC_SQ_V_CMP_EQ_F32                                0x02
+#define     V_008DFC_SQ_V_CMP_LE_F32                                0x03
+#define     V_008DFC_SQ_V_CMP_GT_F32                                0x04
+#define     V_008DFC_SQ_V_CMP_LG_F32                                0x05
+#define     V_008DFC_SQ_V_CMP_GE_F32                                0x06
+#define     V_008DFC_SQ_V_CMP_O_F32                                 0x07
+#define     V_008DFC_SQ_V_CMP_U_F32                                 0x08
+#define     V_008DFC_SQ_V_CMP_NGE_F32                               0x09
+#define     V_008DFC_SQ_V_CMP_NLG_F32                               0x0A
+#define     V_008DFC_SQ_V_CMP_NGT_F32                               0x0B
+#define     V_008DFC_SQ_V_CMP_NLE_F32                               0x0C
+#define     V_008DFC_SQ_V_CMP_NEQ_F32                               0x0D
+#define     V_008DFC_SQ_V_CMP_NLT_F32                               0x0E
+#define     V_008DFC_SQ_V_CMP_TRU_F32                               0x0F
+#define     V_008DFC_SQ_V_CMPX_F_F32                                0x10
+#define     V_008DFC_SQ_V_CMPX_LT_F32                               0x11
+#define     V_008DFC_SQ_V_CMPX_EQ_F32                               0x12
+#define     V_008DFC_SQ_V_CMPX_LE_F32                               0x13
+#define     V_008DFC_SQ_V_CMPX_GT_F32                               0x14
+#define     V_008DFC_SQ_V_CMPX_LG_F32                               0x15
+#define     V_008DFC_SQ_V_CMPX_GE_F32                               0x16
+#define     V_008DFC_SQ_V_CMPX_O_F32                                0x17
+#define     V_008DFC_SQ_V_CMPX_U_F32                                0x18
+#define     V_008DFC_SQ_V_CMPX_NGE_F32                              0x19
+#define     V_008DFC_SQ_V_CMPX_NLG_F32                              0x1A
+#define     V_008DFC_SQ_V_CMPX_NGT_F32                              0x1B
+#define     V_008DFC_SQ_V_CMPX_NLE_F32                              0x1C
+#define     V_008DFC_SQ_V_CMPX_NEQ_F32                              0x1D
+#define     V_008DFC_SQ_V_CMPX_NLT_F32                              0x1E
+#define     V_008DFC_SQ_V_CMPX_TRU_F32                              0x1F
+#define     V_008DFC_SQ_V_CMP_F_F64                                 0x20
+#define     V_008DFC_SQ_V_CMP_LT_F64                                0x21
+#define     V_008DFC_SQ_V_CMP_EQ_F64                                0x22
+#define     V_008DFC_SQ_V_CMP_LE_F64                                0x23
+#define     V_008DFC_SQ_V_CMP_GT_F64                                0x24
+#define     V_008DFC_SQ_V_CMP_LG_F64                                0x25
+#define     V_008DFC_SQ_V_CMP_GE_F64                                0x26
+#define     V_008DFC_SQ_V_CMP_O_F64                                 0x27
+#define     V_008DFC_SQ_V_CMP_U_F64                                 0x28
+#define     V_008DFC_SQ_V_CMP_NGE_F64                               0x29
+#define     V_008DFC_SQ_V_CMP_NLG_F64                               0x2A
+#define     V_008DFC_SQ_V_CMP_NGT_F64                               0x2B
+#define     V_008DFC_SQ_V_CMP_NLE_F64                               0x2C
+#define     V_008DFC_SQ_V_CMP_NEQ_F64                               0x2D
+#define     V_008DFC_SQ_V_CMP_NLT_F64                               0x2E
+#define     V_008DFC_SQ_V_CMP_TRU_F64                               0x2F
+#define     V_008DFC_SQ_V_CMPX_F_F64                                0x30
+#define     V_008DFC_SQ_V_CMPX_LT_F64                               0x31
+#define     V_008DFC_SQ_V_CMPX_EQ_F64                               0x32
+#define     V_008DFC_SQ_V_CMPX_LE_F64                               0x33
+#define     V_008DFC_SQ_V_CMPX_GT_F64                               0x34
+#define     V_008DFC_SQ_V_CMPX_LG_F64                               0x35
+#define     V_008DFC_SQ_V_CMPX_GE_F64                               0x36
+#define     V_008DFC_SQ_V_CMPX_O_F64                                0x37
+#define     V_008DFC_SQ_V_CMPX_U_F64                                0x38
+#define     V_008DFC_SQ_V_CMPX_NGE_F64                              0x39
+#define     V_008DFC_SQ_V_CMPX_NLG_F64                              0x3A
+#define     V_008DFC_SQ_V_CMPX_NGT_F64                              0x3B
+#define     V_008DFC_SQ_V_CMPX_NLE_F64                              0x3C
+#define     V_008DFC_SQ_V_CMPX_NEQ_F64                              0x3D
+#define     V_008DFC_SQ_V_CMPX_NLT_F64                              0x3E
+#define     V_008DFC_SQ_V_CMPX_TRU_F64                              0x3F
+#define     V_008DFC_SQ_V_CMPS_F_F32                                0x40
+#define     V_008DFC_SQ_V_CMPS_LT_F32                               0x41
+#define     V_008DFC_SQ_V_CMPS_EQ_F32                               0x42
+#define     V_008DFC_SQ_V_CMPS_LE_F32                               0x43
+#define     V_008DFC_SQ_V_CMPS_GT_F32                               0x44
+#define     V_008DFC_SQ_V_CMPS_LG_F32                               0x45
+#define     V_008DFC_SQ_V_CMPS_GE_F32                               0x46
+#define     V_008DFC_SQ_V_CMPS_O_F32                                0x47
+#define     V_008DFC_SQ_V_CMPS_U_F32                                0x48
+#define     V_008DFC_SQ_V_CMPS_NGE_F32                              0x49
+#define     V_008DFC_SQ_V_CMPS_NLG_F32                              0x4A
+#define     V_008DFC_SQ_V_CMPS_NGT_F32                              0x4B
+#define     V_008DFC_SQ_V_CMPS_NLE_F32                              0x4C
+#define     V_008DFC_SQ_V_CMPS_NEQ_F32                              0x4D
+#define     V_008DFC_SQ_V_CMPS_NLT_F32                              0x4E
+#define     V_008DFC_SQ_V_CMPS_TRU_F32                              0x4F
+#define     V_008DFC_SQ_V_CMPSX_F_F32                               0x50
+#define     V_008DFC_SQ_V_CMPSX_LT_F32                              0x51
+#define     V_008DFC_SQ_V_CMPSX_EQ_F32                              0x52
+#define     V_008DFC_SQ_V_CMPSX_LE_F32                              0x53
+#define     V_008DFC_SQ_V_CMPSX_GT_F32                              0x54
+#define     V_008DFC_SQ_V_CMPSX_LG_F32                              0x55
+#define     V_008DFC_SQ_V_CMPSX_GE_F32                              0x56
+#define     V_008DFC_SQ_V_CMPSX_O_F32                               0x57
+#define     V_008DFC_SQ_V_CMPSX_U_F32                               0x58
+#define     V_008DFC_SQ_V_CMPSX_NGE_F32                             0x59
+#define     V_008DFC_SQ_V_CMPSX_NLG_F32                             0x5A
+#define     V_008DFC_SQ_V_CMPSX_NGT_F32                             0x5B
+#define     V_008DFC_SQ_V_CMPSX_NLE_F32                             0x5C
+#define     V_008DFC_SQ_V_CMPSX_NEQ_F32                             0x5D
+#define     V_008DFC_SQ_V_CMPSX_NLT_F32                             0x5E
+#define     V_008DFC_SQ_V_CMPSX_TRU_F32                             0x5F
+#define     V_008DFC_SQ_V_CMPS_F_F64                                0x60
+#define     V_008DFC_SQ_V_CMPS_LT_F64                               0x61
+#define     V_008DFC_SQ_V_CMPS_EQ_F64                               0x62
+#define     V_008DFC_SQ_V_CMPS_LE_F64                               0x63
+#define     V_008DFC_SQ_V_CMPS_GT_F64                               0x64
+#define     V_008DFC_SQ_V_CMPS_LG_F64                               0x65
+#define     V_008DFC_SQ_V_CMPS_GE_F64                               0x66
+#define     V_008DFC_SQ_V_CMPS_O_F64                                0x67
+#define     V_008DFC_SQ_V_CMPS_U_F64                                0x68
+#define     V_008DFC_SQ_V_CMPS_NGE_F64                              0x69
+#define     V_008DFC_SQ_V_CMPS_NLG_F64                              0x6A
+#define     V_008DFC_SQ_V_CMPS_NGT_F64                              0x6B
+#define     V_008DFC_SQ_V_CMPS_NLE_F64                              0x6C
+#define     V_008DFC_SQ_V_CMPS_NEQ_F64                              0x6D
+#define     V_008DFC_SQ_V_CMPS_NLT_F64                              0x6E
+#define     V_008DFC_SQ_V_CMPS_TRU_F64                              0x6F
+#define     V_008DFC_SQ_V_CMPSX_F_F64                               0x70
+#define     V_008DFC_SQ_V_CMPSX_LT_F64                              0x71
+#define     V_008DFC_SQ_V_CMPSX_EQ_F64                              0x72
+#define     V_008DFC_SQ_V_CMPSX_LE_F64                              0x73
+#define     V_008DFC_SQ_V_CMPSX_GT_F64                              0x74
+#define     V_008DFC_SQ_V_CMPSX_LG_F64                              0x75
+#define     V_008DFC_SQ_V_CMPSX_GE_F64                              0x76
+#define     V_008DFC_SQ_V_CMPSX_O_F64                               0x77
+#define     V_008DFC_SQ_V_CMPSX_U_F64                               0x78
+#define     V_008DFC_SQ_V_CMPSX_NGE_F64                             0x79
+#define     V_008DFC_SQ_V_CMPSX_NLG_F64                             0x7A
+#define     V_008DFC_SQ_V_CMPSX_NGT_F64                             0x7B
+#define     V_008DFC_SQ_V_CMPSX_NLE_F64                             0x7C
+#define     V_008DFC_SQ_V_CMPSX_NEQ_F64                             0x7D
+#define     V_008DFC_SQ_V_CMPSX_NLT_F64                             0x7E
+#define     V_008DFC_SQ_V_CMPSX_TRU_F64                             0x7F
+#define     V_008DFC_SQ_V_CMP_F_I32                                 0x80
+#define     V_008DFC_SQ_V_CMP_LT_I32                                0x81
+#define     V_008DFC_SQ_V_CMP_EQ_I32                                0x82
+#define     V_008DFC_SQ_V_CMP_LE_I32                                0x83
+#define     V_008DFC_SQ_V_CMP_GT_I32                                0x84
+#define     V_008DFC_SQ_V_CMP_NE_I32                                0x85
+#define     V_008DFC_SQ_V_CMP_GE_I32                                0x86
+#define     V_008DFC_SQ_V_CMP_T_I32                                 0x87
+#define     V_008DFC_SQ_V_CMP_CLASS_F32                             0x88
+#define     V_008DFC_SQ_V_CMPX_F_I32                                0x90
+#define     V_008DFC_SQ_V_CMPX_LT_I32                               0x91
+#define     V_008DFC_SQ_V_CMPX_EQ_I32                               0x92
+#define     V_008DFC_SQ_V_CMPX_LE_I32                               0x93
+#define     V_008DFC_SQ_V_CMPX_GT_I32                               0x94
+#define     V_008DFC_SQ_V_CMPX_NE_I32                               0x95
+#define     V_008DFC_SQ_V_CMPX_GE_I32                               0x96
+#define     V_008DFC_SQ_V_CMPX_T_I32                                0x97
+#define     V_008DFC_SQ_V_CMPX_CLASS_F32                            0x98
+#define     V_008DFC_SQ_V_CMP_F_I64                                 0xA0
+#define     V_008DFC_SQ_V_CMP_LT_I64                                0xA1
+#define     V_008DFC_SQ_V_CMP_EQ_I64                                0xA2
+#define     V_008DFC_SQ_V_CMP_LE_I64                                0xA3
+#define     V_008DFC_SQ_V_CMP_GT_I64                                0xA4
+#define     V_008DFC_SQ_V_CMP_NE_I64                                0xA5
+#define     V_008DFC_SQ_V_CMP_GE_I64                                0xA6
+#define     V_008DFC_SQ_V_CMP_T_I64                                 0xA7
+#define     V_008DFC_SQ_V_CMP_CLASS_F64                             0xA8
+#define     V_008DFC_SQ_V_CMPX_F_I64                                0xB0
+#define     V_008DFC_SQ_V_CMPX_LT_I64                               0xB1
+#define     V_008DFC_SQ_V_CMPX_EQ_I64                               0xB2
+#define     V_008DFC_SQ_V_CMPX_LE_I64                               0xB3
+#define     V_008DFC_SQ_V_CMPX_GT_I64                               0xB4
+#define     V_008DFC_SQ_V_CMPX_NE_I64                               0xB5
+#define     V_008DFC_SQ_V_CMPX_GE_I64                               0xB6
+#define     V_008DFC_SQ_V_CMPX_T_I64                                0xB7
+#define     V_008DFC_SQ_V_CMPX_CLASS_F64                            0xB8
+#define     V_008DFC_SQ_V_CMP_F_U32                                 0xC0
+#define     V_008DFC_SQ_V_CMP_LT_U32                                0xC1
+#define     V_008DFC_SQ_V_CMP_EQ_U32                                0xC2
+#define     V_008DFC_SQ_V_CMP_LE_U32                                0xC3
+#define     V_008DFC_SQ_V_CMP_GT_U32                                0xC4
+#define     V_008DFC_SQ_V_CMP_NE_U32                                0xC5
+#define     V_008DFC_SQ_V_CMP_GE_U32                                0xC6
+#define     V_008DFC_SQ_V_CMP_T_U32                                 0xC7
+#define     V_008DFC_SQ_V_CMPX_F_U32                                0xD0
+#define     V_008DFC_SQ_V_CMPX_LT_U32                               0xD1
+#define     V_008DFC_SQ_V_CMPX_EQ_U32                               0xD2
+#define     V_008DFC_SQ_V_CMPX_LE_U32                               0xD3
+#define     V_008DFC_SQ_V_CMPX_GT_U32                               0xD4
+#define     V_008DFC_SQ_V_CMPX_NE_U32                               0xD5
+#define     V_008DFC_SQ_V_CMPX_GE_U32                               0xD6
+#define     V_008DFC_SQ_V_CMPX_T_U32                                0xD7
+#define     V_008DFC_SQ_V_CMP_F_U64                                 0xE0
+#define     V_008DFC_SQ_V_CMP_LT_U64                                0xE1
+#define     V_008DFC_SQ_V_CMP_EQ_U64                                0xE2
+#define     V_008DFC_SQ_V_CMP_LE_U64                                0xE3
+#define     V_008DFC_SQ_V_CMP_GT_U64                                0xE4
+#define     V_008DFC_SQ_V_CMP_NE_U64                                0xE5
+#define     V_008DFC_SQ_V_CMP_GE_U64                                0xE6
+#define     V_008DFC_SQ_V_CMP_T_U64                                 0xE7
+#define     V_008DFC_SQ_V_CMPX_F_U64                                0xF0
+#define     V_008DFC_SQ_V_CMPX_LT_U64                               0xF1
+#define     V_008DFC_SQ_V_CMPX_EQ_U64                               0xF2
+#define     V_008DFC_SQ_V_CMPX_LE_U64                               0xF3
+#define     V_008DFC_SQ_V_CMPX_GT_U64                               0xF4
+#define     V_008DFC_SQ_V_CMPX_NE_U64                               0xF5
+#define     V_008DFC_SQ_V_CMPX_GE_U64                               0xF6
+#define     V_008DFC_SQ_V_CMPX_T_U64                                0xF7
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x7F) << 25)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 25) & 0x7F)
+#define   C_008DFC_ENCODING                                           0x01FFFFFF
+#define     V_008DFC_SQ_ENC_VOPC_FIELD                              0x3E
+#define R_008DFC_SQ_SOP1                                                0x008DFC
+#define   S_008DFC_SSRC0(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_SSRC0(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_SSRC0                                              0xFFFFFF00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define   S_008DFC_OP(x)                                              (((x) & 0xFF) << 8)
+#define   G_008DFC_OP(x)                                              (((x) >> 8) & 0xFF)
+#define   C_008DFC_OP                                                 0xFFFF00FF
+#define     V_008DFC_SQ_S_MOV_B32                                   0x03
+#define     V_008DFC_SQ_S_MOV_B64                                   0x04
+#define     V_008DFC_SQ_S_CMOV_B32                                  0x05
+#define     V_008DFC_SQ_S_CMOV_B64                                  0x06
+#define     V_008DFC_SQ_S_NOT_B32                                   0x07
+#define     V_008DFC_SQ_S_NOT_B64                                   0x08
+#define     V_008DFC_SQ_S_WQM_B32                                   0x09
+#define     V_008DFC_SQ_S_WQM_B64                                   0x0A
+#define     V_008DFC_SQ_S_BREV_B32                                  0x0B
+#define     V_008DFC_SQ_S_BREV_B64                                  0x0C
+#define     V_008DFC_SQ_S_BCNT0_I32_B32                             0x0D
+#define     V_008DFC_SQ_S_BCNT0_I32_B64                             0x0E
+#define     V_008DFC_SQ_S_BCNT1_I32_B32                             0x0F
+#define     V_008DFC_SQ_S_BCNT1_I32_B64                             0x10
+#define     V_008DFC_SQ_S_FF0_I32_B32                               0x11
+#define     V_008DFC_SQ_S_FF0_I32_B64                               0x12
+#define     V_008DFC_SQ_S_FF1_I32_B32                               0x13
+#define     V_008DFC_SQ_S_FF1_I32_B64                               0x14
+#define     V_008DFC_SQ_S_FLBIT_I32_B32                             0x15
+#define     V_008DFC_SQ_S_FLBIT_I32_B64                             0x16
+#define     V_008DFC_SQ_S_FLBIT_I32                                 0x17
+#define     V_008DFC_SQ_S_FLBIT_I32_I64                             0x18
+#define     V_008DFC_SQ_S_SEXT_I32_I8                               0x19
+#define     V_008DFC_SQ_S_SEXT_I32_I16                              0x1A
+#define     V_008DFC_SQ_S_BITSET0_B32                               0x1B
+#define     V_008DFC_SQ_S_BITSET0_B64                               0x1C
+#define     V_008DFC_SQ_S_BITSET1_B32                               0x1D
+#define     V_008DFC_SQ_S_BITSET1_B64                               0x1E
+#define     V_008DFC_SQ_S_GETPC_B64                                 0x1F
+#define     V_008DFC_SQ_S_SETPC_B64                                 0x20
+#define     V_008DFC_SQ_S_SWAPPC_B64                                0x21
+#define     V_008DFC_SQ_S_RFE_B64                                   0x22
+#define     V_008DFC_SQ_S_AND_SAVEEXEC_B64                          0x24
+#define     V_008DFC_SQ_S_OR_SAVEEXEC_B64                           0x25
+#define     V_008DFC_SQ_S_XOR_SAVEEXEC_B64                          0x26
+#define     V_008DFC_SQ_S_ANDN2_SAVEEXEC_B64                        0x27
+#define     V_008DFC_SQ_S_ORN2_SAVEEXEC_B64                         0x28
+#define     V_008DFC_SQ_S_NAND_SAVEEXEC_B64                         0x29
+#define     V_008DFC_SQ_S_NOR_SAVEEXEC_B64                          0x2A
+#define     V_008DFC_SQ_S_XNOR_SAVEEXEC_B64                         0x2B
+#define     V_008DFC_SQ_S_QUADMASK_B32                              0x2C
+#define     V_008DFC_SQ_S_QUADMASK_B64                              0x2D
+#define     V_008DFC_SQ_S_MOVRELS_B32                               0x2E
+#define     V_008DFC_SQ_S_MOVRELS_B64                               0x2F
+#define     V_008DFC_SQ_S_MOVRELD_B32                               0x30
+#define     V_008DFC_SQ_S_MOVRELD_B64                               0x31
+#define     V_008DFC_SQ_S_CBRANCH_JOIN                              0x32
+#define     V_008DFC_SQ_S_MOV_REGRD_B32                             0x33
+#define     V_008DFC_SQ_S_ABS_I32                                   0x34
+#define     V_008DFC_SQ_S_MOV_FED_B32                               0x35
+#define   S_008DFC_SDST(x)                                            (((x) & 0x7F) << 16)
+#define   G_008DFC_SDST(x)                                            (((x) >> 16) & 0x7F)
+#define   C_008DFC_SDST                                               0xFF80FFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x1FF) << 23)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 23) & 0x1FF)
+#define   C_008DFC_ENCODING                                           0x007FFFFF
+#define     V_008DFC_SQ_ENC_SOP1_FIELD                              0x17D
+#define R_008DFC_SQ_MTBUF_1                                             0x008DFC
+#define   S_008DFC_VADDR(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_VADDR(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_VADDR                                              0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VDATA(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_VDATA(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_VDATA                                              0xFFFF00FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_SRSRC(x)                                           (((x) & 0x1F) << 16)
+#define   G_008DFC_SRSRC(x)                                           (((x) >> 16) & 0x1F)
+#define   C_008DFC_SRSRC                                              0xFFE0FFFF
+#define   S_008DFC_SLC(x)                                             (((x) & 0x1) << 22)
+#define   G_008DFC_SLC(x)                                             (((x) >> 22) & 0x1)
+#define   C_008DFC_SLC                                                0xFFBFFFFF
+#define   S_008DFC_TFE(x)                                             (((x) & 0x1) << 23)
+#define   G_008DFC_TFE(x)                                             (((x) >> 23) & 0x1)
+#define   C_008DFC_TFE                                                0xFF7FFFFF
+#define   S_008DFC_SOFFSET(x)                                         (((x) & 0xFF) << 24)
+#define   G_008DFC_SOFFSET(x)                                         (((x) >> 24) & 0xFF)
+#define   C_008DFC_SOFFSET                                            0x00FFFFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define R_008DFC_SQ_SOP2                                                0x008DFC
+#define   S_008DFC_SSRC0(x)                                           (((x) & 0xFF) << 0)
+#define   G_008DFC_SSRC0(x)                                           (((x) >> 0) & 0xFF)
+#define   C_008DFC_SSRC0                                              0xFFFFFF00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define   S_008DFC_SSRC1(x)                                           (((x) & 0xFF) << 8)
+#define   G_008DFC_SSRC1(x)                                           (((x) >> 8) & 0xFF)
+#define   C_008DFC_SSRC1                                              0xFFFF00FF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define   S_008DFC_SDST(x)                                            (((x) & 0x7F) << 16)
+#define   G_008DFC_SDST(x)                                            (((x) >> 16) & 0x7F)
+#define   C_008DFC_SDST                                               0xFF80FFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define   S_008DFC_OP(x)                                              (((x) & 0x7F) << 23)
+#define   G_008DFC_OP(x)                                              (((x) >> 23) & 0x7F)
+#define   C_008DFC_OP                                                 0xC07FFFFF
+#define     V_008DFC_SQ_S_ADD_U32                                   0x00
+#define     V_008DFC_SQ_S_SUB_U32                                   0x01
+#define     V_008DFC_SQ_S_ADD_I32                                   0x02
+#define     V_008DFC_SQ_S_SUB_I32                                   0x03
+#define     V_008DFC_SQ_S_ADDC_U32                                  0x04
+#define     V_008DFC_SQ_S_SUBB_U32                                  0x05
+#define     V_008DFC_SQ_S_MIN_I32                                   0x06
+#define     V_008DFC_SQ_S_MIN_U32                                   0x07
+#define     V_008DFC_SQ_S_MAX_I32                                   0x08
+#define     V_008DFC_SQ_S_MAX_U32                                   0x09
+#define     V_008DFC_SQ_S_CSELECT_B32                               0x0A
+#define     V_008DFC_SQ_S_CSELECT_B64                               0x0B
+#define     V_008DFC_SQ_S_AND_B32                                   0x0E
+#define     V_008DFC_SQ_S_AND_B64                                   0x0F
+#define     V_008DFC_SQ_S_OR_B32                                    0x10
+#define     V_008DFC_SQ_S_OR_B64                                    0x11
+#define     V_008DFC_SQ_S_XOR_B32                                   0x12
+#define     V_008DFC_SQ_S_XOR_B64                                   0x13
+#define     V_008DFC_SQ_S_ANDN2_B32                                 0x14
+#define     V_008DFC_SQ_S_ANDN2_B64                                 0x15
+#define     V_008DFC_SQ_S_ORN2_B32                                  0x16
+#define     V_008DFC_SQ_S_ORN2_B64                                  0x17
+#define     V_008DFC_SQ_S_NAND_B32                                  0x18
+#define     V_008DFC_SQ_S_NAND_B64                                  0x19
+#define     V_008DFC_SQ_S_NOR_B32                                   0x1A
+#define     V_008DFC_SQ_S_NOR_B64                                   0x1B
+#define     V_008DFC_SQ_S_XNOR_B32                                  0x1C
+#define     V_008DFC_SQ_S_XNOR_B64                                  0x1D
+#define     V_008DFC_SQ_S_LSHL_B32                                  0x1E
+#define     V_008DFC_SQ_S_LSHL_B64                                  0x1F
+#define     V_008DFC_SQ_S_LSHR_B32                                  0x20
+#define     V_008DFC_SQ_S_LSHR_B64                                  0x21
+#define     V_008DFC_SQ_S_ASHR_I32                                  0x22
+#define     V_008DFC_SQ_S_ASHR_I64                                  0x23
+#define     V_008DFC_SQ_S_BFM_B32                                   0x24
+#define     V_008DFC_SQ_S_BFM_B64                                   0x25
+#define     V_008DFC_SQ_S_MUL_I32                                   0x26
+#define     V_008DFC_SQ_S_BFE_U32                                   0x27
+#define     V_008DFC_SQ_S_BFE_I32                                   0x28
+#define     V_008DFC_SQ_S_BFE_U64                                   0x29
+#define     V_008DFC_SQ_S_BFE_I64                                   0x2A
+#define     V_008DFC_SQ_S_CBRANCH_G_FORK                            0x2B
+#define     V_008DFC_SQ_S_ABSDIFF_I32                               0x2C
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x03) << 30)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 30) & 0x03)
+#define   C_008DFC_ENCODING                                           0x3FFFFFFF
+#define     V_008DFC_SQ_ENC_SOP2_FIELD                              0x02
+#define R_008DFC_SQ_SOPK                                                0x008DFC
+#define   S_008DFC_SIMM16(x)                                          (((x) & 0xFFFF) << 0)
+#define   G_008DFC_SIMM16(x)                                          (((x) >> 0) & 0xFFFF)
+#define   C_008DFC_SIMM16                                             0xFFFF0000
+#define   S_008DFC_SDST(x)                                            (((x) & 0x7F) << 16)
+#define   G_008DFC_SDST(x)                                            (((x) >> 16) & 0x7F)
+#define   C_008DFC_SDST                                               0xFF80FFFF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define   S_008DFC_OP(x)                                              (((x) & 0x1F) << 23)
+#define   G_008DFC_OP(x)                                              (((x) >> 23) & 0x1F)
+#define   C_008DFC_OP                                                 0xF07FFFFF
+#define     V_008DFC_SQ_S_MOVK_I32                                  0x00
+#define     V_008DFC_SQ_S_CMOVK_I32                                 0x02
+#define     V_008DFC_SQ_S_CMPK_EQ_I32                               0x03
+#define     V_008DFC_SQ_S_CMPK_LG_I32                               0x04
+#define     V_008DFC_SQ_S_CMPK_GT_I32                               0x05
+#define     V_008DFC_SQ_S_CMPK_GE_I32                               0x06
+#define     V_008DFC_SQ_S_CMPK_LT_I32                               0x07
+#define     V_008DFC_SQ_S_CMPK_LE_I32                               0x08
+#define     V_008DFC_SQ_S_CMPK_EQ_U32                               0x09
+#define     V_008DFC_SQ_S_CMPK_LG_U32                               0x0A
+#define     V_008DFC_SQ_S_CMPK_GT_U32                               0x0B
+#define     V_008DFC_SQ_S_CMPK_GE_U32                               0x0C
+#define     V_008DFC_SQ_S_CMPK_LT_U32                               0x0D
+#define     V_008DFC_SQ_S_CMPK_LE_U32                               0x0E
+#define     V_008DFC_SQ_S_ADDK_I32                                  0x0F
+#define     V_008DFC_SQ_S_MULK_I32                                  0x10
+#define     V_008DFC_SQ_S_CBRANCH_I_FORK                            0x11
+#define     V_008DFC_SQ_S_GETREG_B32                                0x12
+#define     V_008DFC_SQ_S_SETREG_B32                                0x13
+#define     V_008DFC_SQ_S_GETREG_REGRD_B32                          0x14
+#define     V_008DFC_SQ_S_SETREG_IMM32_B32                          0x15
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x0F) << 28)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 28) & 0x0F)
+#define   C_008DFC_ENCODING                                           0x0FFFFFFF
+#define     V_008DFC_SQ_ENC_SOPK_FIELD                              0x0B
+#define R_008DFC_SQ_VOP3_0                                              0x008DFC
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 0)
+#define   G_008DFC_VDST(x)                                            (((x) >> 0) & 0xFF)
+#define   C_008DFC_VDST                                               0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_ABS(x)                                             (((x) & 0x07) << 8)
+#define   G_008DFC_ABS(x)                                             (((x) >> 8) & 0x07)
+#define   C_008DFC_ABS                                                0xFFFFF8FF
+#define   S_008DFC_CLAMP(x)                                           (((x) & 0x1) << 11)
+#define   G_008DFC_CLAMP(x)                                           (((x) >> 11) & 0x1)
+#define   C_008DFC_CLAMP                                              0xFFFFF7FF
+#define   S_008DFC_OP(x)                                              (((x) & 0x1FF) << 17)
+#define   G_008DFC_OP(x)                                              (((x) >> 17) & 0x1FF)
+#define   C_008DFC_OP                                                 0xFC01FFFF
+#define     V_008DFC_SQ_V_OPC_OFFSET                                0x00
+#define     V_008DFC_SQ_V_OP2_OFFSET                                0x100
+#define     V_008DFC_SQ_V_MAD_LEGACY_F32                            0x140
+#define     V_008DFC_SQ_V_MAD_F32                                   0x141
+#define     V_008DFC_SQ_V_MAD_I32_I24                               0x142
+#define     V_008DFC_SQ_V_MAD_U32_U24                               0x143
+#define     V_008DFC_SQ_V_CUBEID_F32                                0x144
+#define     V_008DFC_SQ_V_CUBESC_F32                                0x145
+#define     V_008DFC_SQ_V_CUBETC_F32                                0x146
+#define     V_008DFC_SQ_V_CUBEMA_F32                                0x147
+#define     V_008DFC_SQ_V_BFE_U32                                   0x148
+#define     V_008DFC_SQ_V_BFE_I32                                   0x149
+#define     V_008DFC_SQ_V_BFI_B32                                   0x14A
+#define     V_008DFC_SQ_V_FMA_F32                                   0x14B
+#define     V_008DFC_SQ_V_FMA_F64                                   0x14C
+#define     V_008DFC_SQ_V_LERP_U8                                   0x14D
+#define     V_008DFC_SQ_V_ALIGNBIT_B32                              0x14E
+#define     V_008DFC_SQ_V_ALIGNBYTE_B32                             0x14F
+#define     V_008DFC_SQ_V_MULLIT_F32                                0x150
+#define     V_008DFC_SQ_V_MIN3_F32                                  0x151
+#define     V_008DFC_SQ_V_MIN3_I32                                  0x152
+#define     V_008DFC_SQ_V_MIN3_U32                                  0x153
+#define     V_008DFC_SQ_V_MAX3_F32                                  0x154
+#define     V_008DFC_SQ_V_MAX3_I32                                  0x155
+#define     V_008DFC_SQ_V_MAX3_U32                                  0x156
+#define     V_008DFC_SQ_V_MED3_F32                                  0x157
+#define     V_008DFC_SQ_V_MED3_I32                                  0x158
+#define     V_008DFC_SQ_V_MED3_U32                                  0x159
+#define     V_008DFC_SQ_V_SAD_U8                                    0x15A
+#define     V_008DFC_SQ_V_SAD_HI_U8                                 0x15B
+#define     V_008DFC_SQ_V_SAD_U16                                   0x15C
+#define     V_008DFC_SQ_V_SAD_U32                                   0x15D
+#define     V_008DFC_SQ_V_CVT_PK_U8_F32                             0x15E
+#define     V_008DFC_SQ_V_DIV_FIXUP_F32                             0x15F
+#define     V_008DFC_SQ_V_DIV_FIXUP_F64                             0x160
+#define     V_008DFC_SQ_V_LSHL_B64                                  0x161
+#define     V_008DFC_SQ_V_LSHR_B64                                  0x162
+#define     V_008DFC_SQ_V_ASHR_I64                                  0x163
+#define     V_008DFC_SQ_V_ADD_F64                                   0x164
+#define     V_008DFC_SQ_V_MUL_F64                                   0x165
+#define     V_008DFC_SQ_V_MIN_F64                                   0x166
+#define     V_008DFC_SQ_V_MAX_F64                                   0x167
+#define     V_008DFC_SQ_V_LDEXP_F64                                 0x168
+#define     V_008DFC_SQ_V_MUL_LO_U32                                0x169
+#define     V_008DFC_SQ_V_MUL_HI_U32                                0x16A
+#define     V_008DFC_SQ_V_MUL_LO_I32                                0x16B
+#define     V_008DFC_SQ_V_MUL_HI_I32                                0x16C
+#define     V_008DFC_SQ_V_DIV_SCALE_F32                             0x16D
+#define     V_008DFC_SQ_V_DIV_SCALE_F64                             0x16E
+#define     V_008DFC_SQ_V_DIV_FMAS_F32                              0x16F
+#define     V_008DFC_SQ_V_DIV_FMAS_F64                              0x170
+#define     V_008DFC_SQ_V_MSAD_U8                                   0x171
+#define     V_008DFC_SQ_V_QSAD_U8                                   0x172
+#define     V_008DFC_SQ_V_MQSAD_U8                                  0x173
+#define     V_008DFC_SQ_V_TRIG_PREOP_F64                            0x174
+#define     V_008DFC_SQ_V_OP1_OFFSET                                0x180
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_VOP3_FIELD                              0x34
+#define R_008DFC_SQ_VOP2                                                0x008DFC
+#define   S_008DFC_SRC0(x)                                            (((x) & 0x1FF) << 0)
+#define   G_008DFC_SRC0(x)                                            (((x) >> 0) & 0x1FF)
+#define   C_008DFC_SRC0                                               0xFFFFFE00
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define     V_008DFC_SQ_M0                                          0x7C
+#define     V_008DFC_SQ_EXEC_LO                                     0x7E
+#define     V_008DFC_SQ_EXEC_HI                                     0x7F
+#define     V_008DFC_SQ_SRC_0                                       0x80
+#define     V_008DFC_SQ_SRC_1_INT                                   0x81
+#define     V_008DFC_SQ_SRC_2_INT                                   0x82
+#define     V_008DFC_SQ_SRC_3_INT                                   0x83
+#define     V_008DFC_SQ_SRC_4_INT                                   0x84
+#define     V_008DFC_SQ_SRC_5_INT                                   0x85
+#define     V_008DFC_SQ_SRC_6_INT                                   0x86
+#define     V_008DFC_SQ_SRC_7_INT                                   0x87
+#define     V_008DFC_SQ_SRC_8_INT                                   0x88
+#define     V_008DFC_SQ_SRC_9_INT                                   0x89
+#define     V_008DFC_SQ_SRC_10_INT                                  0x8A
+#define     V_008DFC_SQ_SRC_11_INT                                  0x8B
+#define     V_008DFC_SQ_SRC_12_INT                                  0x8C
+#define     V_008DFC_SQ_SRC_13_INT                                  0x8D
+#define     V_008DFC_SQ_SRC_14_INT                                  0x8E
+#define     V_008DFC_SQ_SRC_15_INT                                  0x8F
+#define     V_008DFC_SQ_SRC_16_INT                                  0x90
+#define     V_008DFC_SQ_SRC_17_INT                                  0x91
+#define     V_008DFC_SQ_SRC_18_INT                                  0x92
+#define     V_008DFC_SQ_SRC_19_INT                                  0x93
+#define     V_008DFC_SQ_SRC_20_INT                                  0x94
+#define     V_008DFC_SQ_SRC_21_INT                                  0x95
+#define     V_008DFC_SQ_SRC_22_INT                                  0x96
+#define     V_008DFC_SQ_SRC_23_INT                                  0x97
+#define     V_008DFC_SQ_SRC_24_INT                                  0x98
+#define     V_008DFC_SQ_SRC_25_INT                                  0x99
+#define     V_008DFC_SQ_SRC_26_INT                                  0x9A
+#define     V_008DFC_SQ_SRC_27_INT                                  0x9B
+#define     V_008DFC_SQ_SRC_28_INT                                  0x9C
+#define     V_008DFC_SQ_SRC_29_INT                                  0x9D
+#define     V_008DFC_SQ_SRC_30_INT                                  0x9E
+#define     V_008DFC_SQ_SRC_31_INT                                  0x9F
+#define     V_008DFC_SQ_SRC_32_INT                                  0xA0
+#define     V_008DFC_SQ_SRC_33_INT                                  0xA1
+#define     V_008DFC_SQ_SRC_34_INT                                  0xA2
+#define     V_008DFC_SQ_SRC_35_INT                                  0xA3
+#define     V_008DFC_SQ_SRC_36_INT                                  0xA4
+#define     V_008DFC_SQ_SRC_37_INT                                  0xA5
+#define     V_008DFC_SQ_SRC_38_INT                                  0xA6
+#define     V_008DFC_SQ_SRC_39_INT                                  0xA7
+#define     V_008DFC_SQ_SRC_40_INT                                  0xA8
+#define     V_008DFC_SQ_SRC_41_INT                                  0xA9
+#define     V_008DFC_SQ_SRC_42_INT                                  0xAA
+#define     V_008DFC_SQ_SRC_43_INT                                  0xAB
+#define     V_008DFC_SQ_SRC_44_INT                                  0xAC
+#define     V_008DFC_SQ_SRC_45_INT                                  0xAD
+#define     V_008DFC_SQ_SRC_46_INT                                  0xAE
+#define     V_008DFC_SQ_SRC_47_INT                                  0xAF
+#define     V_008DFC_SQ_SRC_48_INT                                  0xB0
+#define     V_008DFC_SQ_SRC_49_INT                                  0xB1
+#define     V_008DFC_SQ_SRC_50_INT                                  0xB2
+#define     V_008DFC_SQ_SRC_51_INT                                  0xB3
+#define     V_008DFC_SQ_SRC_52_INT                                  0xB4
+#define     V_008DFC_SQ_SRC_53_INT                                  0xB5
+#define     V_008DFC_SQ_SRC_54_INT                                  0xB6
+#define     V_008DFC_SQ_SRC_55_INT                                  0xB7
+#define     V_008DFC_SQ_SRC_56_INT                                  0xB8
+#define     V_008DFC_SQ_SRC_57_INT                                  0xB9
+#define     V_008DFC_SQ_SRC_58_INT                                  0xBA
+#define     V_008DFC_SQ_SRC_59_INT                                  0xBB
+#define     V_008DFC_SQ_SRC_60_INT                                  0xBC
+#define     V_008DFC_SQ_SRC_61_INT                                  0xBD
+#define     V_008DFC_SQ_SRC_62_INT                                  0xBE
+#define     V_008DFC_SQ_SRC_63_INT                                  0xBF
+#define     V_008DFC_SQ_SRC_64_INT                                  0xC0
+#define     V_008DFC_SQ_SRC_M_1_INT                                 0xC1
+#define     V_008DFC_SQ_SRC_M_2_INT                                 0xC2
+#define     V_008DFC_SQ_SRC_M_3_INT                                 0xC3
+#define     V_008DFC_SQ_SRC_M_4_INT                                 0xC4
+#define     V_008DFC_SQ_SRC_M_5_INT                                 0xC5
+#define     V_008DFC_SQ_SRC_M_6_INT                                 0xC6
+#define     V_008DFC_SQ_SRC_M_7_INT                                 0xC7
+#define     V_008DFC_SQ_SRC_M_8_INT                                 0xC8
+#define     V_008DFC_SQ_SRC_M_9_INT                                 0xC9
+#define     V_008DFC_SQ_SRC_M_10_INT                                0xCA
+#define     V_008DFC_SQ_SRC_M_11_INT                                0xCB
+#define     V_008DFC_SQ_SRC_M_12_INT                                0xCC
+#define     V_008DFC_SQ_SRC_M_13_INT                                0xCD
+#define     V_008DFC_SQ_SRC_M_14_INT                                0xCE
+#define     V_008DFC_SQ_SRC_M_15_INT                                0xCF
+#define     V_008DFC_SQ_SRC_M_16_INT                                0xD0
+#define     V_008DFC_SQ_SRC_0_5                                     0xF0
+#define     V_008DFC_SQ_SRC_M_0_5                                   0xF1
+#define     V_008DFC_SQ_SRC_1                                       0xF2
+#define     V_008DFC_SQ_SRC_M_1                                     0xF3
+#define     V_008DFC_SQ_SRC_2                                       0xF4
+#define     V_008DFC_SQ_SRC_M_2                                     0xF5
+#define     V_008DFC_SQ_SRC_4                                       0xF6
+#define     V_008DFC_SQ_SRC_M_4                                     0xF7
+#define     V_008DFC_SQ_SRC_VCCZ                                    0xFB
+#define     V_008DFC_SQ_SRC_EXECZ                                   0xFC
+#define     V_008DFC_SQ_SRC_SCC                                     0xFD
+#define     V_008DFC_SQ_SRC_LDS_DIRECT                              0xFE
+#define     V_008DFC_SQ_SRC_VGPR                                    0x100
+#define   S_008DFC_VSRC1(x)                                           (((x) & 0xFF) << 9)
+#define   G_008DFC_VSRC1(x)                                           (((x) >> 9) & 0xFF)
+#define   C_008DFC_VSRC1                                              0xFFFE01FF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 17)
+#define   G_008DFC_VDST(x)                                            (((x) >> 17) & 0xFF)
+#define   C_008DFC_VDST                                               0xFE01FFFF
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_OP(x)                                              (((x) & 0x3F) << 25)
+#define   G_008DFC_OP(x)                                              (((x) >> 25) & 0x3F)
+#define   C_008DFC_OP                                                 0x81FFFFFF
+#define     V_008DFC_SQ_V_CNDMASK_B32                               0x00
+#define     V_008DFC_SQ_V_READLANE_B32                              0x01
+#define     V_008DFC_SQ_V_WRITELANE_B32                             0x02
+#define     V_008DFC_SQ_V_ADD_F32                                   0x03
+#define     V_008DFC_SQ_V_SUB_F32                                   0x04
+#define     V_008DFC_SQ_V_SUBREV_F32                                0x05
+#define     V_008DFC_SQ_V_MAC_LEGACY_F32                            0x06
+#define     V_008DFC_SQ_V_MUL_LEGACY_F32                            0x07
+#define     V_008DFC_SQ_V_MUL_F32                                   0x08
+#define     V_008DFC_SQ_V_MUL_I32_I24                               0x09
+#define     V_008DFC_SQ_V_MUL_HI_I32_I24                            0x0A
+#define     V_008DFC_SQ_V_MUL_U32_U24                               0x0B
+#define     V_008DFC_SQ_V_MUL_HI_U32_U24                            0x0C
+#define     V_008DFC_SQ_V_MIN_LEGACY_F32                            0x0D
+#define     V_008DFC_SQ_V_MAX_LEGACY_F32                            0x0E
+#define     V_008DFC_SQ_V_MIN_F32                                   0x0F
+#define     V_008DFC_SQ_V_MAX_F32                                   0x10
+#define     V_008DFC_SQ_V_MIN_I32                                   0x11
+#define     V_008DFC_SQ_V_MAX_I32                                   0x12
+#define     V_008DFC_SQ_V_MIN_U32                                   0x13
+#define     V_008DFC_SQ_V_MAX_U32                                   0x14
+#define     V_008DFC_SQ_V_LSHR_B32                                  0x15
+#define     V_008DFC_SQ_V_LSHRREV_B32                               0x16
+#define     V_008DFC_SQ_V_ASHR_I32                                  0x17
+#define     V_008DFC_SQ_V_ASHRREV_I32                               0x18
+#define     V_008DFC_SQ_V_LSHL_B32                                  0x19
+#define     V_008DFC_SQ_V_LSHLREV_B32                               0x1A
+#define     V_008DFC_SQ_V_AND_B32                                   0x1B
+#define     V_008DFC_SQ_V_OR_B32                                    0x1C
+#define     V_008DFC_SQ_V_XOR_B32                                   0x1D
+#define     V_008DFC_SQ_V_BFM_B32                                   0x1E
+#define     V_008DFC_SQ_V_MAC_F32                                   0x1F
+#define     V_008DFC_SQ_V_MADMK_F32                                 0x20
+#define     V_008DFC_SQ_V_MADAK_F32                                 0x21
+#define     V_008DFC_SQ_V_BCNT_U32_B32                              0x22
+#define     V_008DFC_SQ_V_MBCNT_LO_U32_B32                          0x23
+#define     V_008DFC_SQ_V_MBCNT_HI_U32_B32                          0x24
+#define     V_008DFC_SQ_V_ADD_I32                                   0x25
+#define     V_008DFC_SQ_V_SUB_I32                                   0x26
+#define     V_008DFC_SQ_V_SUBREV_I32                                0x27
+#define     V_008DFC_SQ_V_ADDC_U32                                  0x28
+#define     V_008DFC_SQ_V_SUBB_U32                                  0x29
+#define     V_008DFC_SQ_V_SUBBREV_U32                               0x2A
+#define     V_008DFC_SQ_V_LDEXP_F32                                 0x2B
+#define     V_008DFC_SQ_V_CVT_PKACCUM_U8_F32                        0x2C
+#define     V_008DFC_SQ_V_CVT_PKNORM_I16_F32                        0x2D
+#define     V_008DFC_SQ_V_CVT_PKNORM_U16_F32                        0x2E
+#define     V_008DFC_SQ_V_CVT_PKRTZ_F16_F32                         0x2F
+#define     V_008DFC_SQ_V_CVT_PK_U16_U32                            0x30
+#define     V_008DFC_SQ_V_CVT_PK_I16_I32                            0x31
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x1) << 31)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 31) & 0x1)
+#define   C_008DFC_ENCODING                                           0x7FFFFFFF
+#define R_008DFC_SQ_VOP3_0_SDST_ENC                                     0x008DFC
+#define   S_008DFC_VDST(x)                                            (((x) & 0xFF) << 0)
+#define   G_008DFC_VDST(x)                                            (((x) >> 0) & 0xFF)
+#define   C_008DFC_VDST                                               0xFFFFFF00
+#define     V_008DFC_SQ_VGPR                                        0x00
+#define   S_008DFC_SDST(x)                                            (((x) & 0x7F) << 8)
+#define   G_008DFC_SDST(x)                                            (((x) >> 8) & 0x7F)
+#define   C_008DFC_SDST                                               0xFFFF80FF
+#define     V_008DFC_SQ_SGPR                                        0x00
+#define     V_008DFC_SQ_VCC_LO                                      0x6A
+#define     V_008DFC_SQ_VCC_HI                                      0x6B
+#define     V_008DFC_SQ_TBA_LO                                      0x6C
+#define     V_008DFC_SQ_TBA_HI                                      0x6D
+#define     V_008DFC_SQ_TMA_LO                                      0x6E
+#define     V_008DFC_SQ_TMA_HI                                      0x6F
+#define     V_008DFC_SQ_TTMP0                                       0x70
+#define     V_008DFC_SQ_TTMP1                                       0x71
+#define     V_008DFC_SQ_TTMP2                                       0x72
+#define     V_008DFC_SQ_TTMP3                                       0x73
+#define     V_008DFC_SQ_TTMP4                                       0x74
+#define     V_008DFC_SQ_TTMP5                                       0x75
+#define     V_008DFC_SQ_TTMP6                                       0x76
+#define     V_008DFC_SQ_TTMP7                                       0x77
+#define     V_008DFC_SQ_TTMP8                                       0x78
+#define     V_008DFC_SQ_TTMP9                                       0x79
+#define     V_008DFC_SQ_TTMP10                                      0x7A
+#define     V_008DFC_SQ_TTMP11                                      0x7B
+#define   S_008DFC_OP(x)                                              (((x) & 0x1FF) << 17)
+#define   G_008DFC_OP(x)                                              (((x) >> 17) & 0x1FF)
+#define   C_008DFC_OP                                                 0xFC01FFFF
+#define     V_008DFC_SQ_V_OPC_OFFSET                                0x00
+#define     V_008DFC_SQ_V_OP2_OFFSET                                0x100
+#define     V_008DFC_SQ_V_MAD_LEGACY_F32                            0x140
+#define     V_008DFC_SQ_V_MAD_F32                                   0x141
+#define     V_008DFC_SQ_V_MAD_I32_I24                               0x142
+#define     V_008DFC_SQ_V_MAD_U32_U24                               0x143
+#define     V_008DFC_SQ_V_CUBEID_F32                                0x144
+#define     V_008DFC_SQ_V_CUBESC_F32                                0x145
+#define     V_008DFC_SQ_V_CUBETC_F32                                0x146
+#define     V_008DFC_SQ_V_CUBEMA_F32                                0x147
+#define     V_008DFC_SQ_V_BFE_U32                                   0x148
+#define     V_008DFC_SQ_V_BFE_I32                                   0x149
+#define     V_008DFC_SQ_V_BFI_B32                                   0x14A
+#define     V_008DFC_SQ_V_FMA_F32                                   0x14B
+#define     V_008DFC_SQ_V_FMA_F64                                   0x14C
+#define     V_008DFC_SQ_V_LERP_U8                                   0x14D
+#define     V_008DFC_SQ_V_ALIGNBIT_B32                              0x14E
+#define     V_008DFC_SQ_V_ALIGNBYTE_B32                             0x14F
+#define     V_008DFC_SQ_V_MULLIT_F32                                0x150
+#define     V_008DFC_SQ_V_MIN3_F32                                  0x151
+#define     V_008DFC_SQ_V_MIN3_I32                                  0x152
+#define     V_008DFC_SQ_V_MIN3_U32                                  0x153
+#define     V_008DFC_SQ_V_MAX3_F32                                  0x154
+#define     V_008DFC_SQ_V_MAX3_I32                                  0x155
+#define     V_008DFC_SQ_V_MAX3_U32                                  0x156
+#define     V_008DFC_SQ_V_MED3_F32                                  0x157
+#define     V_008DFC_SQ_V_MED3_I32                                  0x158
+#define     V_008DFC_SQ_V_MED3_U32                                  0x159
+#define     V_008DFC_SQ_V_SAD_U8                                    0x15A
+#define     V_008DFC_SQ_V_SAD_HI_U8                                 0x15B
+#define     V_008DFC_SQ_V_SAD_U16                                   0x15C
+#define     V_008DFC_SQ_V_SAD_U32                                   0x15D
+#define     V_008DFC_SQ_V_CVT_PK_U8_F32                             0x15E
+#define     V_008DFC_SQ_V_DIV_FIXUP_F32                             0x15F
+#define     V_008DFC_SQ_V_DIV_FIXUP_F64                             0x160
+#define     V_008DFC_SQ_V_LSHL_B64                                  0x161
+#define     V_008DFC_SQ_V_LSHR_B64                                  0x162
+#define     V_008DFC_SQ_V_ASHR_I64                                  0x163
+#define     V_008DFC_SQ_V_ADD_F64                                   0x164
+#define     V_008DFC_SQ_V_MUL_F64                                   0x165
+#define     V_008DFC_SQ_V_MIN_F64                                   0x166
+#define     V_008DFC_SQ_V_MAX_F64                                   0x167
+#define     V_008DFC_SQ_V_LDEXP_F64                                 0x168
+#define     V_008DFC_SQ_V_MUL_LO_U32                                0x169
+#define     V_008DFC_SQ_V_MUL_HI_U32                                0x16A
+#define     V_008DFC_SQ_V_MUL_LO_I32                                0x16B
+#define     V_008DFC_SQ_V_MUL_HI_I32                                0x16C
+#define     V_008DFC_SQ_V_DIV_SCALE_F32                             0x16D
+#define     V_008DFC_SQ_V_DIV_SCALE_F64                             0x16E
+#define     V_008DFC_SQ_V_DIV_FMAS_F32                              0x16F
+#define     V_008DFC_SQ_V_DIV_FMAS_F64                              0x170
+#define     V_008DFC_SQ_V_MSAD_U8                                   0x171
+#define     V_008DFC_SQ_V_QSAD_U8                                   0x172
+#define     V_008DFC_SQ_V_MQSAD_U8                                  0x173
+#define     V_008DFC_SQ_V_TRIG_PREOP_F64                            0x174
+#define     V_008DFC_SQ_V_OP1_OFFSET                                0x180
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_VOP3_FIELD                              0x34
+#define R_008DFC_SQ_MUBUF_0                                             0x008DFC
+#define   S_008DFC_OFFSET(x)                                          (((x) & 0xFFF) << 0)
+#define   G_008DFC_OFFSET(x)                                          (((x) >> 0) & 0xFFF)
+#define   C_008DFC_OFFSET                                             0xFFFFF000
+#define   S_008DFC_OFFEN(x)                                           (((x) & 0x1) << 12)
+#define   G_008DFC_OFFEN(x)                                           (((x) >> 12) & 0x1)
+#define   C_008DFC_OFFEN                                              0xFFFFEFFF
+#define   S_008DFC_IDXEN(x)                                           (((x) & 0x1) << 13)
+#define   G_008DFC_IDXEN(x)                                           (((x) >> 13) & 0x1)
+#define   C_008DFC_IDXEN                                              0xFFFFDFFF
+#define   S_008DFC_GLC(x)                                             (((x) & 0x1) << 14)
+#define   G_008DFC_GLC(x)                                             (((x) >> 14) & 0x1)
+#define   C_008DFC_GLC                                                0xFFFFBFFF
+#define   S_008DFC_ADDR64(x)                                          (((x) & 0x1) << 15)
+#define   G_008DFC_ADDR64(x)                                          (((x) >> 15) & 0x1)
+#define   C_008DFC_ADDR64                                             0xFFFF7FFF
+#define   S_008DFC_LDS(x)                                             (((x) & 0x1) << 16)
+#define   G_008DFC_LDS(x)                                             (((x) >> 16) & 0x1)
+#define   C_008DFC_LDS                                                0xFFFEFFFF
+#define   S_008DFC_OP(x)                                              (((x) & 0x7F) << 18)
+#define   G_008DFC_OP(x)                                              (((x) >> 18) & 0x7F)
+#define   C_008DFC_OP                                                 0xFE03FFFF
+#define     V_008DFC_SQ_BUFFER_LOAD_FORMAT_X                        0x00
+#define     V_008DFC_SQ_BUFFER_LOAD_FORMAT_XY                       0x01
+#define     V_008DFC_SQ_BUFFER_LOAD_FORMAT_XYZ                      0x02
+#define     V_008DFC_SQ_BUFFER_LOAD_FORMAT_XYZW                     0x03
+#define     V_008DFC_SQ_BUFFER_STORE_FORMAT_X                       0x04
+#define     V_008DFC_SQ_BUFFER_STORE_FORMAT_XY                      0x05
+#define     V_008DFC_SQ_BUFFER_STORE_FORMAT_XYZ                     0x06
+#define     V_008DFC_SQ_BUFFER_STORE_FORMAT_XYZW                    0x07
+#define     V_008DFC_SQ_BUFFER_LOAD_UBYTE                           0x08
+#define     V_008DFC_SQ_BUFFER_LOAD_SBYTE                           0x09
+#define     V_008DFC_SQ_BUFFER_LOAD_USHORT                          0x0A
+#define     V_008DFC_SQ_BUFFER_LOAD_SSHORT                          0x0B
+#define     V_008DFC_SQ_BUFFER_LOAD_DWORD                           0x0C
+#define     V_008DFC_SQ_BUFFER_LOAD_DWORDX2                         0x0D
+#define     V_008DFC_SQ_BUFFER_LOAD_DWORDX4                         0x0E
+#define     V_008DFC_SQ_BUFFER_STORE_BYTE                           0x18
+#define     V_008DFC_SQ_BUFFER_STORE_SHORT                          0x1A
+#define     V_008DFC_SQ_BUFFER_STORE_DWORD                          0x1C
+#define     V_008DFC_SQ_BUFFER_STORE_DWORDX2                        0x1D
+#define     V_008DFC_SQ_BUFFER_STORE_DWORDX4                        0x1E
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SWAP                          0x30
+#define     V_008DFC_SQ_BUFFER_ATOMIC_CMPSWAP                       0x31
+#define     V_008DFC_SQ_BUFFER_ATOMIC_ADD                           0x32
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SUB                           0x33
+#define     V_008DFC_SQ_BUFFER_ATOMIC_RSUB                          0x34
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SMIN                          0x35
+#define     V_008DFC_SQ_BUFFER_ATOMIC_UMIN                          0x36
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SMAX                          0x37
+#define     V_008DFC_SQ_BUFFER_ATOMIC_UMAX                          0x38
+#define     V_008DFC_SQ_BUFFER_ATOMIC_AND                           0x39
+#define     V_008DFC_SQ_BUFFER_ATOMIC_OR                            0x3A
+#define     V_008DFC_SQ_BUFFER_ATOMIC_XOR                           0x3B
+#define     V_008DFC_SQ_BUFFER_ATOMIC_INC                           0x3C
+#define     V_008DFC_SQ_BUFFER_ATOMIC_DEC                           0x3D
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FCMPSWAP                      0x3E
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FMIN                          0x3F
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FMAX                          0x40
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SWAP_X2                       0x50
+#define     V_008DFC_SQ_BUFFER_ATOMIC_CMPSWAP_X2                    0x51
+#define     V_008DFC_SQ_BUFFER_ATOMIC_ADD_X2                        0x52
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SUB_X2                        0x53
+#define     V_008DFC_SQ_BUFFER_ATOMIC_RSUB_X2                       0x54
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SMIN_X2                       0x55
+#define     V_008DFC_SQ_BUFFER_ATOMIC_UMIN_X2                       0x56
+#define     V_008DFC_SQ_BUFFER_ATOMIC_SMAX_X2                       0x57
+#define     V_008DFC_SQ_BUFFER_ATOMIC_UMAX_X2                       0x58
+#define     V_008DFC_SQ_BUFFER_ATOMIC_AND_X2                        0x59
+#define     V_008DFC_SQ_BUFFER_ATOMIC_OR_X2                         0x5A
+#define     V_008DFC_SQ_BUFFER_ATOMIC_XOR_X2                        0x5B
+#define     V_008DFC_SQ_BUFFER_ATOMIC_INC_X2                        0x5C
+#define     V_008DFC_SQ_BUFFER_ATOMIC_DEC_X2                        0x5D
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FCMPSWAP_X2                   0x5E
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FMIN_X2                       0x5F
+#define     V_008DFC_SQ_BUFFER_ATOMIC_FMAX_X2                       0x60
+#define     V_008DFC_SQ_BUFFER_WBINVL1_SC                           0x70
+#define     V_008DFC_SQ_BUFFER_WBINVL1                              0x71
+#define   S_008DFC_ENCODING(x)                                        (((x) & 0x3F) << 26)
+#define   G_008DFC_ENCODING(x)                                        (((x) >> 26) & 0x3F)
+#define   C_008DFC_ENCODING                                           0x03FFFFFF
+#define     V_008DFC_SQ_ENC_MUBUF_FIELD                             0x38
+#endif
+#define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
+#define R_008F04_SQ_BUF_RSRC_WORD1                                      0x008F04
+#define   S_008F04_BASE_ADDRESS_HI(x)                                 (((x) & 0xFFFF) << 0)
+#define   G_008F04_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFFFF)
+#define   C_008F04_BASE_ADDRESS_HI                                    0xFFFF0000
+#define   S_008F04_STRIDE(x)                                          (((x) & 0x3FFF) << 16)
+#define   G_008F04_STRIDE(x)                                          (((x) >> 16) & 0x3FFF)
+#define   C_008F04_STRIDE                                             0xC000FFFF
+#define   S_008F04_CACHE_SWIZZLE(x)                                   (((x) & 0x1) << 30)
+#define   G_008F04_CACHE_SWIZZLE(x)                                   (((x) >> 30) & 0x1)
+#define   C_008F04_CACHE_SWIZZLE                                      0xBFFFFFFF
+#define   S_008F04_SWIZZLE_ENABLE(x)                                  (((x) & 0x1) << 31)
+#define   G_008F04_SWIZZLE_ENABLE(x)                                  (((x) >> 31) & 0x1)
+#define   C_008F04_SWIZZLE_ENABLE                                     0x7FFFFFFF
+#define R_008F08_SQ_BUF_RSRC_WORD2                                      0x008F08
+#define R_008F0C_SQ_BUF_RSRC_WORD3                                      0x008F0C
+#define   S_008F0C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
+#define   G_008F0C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
+#define   C_008F0C_DST_SEL_X                                          0xFFFFFFF8
+#define     V_008F0C_SQ_SEL_0                                       0x00
+#define     V_008F0C_SQ_SEL_1                                       0x01
+#define     V_008F0C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F0C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F0C_SQ_SEL_X                                       0x04
+#define     V_008F0C_SQ_SEL_Y                                       0x05
+#define     V_008F0C_SQ_SEL_Z                                       0x06
+#define     V_008F0C_SQ_SEL_W                                       0x07
+#define   S_008F0C_DST_SEL_Y(x)                                       (((x) & 0x07) << 3)
+#define   G_008F0C_DST_SEL_Y(x)                                       (((x) >> 3) & 0x07)
+#define   C_008F0C_DST_SEL_Y                                          0xFFFFFFC7
+#define     V_008F0C_SQ_SEL_0                                       0x00
+#define     V_008F0C_SQ_SEL_1                                       0x01
+#define     V_008F0C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F0C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F0C_SQ_SEL_X                                       0x04
+#define     V_008F0C_SQ_SEL_Y                                       0x05
+#define     V_008F0C_SQ_SEL_Z                                       0x06
+#define     V_008F0C_SQ_SEL_W                                       0x07
+#define   S_008F0C_DST_SEL_Z(x)                                       (((x) & 0x07) << 6)
+#define   G_008F0C_DST_SEL_Z(x)                                       (((x) >> 6) & 0x07)
+#define   C_008F0C_DST_SEL_Z                                          0xFFFFFE3F
+#define     V_008F0C_SQ_SEL_0                                       0x00
+#define     V_008F0C_SQ_SEL_1                                       0x01
+#define     V_008F0C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F0C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F0C_SQ_SEL_X                                       0x04
+#define     V_008F0C_SQ_SEL_Y                                       0x05
+#define     V_008F0C_SQ_SEL_Z                                       0x06
+#define     V_008F0C_SQ_SEL_W                                       0x07
+#define   S_008F0C_DST_SEL_W(x)                                       (((x) & 0x07) << 9)
+#define   G_008F0C_DST_SEL_W(x)                                       (((x) >> 9) & 0x07)
+#define   C_008F0C_DST_SEL_W                                          0xFFFFF1FF
+#define     V_008F0C_SQ_SEL_0                                       0x00
+#define     V_008F0C_SQ_SEL_1                                       0x01
+#define     V_008F0C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F0C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F0C_SQ_SEL_X                                       0x04
+#define     V_008F0C_SQ_SEL_Y                                       0x05
+#define     V_008F0C_SQ_SEL_Z                                       0x06
+#define     V_008F0C_SQ_SEL_W                                       0x07
+#define   S_008F0C_NUM_FORMAT(x)                                      (((x) & 0x07) << 12)
+#define   G_008F0C_NUM_FORMAT(x)                                      (((x) >> 12) & 0x07)
+#define   C_008F0C_NUM_FORMAT                                         0xFFFF8FFF
+#define     V_008F0C_BUF_NUM_FORMAT_UNORM                           0x00
+#define     V_008F0C_BUF_NUM_FORMAT_SNORM                           0x01
+#define     V_008F0C_BUF_NUM_FORMAT_USCALED                         0x02
+#define     V_008F0C_BUF_NUM_FORMAT_SSCALED                         0x03
+#define     V_008F0C_BUF_NUM_FORMAT_UINT                            0x04
+#define     V_008F0C_BUF_NUM_FORMAT_SINT                            0x05
+#define     V_008F0C_BUF_NUM_FORMAT_SNORM_OGL                       0x06
+#define     V_008F0C_BUF_NUM_FORMAT_FLOAT                           0x07
+#define   S_008F0C_DATA_FORMAT(x)                                     (((x) & 0x0F) << 15)
+#define   G_008F0C_DATA_FORMAT(x)                                     (((x) >> 15) & 0x0F)
+#define   C_008F0C_DATA_FORMAT                                        0xFFF87FFF
+#define     V_008F0C_BUF_DATA_FORMAT_INVALID                        0x00
+#define     V_008F0C_BUF_DATA_FORMAT_8                              0x01
+#define     V_008F0C_BUF_DATA_FORMAT_16                             0x02
+#define     V_008F0C_BUF_DATA_FORMAT_8_8                            0x03
+#define     V_008F0C_BUF_DATA_FORMAT_32                             0x04
+#define     V_008F0C_BUF_DATA_FORMAT_16_16                          0x05
+#define     V_008F0C_BUF_DATA_FORMAT_10_11_11                       0x06
+#define     V_008F0C_BUF_DATA_FORMAT_11_11_10                       0x07
+#define     V_008F0C_BUF_DATA_FORMAT_10_10_10_2                     0x08
+#define     V_008F0C_BUF_DATA_FORMAT_2_10_10_10                     0x09
+#define     V_008F0C_BUF_DATA_FORMAT_8_8_8_8                        0x0A
+#define     V_008F0C_BUF_DATA_FORMAT_32_32                          0x0B
+#define     V_008F0C_BUF_DATA_FORMAT_16_16_16_16                    0x0C
+#define     V_008F0C_BUF_DATA_FORMAT_32_32_32                       0x0D
+#define     V_008F0C_BUF_DATA_FORMAT_32_32_32_32                    0x0E
+#define     V_008F0C_BUF_DATA_FORMAT_RESERVED_15                    0x0F
+#define   S_008F0C_ELEMENT_SIZE(x)                                    (((x) & 0x03) << 19)
+#define   G_008F0C_ELEMENT_SIZE(x)                                    (((x) >> 19) & 0x03)
+#define   C_008F0C_ELEMENT_SIZE                                       0xFFE7FFFF
+#define   S_008F0C_INDEX_STRIDE(x)                                    (((x) & 0x03) << 21)
+#define   G_008F0C_INDEX_STRIDE(x)                                    (((x) >> 21) & 0x03)
+#define   C_008F0C_INDEX_STRIDE                                       0xFF9FFFFF
+#define   S_008F0C_ADD_TID_ENABLE(x)                                  (((x) & 0x1) << 23)
+#define   G_008F0C_ADD_TID_ENABLE(x)                                  (((x) >> 23) & 0x1)
+#define   C_008F0C_ADD_TID_ENABLE                                     0xFF7FFFFF
+#define   S_008F0C_HASH_ENABLE(x)                                     (((x) & 0x1) << 25)
+#define   G_008F0C_HASH_ENABLE(x)                                     (((x) >> 25) & 0x1)
+#define   C_008F0C_HASH_ENABLE                                        0xFDFFFFFF
+#define   S_008F0C_HEAP(x)                                            (((x) & 0x1) << 26)
+#define   G_008F0C_HEAP(x)                                            (((x) >> 26) & 0x1)
+#define   C_008F0C_HEAP                                               0xFBFFFFFF
+#define   S_008F0C_TYPE(x)                                            (((x) & 0x03) << 30)
+#define   G_008F0C_TYPE(x)                                            (((x) >> 30) & 0x03)
+#define   C_008F0C_TYPE                                               0x3FFFFFFF
+#define     V_008F0C_SQ_RSRC_BUF                                    0x00
+#define     V_008F0C_SQ_RSRC_BUF_RSVD_1                             0x01
+#define     V_008F0C_SQ_RSRC_BUF_RSVD_2                             0x02
+#define     V_008F0C_SQ_RSRC_BUF_RSVD_3                             0x03
+#define R_008F10_SQ_IMG_RSRC_WORD0                                      0x008F10
+#define R_008F14_SQ_IMG_RSRC_WORD1                                      0x008F14
+#define   S_008F14_BASE_ADDRESS_HI(x)                                 (((x) & 0xFF) << 0)
+#define   G_008F14_BASE_ADDRESS_HI(x)                                 (((x) >> 0) & 0xFF)
+#define   C_008F14_BASE_ADDRESS_HI                                    0xFFFFFF00
+#define   S_008F14_MIN_LOD(x)                                         (((x) & 0xFFF) << 8)
+#define   G_008F14_MIN_LOD(x)                                         (((x) >> 8) & 0xFFF)
+#define   C_008F14_MIN_LOD                                            0xFFF000FF
+#define   S_008F14_DATA_FORMAT(x)                                     (((x) & 0x3F) << 20)
+#define   G_008F14_DATA_FORMAT(x)                                     (((x) >> 20) & 0x3F)
+#define   C_008F14_DATA_FORMAT                                        0xFC0FFFFF
+#define     V_008F14_IMG_DATA_FORMAT_INVALID                        0x00
+#define     V_008F14_IMG_DATA_FORMAT_8                              0x01
+#define     V_008F14_IMG_DATA_FORMAT_16                             0x02
+#define     V_008F14_IMG_DATA_FORMAT_8_8                            0x03
+#define     V_008F14_IMG_DATA_FORMAT_32                             0x04
+#define     V_008F14_IMG_DATA_FORMAT_16_16                          0x05
+#define     V_008F14_IMG_DATA_FORMAT_10_11_11                       0x06
+#define     V_008F14_IMG_DATA_FORMAT_11_11_10                       0x07
+#define     V_008F14_IMG_DATA_FORMAT_10_10_10_2                     0x08
+#define     V_008F14_IMG_DATA_FORMAT_2_10_10_10                     0x09
+#define     V_008F14_IMG_DATA_FORMAT_8_8_8_8                        0x0A
+#define     V_008F14_IMG_DATA_FORMAT_32_32                          0x0B
+#define     V_008F14_IMG_DATA_FORMAT_16_16_16_16                    0x0C
+#define     V_008F14_IMG_DATA_FORMAT_32_32_32                       0x0D
+#define     V_008F14_IMG_DATA_FORMAT_32_32_32_32                    0x0E
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_15                    0x0F
+#define     V_008F14_IMG_DATA_FORMAT_5_6_5                          0x10
+#define     V_008F14_IMG_DATA_FORMAT_1_5_5_5                        0x11
+#define     V_008F14_IMG_DATA_FORMAT_5_5_5_1                        0x12
+#define     V_008F14_IMG_DATA_FORMAT_4_4_4_4                        0x13
+#define     V_008F14_IMG_DATA_FORMAT_8_24                           0x14
+#define     V_008F14_IMG_DATA_FORMAT_24_8                           0x15
+#define     V_008F14_IMG_DATA_FORMAT_X24_8_32                       0x16
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_23                    0x17
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_24                    0x18
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_25                    0x19
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_26                    0x1A
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_27                    0x1B
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_28                    0x1C
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_29                    0x1D
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_30                    0x1E
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_31                    0x1F
+#define     V_008F14_IMG_DATA_FORMAT_GB_GR                          0x20
+#define     V_008F14_IMG_DATA_FORMAT_BG_RG                          0x21
+#define     V_008F14_IMG_DATA_FORMAT_5_9_9_9                        0x22
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_42                    0x2A
+#define     V_008F14_IMG_DATA_FORMAT_RESERVED_43                    0x2B
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1                   0x2C
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1                   0x2D
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1                   0x2E
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2                   0x2F
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2                   0x30
+#define     V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4                   0x31
+#define     V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1                 0x32
+#define     V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2                  0x33
+#define     V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2                 0x34
+#define     V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4                  0x35
+#define     V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8                  0x36
+#define     V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4                 0x37
+#define     V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8                 0x38
+#define     V_008F14_IMG_DATA_FORMAT_4_4                            0x39
+#define     V_008F14_IMG_DATA_FORMAT_6_5_5                          0x3A
+#define     V_008F14_IMG_DATA_FORMAT_1                              0x3B
+#define     V_008F14_IMG_DATA_FORMAT_1_REVERSED                     0x3C
+#define     V_008F14_IMG_DATA_FORMAT_32_AS_8                        0x3D
+#define     V_008F14_IMG_DATA_FORMAT_32_AS_8_8                      0x3E
+#define     V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32              0x3F
+#define   S_008F14_NUM_FORMAT(x)                                      (((x) & 0x0F) << 26)
+#define   G_008F14_NUM_FORMAT(x)                                      (((x) >> 26) & 0x0F)
+#define   C_008F14_NUM_FORMAT                                         0xC3FFFFFF
+#define     V_008F14_IMG_NUM_FORMAT_UNORM                           0x00
+#define     V_008F14_IMG_NUM_FORMAT_SNORM                           0x01
+#define     V_008F14_IMG_NUM_FORMAT_USCALED                         0x02
+#define     V_008F14_IMG_NUM_FORMAT_SSCALED                         0x03
+#define     V_008F14_IMG_NUM_FORMAT_UINT                            0x04
+#define     V_008F14_IMG_NUM_FORMAT_SINT                            0x05
+#define     V_008F14_IMG_NUM_FORMAT_SNORM_OGL                       0x06
+#define     V_008F14_IMG_NUM_FORMAT_FLOAT                           0x07
+#define     V_008F14_IMG_NUM_FORMAT_RESERVED_8                      0x08
+#define     V_008F14_IMG_NUM_FORMAT_SRGB                            0x09
+#define     V_008F14_IMG_NUM_FORMAT_UBNORM                          0x0A
+#define     V_008F14_IMG_NUM_FORMAT_UBNORM_OGL                      0x0B
+#define     V_008F14_IMG_NUM_FORMAT_UBINT                           0x0C
+#define     V_008F14_IMG_NUM_FORMAT_UBSCALED                        0x0D
+#define     V_008F14_IMG_NUM_FORMAT_RESERVED_14                     0x0E
+#define     V_008F14_IMG_NUM_FORMAT_RESERVED_15                     0x0F
+#define R_008F18_SQ_IMG_RSRC_WORD2                                      0x008F18
+#define   S_008F18_WIDTH(x)                                           (((x) & 0x3FFF) << 0)
+#define   G_008F18_WIDTH(x)                                           (((x) >> 0) & 0x3FFF)
+#define   C_008F18_WIDTH                                              0xFFFFC000
+#define   S_008F18_HEIGHT(x)                                          (((x) & 0x3FFF) << 14)
+#define   G_008F18_HEIGHT(x)                                          (((x) >> 14) & 0x3FFF)
+#define   C_008F18_HEIGHT                                             0xF0003FFF
+#define   S_008F18_PERF_MOD(x)                                        (((x) & 0x07) << 28)
+#define   G_008F18_PERF_MOD(x)                                        (((x) >> 28) & 0x07)
+#define   C_008F18_PERF_MOD                                           0x8FFFFFFF
+#define   S_008F18_INTERLACED(x)                                      (((x) & 0x1) << 31)
+#define   G_008F18_INTERLACED(x)                                      (((x) >> 31) & 0x1)
+#define   C_008F18_INTERLACED                                         0x7FFFFFFF
+#define R_008F1C_SQ_IMG_RSRC_WORD3                                      0x008F1C
+#define   S_008F1C_DST_SEL_X(x)                                       (((x) & 0x07) << 0)
+#define   G_008F1C_DST_SEL_X(x)                                       (((x) >> 0) & 0x07)
+#define   C_008F1C_DST_SEL_X                                          0xFFFFFFF8
+#define     V_008F1C_SQ_SEL_0                                       0x00
+#define     V_008F1C_SQ_SEL_1                                       0x01
+#define     V_008F1C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F1C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F1C_SQ_SEL_X                                       0x04
+#define     V_008F1C_SQ_SEL_Y                                       0x05
+#define     V_008F1C_SQ_SEL_Z                                       0x06
+#define     V_008F1C_SQ_SEL_W                                       0x07
+#define   S_008F1C_DST_SEL_Y(x)                                       (((x) & 0x07) << 3)
+#define   G_008F1C_DST_SEL_Y(x)                                       (((x) >> 3) & 0x07)
+#define   C_008F1C_DST_SEL_Y                                          0xFFFFFFC7
+#define     V_008F1C_SQ_SEL_0                                       0x00
+#define     V_008F1C_SQ_SEL_1                                       0x01
+#define     V_008F1C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F1C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F1C_SQ_SEL_X                                       0x04
+#define     V_008F1C_SQ_SEL_Y                                       0x05
+#define     V_008F1C_SQ_SEL_Z                                       0x06
+#define     V_008F1C_SQ_SEL_W                                       0x07
+#define   S_008F1C_DST_SEL_Z(x)                                       (((x) & 0x07) << 6)
+#define   G_008F1C_DST_SEL_Z(x)                                       (((x) >> 6) & 0x07)
+#define   C_008F1C_DST_SEL_Z                                          0xFFFFFE3F
+#define     V_008F1C_SQ_SEL_0                                       0x00
+#define     V_008F1C_SQ_SEL_1                                       0x01
+#define     V_008F1C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F1C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F1C_SQ_SEL_X                                       0x04
+#define     V_008F1C_SQ_SEL_Y                                       0x05
+#define     V_008F1C_SQ_SEL_Z                                       0x06
+#define     V_008F1C_SQ_SEL_W                                       0x07
+#define   S_008F1C_DST_SEL_W(x)                                       (((x) & 0x07) << 9)
+#define   G_008F1C_DST_SEL_W(x)                                       (((x) >> 9) & 0x07)
+#define   C_008F1C_DST_SEL_W                                          0xFFFFF1FF
+#define     V_008F1C_SQ_SEL_0                                       0x00
+#define     V_008F1C_SQ_SEL_1                                       0x01
+#define     V_008F1C_SQ_SEL_RESERVED_0                              0x02
+#define     V_008F1C_SQ_SEL_RESERVED_1                              0x03
+#define     V_008F1C_SQ_SEL_X                                       0x04
+#define     V_008F1C_SQ_SEL_Y                                       0x05
+#define     V_008F1C_SQ_SEL_Z                                       0x06
+#define     V_008F1C_SQ_SEL_W                                       0x07
+#define   S_008F1C_BASE_LEVEL(x)                                      (((x) & 0x0F) << 12)
+#define   G_008F1C_BASE_LEVEL(x)                                      (((x) >> 12) & 0x0F)
+#define   C_008F1C_BASE_LEVEL                                         0xFFFF0FFF
+#define   S_008F1C_LAST_LEVEL(x)                                      (((x) & 0x0F) << 16)
+#define   G_008F1C_LAST_LEVEL(x)                                      (((x) >> 16) & 0x0F)
+#define   C_008F1C_LAST_LEVEL                                         0xFFF0FFFF
+#define   S_008F1C_TILING_INDEX(x)                                    (((x) & 0x1F) << 20)
+#define   G_008F1C_TILING_INDEX(x)                                    (((x) >> 20) & 0x1F)
+#define   C_008F1C_TILING_INDEX                                       0xFE0FFFFF
+#define   S_008F1C_POW2_PAD(x)                                        (((x) & 0x1) << 25)
+#define   G_008F1C_POW2_PAD(x)                                        (((x) >> 25) & 0x1)
+#define   C_008F1C_POW2_PAD                                           0xFDFFFFFF
+#define   S_008F1C_TYPE(x)                                            (((x) & 0x0F) << 28)
+#define   G_008F1C_TYPE(x)                                            (((x) >> 28) & 0x0F)
+#define   C_008F1C_TYPE                                               0x0FFFFFFF
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_0                             0x00
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_1                             0x01
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_2                             0x02
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_3                             0x03
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_4                             0x04
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_5                             0x05
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_6                             0x06
+#define     V_008F1C_SQ_RSRC_IMG_RSVD_7                             0x07
+#define     V_008F1C_SQ_RSRC_IMG_1D                                 0x08
+#define     V_008F1C_SQ_RSRC_IMG_2D                                 0x09
+#define     V_008F1C_SQ_RSRC_IMG_3D                                 0x0A
+#define     V_008F1C_SQ_RSRC_IMG_CUBE                               0x0B
+#define     V_008F1C_SQ_RSRC_IMG_1D_ARRAY                           0x0C
+#define     V_008F1C_SQ_RSRC_IMG_2D_ARRAY                           0x0D
+#define     V_008F1C_SQ_RSRC_IMG_2D_MSAA                            0x0E
+#define     V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY                      0x0F
+#define R_008F20_SQ_IMG_RSRC_WORD4                                      0x008F20
+#define   S_008F20_DEPTH(x)                                           (((x) & 0x1FFF) << 0)
+#define   G_008F20_DEPTH(x)                                           (((x) >> 0) & 0x1FFF)
+#define   C_008F20_DEPTH                                              0xFFFFE000
+#define   S_008F20_PITCH(x)                                           (((x) & 0x3FFF) << 13)
+#define   G_008F20_PITCH(x)                                           (((x) >> 13) & 0x3FFF)
+#define   C_008F20_PITCH                                              0xF8001FFF
+#define R_008F24_SQ_IMG_RSRC_WORD5                                      0x008F24
+#define   S_008F24_BASE_ARRAY(x)                                      (((x) & 0x1FFF) << 0)
+#define   G_008F24_BASE_ARRAY(x)                                      (((x) >> 0) & 0x1FFF)
+#define   C_008F24_BASE_ARRAY                                         0xFFFFE000
+#define   S_008F24_LAST_ARRAY(x)                                      (((x) & 0x1FFF) << 13)
+#define   G_008F24_LAST_ARRAY(x)                                      (((x) >> 13) & 0x1FFF)
+#define   C_008F24_LAST_ARRAY                                         0xFC001FFF
+#define R_008F28_SQ_IMG_RSRC_WORD6                                      0x008F28
+#define   S_008F28_MIN_LOD_WARN(x)                                    (((x) & 0xFFF) << 0)
+#define   G_008F28_MIN_LOD_WARN(x)                                    (((x) >> 0) & 0xFFF)
+#define   C_008F28_MIN_LOD_WARN                                       0xFFFFF000
+#define R_008F2C_SQ_IMG_RSRC_WORD7                                      0x008F2C
+#define R_008F30_SQ_IMG_SAMP_WORD0                                      0x008F30
+#define   S_008F30_CLAMP_X(x)                                         (((x) & 0x07) << 0)
+#define   G_008F30_CLAMP_X(x)                                         (((x) >> 0) & 0x07)
+#define   C_008F30_CLAMP_X                                            0xFFFFFFF8
+#define     V_008F30_SQ_TEX_WRAP                                    0x00
+#define     V_008F30_SQ_TEX_MIRROR                                  0x01
+#define     V_008F30_SQ_TEX_CLAMP_LAST_TEXEL                        0x02
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL                  0x03
+#define     V_008F30_SQ_TEX_CLAMP_HALF_BORDER                       0x04
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER                 0x05
+#define     V_008F30_SQ_TEX_CLAMP_BORDER                            0x06
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_BORDER                      0x07
+#define   S_008F30_CLAMP_Y(x)                                         (((x) & 0x07) << 3)
+#define   G_008F30_CLAMP_Y(x)                                         (((x) >> 3) & 0x07)
+#define   C_008F30_CLAMP_Y                                            0xFFFFFFC7
+#define     V_008F30_SQ_TEX_WRAP                                    0x00
+#define     V_008F30_SQ_TEX_MIRROR                                  0x01
+#define     V_008F30_SQ_TEX_CLAMP_LAST_TEXEL                        0x02
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL                  0x03
+#define     V_008F30_SQ_TEX_CLAMP_HALF_BORDER                       0x04
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER                 0x05
+#define     V_008F30_SQ_TEX_CLAMP_BORDER                            0x06
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_BORDER                      0x07
+#define   S_008F30_CLAMP_Z(x)                                         (((x) & 0x07) << 6)
+#define   G_008F30_CLAMP_Z(x)                                         (((x) >> 6) & 0x07)
+#define   C_008F30_CLAMP_Z                                            0xFFFFFE3F
+#define     V_008F30_SQ_TEX_WRAP                                    0x00
+#define     V_008F30_SQ_TEX_MIRROR                                  0x01
+#define     V_008F30_SQ_TEX_CLAMP_LAST_TEXEL                        0x02
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL                  0x03
+#define     V_008F30_SQ_TEX_CLAMP_HALF_BORDER                       0x04
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER                 0x05
+#define     V_008F30_SQ_TEX_CLAMP_BORDER                            0x06
+#define     V_008F30_SQ_TEX_MIRROR_ONCE_BORDER                      0x07
+#define   S_008F30_DEPTH_COMPARE_FUNC(x)                              (((x) & 0x07) << 12)
+#define   G_008F30_DEPTH_COMPARE_FUNC(x)                              (((x) >> 12) & 0x07)
+#define   C_008F30_DEPTH_COMPARE_FUNC                                 0xFFFF8FFF
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER                     0x00
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_LESS                      0x01
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL                     0x02
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL                 0x03
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER                   0x04
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL                  0x05
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL              0x06
+#define     V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS                    0x07
+#define   S_008F30_FORCE_UNNORMALIZED(x)                              (((x) & 0x1) << 15)
+#define   G_008F30_FORCE_UNNORMALIZED(x)                              (((x) >> 15) & 0x1)
+#define   C_008F30_FORCE_UNNORMALIZED                                 0xFFFF7FFF
+#define   S_008F30_MC_COORD_TRUNC(x)                                  (((x) & 0x1) << 19)
+#define   G_008F30_MC_COORD_TRUNC(x)                                  (((x) >> 19) & 0x1)
+#define   C_008F30_MC_COORD_TRUNC                                     0xFFF7FFFF
+#define   S_008F30_FORCE_DEGAMMA(x)                                   (((x) & 0x1) << 20)
+#define   G_008F30_FORCE_DEGAMMA(x)                                   (((x) >> 20) & 0x1)
+#define   C_008F30_FORCE_DEGAMMA                                      0xFFEFFFFF
+#define   S_008F30_TRUNC_COORD(x)                                     (((x) & 0x1) << 27)
+#define   G_008F30_TRUNC_COORD(x)                                     (((x) >> 27) & 0x1)
+#define   C_008F30_TRUNC_COORD                                        0xF7FFFFFF
+#define   S_008F30_DISABLE_CUBE_WRAP(x)                               (((x) & 0x1) << 28)
+#define   G_008F30_DISABLE_CUBE_WRAP(x)                               (((x) >> 28) & 0x1)
+#define   C_008F30_DISABLE_CUBE_WRAP                                  0xEFFFFFFF
+#define   S_008F30_FILTER_MODE(x)                                     (((x) & 0x03) << 29)
+#define   G_008F30_FILTER_MODE(x)                                     (((x) >> 29) & 0x03)
+#define   C_008F30_FILTER_MODE                                        0x9FFFFFFF
+#define R_008F34_SQ_IMG_SAMP_WORD1                                      0x008F34
+#define   S_008F34_MIN_LOD(x)                                         (((x) & 0xFFF) << 0)
+#define   G_008F34_MIN_LOD(x)                                         (((x) >> 0) & 0xFFF)
+#define   C_008F34_MIN_LOD                                            0xFFFFF000
+#define   S_008F34_MAX_LOD(x)                                         (((x) & 0xFFF) << 12)
+#define   G_008F34_MAX_LOD(x)                                         (((x) >> 12) & 0xFFF)
+#define   C_008F34_MAX_LOD                                            0xFF000FFF
+#define   S_008F34_PERF_MIP(x)                                        (((x) & 0x0F) << 24)
+#define   G_008F34_PERF_MIP(x)                                        (((x) >> 24) & 0x0F)
+#define   C_008F34_PERF_MIP                                           0xF0FFFFFF
+#define   S_008F34_PERF_Z(x)                                          (((x) & 0x0F) << 28)
+#define   G_008F34_PERF_Z(x)                                          (((x) >> 28) & 0x0F)
+#define   C_008F34_PERF_Z                                             0x0FFFFFFF
+#define R_008F38_SQ_IMG_SAMP_WORD2                                      0x008F38
+#define   S_008F38_LOD_BIAS(x)                                        (((x) & 0x3FFF) << 0)
+#define   G_008F38_LOD_BIAS(x)                                        (((x) >> 0) & 0x3FFF)
+#define   C_008F38_LOD_BIAS                                           0xFFFFC000
+#define   S_008F38_LOD_BIAS_SEC(x)                                    (((x) & 0x3F) << 14)
+#define   G_008F38_LOD_BIAS_SEC(x)                                    (((x) >> 14) & 0x3F)
+#define   C_008F38_LOD_BIAS_SEC                                       0xFFF03FFF
+#define   S_008F38_XY_MAG_FILTER(x)                                   (((x) & 0x03) << 20)
+#define   G_008F38_XY_MAG_FILTER(x)                                   (((x) >> 20) & 0x03)
+#define   C_008F38_XY_MAG_FILTER                                      0xFFCFFFFF
+#define     V_008F38_SQ_TEX_XY_FILTER_POINT                         0x00
+#define     V_008F38_SQ_TEX_XY_FILTER_BILINEAR                      0x01
+#define   S_008F38_XY_MIN_FILTER(x)                                   (((x) & 0x03) << 22)
+#define   G_008F38_XY_MIN_FILTER(x)                                   (((x) >> 22) & 0x03)
+#define   C_008F38_XY_MIN_FILTER                                      0xFF3FFFFF
+#define     V_008F38_SQ_TEX_XY_FILTER_POINT                         0x00
+#define     V_008F38_SQ_TEX_XY_FILTER_BILINEAR                      0x01
+#define   S_008F38_Z_FILTER(x)                                        (((x) & 0x03) << 24)
+#define   G_008F38_Z_FILTER(x)                                        (((x) >> 24) & 0x03)
+#define   C_008F38_Z_FILTER                                           0xFCFFFFFF
+#define     V_008F38_SQ_TEX_Z_FILTER_NONE                           0x00
+#define     V_008F38_SQ_TEX_Z_FILTER_POINT                          0x01
+#define     V_008F38_SQ_TEX_Z_FILTER_LINEAR                         0x02
+#define   S_008F38_MIP_FILTER(x)                                      (((x) & 0x03) << 26)
+#define   G_008F38_MIP_FILTER(x)                                      (((x) >> 26) & 0x03)
+#define   C_008F38_MIP_FILTER                                         0xF3FFFFFF
+#define     V_008F38_SQ_TEX_Z_FILTER_NONE                           0x00
+#define     V_008F38_SQ_TEX_Z_FILTER_POINT                          0x01
+#define     V_008F38_SQ_TEX_Z_FILTER_LINEAR                         0x02
+#define   S_008F38_MIP_POINT_PRECLAMP(x)                              (((x) & 0x1) << 28)
+#define   G_008F38_MIP_POINT_PRECLAMP(x)                              (((x) >> 28) & 0x1)
+#define   C_008F38_MIP_POINT_PRECLAMP                                 0xEFFFFFFF
+#define   S_008F38_DISABLE_LSB_CEIL(x)                                (((x) & 0x1) << 29)
+#define   G_008F38_DISABLE_LSB_CEIL(x)                                (((x) >> 29) & 0x1)
+#define   C_008F38_DISABLE_LSB_CEIL                                   0xDFFFFFFF
+#define   S_008F38_FILTER_PREC_FIX(x)                                 (((x) & 0x1) << 30)
+#define   G_008F38_FILTER_PREC_FIX(x)                                 (((x) >> 30) & 0x1)
+#define   C_008F38_FILTER_PREC_FIX                                    0xBFFFFFFF
+#define R_008F3C_SQ_IMG_SAMP_WORD3                                      0x008F3C
+#define   S_008F3C_BORDER_COLOR_PTR(x)                                (((x) & 0xFFF) << 0)
+#define   G_008F3C_BORDER_COLOR_PTR(x)                                (((x) >> 0) & 0xFFF)
+#define   C_008F3C_BORDER_COLOR_PTR                                   0xFFFFF000
+#define   S_008F3C_BORDER_COLOR_TYPE(x)                               (((x) & 0x03) << 30)
+#define   G_008F3C_BORDER_COLOR_TYPE(x)                               (((x) >> 30) & 0x03)
+#define   C_008F3C_BORDER_COLOR_TYPE                                  0x3FFFFFFF
+#define     V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK                0x00
+#define     V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK               0x01
+#define     V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE               0x02
+#define     V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER                   0x03
+#define R_0090DC_SPI_DYN_GPR_LOCK_EN                                    0x0090DC
+#define   S_0090DC_VS_LOW_THRESHOLD(x)                                (((x) & 0x0F) << 0)
+#define   G_0090DC_VS_LOW_THRESHOLD(x)                                (((x) >> 0) & 0x0F)
+#define   C_0090DC_VS_LOW_THRESHOLD                                   0xFFFFFFF0
+#define   S_0090DC_GS_LOW_THRESHOLD(x)                                (((x) & 0x0F) << 4)
+#define   G_0090DC_GS_LOW_THRESHOLD(x)                                (((x) >> 4) & 0x0F)
+#define   C_0090DC_GS_LOW_THRESHOLD                                   0xFFFFFF0F
+#define   S_0090DC_ES_LOW_THRESHOLD(x)                                (((x) & 0x0F) << 8)
+#define   G_0090DC_ES_LOW_THRESHOLD(x)                                (((x) >> 8) & 0x0F)
+#define   C_0090DC_ES_LOW_THRESHOLD                                   0xFFFFF0FF
+#define   S_0090DC_HS_LOW_THRESHOLD(x)                                (((x) & 0x0F) << 12)
+#define   G_0090DC_HS_LOW_THRESHOLD(x)                                (((x) >> 12) & 0x0F)
+#define   C_0090DC_HS_LOW_THRESHOLD                                   0xFFFF0FFF
+#define   S_0090DC_LS_LOW_THRESHOLD(x)                                (((x) & 0x0F) << 16)
+#define   G_0090DC_LS_LOW_THRESHOLD(x)                                (((x) >> 16) & 0x0F)
+#define   C_0090DC_LS_LOW_THRESHOLD                                   0xFFF0FFFF
+#define R_0090E0_SPI_STATIC_THREAD_MGMT_1                               0x0090E0
+#define   S_0090E0_PS_CU_EN(x)                                        (((x) & 0xFFFF) << 0)
+#define   G_0090E0_PS_CU_EN(x)                                        (((x) >> 0) & 0xFFFF)
+#define   C_0090E0_PS_CU_EN                                           0xFFFF0000
+#define   S_0090E0_VS_CU_EN(x)                                        (((x) & 0xFFFF) << 16)
+#define   G_0090E0_VS_CU_EN(x)                                        (((x) >> 16) & 0xFFFF)
+#define   C_0090E0_VS_CU_EN                                           0x0000FFFF
+#define R_0090E4_SPI_STATIC_THREAD_MGMT_2                               0x0090E4
+#define   S_0090E4_GS_CU_EN(x)                                        (((x) & 0xFFFF) << 0)
+#define   G_0090E4_GS_CU_EN(x)                                        (((x) >> 0) & 0xFFFF)
+#define   C_0090E4_GS_CU_EN                                           0xFFFF0000
+#define   S_0090E4_ES_CU_EN(x)                                        (((x) & 0xFFFF) << 16)
+#define   G_0090E4_ES_CU_EN(x)                                        (((x) >> 16) & 0xFFFF)
+#define   C_0090E4_ES_CU_EN                                           0x0000FFFF
+#define R_0090E8_SPI_STATIC_THREAD_MGMT_3                               0x0090E8
+#define   S_0090E8_LSHS_CU_EN(x)                                      (((x) & 0xFFFF) << 0)
+#define   G_0090E8_LSHS_CU_EN(x)                                      (((x) >> 0) & 0xFFFF)
+#define   C_0090E8_LSHS_CU_EN                                         0xFFFF0000
+#define R_0090EC_SPI_PS_MAX_WAVE_ID                                     0x0090EC
+#define   S_0090EC_MAX_WAVE_ID(x)                                     (((x) & 0xFFF) << 0)
+#define   G_0090EC_MAX_WAVE_ID(x)                                     (((x) >> 0) & 0xFFF)
+#define   C_0090EC_MAX_WAVE_ID                                        0xFFFFF000
+#define R_0090F0_SPI_ARB_PRIORITY                                       0x0090F0
+#define   S_0090F0_RING_ORDER_TS0(x)                                  (((x) & 0x07) << 0)
+#define   G_0090F0_RING_ORDER_TS0(x)                                  (((x) >> 0) & 0x07)
+#define   C_0090F0_RING_ORDER_TS0                                     0xFFFFFFF8
+#define     V_0090F0_X_R0                                           0x00
+#define   S_0090F0_RING_ORDER_TS1(x)                                  (((x) & 0x07) << 3)
+#define   G_0090F0_RING_ORDER_TS1(x)                                  (((x) >> 3) & 0x07)
+#define   C_0090F0_RING_ORDER_TS1                                     0xFFFFFFC7
+#define   S_0090F0_RING_ORDER_TS2(x)                                  (((x) & 0x07) << 6)
+#define   G_0090F0_RING_ORDER_TS2(x)                                  (((x) >> 6) & 0x07)
+#define   C_0090F0_RING_ORDER_TS2                                     0xFFFFFE3F
+#define R_0090F4_SPI_ARB_CYCLES_0                                       0x0090F4
+#define   S_0090F4_TS0_DURATION(x)                                    (((x) & 0xFFFF) << 0)
+#define   G_0090F4_TS0_DURATION(x)                                    (((x) >> 0) & 0xFFFF)
+#define   C_0090F4_TS0_DURATION                                       0xFFFF0000
+#define   S_0090F4_TS1_DURATION(x)                                    (((x) & 0xFFFF) << 16)
+#define   G_0090F4_TS1_DURATION(x)                                    (((x) >> 16) & 0xFFFF)
+#define   C_0090F4_TS1_DURATION                                       0x0000FFFF
+#define R_0090F8_SPI_ARB_CYCLES_1                                       0x0090F8
+#define   S_0090F8_TS2_DURATION(x)                                    (((x) & 0xFFFF) << 0)
+#define   G_0090F8_TS2_DURATION(x)                                    (((x) >> 0) & 0xFFFF)
+#define   C_0090F8_TS2_DURATION                                       0xFFFF0000
+#define R_009100_SPI_CONFIG_CNTL                                        0x009100
+#define   S_009100_GPR_WRITE_PRIORITY(x)                              (((x) & 0x1FFFFF) << 0)
+#define   G_009100_GPR_WRITE_PRIORITY(x)                              (((x) >> 0) & 0x1FFFFF)
+#define   C_009100_GPR_WRITE_PRIORITY                                 0xFFE00000
+#define   S_009100_EXP_PRIORITY_ORDER(x)                              (((x) & 0x07) << 21)
+#define   G_009100_EXP_PRIORITY_ORDER(x)                              (((x) >> 21) & 0x07)
+#define   C_009100_EXP_PRIORITY_ORDER                                 0xFF1FFFFF
+#define   S_009100_ENABLE_SQG_TOP_EVENTS(x)                           (((x) & 0x1) << 24)
+#define   G_009100_ENABLE_SQG_TOP_EVENTS(x)                           (((x) >> 24) & 0x1)
+#define   C_009100_ENABLE_SQG_TOP_EVENTS                              0xFEFFFFFF
+#define   S_009100_ENABLE_SQG_BOP_EVENTS(x)                           (((x) & 0x1) << 25)
+#define   G_009100_ENABLE_SQG_BOP_EVENTS(x)                           (((x) >> 25) & 0x1)
+#define   C_009100_ENABLE_SQG_BOP_EVENTS                              0xFDFFFFFF
+#define   S_009100_RSRC_MGMT_RESET(x)                                 (((x) & 0x1) << 26)
+#define   G_009100_RSRC_MGMT_RESET(x)                                 (((x) >> 26) & 0x1)
+#define   C_009100_RSRC_MGMT_RESET                                    0xFBFFFFFF
+#define R_00913C_SPI_CONFIG_CNTL_1                                      0x00913C
+#define   S_00913C_VTX_DONE_DELAY(x)                                  (((x) & 0x0F) << 0)
+#define   G_00913C_VTX_DONE_DELAY(x)                                  (((x) >> 0) & 0x0F)
+#define   C_00913C_VTX_DONE_DELAY                                     0xFFFFFFF0
+#define     V_00913C_X_DELAY_14_CLKS                                0x00
+#define     V_00913C_X_DELAY_16_CLKS                                0x01
+#define     V_00913C_X_DELAY_18_CLKS                                0x02
+#define     V_00913C_X_DELAY_20_CLKS                                0x03
+#define     V_00913C_X_DELAY_22_CLKS                                0x04
+#define     V_00913C_X_DELAY_24_CLKS                                0x05
+#define     V_00913C_X_DELAY_26_CLKS                                0x06
+#define     V_00913C_X_DELAY_28_CLKS                                0x07
+#define     V_00913C_X_DELAY_30_CLKS                                0x08
+#define     V_00913C_X_DELAY_32_CLKS                                0x09
+#define     V_00913C_X_DELAY_34_CLKS                                0x0A
+#define     V_00913C_X_DELAY_4_CLKS                                 0x0B
+#define     V_00913C_X_DELAY_6_CLKS                                 0x0C
+#define     V_00913C_X_DELAY_8_CLKS                                 0x0D
+#define     V_00913C_X_DELAY_10_CLKS                                0x0E
+#define     V_00913C_X_DELAY_12_CLKS                                0x0F
+#define   S_00913C_INTERP_ONE_PRIM_PER_ROW(x)                         (((x) & 0x1) << 4)
+#define   G_00913C_INTERP_ONE_PRIM_PER_ROW(x)                         (((x) >> 4) & 0x1)
+#define   C_00913C_INTERP_ONE_PRIM_PER_ROW                            0xFFFFFFEF
+#define   S_00913C_PC_LIMIT_ENABLE(x)                                 (((x) & 0x1) << 6)
+#define   G_00913C_PC_LIMIT_ENABLE(x)                                 (((x) >> 6) & 0x1)
+#define   C_00913C_PC_LIMIT_ENABLE                                    0xFFFFFFBF
+#define   S_00913C_PC_LIMIT_STRICT(x)                                 (((x) & 0x1) << 7)
+#define   G_00913C_PC_LIMIT_STRICT(x)                                 (((x) >> 7) & 0x1)
+#define   C_00913C_PC_LIMIT_STRICT                                    0xFFFFFF7F
+#define   S_00913C_PC_LIMIT_SIZE(x)                                   (((x) & 0xFFFF) << 16)
+#define   G_00913C_PC_LIMIT_SIZE(x)                                   (((x) >> 16) & 0xFFFF)
+#define   C_00913C_PC_LIMIT_SIZE                                      0x0000FFFF
+#define R_00936C_SPI_RESOURCE_RESERVE_CU_AB_0                           0x00936C
+#define   S_00936C_TYPE_A(x)                                          (((x) & 0x0F) << 0)
+#define   G_00936C_TYPE_A(x)                                          (((x) >> 0) & 0x0F)
+#define   C_00936C_TYPE_A                                             0xFFFFFFF0
+#define   S_00936C_VGPR_A(x)                                          (((x) & 0x07) << 4)
+#define   G_00936C_VGPR_A(x)                                          (((x) >> 4) & 0x07)
+#define   C_00936C_VGPR_A                                             0xFFFFFF8F
+#define   S_00936C_SGPR_A(x)                                          (((x) & 0x07) << 7)
+#define   G_00936C_SGPR_A(x)                                          (((x) >> 7) & 0x07)
+#define   C_00936C_SGPR_A                                             0xFFFFFC7F
+#define   S_00936C_LDS_A(x)                                           (((x) & 0x07) << 10)
+#define   G_00936C_LDS_A(x)                                           (((x) >> 10) & 0x07)
+#define   C_00936C_LDS_A                                              0xFFFFE3FF
+#define   S_00936C_WAVES_A(x)                                         (((x) & 0x03) << 13)
+#define   G_00936C_WAVES_A(x)                                         (((x) >> 13) & 0x03)
+#define   C_00936C_WAVES_A                                            0xFFFF9FFF
+#define   S_00936C_EN_A(x)                                            (((x) & 0x1) << 15)
+#define   G_00936C_EN_A(x)                                            (((x) >> 15) & 0x1)
+#define   C_00936C_EN_A                                               0xFFFF7FFF
+#define   S_00936C_TYPE_B(x)                                          (((x) & 0x0F) << 16)
+#define   G_00936C_TYPE_B(x)                                          (((x) >> 16) & 0x0F)
+#define   C_00936C_TYPE_B                                             0xFFF0FFFF
+#define   S_00936C_VGPR_B(x)                                          (((x) & 0x07) << 20)
+#define   G_00936C_VGPR_B(x)                                          (((x) >> 20) & 0x07)
+#define   C_00936C_VGPR_B                                             0xFF8FFFFF
+#define   S_00936C_SGPR_B(x)                                          (((x) & 0x07) << 23)
+#define   G_00936C_SGPR_B(x)                                          (((x) >> 23) & 0x07)
+#define   C_00936C_SGPR_B                                             0xFC7FFFFF
+#define   S_00936C_LDS_B(x)                                           (((x) & 0x07) << 26)
+#define   G_00936C_LDS_B(x)                                           (((x) >> 26) & 0x07)
+#define   C_00936C_LDS_B                                              0xE3FFFFFF
+#define   S_00936C_WAVES_B(x)                                         (((x) & 0x03) << 29)
+#define   G_00936C_WAVES_B(x)                                         (((x) >> 29) & 0x03)
+#define   C_00936C_WAVES_B                                            0x9FFFFFFF
+#define   S_00936C_EN_B(x)                                            (((x) & 0x1) << 31)
+#define   G_00936C_EN_B(x)                                            (((x) >> 31) & 0x1)
+#define   C_00936C_EN_B                                               0x7FFFFFFF
+#define R_00950C_TA_CS_BC_BASE_ADDR                                     0x00950C
+#define R_009858_DB_SUBTILE_CONTROL                                     0x009858
+#define   S_009858_MSAA1_X(x)                                         (((x) & 0x03) << 0)
+#define   G_009858_MSAA1_X(x)                                         (((x) >> 0) & 0x03)
+#define   C_009858_MSAA1_X                                            0xFFFFFFFC
+#define   S_009858_MSAA1_Y(x)                                         (((x) & 0x03) << 2)
+#define   G_009858_MSAA1_Y(x)                                         (((x) >> 2) & 0x03)
+#define   C_009858_MSAA1_Y                                            0xFFFFFFF3
+#define   S_009858_MSAA2_X(x)                                         (((x) & 0x03) << 4)
+#define   G_009858_MSAA2_X(x)                                         (((x) >> 4) & 0x03)
+#define   C_009858_MSAA2_X                                            0xFFFFFFCF
+#define   S_009858_MSAA2_Y(x)                                         (((x) & 0x03) << 6)
+#define   G_009858_MSAA2_Y(x)                                         (((x) >> 6) & 0x03)
+#define   C_009858_MSAA2_Y                                            0xFFFFFF3F
+#define   S_009858_MSAA4_X(x)                                         (((x) & 0x03) << 8)
+#define   G_009858_MSAA4_X(x)                                         (((x) >> 8) & 0x03)
+#define   C_009858_MSAA4_X                                            0xFFFFFCFF
+#define   S_009858_MSAA4_Y(x)                                         (((x) & 0x03) << 10)
+#define   G_009858_MSAA4_Y(x)                                         (((x) >> 10) & 0x03)
+#define   C_009858_MSAA4_Y                                            0xFFFFF3FF
+#define   S_009858_MSAA8_X(x)                                         (((x) & 0x03) << 12)
+#define   G_009858_MSAA8_X(x)                                         (((x) >> 12) & 0x03)
+#define   C_009858_MSAA8_X                                            0xFFFFCFFF
+#define   S_009858_MSAA8_Y(x)                                         (((x) & 0x03) << 14)
+#define   G_009858_MSAA8_Y(x)                                         (((x) >> 14) & 0x03)
+#define   C_009858_MSAA8_Y                                            0xFFFF3FFF
+#define   S_009858_MSAA16_X(x)                                        (((x) & 0x03) << 16)
+#define   G_009858_MSAA16_X(x)                                        (((x) >> 16) & 0x03)
+#define   C_009858_MSAA16_X                                           0xFFFCFFFF
+#define   S_009858_MSAA16_Y(x)                                        (((x) & 0x03) << 18)
+#define   G_009858_MSAA16_Y(x)                                        (((x) >> 18) & 0x03)
+#define   C_009858_MSAA16_Y                                           0xFFF3FFFF
+#define R_009910_GB_TILE_MODE0                                          0x009910
+#define   S_009910_MICRO_TILE_MODE(x)                                 (((x) & 0x03) << 0)
+#define   G_009910_MICRO_TILE_MODE(x)                                 (((x) >> 0) & 0x03)
+#define   C_009910_MICRO_TILE_MODE                                    0xFFFFFFFC
+#define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
+#define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
+#define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
+#define     V_009910_ADDR_SURF_THICK_MICRO_TILING                   0x03
+#define   S_009910_ARRAY_MODE(x)                                      (((x) & 0x0F) << 2)
+#define   G_009910_ARRAY_MODE(x)                                      (((x) >> 2) & 0x0F)
+#define   C_009910_ARRAY_MODE                                         0xFFFFFFC3
+#define     V_009910_ARRAY_LINEAR_GENERAL                           0x00
+#define     V_009910_ARRAY_LINEAR_ALIGNED                           0x01
+#define     V_009910_ARRAY_1D_TILED_THIN1                           0x02
+#define     V_009910_ARRAY_1D_TILED_THICK                           0x03
+#define     V_009910_ARRAY_2D_TILED_THIN1                           0x04
+#define     V_009910_ARRAY_2D_TILED_THICK                           0x07
+#define     V_009910_ARRAY_2D_TILED_XTHICK                          0x08
+#define     V_009910_ARRAY_3D_TILED_THIN1                           0x0C
+#define     V_009910_ARRAY_3D_TILED_THICK                           0x0D
+#define     V_009910_ARRAY_3D_TILED_XTHICK                          0x0E
+#define     V_009910_ARRAY_POWER_SAVE                               0x0F
+#define   S_009910_PIPE_CONFIG(x)                                     (((x) & 0x1F) << 6)
+#define   G_009910_PIPE_CONFIG(x)                                     (((x) >> 6) & 0x1F)
+#define   C_009910_PIPE_CONFIG                                        0xFFFFF83F
+#define     V_009910_ADDR_SURF_P2                                   0x00
+#define     V_009910_ADDR_SURF_P2_RESERVED0                         0x01
+#define     V_009910_ADDR_SURF_P2_RESERVED1                         0x02
+#define     V_009910_ADDR_SURF_P2_RESERVED2                         0x03
+#define     V_009910_X_ADDR_SURF_P4_8X16                            0x04
+#define     V_009910_X_ADDR_SURF_P4_16X16                           0x05
+#define     V_009910_X_ADDR_SURF_P4_16X32                           0x06
+#define     V_009910_X_ADDR_SURF_P4_32X32                           0x07
+#define     V_009910_X_ADDR_SURF_P8_16X16_8X16                      0x08
+#define     V_009910_X_ADDR_SURF_P8_16X32_8X16                      0x09
+#define     V_009910_X_ADDR_SURF_P8_32X32_8X16                      0x0A
+#define     V_009910_X_ADDR_SURF_P8_16X32_16X16                     0x0B
+#define     V_009910_X_ADDR_SURF_P8_32X32_16X16                     0x0C
+#define     V_009910_X_ADDR_SURF_P8_32X32_16X32                     0x0D
+#define     V_009910_X_ADDR_SURF_P8_32X64_32X32                     0x0E
+#define   S_009910_TILE_SPLIT(x)                                      (((x) & 0x07) << 11)
+#define   G_009910_TILE_SPLIT(x)                                      (((x) >> 11) & 0x07)
+#define   C_009910_TILE_SPLIT                                         0xFFFFC7FF
+#define     V_009910_ADDR_SURF_TILE_SPLIT_64B                       0x00
+#define     V_009910_ADDR_SURF_TILE_SPLIT_128B                      0x01
+#define     V_009910_ADDR_SURF_TILE_SPLIT_256B                      0x02
+#define     V_009910_ADDR_SURF_TILE_SPLIT_512B                      0x03
+#define     V_009910_ADDR_SURF_TILE_SPLIT_1KB                       0x04
+#define     V_009910_ADDR_SURF_TILE_SPLIT_2KB                       0x05
+#define     V_009910_ADDR_SURF_TILE_SPLIT_4KB                       0x06
+#define   S_009910_BANK_WIDTH(x)                                      (((x) & 0x03) << 14)
+#define   G_009910_BANK_WIDTH(x)                                      (((x) >> 14) & 0x03)
+#define   C_009910_BANK_WIDTH                                         0xFFFF3FFF
+#define     V_009910_ADDR_SURF_BANK_WIDTH_1                         0x00
+#define     V_009910_ADDR_SURF_BANK_WIDTH_2                         0x01
+#define     V_009910_ADDR_SURF_BANK_WIDTH_4                         0x02
+#define     V_009910_ADDR_SURF_BANK_WIDTH_8                         0x03
+#define   S_009910_BANK_HEIGHT(x)                                     (((x) & 0x03) << 16)
+#define   G_009910_BANK_HEIGHT(x)                                     (((x) >> 16) & 0x03)
+#define   C_009910_BANK_HEIGHT                                        0xFFFCFFFF
+#define     V_009910_ADDR_SURF_BANK_HEIGHT_1                        0x00
+#define     V_009910_ADDR_SURF_BANK_HEIGHT_2                        0x01
+#define     V_009910_ADDR_SURF_BANK_HEIGHT_4                        0x02
+#define     V_009910_ADDR_SURF_BANK_HEIGHT_8                        0x03
+#define   S_009910_MACRO_TILE_ASPECT(x)                               (((x) & 0x03) << 18)
+#define   G_009910_MACRO_TILE_ASPECT(x)                               (((x) >> 18) & 0x03)
+#define   C_009910_MACRO_TILE_ASPECT                                  0xFFF3FFFF
+#define     V_009910_ADDR_SURF_MACRO_ASPECT_1                       0x00
+#define     V_009910_ADDR_SURF_MACRO_ASPECT_2                       0x01
+#define     V_009910_ADDR_SURF_MACRO_ASPECT_4                       0x02
+#define     V_009910_ADDR_SURF_MACRO_ASPECT_8                       0x03
+#define   S_009910_NUM_BANKS(x)                                       (((x) & 0x03) << 20)
+#define   G_009910_NUM_BANKS(x)                                       (((x) >> 20) & 0x03)
+#define   C_009910_NUM_BANKS                                          0xFFCFFFFF
+#define     V_009910_ADDR_SURF_2_BANK                               0x00
+#define     V_009910_ADDR_SURF_4_BANK                               0x01
+#define     V_009910_ADDR_SURF_8_BANK                               0x02
+#define     V_009910_ADDR_SURF_16_BANK                              0x03
+#define R_00B020_SPI_SHADER_PGM_LO_PS                                   0x00B020
+#define R_00B024_SPI_SHADER_PGM_HI_PS                                   0x00B024
+#define   S_00B024_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B024_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B024_MEM_BASE                                           0xFFFFFF00
+#define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B028_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B028_VGPRS                                              0xFFFFFFC0
+#define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B028_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B028_SGPRS                                              0xFFFFFC3F
+#define   S_00B028_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B028_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B028_PRIORITY                                           0xFFFFF3FF
+#define   S_00B028_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B028_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B028_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B028_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B028_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B028_PRIV                                               0xFFEFFFFF
+#define   S_00B028_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B028_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B028_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B028_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B028_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B028_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B028_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B028_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B028_IEEE_MODE                                          0xFF7FFFFF
+#define   S_00B028_CU_GROUP_DISABLE(x)                                (((x) & 0x1) << 24)
+#define   G_00B028_CU_GROUP_DISABLE(x)                                (((x) >> 24) & 0x1)
+#define   C_00B028_CU_GROUP_DISABLE                                   0xFEFFFFFF
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
+#define   S_00B02C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B02C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B02C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B02C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B02C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B02C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B02C_WAVE_CNT_EN(x)                                     (((x) & 0x1) << 7)
+#define   G_00B02C_WAVE_CNT_EN(x)                                     (((x) >> 7) & 0x1)
+#define   C_00B02C_WAVE_CNT_EN                                        0xFFFFFF7F
+#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
+#define   G_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) >> 8) & 0xFF)
+#define   C_00B02C_EXTRA_LDS_SIZE                                     0xFFFF00FF
+#define   S_00B02C_EXCP_EN(x)                                         (((x) & 0x7F) << 16)
+#define   G_00B02C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F)
+#define   C_00B02C_EXCP_EN                                            0xFF80FFFF
+#define R_00B030_SPI_SHADER_USER_DATA_PS_0                              0x00B030
+#define R_00B034_SPI_SHADER_USER_DATA_PS_1                              0x00B034
+#define R_00B038_SPI_SHADER_USER_DATA_PS_2                              0x00B038
+#define R_00B03C_SPI_SHADER_USER_DATA_PS_3                              0x00B03C
+#define R_00B040_SPI_SHADER_USER_DATA_PS_4                              0x00B040
+#define R_00B044_SPI_SHADER_USER_DATA_PS_5                              0x00B044
+#define R_00B048_SPI_SHADER_USER_DATA_PS_6                              0x00B048
+#define R_00B04C_SPI_SHADER_USER_DATA_PS_7                              0x00B04C
+#define R_00B050_SPI_SHADER_USER_DATA_PS_8                              0x00B050
+#define R_00B054_SPI_SHADER_USER_DATA_PS_9                              0x00B054
+#define R_00B058_SPI_SHADER_USER_DATA_PS_10                             0x00B058
+#define R_00B05C_SPI_SHADER_USER_DATA_PS_11                             0x00B05C
+#define R_00B060_SPI_SHADER_USER_DATA_PS_12                             0x00B060
+#define R_00B064_SPI_SHADER_USER_DATA_PS_13                             0x00B064
+#define R_00B068_SPI_SHADER_USER_DATA_PS_14                             0x00B068
+#define R_00B06C_SPI_SHADER_USER_DATA_PS_15                             0x00B06C
+#define R_00B120_SPI_SHADER_PGM_LO_VS                                   0x00B120
+#define R_00B124_SPI_SHADER_PGM_HI_VS                                   0x00B124
+#define   S_00B124_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B124_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B124_MEM_BASE                                           0xFFFFFF00
+#define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
+#define   S_00B128_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B128_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B128_VGPRS                                              0xFFFFFFC0
+#define   S_00B128_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B128_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B128_SGPRS                                              0xFFFFFC3F
+#define   S_00B128_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B128_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B128_PRIORITY                                           0xFFFFF3FF
+#define   S_00B128_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B128_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B128_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B128_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B128_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B128_PRIV                                               0xFFEFFFFF
+#define   S_00B128_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B128_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B128_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B128_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B128_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B128_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B128_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B128_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B128_IEEE_MODE                                          0xFF7FFFFF
+#define   S_00B128_VGPR_COMP_CNT(x)                                   (((x) & 0x03) << 24)
+#define   G_00B128_VGPR_COMP_CNT(x)                                   (((x) >> 24) & 0x03)
+#define   C_00B128_VGPR_COMP_CNT                                      0xFCFFFFFF
+#define   S_00B128_CU_GROUP_ENABLE(x)                                 (((x) & 0x1) << 26)
+#define   G_00B128_CU_GROUP_ENABLE(x)                                 (((x) >> 26) & 0x1)
+#define   C_00B128_CU_GROUP_ENABLE                                    0xFBFFFFFF
+#define R_00B12C_SPI_SHADER_PGM_RSRC2_VS                                0x00B12C
+#define   S_00B12C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B12C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B12C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B12C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B12C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B12C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B12C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
+#define   G_00B12C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
+#define   C_00B12C_OC_LDS_EN                                          0xFFFFFF7F
+#define   S_00B12C_SO_BASE0_EN(x)                                     (((x) & 0x1) << 8)
+#define   G_00B12C_SO_BASE0_EN(x)                                     (((x) >> 8) & 0x1)
+#define   C_00B12C_SO_BASE0_EN                                        0xFFFFFEFF
+#define   S_00B12C_SO_BASE1_EN(x)                                     (((x) & 0x1) << 9)
+#define   G_00B12C_SO_BASE1_EN(x)                                     (((x) >> 9) & 0x1)
+#define   C_00B12C_SO_BASE1_EN                                        0xFFFFFDFF
+#define   S_00B12C_SO_BASE2_EN(x)                                     (((x) & 0x1) << 10)
+#define   G_00B12C_SO_BASE2_EN(x)                                     (((x) >> 10) & 0x1)
+#define   C_00B12C_SO_BASE2_EN                                        0xFFFFFBFF
+#define   S_00B12C_SO_BASE3_EN(x)                                     (((x) & 0x1) << 11)
+#define   G_00B12C_SO_BASE3_EN(x)                                     (((x) >> 11) & 0x1)
+#define   C_00B12C_SO_BASE3_EN                                        0xFFFFF7FF
+#define   S_00B12C_SO_EN(x)                                           (((x) & 0x1) << 12)
+#define   G_00B12C_SO_EN(x)                                           (((x) >> 12) & 0x1)
+#define   C_00B12C_SO_EN                                              0xFFFFEFFF
+#define   S_00B12C_EXCP_EN(x)                                         (((x) & 0x7F) << 13)
+#define   G_00B12C_EXCP_EN(x)                                         (((x) >> 13) & 0x7F)
+#define   C_00B12C_EXCP_EN                                            0xFFF01FFF
+#define R_00B130_SPI_SHADER_USER_DATA_VS_0                              0x00B130
+#define R_00B134_SPI_SHADER_USER_DATA_VS_1                              0x00B134
+#define R_00B138_SPI_SHADER_USER_DATA_VS_2                              0x00B138
+#define R_00B13C_SPI_SHADER_USER_DATA_VS_3                              0x00B13C
+#define R_00B140_SPI_SHADER_USER_DATA_VS_4                              0x00B140
+#define R_00B144_SPI_SHADER_USER_DATA_VS_5                              0x00B144
+#define R_00B148_SPI_SHADER_USER_DATA_VS_6                              0x00B148
+#define R_00B14C_SPI_SHADER_USER_DATA_VS_7                              0x00B14C
+#define R_00B150_SPI_SHADER_USER_DATA_VS_8                              0x00B150
+#define R_00B154_SPI_SHADER_USER_DATA_VS_9                              0x00B154
+#define R_00B158_SPI_SHADER_USER_DATA_VS_10                             0x00B158
+#define R_00B15C_SPI_SHADER_USER_DATA_VS_11                             0x00B15C
+#define R_00B160_SPI_SHADER_USER_DATA_VS_12                             0x00B160
+#define R_00B164_SPI_SHADER_USER_DATA_VS_13                             0x00B164
+#define R_00B168_SPI_SHADER_USER_DATA_VS_14                             0x00B168
+#define R_00B16C_SPI_SHADER_USER_DATA_VS_15                             0x00B16C
+#define R_00B220_SPI_SHADER_PGM_LO_GS                                   0x00B220
+#define R_00B224_SPI_SHADER_PGM_HI_GS                                   0x00B224
+#define   S_00B224_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B224_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B224_MEM_BASE                                           0xFFFFFF00
+#define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
+#define   S_00B228_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B228_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B228_VGPRS                                              0xFFFFFFC0
+#define   S_00B228_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B228_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B228_SGPRS                                              0xFFFFFC3F
+#define   S_00B228_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B228_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B228_PRIORITY                                           0xFFFFF3FF
+#define   S_00B228_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B228_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B228_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B228_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B228_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B228_PRIV                                               0xFFEFFFFF
+#define   S_00B228_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B228_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B228_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B228_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B228_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B228_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B228_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B228_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B228_IEEE_MODE                                          0xFF7FFFFF
+#define   S_00B228_CU_GROUP_ENABLE(x)                                 (((x) & 0x1) << 24)
+#define   G_00B228_CU_GROUP_ENABLE(x)                                 (((x) >> 24) & 0x1)
+#define   C_00B228_CU_GROUP_ENABLE                                    0xFEFFFFFF
+#define R_00B22C_SPI_SHADER_PGM_RSRC2_GS                                0x00B22C
+#define   S_00B22C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B22C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B22C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B22C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B22C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B22C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B22C_EXCP_EN(x)                                         (((x) & 0x7F) << 7)
+#define   G_00B22C_EXCP_EN(x)                                         (((x) >> 7) & 0x7F)
+#define   C_00B22C_EXCP_EN                                            0xFFFFC07F
+#define R_00B230_SPI_SHADER_USER_DATA_GS_0                              0x00B230
+#define R_00B320_SPI_SHADER_PGM_LO_ES                                   0x00B320
+#define R_00B324_SPI_SHADER_PGM_HI_ES                                   0x00B324
+#define   S_00B324_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B324_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B324_MEM_BASE                                           0xFFFFFF00
+#define R_00B328_SPI_SHADER_PGM_RSRC1_ES                                0x00B328
+#define   S_00B328_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B328_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B328_VGPRS                                              0xFFFFFFC0
+#define   S_00B328_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B328_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B328_SGPRS                                              0xFFFFFC3F
+#define   S_00B328_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B328_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B328_PRIORITY                                           0xFFFFF3FF
+#define   S_00B328_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B328_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B328_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B328_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B328_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B328_PRIV                                               0xFFEFFFFF
+#define   S_00B328_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B328_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B328_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B328_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B328_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B328_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B328_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B328_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B328_IEEE_MODE                                          0xFF7FFFFF
+#define   S_00B328_VGPR_COMP_CNT(x)                                   (((x) & 0x03) << 24)
+#define   G_00B328_VGPR_COMP_CNT(x)                                   (((x) >> 24) & 0x03)
+#define   C_00B328_VGPR_COMP_CNT                                      0xFCFFFFFF
+#define   S_00B328_CU_GROUP_ENABLE(x)                                 (((x) & 0x1) << 26)
+#define   G_00B328_CU_GROUP_ENABLE(x)                                 (((x) >> 26) & 0x1)
+#define   C_00B328_CU_GROUP_ENABLE                                    0xFBFFFFFF
+#define R_00B32C_SPI_SHADER_PGM_RSRC2_ES                                0x00B32C
+#define   S_00B32C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B32C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B32C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B32C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B32C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B32C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B32C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
+#define   G_00B32C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
+#define   C_00B32C_OC_LDS_EN                                          0xFFFFFF7F
+#define   S_00B32C_EXCP_EN(x)                                         (((x) & 0x7F) << 8)
+#define   G_00B32C_EXCP_EN(x)                                         (((x) >> 8) & 0x7F)
+#define   C_00B32C_EXCP_EN                                            0xFFFF80FF
+#define R_00B330_SPI_SHADER_USER_DATA_ES_0                              0x00B330
+#define R_00B420_SPI_SHADER_PGM_LO_HS                                   0x00B420
+#define R_00B424_SPI_SHADER_PGM_HI_HS                                   0x00B424
+#define   S_00B424_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B424_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B424_MEM_BASE                                           0xFFFFFF00
+#define R_00B428_SPI_SHADER_PGM_RSRC1_HS                                0x00B428
+#define   S_00B428_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B428_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B428_VGPRS                                              0xFFFFFFC0
+#define   S_00B428_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B428_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B428_SGPRS                                              0xFFFFFC3F
+#define   S_00B428_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B428_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B428_PRIORITY                                           0xFFFFF3FF
+#define   S_00B428_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B428_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B428_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B428_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B428_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B428_PRIV                                               0xFFEFFFFF
+#define   S_00B428_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B428_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B428_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B428_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B428_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B428_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B428_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B428_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B428_IEEE_MODE                                          0xFF7FFFFF
+#define R_00B42C_SPI_SHADER_PGM_RSRC2_HS                                0x00B42C
+#define   S_00B42C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B42C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B42C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B42C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B42C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B42C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B42C_OC_LDS_EN(x)                                       (((x) & 0x1) << 7)
+#define   G_00B42C_OC_LDS_EN(x)                                       (((x) >> 7) & 0x1)
+#define   C_00B42C_OC_LDS_EN                                          0xFFFFFF7F
+#define   S_00B42C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 8)
+#define   G_00B42C_TG_SIZE_EN(x)                                      (((x) >> 8) & 0x1)
+#define   C_00B42C_TG_SIZE_EN                                         0xFFFFFEFF
+#define   S_00B42C_EXCP_EN(x)                                         (((x) & 0x7F) << 9)
+#define   G_00B42C_EXCP_EN(x)                                         (((x) >> 9) & 0x7F)
+#define   C_00B42C_EXCP_EN                                            0xFFFF01FF
+#define R_00B430_SPI_SHADER_USER_DATA_HS_0                              0x00B430
+#define R_00B520_SPI_SHADER_PGM_LO_LS                                   0x00B520
+#define R_00B524_SPI_SHADER_PGM_HI_LS                                   0x00B524
+#define   S_00B524_MEM_BASE(x)                                        (((x) & 0xFF) << 0)
+#define   G_00B524_MEM_BASE(x)                                        (((x) >> 0) & 0xFF)
+#define   C_00B524_MEM_BASE                                           0xFFFFFF00
+#define R_00B528_SPI_SHADER_PGM_RSRC1_LS                                0x00B528
+#define   S_00B528_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B528_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B528_VGPRS                                              0xFFFFFFC0
+#define   S_00B528_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B528_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B528_SGPRS                                              0xFFFFFC3F
+#define   S_00B528_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B528_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B528_PRIORITY                                           0xFFFFF3FF
+#define   S_00B528_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B528_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B528_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B528_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B528_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B528_PRIV                                               0xFFEFFFFF
+#define   S_00B528_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B528_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B528_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B528_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B528_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B528_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B528_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B528_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B528_IEEE_MODE                                          0xFF7FFFFF
+#define   S_00B528_VGPR_COMP_CNT(x)                                   (((x) & 0x03) << 24)
+#define   G_00B528_VGPR_COMP_CNT(x)                                   (((x) >> 24) & 0x03)
+#define   C_00B528_VGPR_COMP_CNT                                      0xFCFFFFFF
+#define R_00B52C_SPI_SHADER_PGM_RSRC2_LS                                0x00B52C
+#define   S_00B52C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B52C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B52C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B52C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B52C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B52C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B52C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 7)
+#define   G_00B52C_LDS_SIZE(x)                                        (((x) >> 7) & 0x1FF)
+#define   C_00B52C_LDS_SIZE                                           0xFFFF007F
+#define   S_00B52C_EXCP_EN(x)                                         (((x) & 0x7F) << 16)
+#define   G_00B52C_EXCP_EN(x)                                         (((x) >> 16) & 0x7F)
+#define   C_00B52C_EXCP_EN                                            0xFF80FFFF
+#define R_00B530_SPI_SHADER_USER_DATA_LS_0                              0x00B530
+#define R_00B800_COMPUTE_DISPATCH_INITIATOR                             0x00B800
+#define   S_00B800_COMPUTE_SHADER_EN(x)                               (((x) & 0x1) << 0)
+#define   G_00B800_COMPUTE_SHADER_EN(x)                               (((x) >> 0) & 0x1)
+#define   C_00B800_COMPUTE_SHADER_EN                                  0xFFFFFFFE
+#define   S_00B800_PARTIAL_TG_EN(x)                                   (((x) & 0x1) << 1)
+#define   G_00B800_PARTIAL_TG_EN(x)                                   (((x) >> 1) & 0x1)
+#define   C_00B800_PARTIAL_TG_EN                                      0xFFFFFFFD
+#define   S_00B800_FORCE_START_AT_000(x)                              (((x) & 0x1) << 2)
+#define   G_00B800_FORCE_START_AT_000(x)                              (((x) >> 2) & 0x1)
+#define   C_00B800_FORCE_START_AT_000                                 0xFFFFFFFB
+#define   S_00B800_ORDERED_APPEND_ENBL(x)                             (((x) & 0x1) << 3)
+#define   G_00B800_ORDERED_APPEND_ENBL(x)                             (((x) >> 3) & 0x1)
+#define   C_00B800_ORDERED_APPEND_ENBL                                0xFFFFFFF7
+#define R_00B804_COMPUTE_DIM_X                                          0x00B804
+#define R_00B808_COMPUTE_DIM_Y                                          0x00B808
+#define R_00B80C_COMPUTE_DIM_Z                                          0x00B80C
+#define R_00B810_COMPUTE_START_X                                        0x00B810
+#define R_00B814_COMPUTE_START_Y                                        0x00B814
+#define R_00B818_COMPUTE_START_Z                                        0x00B818
+#define R_00B81C_COMPUTE_NUM_THREAD_X                                   0x00B81C
+#define   S_00B81C_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
+#define   G_00B81C_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
+#define   C_00B81C_NUM_THREAD_FULL                                    0xFFFF0000
+#define   S_00B81C_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
+#define   G_00B81C_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
+#define   C_00B81C_NUM_THREAD_PARTIAL                                 0x0000FFFF
+#define R_00B820_COMPUTE_NUM_THREAD_Y                                   0x00B820
+#define   S_00B820_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
+#define   G_00B820_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
+#define   C_00B820_NUM_THREAD_FULL                                    0xFFFF0000
+#define   S_00B820_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
+#define   G_00B820_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
+#define   C_00B820_NUM_THREAD_PARTIAL                                 0x0000FFFF
+#define R_00B824_COMPUTE_NUM_THREAD_Z                                   0x00B824
+#define   S_00B824_NUM_THREAD_FULL(x)                                 (((x) & 0xFFFF) << 0)
+#define   G_00B824_NUM_THREAD_FULL(x)                                 (((x) >> 0) & 0xFFFF)
+#define   C_00B824_NUM_THREAD_FULL                                    0xFFFF0000
+#define   S_00B824_NUM_THREAD_PARTIAL(x)                              (((x) & 0xFFFF) << 16)
+#define   G_00B824_NUM_THREAD_PARTIAL(x)                              (((x) >> 16) & 0xFFFF)
+#define   C_00B824_NUM_THREAD_PARTIAL                                 0x0000FFFF
+#define R_00B82C_COMPUTE_MAX_WAVE_ID                                    0x00B82C
+#define   S_00B82C_MAX_WAVE_ID(x)                                     (((x) & 0xFFF) << 0)
+#define   G_00B82C_MAX_WAVE_ID(x)                                     (((x) >> 0) & 0xFFF)
+#define   C_00B82C_MAX_WAVE_ID                                        0xFFFFF000
+#define R_00B830_COMPUTE_PGM_LO                                         0x00B830
+#define R_00B834_COMPUTE_PGM_HI                                         0x00B834
+#define   S_00B834_DATA(x)                                            (((x) & 0xFF) << 0)
+#define   G_00B834_DATA(x)                                            (((x) >> 0) & 0xFF)
+#define   C_00B834_DATA                                               0xFFFFFF00
+#define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
+#define   S_00B848_VGPRS(x)                                           (((x) & 0x3F) << 0)
+#define   G_00B848_VGPRS(x)                                           (((x) >> 0) & 0x3F)
+#define   C_00B848_VGPRS                                              0xFFFFFFC0
+#define   S_00B848_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define   G_00B848_SGPRS(x)                                           (((x) >> 6) & 0x0F)
+#define   C_00B848_SGPRS                                              0xFFFFFC3F
+#define   S_00B848_PRIORITY(x)                                        (((x) & 0x03) << 10)
+#define   G_00B848_PRIORITY(x)                                        (((x) >> 10) & 0x03)
+#define   C_00B848_PRIORITY                                           0xFFFFF3FF
+#define   S_00B848_FLOAT_MODE(x)                                      (((x) & 0xFF) << 12)
+#define   G_00B848_FLOAT_MODE(x)                                      (((x) >> 12) & 0xFF)
+#define   C_00B848_FLOAT_MODE                                         0xFFF00FFF
+#define   S_00B848_PRIV(x)                                            (((x) & 0x1) << 20)
+#define   G_00B848_PRIV(x)                                            (((x) >> 20) & 0x1)
+#define   C_00B848_PRIV                                               0xFFEFFFFF
+#define   S_00B848_DX10_CLAMP(x)                                      (((x) & 0x1) << 21)
+#define   G_00B848_DX10_CLAMP(x)                                      (((x) >> 21) & 0x1)
+#define   C_00B848_DX10_CLAMP                                         0xFFDFFFFF
+#define   S_00B848_DEBUG_MODE(x)                                      (((x) & 0x1) << 22)
+#define   G_00B848_DEBUG_MODE(x)                                      (((x) >> 22) & 0x1)
+#define   C_00B848_DEBUG_MODE                                         0xFFBFFFFF
+#define   S_00B848_IEEE_MODE(x)                                       (((x) & 0x1) << 23)
+#define   G_00B848_IEEE_MODE(x)                                       (((x) >> 23) & 0x1)
+#define   C_00B848_IEEE_MODE                                          0xFF7FFFFF
+#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B84C_SCRATCH_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_00B84C_SCRATCH_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_00B84C_SCRATCH_EN                                         0xFFFFFFFE
+#define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
+#define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
+#define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
+#define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
+#define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
+#define   S_00B84C_TGID_Y_EN(x)                                       (((x) & 0x1) << 8)
+#define   G_00B84C_TGID_Y_EN(x)                                       (((x) >> 8) & 0x1)
+#define   C_00B84C_TGID_Y_EN                                          0xFFFFFEFF
+#define   S_00B84C_TGID_Z_EN(x)                                       (((x) & 0x1) << 9)
+#define   G_00B84C_TGID_Z_EN(x)                                       (((x) >> 9) & 0x1)
+#define   C_00B84C_TGID_Z_EN                                          0xFFFFFDFF
+#define   S_00B84C_TG_SIZE_EN(x)                                      (((x) & 0x1) << 10)
+#define   G_00B84C_TG_SIZE_EN(x)                                      (((x) >> 10) & 0x1)
+#define   C_00B84C_TG_SIZE_EN                                         0xFFFFFBFF
+#define   S_00B84C_TIDIG_COMP_CNT(x)                                  (((x) & 0x03) << 11)
+#define   G_00B84C_TIDIG_COMP_CNT(x)                                  (((x) >> 11) & 0x03)
+#define   C_00B84C_TIDIG_COMP_CNT                                     0xFFFFE7FF
+#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
+#define   G_00B84C_LDS_SIZE(x)                                        (((x) >> 15) & 0x1FF)
+#define   C_00B84C_LDS_SIZE                                           0xFF007FFF
+#define   S_00B84C_EXCP_EN(x)                                         (((x) & 0x7F) << 24)
+#define   G_00B84C_EXCP_EN(x)                                         (((x) >> 24) & 0x7F)
+#define   C_00B84C_EXCP_EN                                            0x80FFFFFF
+#define R_00B854_COMPUTE_RESOURCE_LIMITS                                0x00B854
+#define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0)
+#define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F)
+#define   C_00B854_WAVES_PER_SH                                       0xFFFFFFC0
+#define   S_00B854_TG_PER_CU(x)                                       (((x) & 0x0F) << 12)
+#define   G_00B854_TG_PER_CU(x)                                       (((x) >> 12) & 0x0F)
+#define   C_00B854_TG_PER_CU                                          0xFFFF0FFF
+#define   S_00B854_LOCK_THRESHOLD(x)                                  (((x) & 0x3F) << 16)
+#define   G_00B854_LOCK_THRESHOLD(x)                                  (((x) >> 16) & 0x3F)
+#define   C_00B854_LOCK_THRESHOLD                                     0xFFC0FFFF
+#define   S_00B854_SIMD_DEST_CNTL(x)                                  (((x) & 0x1) << 22)
+#define   G_00B854_SIMD_DEST_CNTL(x)                                  (((x) >> 22) & 0x1)
+#define   C_00B854_SIMD_DEST_CNTL                                     0xFFBFFFFF
+#define R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0                         0x00B858
+#define   S_00B858_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B858_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B858_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B858_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B858_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B858_SH1_CU_EN                                          0x0000FFFF
+#define R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1                         0x00B85C
+#define   S_00B85C_SH0_CU_EN(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_00B85C_SH0_CU_EN(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_00B85C_SH0_CU_EN                                          0xFFFF0000
+#define   S_00B85C_SH1_CU_EN(x)                                       (((x) & 0xFFFF) << 16)
+#define   G_00B85C_SH1_CU_EN(x)                                       (((x) >> 16) & 0xFFFF)
+#define   C_00B85C_SH1_CU_EN                                          0x0000FFFF
+#define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
+#define   S_00B860_WAVES(x)                                           (((x) & 0xFFF) << 0)
+#define   G_00B860_WAVES(x)                                           (((x) >> 0) & 0xFFF)
+#define   C_00B860_WAVES                                              0xFFFFF000
+#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   G_00B860_WAVESIZE(x)                                        (((x) >> 12) & 0x1FFF)
+#define   C_00B860_WAVESIZE                                           0xFE000FFF
+#define R_00B900_COMPUTE_USER_DATA_0                                    0x00B900
+#define R_028000_DB_RENDER_CONTROL                                      0x028000
+#define   S_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) & 0x1) << 0)
+#define   G_028000_DEPTH_CLEAR_ENABLE(x)                              (((x) >> 0) & 0x1)
+#define   C_028000_DEPTH_CLEAR_ENABLE                                 0xFFFFFFFE
+#define   S_028000_STENCIL_CLEAR_ENABLE(x)                            (((x) & 0x1) << 1)
+#define   G_028000_STENCIL_CLEAR_ENABLE(x)                            (((x) >> 1) & 0x1)
+#define   C_028000_STENCIL_CLEAR_ENABLE                               0xFFFFFFFD
+#define   S_028000_DEPTH_COPY(x)                                      (((x) & 0x1) << 2)
+#define   G_028000_DEPTH_COPY(x)                                      (((x) >> 2) & 0x1)
+#define   C_028000_DEPTH_COPY                                         0xFFFFFFFB
+#define   S_028000_STENCIL_COPY(x)                                    (((x) & 0x1) << 3)
+#define   G_028000_STENCIL_COPY(x)                                    (((x) >> 3) & 0x1)
+#define   C_028000_STENCIL_COPY                                       0xFFFFFFF7
+#define   S_028000_RESUMMARIZE_ENABLE(x)                              (((x) & 0x1) << 4)
+#define   G_028000_RESUMMARIZE_ENABLE(x)                              (((x) >> 4) & 0x1)
+#define   C_028000_RESUMMARIZE_ENABLE                                 0xFFFFFFEF
+#define   S_028000_STENCIL_COMPRESS_DISABLE(x)                        (((x) & 0x1) << 5)
+#define   G_028000_STENCIL_COMPRESS_DISABLE(x)                        (((x) >> 5) & 0x1)
+#define   C_028000_STENCIL_COMPRESS_DISABLE                           0xFFFFFFDF
+#define   S_028000_DEPTH_COMPRESS_DISABLE(x)                          (((x) & 0x1) << 6)
+#define   G_028000_DEPTH_COMPRESS_DISABLE(x)                          (((x) >> 6) & 0x1)
+#define   C_028000_DEPTH_COMPRESS_DISABLE                             0xFFFFFFBF
+#define   S_028000_COPY_CENTROID(x)                                   (((x) & 0x1) << 7)
+#define   G_028000_COPY_CENTROID(x)                                   (((x) >> 7) & 0x1)
+#define   C_028000_COPY_CENTROID                                      0xFFFFFF7F
+#define   S_028000_COPY_SAMPLE(x)                                     (((x) & 0x0F) << 8)
+#define   G_028000_COPY_SAMPLE(x)                                     (((x) >> 8) & 0x0F)
+#define   C_028000_COPY_SAMPLE                                        0xFFFFF0FF
+#define R_028004_DB_COUNT_CONTROL                                       0x028004
+#define   S_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) & 0x1) << 0)
+#define   G_028004_ZPASS_INCREMENT_DISABLE(x)                         (((x) >> 0) & 0x1)
+#define   C_028004_ZPASS_INCREMENT_DISABLE                            0xFFFFFFFE
+#define   S_028004_PERFECT_ZPASS_COUNTS(x)                            (((x) & 0x1) << 1)
+#define   G_028004_PERFECT_ZPASS_COUNTS(x)                            (((x) >> 1) & 0x1)
+#define   C_028004_PERFECT_ZPASS_COUNTS                               0xFFFFFFFD
+#define   S_028004_SAMPLE_RATE(x)                                     (((x) & 0x07) << 4)
+#define   G_028004_SAMPLE_RATE(x)                                     (((x) >> 4) & 0x07)
+#define   C_028004_SAMPLE_RATE                                        0xFFFFFF8F
+#define R_028008_DB_DEPTH_VIEW                                          0x028008
+#define   S_028008_SLICE_START(x)                                     (((x) & 0x7FF) << 0)
+#define   G_028008_SLICE_START(x)                                     (((x) >> 0) & 0x7FF)
+#define   C_028008_SLICE_START                                        0xFFFFF800
+#define   S_028008_SLICE_MAX(x)                                       (((x) & 0x7FF) << 13)
+#define   G_028008_SLICE_MAX(x)                                       (((x) >> 13) & 0x7FF)
+#define   C_028008_SLICE_MAX                                          0xFF001FFF
+#define   S_028008_Z_READ_ONLY(x)                                     (((x) & 0x1) << 24)
+#define   G_028008_Z_READ_ONLY(x)                                     (((x) >> 24) & 0x1)
+#define   C_028008_Z_READ_ONLY                                        0xFEFFFFFF
+#define   S_028008_STENCIL_READ_ONLY(x)                               (((x) & 0x1) << 25)
+#define   G_028008_STENCIL_READ_ONLY(x)                               (((x) >> 25) & 0x1)
+#define   C_028008_STENCIL_READ_ONLY                                  0xFDFFFFFF
+#define R_02800C_DB_RENDER_OVERRIDE                                     0x02800C
+#define   S_02800C_FORCE_HIZ_ENABLE(x)                                (((x) & 0x03) << 0)
+#define   G_02800C_FORCE_HIZ_ENABLE(x)                                (((x) >> 0) & 0x03)
+#define   C_02800C_FORCE_HIZ_ENABLE                                   0xFFFFFFFC
+#define     V_02800C_FORCE_OFF                                      0x00
+#define     V_02800C_FORCE_ENABLE                                   0x01
+#define     V_02800C_FORCE_DISABLE                                  0x02
+#define     V_02800C_FORCE_RESERVED                                 0x03
+#define   S_02800C_FORCE_HIS_ENABLE0(x)                               (((x) & 0x03) << 2)
+#define   G_02800C_FORCE_HIS_ENABLE0(x)                               (((x) >> 2) & 0x03)
+#define   C_02800C_FORCE_HIS_ENABLE0                                  0xFFFFFFF3
+#define     V_02800C_FORCE_OFF                                      0x00
+#define     V_02800C_FORCE_ENABLE                                   0x01
+#define     V_02800C_FORCE_DISABLE                                  0x02
+#define     V_02800C_FORCE_RESERVED                                 0x03
+#define   S_02800C_FORCE_HIS_ENABLE1(x)                               (((x) & 0x03) << 4)
+#define   G_02800C_FORCE_HIS_ENABLE1(x)                               (((x) >> 4) & 0x03)
+#define   C_02800C_FORCE_HIS_ENABLE1                                  0xFFFFFFCF
+#define     V_02800C_FORCE_OFF                                      0x00
+#define     V_02800C_FORCE_ENABLE                                   0x01
+#define     V_02800C_FORCE_DISABLE                                  0x02
+#define     V_02800C_FORCE_RESERVED                                 0x03
+#define   S_02800C_FORCE_SHADER_Z_ORDER(x)                            (((x) & 0x1) << 6)
+#define   G_02800C_FORCE_SHADER_Z_ORDER(x)                            (((x) >> 6) & 0x1)
+#define   C_02800C_FORCE_SHADER_Z_ORDER                               0xFFFFFFBF
+#define   S_02800C_FAST_Z_DISABLE(x)                                  (((x) & 0x1) << 7)
+#define   G_02800C_FAST_Z_DISABLE(x)                                  (((x) >> 7) & 0x1)
+#define   C_02800C_FAST_Z_DISABLE                                     0xFFFFFF7F
+#define   S_02800C_FAST_STENCIL_DISABLE(x)                            (((x) & 0x1) << 8)
+#define   G_02800C_FAST_STENCIL_DISABLE(x)                            (((x) >> 8) & 0x1)
+#define   C_02800C_FAST_STENCIL_DISABLE                               0xFFFFFEFF
+#define   S_02800C_NOOP_CULL_DISABLE(x)                               (((x) & 0x1) << 9)
+#define   G_02800C_NOOP_CULL_DISABLE(x)                               (((x) >> 9) & 0x1)
+#define   C_02800C_NOOP_CULL_DISABLE                                  0xFFFFFDFF
+#define   S_02800C_FORCE_COLOR_KILL(x)                                (((x) & 0x1) << 10)
+#define   G_02800C_FORCE_COLOR_KILL(x)                                (((x) >> 10) & 0x1)
+#define   C_02800C_FORCE_COLOR_KILL                                   0xFFFFFBFF
+#define   S_02800C_FORCE_Z_READ(x)                                    (((x) & 0x1) << 11)
+#define   G_02800C_FORCE_Z_READ(x)                                    (((x) >> 11) & 0x1)
+#define   C_02800C_FORCE_Z_READ                                       0xFFFFF7FF
+#define   S_02800C_FORCE_STENCIL_READ(x)                              (((x) & 0x1) << 12)
+#define   G_02800C_FORCE_STENCIL_READ(x)                              (((x) >> 12) & 0x1)
+#define   C_02800C_FORCE_STENCIL_READ                                 0xFFFFEFFF
+#define   S_02800C_FORCE_FULL_Z_RANGE(x)                              (((x) & 0x03) << 13)
+#define   G_02800C_FORCE_FULL_Z_RANGE(x)                              (((x) >> 13) & 0x03)
+#define   C_02800C_FORCE_FULL_Z_RANGE                                 0xFFFF9FFF
+#define     V_02800C_FORCE_OFF                                      0x00
+#define     V_02800C_FORCE_ENABLE                                   0x01
+#define     V_02800C_FORCE_DISABLE                                  0x02
+#define     V_02800C_FORCE_RESERVED                                 0x03
+#define   S_02800C_FORCE_QC_SMASK_CONFLICT(x)                         (((x) & 0x1) << 15)
+#define   G_02800C_FORCE_QC_SMASK_CONFLICT(x)                         (((x) >> 15) & 0x1)
+#define   C_02800C_FORCE_QC_SMASK_CONFLICT                            0xFFFF7FFF
+#define   S_02800C_DISABLE_VIEWPORT_CLAMP(x)                          (((x) & 0x1) << 16)
+#define   G_02800C_DISABLE_VIEWPORT_CLAMP(x)                          (((x) >> 16) & 0x1)
+#define   C_02800C_DISABLE_VIEWPORT_CLAMP                             0xFFFEFFFF
+#define   S_02800C_IGNORE_SC_ZRANGE(x)                                (((x) & 0x1) << 17)
+#define   G_02800C_IGNORE_SC_ZRANGE(x)                                (((x) >> 17) & 0x1)
+#define   C_02800C_IGNORE_SC_ZRANGE                                   0xFFFDFFFF
+#define   S_02800C_DISABLE_FULLY_COVERED(x)                           (((x) & 0x1) << 18)
+#define   G_02800C_DISABLE_FULLY_COVERED(x)                           (((x) >> 18) & 0x1)
+#define   C_02800C_DISABLE_FULLY_COVERED                              0xFFFBFFFF
+#define   S_02800C_FORCE_Z_LIMIT_SUMM(x)                              (((x) & 0x03) << 19)
+#define   G_02800C_FORCE_Z_LIMIT_SUMM(x)                              (((x) >> 19) & 0x03)
+#define   C_02800C_FORCE_Z_LIMIT_SUMM                                 0xFFE7FFFF
+#define     V_02800C_FORCE_SUMM_OFF                                 0x00
+#define     V_02800C_FORCE_SUMM_MINZ                                0x01
+#define     V_02800C_FORCE_SUMM_MAXZ                                0x02
+#define     V_02800C_FORCE_SUMM_BOTH                                0x03
+#define   S_02800C_MAX_TILES_IN_DTT(x)                                (((x) & 0x1F) << 21)
+#define   G_02800C_MAX_TILES_IN_DTT(x)                                (((x) >> 21) & 0x1F)
+#define   C_02800C_MAX_TILES_IN_DTT                                   0xFC1FFFFF
+#define   S_02800C_DISABLE_TILE_RATE_TILES(x)                         (((x) & 0x1) << 26)
+#define   G_02800C_DISABLE_TILE_RATE_TILES(x)                         (((x) >> 26) & 0x1)
+#define   C_02800C_DISABLE_TILE_RATE_TILES                            0xFBFFFFFF
+#define   S_02800C_FORCE_Z_DIRTY(x)                                   (((x) & 0x1) << 27)
+#define   G_02800C_FORCE_Z_DIRTY(x)                                   (((x) >> 27) & 0x1)
+#define   C_02800C_FORCE_Z_DIRTY                                      0xF7FFFFFF
+#define   S_02800C_FORCE_STENCIL_DIRTY(x)                             (((x) & 0x1) << 28)
+#define   G_02800C_FORCE_STENCIL_DIRTY(x)                             (((x) >> 28) & 0x1)
+#define   C_02800C_FORCE_STENCIL_DIRTY                                0xEFFFFFFF
+#define   S_02800C_FORCE_Z_VALID(x)                                   (((x) & 0x1) << 29)
+#define   G_02800C_FORCE_Z_VALID(x)                                   (((x) >> 29) & 0x1)
+#define   C_02800C_FORCE_Z_VALID                                      0xDFFFFFFF
+#define   S_02800C_FORCE_STENCIL_VALID(x)                             (((x) & 0x1) << 30)
+#define   G_02800C_FORCE_STENCIL_VALID(x)                             (((x) >> 30) & 0x1)
+#define   C_02800C_FORCE_STENCIL_VALID                                0xBFFFFFFF
+#define   S_02800C_PRESERVE_COMPRESSION(x)                            (((x) & 0x1) << 31)
+#define   G_02800C_PRESERVE_COMPRESSION(x)                            (((x) >> 31) & 0x1)
+#define   C_02800C_PRESERVE_COMPRESSION                               0x7FFFFFFF
+#define R_028010_DB_RENDER_OVERRIDE2                                    0x028010
+#define   S_028010_PARTIAL_SQUAD_LAUNCH_CONTROL(x)                    (((x) & 0x03) << 0)
+#define   G_028010_PARTIAL_SQUAD_LAUNCH_CONTROL(x)                    (((x) >> 0) & 0x03)
+#define   C_028010_PARTIAL_SQUAD_LAUNCH_CONTROL                       0xFFFFFFFC
+#define     V_028010_PSLC_AUTO                                      0x00
+#define     V_028010_PSLC_ON_HANG_ONLY                              0x01
+#define     V_028010_PSLC_ASAP                                      0x02
+#define     V_028010_PSLC_COUNTDOWN                                 0x03
+#define   S_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN(x)                  (((x) & 0x07) << 2)
+#define   G_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN(x)                  (((x) >> 2) & 0x07)
+#define   C_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN                     0xFFFFFFE3
+#define   S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO(x)              (((x) & 0x1) << 5)
+#define   G_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO(x)              (((x) >> 5) & 0x1)
+#define   C_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO                 0xFFFFFFDF
+#define   S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(x)              (((x) & 0x1) << 6)
+#define   G_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(x)              (((x) >> 6) & 0x1)
+#define   C_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION                 0xFFFFFFBF
+#define   S_028010_DISABLE_COLOR_ON_VALIDATION(x)                     (((x) & 0x1) << 7)
+#define   G_028010_DISABLE_COLOR_ON_VALIDATION(x)                     (((x) >> 7) & 0x1)
+#define   C_028010_DISABLE_COLOR_ON_VALIDATION                        0xFFFFFF7F
+#define   S_028010_DECOMPRESS_Z_ON_FLUSH(x)                           (((x) & 0x1) << 8)
+#define   G_028010_DECOMPRESS_Z_ON_FLUSH(x)                           (((x) >> 8) & 0x1)
+#define   C_028010_DECOMPRESS_Z_ON_FLUSH                              0xFFFFFEFF
+#define   S_028010_DISABLE_REG_SNOOP(x)                               (((x) & 0x1) << 9)
+#define   G_028010_DISABLE_REG_SNOOP(x)                               (((x) >> 9) & 0x1)
+#define   C_028010_DISABLE_REG_SNOOP                                  0xFFFFFDFF
+#define   S_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE(x)                 (((x) & 0x1) << 10)
+#define   G_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE(x)                 (((x) >> 10) & 0x1)
+#define   C_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE                    0xFFFFFBFF
+#define R_028014_DB_HTILE_DATA_BASE                                     0x028014
+#define R_028020_DB_DEPTH_BOUNDS_MIN                                    0x028020
+#define R_028024_DB_DEPTH_BOUNDS_MAX                                    0x028024
+#define R_028028_DB_STENCIL_CLEAR                                       0x028028
+#define   S_028028_CLEAR(x)                                           (((x) & 0xFF) << 0)
+#define   G_028028_CLEAR(x)                                           (((x) >> 0) & 0xFF)
+#define   C_028028_CLEAR                                              0xFFFFFF00
+#define R_02802C_DB_DEPTH_CLEAR                                         0x02802C
+#define R_028030_PA_SC_SCREEN_SCISSOR_TL                                0x028030
+#define   S_028030_TL_X(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_028030_TL_X(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_028030_TL_X                                               0xFFFF0000
+#define   S_028030_TL_Y(x)                                            (((x) & 0xFFFF) << 16)
+#define   G_028030_TL_Y(x)                                            (((x) >> 16) & 0xFFFF)
+#define   C_028030_TL_Y                                               0x0000FFFF
+#define R_028034_PA_SC_SCREEN_SCISSOR_BR                                0x028034
+#define   S_028034_BR_X(x)                                            (((x) & 0xFFFF) << 0)
+#define   G_028034_BR_X(x)                                            (((x) >> 0) & 0xFFFF)
+#define   C_028034_BR_X                                               0xFFFF0000
+#define   S_028034_BR_Y(x)                                            (((x) & 0xFFFF) << 16)
+#define   G_028034_BR_Y(x)                                            (((x) >> 16) & 0xFFFF)
+#define   C_028034_BR_Y                                               0x0000FFFF
+#define R_02803C_DB_DEPTH_INFO                                          0x02803C
+#define   S_02803C_ADDR5_SWIZZLE_MASK(x)                              (((x) & 0x0F) << 0)
+#define   G_02803C_ADDR5_SWIZZLE_MASK(x)                              (((x) >> 0) & 0x0F)
+#define   C_02803C_ADDR5_SWIZZLE_MASK                                 0xFFFFFFF0
+#define R_028040_DB_Z_INFO                                              0x028040
+#define   S_028040_FORMAT(x)                                          (((x) & 0x03) << 0)
+#define   G_028040_FORMAT(x)                                          (((x) >> 0) & 0x03)
+#define   C_028040_FORMAT                                             0xFFFFFFFC
+#define     V_028040_Z_INVALID                                      0x00
+#define     V_028040_Z_16                                           0x01
+#define     V_028040_Z_24                                           0x02 /* deprecated */
+#define     V_028040_Z_32_FLOAT                                     0x03
+#define   S_028040_NUM_SAMPLES(x)                                     (((x) & 0x03) << 2)
+#define   G_028040_NUM_SAMPLES(x)                                     (((x) >> 2) & 0x03)
+#define   C_028040_NUM_SAMPLES                                        0xFFFFFFF3
+#define   S_028040_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20)
+#define   G_028040_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07)
+#define   C_028040_TILE_MODE_INDEX                                    0xFF8FFFFF
+#define   S_028040_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
+#define   G_028040_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
+#define   C_028040_ALLOW_EXPCLEAR                                     0xF7FFFFFF
+#define   S_028040_READ_SIZE(x)                                       (((x) & 0x1) << 28)
+#define   G_028040_READ_SIZE(x)                                       (((x) >> 28) & 0x1)
+#define   C_028040_READ_SIZE                                          0xEFFFFFFF
+#define   S_028040_TILE_SURFACE_ENABLE(x)                             (((x) & 0x1) << 29)
+#define   G_028040_TILE_SURFACE_ENABLE(x)                             (((x) >> 29) & 0x1)
+#define   C_028040_TILE_SURFACE_ENABLE                                0xDFFFFFFF
+#define   S_028040_ZRANGE_PRECISION(x)                                (((x) & 0x1) << 31)
+#define   G_028040_ZRANGE_PRECISION(x)                                (((x) >> 31) & 0x1)
+#define   C_028040_ZRANGE_PRECISION                                   0x7FFFFFFF
+#define R_028044_DB_STENCIL_INFO                                        0x028044
+#define   S_028044_FORMAT(x)                                          (((x) & 0x1) << 0)
+#define   G_028044_FORMAT(x)                                          (((x) >> 0) & 0x1)
+#define   C_028044_FORMAT                                             0xFFFFFFFE
+#define   S_028044_TILE_MODE_INDEX(x)                                 (((x) & 0x07) << 20)
+#define   G_028044_TILE_MODE_INDEX(x)                                 (((x) >> 20) & 0x07)
+#define   C_028044_TILE_MODE_INDEX                                    0xFF8FFFFF
+#define   S_028044_ALLOW_EXPCLEAR(x)                                  (((x) & 0x1) << 27)
+#define   G_028044_ALLOW_EXPCLEAR(x)                                  (((x) >> 27) & 0x1)
+#define   C_028044_ALLOW_EXPCLEAR                                     0xF7FFFFFF
+#define   S_028044_TILE_STENCIL_DISABLE(x)                            (((x) & 0x1) << 29)
+#define   G_028044_TILE_STENCIL_DISABLE(x)                            (((x) >> 29) & 0x1)
+#define   C_028044_TILE_STENCIL_DISABLE                               0xDFFFFFFF
+#define R_028048_DB_Z_READ_BASE                                         0x028048
+#define R_02804C_DB_STENCIL_READ_BASE                                   0x02804C
+#define R_028050_DB_Z_WRITE_BASE                                        0x028050
+#define R_028054_DB_STENCIL_WRITE_BASE                                  0x028054
+#define R_028058_DB_DEPTH_SIZE                                          0x028058
+#define   S_028058_PITCH_TILE_MAX(x)                                  (((x) & 0x7FF) << 0)
+#define   G_028058_PITCH_TILE_MAX(x)                                  (((x) >> 0) & 0x7FF)
+#define   C_028058_PITCH_TILE_MAX                                     0xFFFFF800
+#define   S_028058_HEIGHT_TILE_MAX(x)                                 (((x) & 0x7FF) << 11)
+#define   G_028058_HEIGHT_TILE_MAX(x)                                 (((x) >> 11) & 0x7FF)
+#define   C_028058_HEIGHT_TILE_MAX                                    0xFFC007FF
+#define R_02805C_DB_DEPTH_SLICE                                         0x02805C
+#define   S_02805C_SLICE_TILE_MAX(x)                                  (((x) & 0x3FFFFF) << 0)
+#define   G_02805C_SLICE_TILE_MAX(x)                                  (((x) >> 0) & 0x3FFFFF)
+#define   C_02805C_SLICE_TILE_MAX                                     0xFFC00000
+#define R_028080_TA_BC_BASE_ADDR                                        0x028080
+#define R_028200_PA_SC_WINDOW_OFFSET                                    0x028200
+#define   S_028200_WINDOW_X_OFFSET(x)                                 (((x) & 0xFFFF) << 0)
+#define   G_028200_WINDOW_X_OFFSET(x)                                 (((x) >> 0) & 0xFFFF)
+#define   C_028200_WINDOW_X_OFFSET                                    0xFFFF0000
+#define   S_028200_WINDOW_Y_OFFSET(x)                                 (((x) & 0xFFFF) << 16)
+#define   G_028200_WINDOW_Y_OFFSET(x)                                 (((x) >> 16) & 0xFFFF)
+#define   C_028200_WINDOW_Y_OFFSET                                    0x0000FFFF
+#define R_028204_PA_SC_WINDOW_SCISSOR_TL                                0x028204
+#define   S_028204_TL_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028204_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028204_TL_X                                               0xFFFF8000
+#define   S_028204_TL_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028204_TL_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028204_TL_Y                                               0x8000FFFF
+#define   S_028204_WINDOW_OFFSET_DISABLE(x)                           (((x) & 0x1) << 31)
+#define   G_028204_WINDOW_OFFSET_DISABLE(x)                           (((x) >> 31) & 0x1)
+#define   C_028204_WINDOW_OFFSET_DISABLE                              0x7FFFFFFF
+#define R_028208_PA_SC_WINDOW_SCISSOR_BR                                0x028208
+#define   S_028208_BR_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028208_BR_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028208_BR_X                                               0xFFFF8000
+#define   S_028208_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028208_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028208_BR_Y                                               0x8000FFFF
+#define R_02820C_PA_SC_CLIPRECT_RULE                                    0x02820C
+#define   S_02820C_CLIP_RULE(x)                                       (((x) & 0xFFFF) << 0)
+#define   G_02820C_CLIP_RULE(x)                                       (((x) >> 0) & 0xFFFF)
+#define   C_02820C_CLIP_RULE                                          0xFFFF0000
+#define R_028210_PA_SC_CLIPRECT_0_TL                                    0x028210
+#define   S_028210_TL_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028210_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028210_TL_X                                               0xFFFF8000
+#define   S_028210_TL_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028210_TL_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028210_TL_Y                                               0x8000FFFF
+#define R_028214_PA_SC_CLIPRECT_0_BR                                    0x028214
+#define   S_028214_BR_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028214_BR_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028214_BR_X                                               0xFFFF8000
+#define   S_028214_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028214_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028214_BR_Y                                               0x8000FFFF
+#define R_028218_PA_SC_CLIPRECT_1_TL                                    0x028218
+#define R_02821C_PA_SC_CLIPRECT_1_BR                                    0x02821C
+#define R_028220_PA_SC_CLIPRECT_2_TL                                    0x028220
+#define R_028224_PA_SC_CLIPRECT_2_BR                                    0x028224
+#define R_028228_PA_SC_CLIPRECT_3_TL                                    0x028228
+#define R_02822C_PA_SC_CLIPRECT_3_BR                                    0x02822C
+#define R_028230_PA_SC_EDGERULE                                         0x028230
+#define   S_028230_ER_TRI(x)                                          (((x) & 0x0F) << 0)
+#define   G_028230_ER_TRI(x)                                          (((x) >> 0) & 0x0F)
+#define   C_028230_ER_TRI                                             0xFFFFFFF0
+#define   S_028230_ER_POINT(x)                                        (((x) & 0x0F) << 4)
+#define   G_028230_ER_POINT(x)                                        (((x) >> 4) & 0x0F)
+#define   C_028230_ER_POINT                                           0xFFFFFF0F
+#define   S_028230_ER_RECT(x)                                         (((x) & 0x0F) << 8)
+#define   G_028230_ER_RECT(x)                                         (((x) >> 8) & 0x0F)
+#define   C_028230_ER_RECT                                            0xFFFFF0FF
+#define   S_028230_ER_LINE_LR(x)                                      (((x) & 0x3F) << 12)
+#define   G_028230_ER_LINE_LR(x)                                      (((x) >> 12) & 0x3F)
+#define   C_028230_ER_LINE_LR                                         0xFFFC0FFF
+#define   S_028230_ER_LINE_RL(x)                                      (((x) & 0x3F) << 18)
+#define   G_028230_ER_LINE_RL(x)                                      (((x) >> 18) & 0x3F)
+#define   C_028230_ER_LINE_RL                                         0xFF03FFFF
+#define   S_028230_ER_LINE_TB(x)                                      (((x) & 0x0F) << 24)
+#define   G_028230_ER_LINE_TB(x)                                      (((x) >> 24) & 0x0F)
+#define   C_028230_ER_LINE_TB                                         0xF0FFFFFF
+#define   S_028230_ER_LINE_BT(x)                                      (((x) & 0x0F) << 28)
+#define   G_028230_ER_LINE_BT(x)                                      (((x) >> 28) & 0x0F)
+#define   C_028230_ER_LINE_BT                                         0x0FFFFFFF
+#define R_028234_PA_SU_HARDWARE_SCREEN_OFFSET                           0x028234
+#define   S_028234_HW_SCREEN_OFFSET_X(x)                              (((x) & 0x1FF) << 0)
+#define   G_028234_HW_SCREEN_OFFSET_X(x)                              (((x) >> 0) & 0x1FF)
+#define   C_028234_HW_SCREEN_OFFSET_X                                 0xFFFFFE00
+#define   S_028234_HW_SCREEN_OFFSET_Y(x)                              (((x) & 0x1FF) << 16)
+#define   G_028234_HW_SCREEN_OFFSET_Y(x)                              (((x) >> 16) & 0x1FF)
+#define   C_028234_HW_SCREEN_OFFSET_Y                                 0xFE00FFFF
+#define R_028238_CB_TARGET_MASK                                         0x028238
+#define   S_028238_TARGET0_ENABLE(x)                                  (((x) & 0x0F) << 0)
+#define   G_028238_TARGET0_ENABLE(x)                                  (((x) >> 0) & 0x0F)
+#define   C_028238_TARGET0_ENABLE                                     0xFFFFFFF0
+#define   S_028238_TARGET1_ENABLE(x)                                  (((x) & 0x0F) << 4)
+#define   G_028238_TARGET1_ENABLE(x)                                  (((x) >> 4) & 0x0F)
+#define   C_028238_TARGET1_ENABLE                                     0xFFFFFF0F
+#define   S_028238_TARGET2_ENABLE(x)                                  (((x) & 0x0F) << 8)
+#define   G_028238_TARGET2_ENABLE(x)                                  (((x) >> 8) & 0x0F)
+#define   C_028238_TARGET2_ENABLE                                     0xFFFFF0FF
+#define   S_028238_TARGET3_ENABLE(x)                                  (((x) & 0x0F) << 12)
+#define   G_028238_TARGET3_ENABLE(x)                                  (((x) >> 12) & 0x0F)
+#define   C_028238_TARGET3_ENABLE                                     0xFFFF0FFF
+#define   S_028238_TARGET4_ENABLE(x)                                  (((x) & 0x0F) << 16)
+#define   G_028238_TARGET4_ENABLE(x)                                  (((x) >> 16) & 0x0F)
+#define   C_028238_TARGET4_ENABLE                                     0xFFF0FFFF
+#define   S_028238_TARGET5_ENABLE(x)                                  (((x) & 0x0F) << 20)
+#define   G_028238_TARGET5_ENABLE(x)                                  (((x) >> 20) & 0x0F)
+#define   C_028238_TARGET5_ENABLE                                     0xFF0FFFFF
+#define   S_028238_TARGET6_ENABLE(x)                                  (((x) & 0x0F) << 24)
+#define   G_028238_TARGET6_ENABLE(x)                                  (((x) >> 24) & 0x0F)
+#define   C_028238_TARGET6_ENABLE                                     0xF0FFFFFF
+#define   S_028238_TARGET7_ENABLE(x)                                  (((x) & 0x0F) << 28)
+#define   G_028238_TARGET7_ENABLE(x)                                  (((x) >> 28) & 0x0F)
+#define   C_028238_TARGET7_ENABLE                                     0x0FFFFFFF
+#define R_02823C_CB_SHADER_MASK                                         0x02823C
+#define   S_02823C_OUTPUT0_ENABLE(x)                                  (((x) & 0x0F) << 0)
+#define   G_02823C_OUTPUT0_ENABLE(x)                                  (((x) >> 0) & 0x0F)
+#define   C_02823C_OUTPUT0_ENABLE                                     0xFFFFFFF0
+#define   S_02823C_OUTPUT1_ENABLE(x)                                  (((x) & 0x0F) << 4)
+#define   G_02823C_OUTPUT1_ENABLE(x)                                  (((x) >> 4) & 0x0F)
+#define   C_02823C_OUTPUT1_ENABLE                                     0xFFFFFF0F
+#define   S_02823C_OUTPUT2_ENABLE(x)                                  (((x) & 0x0F) << 8)
+#define   G_02823C_OUTPUT2_ENABLE(x)                                  (((x) >> 8) & 0x0F)
+#define   C_02823C_OUTPUT2_ENABLE                                     0xFFFFF0FF
+#define   S_02823C_OUTPUT3_ENABLE(x)                                  (((x) & 0x0F) << 12)
+#define   G_02823C_OUTPUT3_ENABLE(x)                                  (((x) >> 12) & 0x0F)
+#define   C_02823C_OUTPUT3_ENABLE                                     0xFFFF0FFF
+#define   S_02823C_OUTPUT4_ENABLE(x)                                  (((x) & 0x0F) << 16)
+#define   G_02823C_OUTPUT4_ENABLE(x)                                  (((x) >> 16) & 0x0F)
+#define   C_02823C_OUTPUT4_ENABLE                                     0xFFF0FFFF
+#define   S_02823C_OUTPUT5_ENABLE(x)                                  (((x) & 0x0F) << 20)
+#define   G_02823C_OUTPUT5_ENABLE(x)                                  (((x) >> 20) & 0x0F)
+#define   C_02823C_OUTPUT5_ENABLE                                     0xFF0FFFFF
+#define   S_02823C_OUTPUT6_ENABLE(x)                                  (((x) & 0x0F) << 24)
+#define   G_02823C_OUTPUT6_ENABLE(x)                                  (((x) >> 24) & 0x0F)
+#define   C_02823C_OUTPUT6_ENABLE                                     0xF0FFFFFF
+#define   S_02823C_OUTPUT7_ENABLE(x)                                  (((x) & 0x0F) << 28)
+#define   G_02823C_OUTPUT7_ENABLE(x)                                  (((x) >> 28) & 0x0F)
+#define   C_02823C_OUTPUT7_ENABLE                                     0x0FFFFFFF
+#define R_028240_PA_SC_GENERIC_SCISSOR_TL                               0x028240
+#define   S_028240_TL_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028240_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028240_TL_X                                               0xFFFF8000
+#define   S_028240_TL_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028240_TL_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028240_TL_Y                                               0x8000FFFF
+#define   S_028240_WINDOW_OFFSET_DISABLE(x)                           (((x) & 0x1) << 31)
+#define   G_028240_WINDOW_OFFSET_DISABLE(x)                           (((x) >> 31) & 0x1)
+#define   C_028240_WINDOW_OFFSET_DISABLE                              0x7FFFFFFF
+#define R_028244_PA_SC_GENERIC_SCISSOR_BR                               0x028244
+#define   S_028244_BR_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028244_BR_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028244_BR_X                                               0xFFFF8000
+#define   S_028244_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028244_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028244_BR_Y                                               0x8000FFFF
+#define R_028250_PA_SC_VPORT_SCISSOR_0_TL                               0x028250
+#define   S_028250_TL_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028250_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028250_TL_X                                               0xFFFF8000
+#define   S_028250_TL_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028250_TL_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028250_TL_Y                                               0x8000FFFF
+#define   S_028250_WINDOW_OFFSET_DISABLE(x)                           (((x) & 0x1) << 31)
+#define   G_028250_WINDOW_OFFSET_DISABLE(x)                           (((x) >> 31) & 0x1)
+#define   C_028250_WINDOW_OFFSET_DISABLE                              0x7FFFFFFF
+#define R_028254_PA_SC_VPORT_SCISSOR_0_BR                               0x028254
+#define   S_028254_BR_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028254_BR_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028254_BR_X                                               0xFFFF8000
+#define   S_028254_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028254_BR_Y                                               0x8000FFFF
+#define R_0282D0_PA_SC_VPORT_ZMIN_0                                     0x0282D0
+#define R_0282D4_PA_SC_VPORT_ZMAX_0                                     0x0282D4
+#define R_028350_PA_SC_RASTER_CONFIG                                    0x028350
+#define   S_028350_RB_MAP_PKR0(x)                                     (((x) & 0x03) << 0)
+#define   G_028350_RB_MAP_PKR0(x)                                     (((x) >> 0) & 0x03)
+#define   C_028350_RB_MAP_PKR0                                        0xFFFFFFFC
+#define     V_028350_RASTER_CONFIG_RB_MAP_0                         0x00
+#define     V_028350_RASTER_CONFIG_RB_MAP_1                         0x01
+#define     V_028350_RASTER_CONFIG_RB_MAP_2                         0x02
+#define     V_028350_RASTER_CONFIG_RB_MAP_3                         0x03
+#define   S_028350_RB_MAP_PKR1(x)                                     (((x) & 0x03) << 2)
+#define   G_028350_RB_MAP_PKR1(x)                                     (((x) >> 2) & 0x03)
+#define   C_028350_RB_MAP_PKR1                                        0xFFFFFFF3
+#define     V_028350_RASTER_CONFIG_RB_MAP_0                         0x00
+#define     V_028350_RASTER_CONFIG_RB_MAP_1                         0x01
+#define     V_028350_RASTER_CONFIG_RB_MAP_2                         0x02
+#define     V_028350_RASTER_CONFIG_RB_MAP_3                         0x03
+#define   S_028350_RB_XSEL2(x)                                        (((x) & 0x03) << 4)
+#define   G_028350_RB_XSEL2(x)                                        (((x) >> 4) & 0x03)
+#define   C_028350_RB_XSEL2                                           0xFFFFFFCF
+#define     V_028350_RASTER_CONFIG_RB_XSEL2_0                       0x00
+#define     V_028350_RASTER_CONFIG_RB_XSEL2_1                       0x01
+#define     V_028350_RASTER_CONFIG_RB_XSEL2_2                       0x02
+#define     V_028350_RASTER_CONFIG_RB_XSEL2_3                       0x03
+#define   S_028350_RB_XSEL(x)                                         (((x) & 0x1) << 6)
+#define   G_028350_RB_XSEL(x)                                         (((x) >> 6) & 0x1)
+#define   C_028350_RB_XSEL                                            0xFFFFFFBF
+#define   S_028350_RB_YSEL(x)                                         (((x) & 0x1) << 7)
+#define   G_028350_RB_YSEL(x)                                         (((x) >> 7) & 0x1)
+#define   C_028350_RB_YSEL                                            0xFFFFFF7F
+#define   S_028350_PKR_MAP(x)                                         (((x) & 0x03) << 8)
+#define   G_028350_PKR_MAP(x)                                         (((x) >> 8) & 0x03)
+#define   C_028350_PKR_MAP                                            0xFFFFFCFF
+#define     V_028350_RASTER_CONFIG_PKR_MAP_0                        0x00
+#define     V_028350_RASTER_CONFIG_PKR_MAP_1                        0x01
+#define     V_028350_RASTER_CONFIG_PKR_MAP_2                        0x02
+#define     V_028350_RASTER_CONFIG_PKR_MAP_3                        0x03
+#define   S_028350_PKR_XSEL(x)                                        (((x) & 0x03) << 10)
+#define   G_028350_PKR_XSEL(x)                                        (((x) >> 10) & 0x03)
+#define   C_028350_PKR_XSEL                                           0xFFFFF3FF
+#define     V_028350_RASTER_CONFIG_PKR_XSEL_0                       0x00
+#define     V_028350_RASTER_CONFIG_PKR_XSEL_1                       0x01
+#define     V_028350_RASTER_CONFIG_PKR_XSEL_2                       0x02
+#define     V_028350_RASTER_CONFIG_PKR_XSEL_3                       0x03
+#define   S_028350_PKR_YSEL(x)                                        (((x) & 0x03) << 12)
+#define   G_028350_PKR_YSEL(x)                                        (((x) >> 12) & 0x03)
+#define   C_028350_PKR_YSEL                                           0xFFFFCFFF
+#define     V_028350_RASTER_CONFIG_PKR_YSEL_0                       0x00
+#define     V_028350_RASTER_CONFIG_PKR_YSEL_1                       0x01
+#define     V_028350_RASTER_CONFIG_PKR_YSEL_2                       0x02
+#define     V_028350_RASTER_CONFIG_PKR_YSEL_3                       0x03
+#define   S_028350_SC_MAP(x)                                          (((x) & 0x03) << 16)
+#define   G_028350_SC_MAP(x)                                          (((x) >> 16) & 0x03)
+#define   C_028350_SC_MAP                                             0xFFFCFFFF
+#define     V_028350_RASTER_CONFIG_SC_MAP_0                         0x00
+#define     V_028350_RASTER_CONFIG_SC_MAP_1                         0x01
+#define     V_028350_RASTER_CONFIG_SC_MAP_2                         0x02
+#define     V_028350_RASTER_CONFIG_SC_MAP_3                         0x03
+#define   S_028350_SC_XSEL(x)                                         (((x) & 0x03) << 18)
+#define   G_028350_SC_XSEL(x)                                         (((x) >> 18) & 0x03)
+#define   C_028350_SC_XSEL                                            0xFFF3FFFF
+#define     V_028350_RASTER_CONFIG_SC_XSEL_8_WIDE_TILE              0x00
+#define     V_028350_RASTER_CONFIG_SC_XSEL_16_WIDE_TILE             0x01
+#define     V_028350_RASTER_CONFIG_SC_XSEL_32_WIDE_TILE             0x02
+#define     V_028350_RASTER_CONFIG_SC_XSEL_64_WIDE_TILE             0x03
+#define   S_028350_SC_YSEL(x)                                         (((x) & 0x03) << 20)
+#define   G_028350_SC_YSEL(x)                                         (((x) >> 20) & 0x03)
+#define   C_028350_SC_YSEL                                            0xFFCFFFFF
+#define     V_028350_RASTER_CONFIG_SC_YSEL_8_WIDE_TILE              0x00
+#define     V_028350_RASTER_CONFIG_SC_YSEL_16_WIDE_TILE             0x01
+#define     V_028350_RASTER_CONFIG_SC_YSEL_32_WIDE_TILE             0x02
+#define     V_028350_RASTER_CONFIG_SC_YSEL_64_WIDE_TILE             0x03
+#define   S_028350_SE_MAP(x)                                          (((x) & 0x03) << 24)
+#define   G_028350_SE_MAP(x)                                          (((x) >> 24) & 0x03)
+#define   C_028350_SE_MAP                                             0xFCFFFFFF
+#define     V_028350_RASTER_CONFIG_SE_MAP_0                         0x00
+#define     V_028350_RASTER_CONFIG_SE_MAP_1                         0x01
+#define     V_028350_RASTER_CONFIG_SE_MAP_2                         0x02
+#define     V_028350_RASTER_CONFIG_SE_MAP_3                         0x03
+#define   S_028350_SE_XSEL(x)                                         (((x) & 0x03) << 26)
+#define   G_028350_SE_XSEL(x)                                         (((x) >> 26) & 0x03)
+#define   C_028350_SE_XSEL                                            0xF3FFFFFF
+#define     V_028350_RASTER_CONFIG_SE_XSEL_8_WIDE_TILE              0x00
+#define     V_028350_RASTER_CONFIG_SE_XSEL_16_WIDE_TILE             0x01
+#define     V_028350_RASTER_CONFIG_SE_XSEL_32_WIDE_TILE             0x02
+#define     V_028350_RASTER_CONFIG_SE_XSEL_64_WIDE_TILE             0x03
+#define   S_028350_SE_YSEL(x)                                         (((x) & 0x03) << 28)
+#define   G_028350_SE_YSEL(x)                                         (((x) >> 28) & 0x03)
+#define   C_028350_SE_YSEL                                            0xCFFFFFFF
+#define     V_028350_RASTER_CONFIG_SE_YSEL_8_WIDE_TILE              0x00
+#define     V_028350_RASTER_CONFIG_SE_YSEL_16_WIDE_TILE             0x01
+#define     V_028350_RASTER_CONFIG_SE_YSEL_32_WIDE_TILE             0x02
+#define     V_028350_RASTER_CONFIG_SE_YSEL_64_WIDE_TILE             0x03
+#define R_028400_VGT_MAX_VTX_INDX                                       0x028400
+#define R_028404_VGT_MIN_VTX_INDX                                       0x028404
+#define R_028408_VGT_INDX_OFFSET                                        0x028408
+#define R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX                           0x02840C
+#define R_028414_CB_BLEND_RED                                           0x028414
+#define R_028418_CB_BLEND_GREEN                                         0x028418
+#define R_02841C_CB_BLEND_BLUE                                          0x02841C
+#define R_028420_CB_BLEND_ALPHA                                         0x028420
+#define R_02842C_DB_STENCIL_CONTROL                                     0x02842C
+#define   S_02842C_STENCILFAIL(x)                                     (((x) & 0x0F) << 0)
+#define   G_02842C_STENCILFAIL(x)                                     (((x) >> 0) & 0x0F)
+#define   C_02842C_STENCILFAIL                                        0xFFFFFFF0
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define   S_02842C_STENCILZPASS(x)                                    (((x) & 0x0F) << 4)
+#define   G_02842C_STENCILZPASS(x)                                    (((x) >> 4) & 0x0F)
+#define   C_02842C_STENCILZPASS                                       0xFFFFFF0F
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define   S_02842C_STENCILZFAIL(x)                                    (((x) & 0x0F) << 8)
+#define   G_02842C_STENCILZFAIL(x)                                    (((x) >> 8) & 0x0F)
+#define   C_02842C_STENCILZFAIL                                       0xFFFFF0FF
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define   S_02842C_STENCILFAIL_BF(x)                                  (((x) & 0x0F) << 12)
+#define   G_02842C_STENCILFAIL_BF(x)                                  (((x) >> 12) & 0x0F)
+#define   C_02842C_STENCILFAIL_BF                                     0xFFFF0FFF
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define   S_02842C_STENCILZPASS_BF(x)                                 (((x) & 0x0F) << 16)
+#define   G_02842C_STENCILZPASS_BF(x)                                 (((x) >> 16) & 0x0F)
+#define   C_02842C_STENCILZPASS_BF                                    0xFFF0FFFF
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define   S_02842C_STENCILZFAIL_BF(x)                                 (((x) & 0x0F) << 20)
+#define   G_02842C_STENCILZFAIL_BF(x)                                 (((x) >> 20) & 0x0F)
+#define   C_02842C_STENCILZFAIL_BF                                    0xFF0FFFFF
+#define     V_02842C_STENCIL_KEEP                                   0x00
+#define     V_02842C_STENCIL_ZERO                                   0x01
+#define     V_02842C_STENCIL_ONES                                   0x02
+#define     V_02842C_STENCIL_REPLACE_TEST                           0x03
+#define     V_02842C_STENCIL_REPLACE_OP                             0x04
+#define     V_02842C_STENCIL_ADD_CLAMP                              0x05
+#define     V_02842C_STENCIL_SUB_CLAMP                              0x06
+#define     V_02842C_STENCIL_INVERT                                 0x07
+#define     V_02842C_STENCIL_ADD_WRAP                               0x08
+#define     V_02842C_STENCIL_SUB_WRAP                               0x09
+#define     V_02842C_STENCIL_AND                                    0x0A
+#define     V_02842C_STENCIL_OR                                     0x0B
+#define     V_02842C_STENCIL_XOR                                    0x0C
+#define     V_02842C_STENCIL_NAND                                   0x0D
+#define     V_02842C_STENCIL_NOR                                    0x0E
+#define     V_02842C_STENCIL_XNOR                                   0x0F
+#define R_028430_DB_STENCILREFMASK                                      0x028430
+#define   S_028430_STENCILTESTVAL(x)                                  (((x) & 0xFF) << 0)
+#define   G_028430_STENCILTESTVAL(x)                                  (((x) >> 0) & 0xFF)
+#define   C_028430_STENCILTESTVAL                                     0xFFFFFF00
+#define   S_028430_STENCILMASK(x)                                     (((x) & 0xFF) << 8)
+#define   G_028430_STENCILMASK(x)                                     (((x) >> 8) & 0xFF)
+#define   C_028430_STENCILMASK                                        0xFFFF00FF
+#define   S_028430_STENCILWRITEMASK(x)                                (((x) & 0xFF) << 16)
+#define   G_028430_STENCILWRITEMASK(x)                                (((x) >> 16) & 0xFF)
+#define   C_028430_STENCILWRITEMASK                                   0xFF00FFFF
+#define   S_028430_STENCILOPVAL(x)                                    (((x) & 0xFF) << 24)
+#define   G_028430_STENCILOPVAL(x)                                    (((x) >> 24) & 0xFF)
+#define   C_028430_STENCILOPVAL                                       0x00FFFFFF
+#define R_028434_DB_STENCILREFMASK_BF                                   0x028434
+#define   S_028434_STENCILTESTVAL_BF(x)                               (((x) & 0xFF) << 0)
+#define   G_028434_STENCILTESTVAL_BF(x)                               (((x) >> 0) & 0xFF)
+#define   C_028434_STENCILTESTVAL_BF                                  0xFFFFFF00
+#define   S_028434_STENCILMASK_BF(x)                                  (((x) & 0xFF) << 8)
+#define   G_028434_STENCILMASK_BF(x)                                  (((x) >> 8) & 0xFF)
+#define   C_028434_STENCILMASK_BF                                     0xFFFF00FF
+#define   S_028434_STENCILWRITEMASK_BF(x)                             (((x) & 0xFF) << 16)
+#define   G_028434_STENCILWRITEMASK_BF(x)                             (((x) >> 16) & 0xFF)
+#define   C_028434_STENCILWRITEMASK_BF                                0xFF00FFFF
+#define   S_028434_STENCILOPVAL_BF(x)                                 (((x) & 0xFF) << 24)
+#define   G_028434_STENCILOPVAL_BF(x)                                 (((x) >> 24) & 0xFF)
+#define   C_028434_STENCILOPVAL_BF                                    0x00FFFFFF
+#define R_02843C_PA_CL_VPORT_XSCALE_0                                   0x02843C
+#define R_028440_PA_CL_VPORT_XOFFSET_0                                  0x028440
+#define R_028444_PA_CL_VPORT_YSCALE_0                                   0x028444
+#define R_028448_PA_CL_VPORT_YOFFSET_0                                  0x028448
+#define R_02844C_PA_CL_VPORT_ZSCALE_0                                   0x02844C
+#define R_028450_PA_CL_VPORT_ZOFFSET_0                                  0x028450
+#define R_0285BC_PA_CL_UCP_0_X                                          0x0285BC
+#define R_0285C0_PA_CL_UCP_0_Y                                          0x0285C0
+#define R_0285C4_PA_CL_UCP_0_Z                                          0x0285C4
+#define R_0285C8_PA_CL_UCP_0_W                                          0x0285C8
+#define R_0285CC_PA_CL_UCP_1_X                                          0x0285CC
+#define R_0285D0_PA_CL_UCP_1_Y                                          0x0285D0
+#define R_0285D4_PA_CL_UCP_1_Z                                          0x0285D4
+#define R_0285D8_PA_CL_UCP_1_W                                          0x0285D8
+#define R_0285DC_PA_CL_UCP_2_X                                          0x0285DC
+#define R_0285E0_PA_CL_UCP_2_Y                                          0x0285E0
+#define R_0285E4_PA_CL_UCP_2_Z                                          0x0285E4
+#define R_0285E8_PA_CL_UCP_2_W                                          0x0285E8
+#define R_0285EC_PA_CL_UCP_3_X                                          0x0285EC
+#define R_0285F0_PA_CL_UCP_3_Y                                          0x0285F0
+#define R_0285F4_PA_CL_UCP_3_Z                                          0x0285F4
+#define R_0285F8_PA_CL_UCP_3_W                                          0x0285F8
+#define R_0285FC_PA_CL_UCP_4_X                                          0x0285FC
+#define R_028600_PA_CL_UCP_4_Y                                          0x028600
+#define R_028604_PA_CL_UCP_4_Z                                          0x028604
+#define R_028608_PA_CL_UCP_4_W                                          0x028608
+#define R_02860C_PA_CL_UCP_5_X                                          0x02860C
+#define R_028610_PA_CL_UCP_5_Y                                          0x028610
+#define R_028614_PA_CL_UCP_5_Z                                          0x028614
+#define R_028618_PA_CL_UCP_5_W                                          0x028618
+#define R_028644_SPI_PS_INPUT_CNTL_0                                    0x028644
+#define   S_028644_OFFSET(x)                                          (((x) & 0x3F) << 0)
+#define   G_028644_OFFSET(x)                                          (((x) >> 0) & 0x3F)
+#define   C_028644_OFFSET                                             0xFFFFFFC0
+#define   S_028644_DEFAULT_VAL(x)                                     (((x) & 0x03) << 8)
+#define   G_028644_DEFAULT_VAL(x)                                     (((x) >> 8) & 0x03)
+#define   C_028644_DEFAULT_VAL                                        0xFFFFFCFF
+#define     V_028644_X_0_0F                                         0x00
+#define   S_028644_FLAT_SHADE(x)                                      (((x) & 0x1) << 10)
+#define   G_028644_FLAT_SHADE(x)                                      (((x) >> 10) & 0x1)
+#define   C_028644_FLAT_SHADE                                         0xFFFFFBFF
+#define   S_028644_CYL_WRAP(x)                                        (((x) & 0x0F) << 13)
+#define   G_028644_CYL_WRAP(x)                                        (((x) >> 13) & 0x0F)
+#define   C_028644_CYL_WRAP                                           0xFFFE1FFF
+#define   S_028644_PT_SPRITE_TEX(x)                                   (((x) & 0x1) << 17)
+#define   G_028644_PT_SPRITE_TEX(x)                                   (((x) >> 17) & 0x1)
+#define   C_028644_PT_SPRITE_TEX                                      0xFFFDFFFF
+#define R_028648_SPI_PS_INPUT_CNTL_1                                    0x028648
+#define R_02864C_SPI_PS_INPUT_CNTL_2                                    0x02864C
+#define R_028650_SPI_PS_INPUT_CNTL_3                                    0x028650
+#define R_028654_SPI_PS_INPUT_CNTL_4                                    0x028654
+#define R_028658_SPI_PS_INPUT_CNTL_5                                    0x028658
+#define R_02865C_SPI_PS_INPUT_CNTL_6                                    0x02865C
+#define R_028660_SPI_PS_INPUT_CNTL_7                                    0x028660
+#define R_028664_SPI_PS_INPUT_CNTL_8                                    0x028664
+#define R_028668_SPI_PS_INPUT_CNTL_9                                    0x028668
+#define R_02866C_SPI_PS_INPUT_CNTL_10                                   0x02866C
+#define R_028670_SPI_PS_INPUT_CNTL_11                                   0x028670
+#define R_028674_SPI_PS_INPUT_CNTL_12                                   0x028674
+#define R_028678_SPI_PS_INPUT_CNTL_13                                   0x028678
+#define R_02867C_SPI_PS_INPUT_CNTL_14                                   0x02867C
+#define R_028680_SPI_PS_INPUT_CNTL_15                                   0x028680
+#define R_028684_SPI_PS_INPUT_CNTL_16                                   0x028684
+#define R_028688_SPI_PS_INPUT_CNTL_17                                   0x028688
+#define R_02868C_SPI_PS_INPUT_CNTL_18                                   0x02868C
+#define R_028690_SPI_PS_INPUT_CNTL_19                                   0x028690
+#define R_028694_SPI_PS_INPUT_CNTL_20                                   0x028694
+#define R_028698_SPI_PS_INPUT_CNTL_21                                   0x028698
+#define R_02869C_SPI_PS_INPUT_CNTL_22                                   0x02869C
+#define R_0286A0_SPI_PS_INPUT_CNTL_23                                   0x0286A0
+#define R_0286A4_SPI_PS_INPUT_CNTL_24                                   0x0286A4
+#define R_0286A8_SPI_PS_INPUT_CNTL_25                                   0x0286A8
+#define R_0286AC_SPI_PS_INPUT_CNTL_26                                   0x0286AC
+#define R_0286B0_SPI_PS_INPUT_CNTL_27                                   0x0286B0
+#define R_0286B4_SPI_PS_INPUT_CNTL_28                                   0x0286B4
+#define R_0286B8_SPI_PS_INPUT_CNTL_29                                   0x0286B8
+#define R_0286BC_SPI_PS_INPUT_CNTL_30                                   0x0286BC
+#define R_0286C0_SPI_PS_INPUT_CNTL_31                                   0x0286C0
+#define R_0286C4_SPI_VS_OUT_CONFIG                                      0x0286C4
+#define   S_0286C4_VS_EXPORT_COUNT(x)                                 (((x) & 0x1F) << 1)
+#define   G_0286C4_VS_EXPORT_COUNT(x)                                 (((x) >> 1) & 0x1F)
+#define   C_0286C4_VS_EXPORT_COUNT                                    0xFFFFFFC1
+#define   S_0286C4_VS_HALF_PACK(x)                                    (((x) & 0x1) << 6)
+#define   G_0286C4_VS_HALF_PACK(x)                                    (((x) >> 6) & 0x1)
+#define   C_0286C4_VS_HALF_PACK                                       0xFFFFFFBF
+#define   S_0286C4_VS_EXPORTS_FOG(x)                                  (((x) & 0x1) << 7)
+#define   G_0286C4_VS_EXPORTS_FOG(x)                                  (((x) >> 7) & 0x1)
+#define   C_0286C4_VS_EXPORTS_FOG                                     0xFFFFFF7F
+#define   S_0286C4_VS_OUT_FOG_VEC_ADDR(x)                             (((x) & 0x1F) << 8)
+#define   G_0286C4_VS_OUT_FOG_VEC_ADDR(x)                             (((x) >> 8) & 0x1F)
+#define   C_0286C4_VS_OUT_FOG_VEC_ADDR                                0xFFFFE0FF
+#define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
+#define   S_0286CC_PERSP_SAMPLE_ENA(x)                                (((x) & 0x1) << 0)
+#define   G_0286CC_PERSP_SAMPLE_ENA(x)                                (((x) >> 0) & 0x1)
+#define   C_0286CC_PERSP_SAMPLE_ENA                                   0xFFFFFFFE
+#define   S_0286CC_PERSP_CENTER_ENA(x)                                (((x) & 0x1) << 1)
+#define   G_0286CC_PERSP_CENTER_ENA(x)                                (((x) >> 1) & 0x1)
+#define   C_0286CC_PERSP_CENTER_ENA                                   0xFFFFFFFD
+#define   S_0286CC_PERSP_CENTROID_ENA(x)                              (((x) & 0x1) << 2)
+#define   G_0286CC_PERSP_CENTROID_ENA(x)                              (((x) >> 2) & 0x1)
+#define   C_0286CC_PERSP_CENTROID_ENA                                 0xFFFFFFFB
+#define   S_0286CC_PERSP_PULL_MODEL_ENA(x)                            (((x) & 0x1) << 3)
+#define   G_0286CC_PERSP_PULL_MODEL_ENA(x)                            (((x) >> 3) & 0x1)
+#define   C_0286CC_PERSP_PULL_MODEL_ENA                               0xFFFFFFF7
+#define   S_0286CC_LINEAR_SAMPLE_ENA(x)                               (((x) & 0x1) << 4)
+#define   G_0286CC_LINEAR_SAMPLE_ENA(x)                               (((x) >> 4) & 0x1)
+#define   C_0286CC_LINEAR_SAMPLE_ENA                                  0xFFFFFFEF
+#define   S_0286CC_LINEAR_CENTER_ENA(x)                               (((x) & 0x1) << 5)
+#define   G_0286CC_LINEAR_CENTER_ENA(x)                               (((x) >> 5) & 0x1)
+#define   C_0286CC_LINEAR_CENTER_ENA                                  0xFFFFFFDF
+#define   S_0286CC_LINEAR_CENTROID_ENA(x)                             (((x) & 0x1) << 6)
+#define   G_0286CC_LINEAR_CENTROID_ENA(x)                             (((x) >> 6) & 0x1)
+#define   C_0286CC_LINEAR_CENTROID_ENA                                0xFFFFFFBF
+#define   S_0286CC_LINE_STIPPLE_TEX_ENA(x)                            (((x) & 0x1) << 7)
+#define   G_0286CC_LINE_STIPPLE_TEX_ENA(x)                            (((x) >> 7) & 0x1)
+#define   C_0286CC_LINE_STIPPLE_TEX_ENA                               0xFFFFFF7F
+#define   S_0286CC_POS_X_FLOAT_ENA(x)                                 (((x) & 0x1) << 8)
+#define   G_0286CC_POS_X_FLOAT_ENA(x)                                 (((x) >> 8) & 0x1)
+#define   C_0286CC_POS_X_FLOAT_ENA                                    0xFFFFFEFF
+#define   S_0286CC_POS_Y_FLOAT_ENA(x)                                 (((x) & 0x1) << 9)
+#define   G_0286CC_POS_Y_FLOAT_ENA(x)                                 (((x) >> 9) & 0x1)
+#define   C_0286CC_POS_Y_FLOAT_ENA                                    0xFFFFFDFF
+#define   S_0286CC_POS_Z_FLOAT_ENA(x)                                 (((x) & 0x1) << 10)
+#define   G_0286CC_POS_Z_FLOAT_ENA(x)                                 (((x) >> 10) & 0x1)
+#define   C_0286CC_POS_Z_FLOAT_ENA                                    0xFFFFFBFF
+#define   S_0286CC_POS_W_FLOAT_ENA(x)                                 (((x) & 0x1) << 11)
+#define   G_0286CC_POS_W_FLOAT_ENA(x)                                 (((x) >> 11) & 0x1)
+#define   C_0286CC_POS_W_FLOAT_ENA                                    0xFFFFF7FF
+#define   S_0286CC_FRONT_FACE_ENA(x)                                  (((x) & 0x1) << 12)
+#define   G_0286CC_FRONT_FACE_ENA(x)                                  (((x) >> 12) & 0x1)
+#define   C_0286CC_FRONT_FACE_ENA                                     0xFFFFEFFF
+#define   S_0286CC_ANCILLARY_ENA(x)                                   (((x) & 0x1) << 13)
+#define   G_0286CC_ANCILLARY_ENA(x)                                   (((x) >> 13) & 0x1)
+#define   C_0286CC_ANCILLARY_ENA                                      0xFFFFDFFF
+#define   S_0286CC_SAMPLE_COVERAGE_ENA(x)                             (((x) & 0x1) << 14)
+#define   G_0286CC_SAMPLE_COVERAGE_ENA(x)                             (((x) >> 14) & 0x1)
+#define   C_0286CC_SAMPLE_COVERAGE_ENA                                0xFFFFBFFF
+#define   S_0286CC_POS_FIXED_PT_ENA(x)                                (((x) & 0x1) << 15)
+#define   G_0286CC_POS_FIXED_PT_ENA(x)                                (((x) >> 15) & 0x1)
+#define   C_0286CC_POS_FIXED_PT_ENA                                   0xFFFF7FFF
+#define R_0286D0_SPI_PS_INPUT_ADDR                                      0x0286D0
+#define   S_0286D0_PERSP_SAMPLE_ENA(x)                                (((x) & 0x1) << 0)
+#define   G_0286D0_PERSP_SAMPLE_ENA(x)                                (((x) >> 0) & 0x1)
+#define   C_0286D0_PERSP_SAMPLE_ENA                                   0xFFFFFFFE
+#define   S_0286D0_PERSP_CENTER_ENA(x)                                (((x) & 0x1) << 1)
+#define   G_0286D0_PERSP_CENTER_ENA(x)                                (((x) >> 1) & 0x1)
+#define   C_0286D0_PERSP_CENTER_ENA                                   0xFFFFFFFD
+#define   S_0286D0_PERSP_CENTROID_ENA(x)                              (((x) & 0x1) << 2)
+#define   G_0286D0_PERSP_CENTROID_ENA(x)                              (((x) >> 2) & 0x1)
+#define   C_0286D0_PERSP_CENTROID_ENA                                 0xFFFFFFFB
+#define   S_0286D0_PERSP_PULL_MODEL_ENA(x)                            (((x) & 0x1) << 3)
+#define   G_0286D0_PERSP_PULL_MODEL_ENA(x)                            (((x) >> 3) & 0x1)
+#define   C_0286D0_PERSP_PULL_MODEL_ENA                               0xFFFFFFF7
+#define   S_0286D0_LINEAR_SAMPLE_ENA(x)                               (((x) & 0x1) << 4)
+#define   G_0286D0_LINEAR_SAMPLE_ENA(x)                               (((x) >> 4) & 0x1)
+#define   C_0286D0_LINEAR_SAMPLE_ENA                                  0xFFFFFFEF
+#define   S_0286D0_LINEAR_CENTER_ENA(x)                               (((x) & 0x1) << 5)
+#define   G_0286D0_LINEAR_CENTER_ENA(x)                               (((x) >> 5) & 0x1)
+#define   C_0286D0_LINEAR_CENTER_ENA                                  0xFFFFFFDF
+#define   S_0286D0_LINEAR_CENTROID_ENA(x)                             (((x) & 0x1) << 6)
+#define   G_0286D0_LINEAR_CENTROID_ENA(x)                             (((x) >> 6) & 0x1)
+#define   C_0286D0_LINEAR_CENTROID_ENA                                0xFFFFFFBF
+#define   S_0286D0_LINE_STIPPLE_TEX_ENA(x)                            (((x) & 0x1) << 7)
+#define   G_0286D0_LINE_STIPPLE_TEX_ENA(x)                            (((x) >> 7) & 0x1)
+#define   C_0286D0_LINE_STIPPLE_TEX_ENA                               0xFFFFFF7F
+#define   S_0286D0_POS_X_FLOAT_ENA(x)                                 (((x) & 0x1) << 8)
+#define   G_0286D0_POS_X_FLOAT_ENA(x)                                 (((x) >> 8) & 0x1)
+#define   C_0286D0_POS_X_FLOAT_ENA                                    0xFFFFFEFF
+#define   S_0286D0_POS_Y_FLOAT_ENA(x)                                 (((x) & 0x1) << 9)
+#define   G_0286D0_POS_Y_FLOAT_ENA(x)                                 (((x) >> 9) & 0x1)
+#define   C_0286D0_POS_Y_FLOAT_ENA                                    0xFFFFFDFF
+#define   S_0286D0_POS_Z_FLOAT_ENA(x)                                 (((x) & 0x1) << 10)
+#define   G_0286D0_POS_Z_FLOAT_ENA(x)                                 (((x) >> 10) & 0x1)
+#define   C_0286D0_POS_Z_FLOAT_ENA                                    0xFFFFFBFF
+#define   S_0286D0_POS_W_FLOAT_ENA(x)                                 (((x) & 0x1) << 11)
+#define   G_0286D0_POS_W_FLOAT_ENA(x)                                 (((x) >> 11) & 0x1)
+#define   C_0286D0_POS_W_FLOAT_ENA                                    0xFFFFF7FF
+#define   S_0286D0_FRONT_FACE_ENA(x)                                  (((x) & 0x1) << 12)
+#define   G_0286D0_FRONT_FACE_ENA(x)                                  (((x) >> 12) & 0x1)
+#define   C_0286D0_FRONT_FACE_ENA                                     0xFFFFEFFF
+#define   S_0286D0_ANCILLARY_ENA(x)                                   (((x) & 0x1) << 13)
+#define   G_0286D0_ANCILLARY_ENA(x)                                   (((x) >> 13) & 0x1)
+#define   C_0286D0_ANCILLARY_ENA                                      0xFFFFDFFF
+#define   S_0286D0_SAMPLE_COVERAGE_ENA(x)                             (((x) & 0x1) << 14)
+#define   G_0286D0_SAMPLE_COVERAGE_ENA(x)                             (((x) >> 14) & 0x1)
+#define   C_0286D0_SAMPLE_COVERAGE_ENA                                0xFFFFBFFF
+#define   S_0286D0_POS_FIXED_PT_ENA(x)                                (((x) & 0x1) << 15)
+#define   G_0286D0_POS_FIXED_PT_ENA(x)                                (((x) >> 15) & 0x1)
+#define   C_0286D0_POS_FIXED_PT_ENA                                   0xFFFF7FFF
+#define R_0286D4_SPI_INTERP_CONTROL_0                                   0x0286D4
+#define   S_0286D4_FLAT_SHADE_ENA(x)                                  (((x) & 0x1) << 0)
+#define   G_0286D4_FLAT_SHADE_ENA(x)                                  (((x) >> 0) & 0x1)
+#define   C_0286D4_FLAT_SHADE_ENA                                     0xFFFFFFFE
+#define   S_0286D4_PNT_SPRITE_ENA(x)                                  (((x) & 0x1) << 1)
+#define   G_0286D4_PNT_SPRITE_ENA(x)                                  (((x) >> 1) & 0x1)
+#define   C_0286D4_PNT_SPRITE_ENA                                     0xFFFFFFFD
+#define   S_0286D4_PNT_SPRITE_OVRD_X(x)                               (((x) & 0x07) << 2)
+#define   G_0286D4_PNT_SPRITE_OVRD_X(x)                               (((x) >> 2) & 0x07)
+#define   C_0286D4_PNT_SPRITE_OVRD_X                                  0xFFFFFFE3
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_0                           0x00
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_1                           0x01
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_S                           0x02
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_T                           0x03
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_NONE                        0x04
+#define   S_0286D4_PNT_SPRITE_OVRD_Y(x)                               (((x) & 0x07) << 5)
+#define   G_0286D4_PNT_SPRITE_OVRD_Y(x)                               (((x) >> 5) & 0x07)
+#define   C_0286D4_PNT_SPRITE_OVRD_Y                                  0xFFFFFF1F
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_0                           0x00
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_1                           0x01
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_S                           0x02
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_T                           0x03
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_NONE                        0x04
+#define   S_0286D4_PNT_SPRITE_OVRD_Z(x)                               (((x) & 0x07) << 8)
+#define   G_0286D4_PNT_SPRITE_OVRD_Z(x)                               (((x) >> 8) & 0x07)
+#define   C_0286D4_PNT_SPRITE_OVRD_Z                                  0xFFFFF8FF
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_0                           0x00
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_1                           0x01
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_S                           0x02
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_T                           0x03
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_NONE                        0x04
+#define   S_0286D4_PNT_SPRITE_OVRD_W(x)                               (((x) & 0x07) << 11)
+#define   G_0286D4_PNT_SPRITE_OVRD_W(x)                               (((x) >> 11) & 0x07)
+#define   C_0286D4_PNT_SPRITE_OVRD_W                                  0xFFFFC7FF
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_0                           0x00
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_1                           0x01
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_S                           0x02
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_T                           0x03
+#define     V_0286D4_SPI_PNT_SPRITE_SEL_NONE                        0x04
+#define   S_0286D4_PNT_SPRITE_TOP_1(x)                                (((x) & 0x1) << 14)
+#define   G_0286D4_PNT_SPRITE_TOP_1(x)                                (((x) >> 14) & 0x1)
+#define   C_0286D4_PNT_SPRITE_TOP_1                                   0xFFFFBFFF
+#define R_0286D8_SPI_PS_IN_CONTROL                                      0x0286D8
+#define   S_0286D8_NUM_INTERP(x)                                      (((x) & 0x3F) << 0)
+#define   G_0286D8_NUM_INTERP(x)                                      (((x) >> 0) & 0x3F)
+#define   C_0286D8_NUM_INTERP                                         0xFFFFFFC0
+#define   S_0286D8_PARAM_GEN(x)                                       (((x) & 0x1) << 6)
+#define   G_0286D8_PARAM_GEN(x)                                       (((x) >> 6) & 0x1)
+#define   C_0286D8_PARAM_GEN                                          0xFFFFFFBF
+#define   S_0286D8_FOG_ADDR(x)                                        (((x) & 0x7F) << 7)
+#define   G_0286D8_FOG_ADDR(x)                                        (((x) >> 7) & 0x7F)
+#define   C_0286D8_FOG_ADDR                                           0xFFFFC07F
+#define   S_0286D8_BC_OPTIMIZE_DISABLE(x)                             (((x) & 0x1) << 14)
+#define   G_0286D8_BC_OPTIMIZE_DISABLE(x)                             (((x) >> 14) & 0x1)
+#define   C_0286D8_BC_OPTIMIZE_DISABLE                                0xFFFFBFFF
+#define   S_0286D8_PASS_FOG_THROUGH_PS(x)                             (((x) & 0x1) << 15)
+#define   G_0286D8_PASS_FOG_THROUGH_PS(x)                             (((x) >> 15) & 0x1)
+#define   C_0286D8_PASS_FOG_THROUGH_PS                                0xFFFF7FFF
+#define R_0286E0_SPI_BARYC_CNTL                                         0x0286E0
+#define   S_0286E0_PERSP_CENTER_CNTL(x)                               (((x) & 0x1) << 0)
+#define   G_0286E0_PERSP_CENTER_CNTL(x)                               (((x) >> 0) & 0x1)
+#define   C_0286E0_PERSP_CENTER_CNTL                                  0xFFFFFFFE
+#define   S_0286E0_PERSP_CENTROID_CNTL(x)                             (((x) & 0x1) << 4)
+#define   G_0286E0_PERSP_CENTROID_CNTL(x)                             (((x) >> 4) & 0x1)
+#define   C_0286E0_PERSP_CENTROID_CNTL                                0xFFFFFFEF
+#define   S_0286E0_LINEAR_CENTER_CNTL(x)                              (((x) & 0x1) << 8)
+#define   G_0286E0_LINEAR_CENTER_CNTL(x)                              (((x) >> 8) & 0x1)
+#define   C_0286E0_LINEAR_CENTER_CNTL                                 0xFFFFFEFF
+#define   S_0286E0_LINEAR_CENTROID_CNTL(x)                            (((x) & 0x1) << 12)
+#define   G_0286E0_LINEAR_CENTROID_CNTL(x)                            (((x) >> 12) & 0x1)
+#define   C_0286E0_LINEAR_CENTROID_CNTL                               0xFFFFEFFF
+#define   S_0286E0_POS_FLOAT_LOCATION(x)                              (((x) & 0x03) << 16)
+#define   G_0286E0_POS_FLOAT_LOCATION(x)                              (((x) >> 16) & 0x03)
+#define   C_0286E0_POS_FLOAT_LOCATION                                 0xFFFCFFFF
+#define     V_0286E0_X_CALCULATE_PER_PIXEL_FLOATING_POINT_POSITION_AT 0x00
+#define   S_0286E0_POS_FLOAT_ULC(x)                                   (((x) & 0x1) << 20)
+#define   G_0286E0_POS_FLOAT_ULC(x)                                   (((x) >> 20) & 0x1)
+#define   C_0286E0_POS_FLOAT_ULC                                      0xFFEFFFFF
+#define   S_0286E0_FRONT_FACE_ALL_BITS(x)                             (((x) & 0x1) << 24)
+#define   G_0286E0_FRONT_FACE_ALL_BITS(x)                             (((x) >> 24) & 0x1)
+#define   C_0286E0_FRONT_FACE_ALL_BITS                                0xFEFFFFFF
+#define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
+#define   S_0286E8_WAVES(x)                                           (((x) & 0xFFF) << 0)
+#define   G_0286E8_WAVES(x)                                           (((x) >> 0) & 0xFFF)
+#define   C_0286E8_WAVES                                              0xFFFFF000
+#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   G_0286E8_WAVESIZE(x)                                        (((x) >> 12) & 0x1FFF)
+#define   C_0286E8_WAVESIZE                                           0xFE000FFF
+#define R_028704_SPI_WAVE_MGMT_1                                        0x028704
+#define   S_028704_NUM_PS_WAVES(x)                                    (((x) & 0x3F) << 0)
+#define   G_028704_NUM_PS_WAVES(x)                                    (((x) >> 0) & 0x3F)
+#define   C_028704_NUM_PS_WAVES                                       0xFFFFFFC0
+#define   S_028704_NUM_VS_WAVES(x)                                    (((x) & 0x3F) << 6)
+#define   G_028704_NUM_VS_WAVES(x)                                    (((x) >> 6) & 0x3F)
+#define   C_028704_NUM_VS_WAVES                                       0xFFFFF03F
+#define   S_028704_NUM_GS_WAVES(x)                                    (((x) & 0x3F) << 12)
+#define   G_028704_NUM_GS_WAVES(x)                                    (((x) >> 12) & 0x3F)
+#define   C_028704_NUM_GS_WAVES                                       0xFFFC0FFF
+#define   S_028704_NUM_ES_WAVES(x)                                    (((x) & 0x3F) << 18)
+#define   G_028704_NUM_ES_WAVES(x)                                    (((x) >> 18) & 0x3F)
+#define   C_028704_NUM_ES_WAVES                                       0xFF03FFFF
+#define   S_028704_NUM_HS_WAVES(x)                                    (((x) & 0x3F) << 24)
+#define   G_028704_NUM_HS_WAVES(x)                                    (((x) >> 24) & 0x3F)
+#define   C_028704_NUM_HS_WAVES                                       0xC0FFFFFF
+#define R_028708_SPI_WAVE_MGMT_2                                        0x028708
+#define   S_028708_NUM_LS_WAVES(x)                                    (((x) & 0x3F) << 0)
+#define   G_028708_NUM_LS_WAVES(x)                                    (((x) >> 0) & 0x3F)
+#define   C_028708_NUM_LS_WAVES                                       0xFFFFFFC0
+#define R_02870C_SPI_SHADER_POS_FORMAT                                  0x02870C
+#define   S_02870C_POS0_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 0)
+#define   G_02870C_POS0_EXPORT_FORMAT(x)                              (((x) >> 0) & 0x0F)
+#define   C_02870C_POS0_EXPORT_FORMAT                                 0xFFFFFFF0
+#define     V_02870C_SPI_SHADER_NONE                                0x00
+#define     V_02870C_SPI_SHADER_1COMP                               0x01
+#define     V_02870C_SPI_SHADER_2COMP                               0x02
+#define     V_02870C_SPI_SHADER_4COMPRESS                           0x03
+#define     V_02870C_SPI_SHADER_4COMP                               0x04
+#define   S_02870C_POS1_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 4)
+#define   G_02870C_POS1_EXPORT_FORMAT(x)                              (((x) >> 4) & 0x0F)
+#define   C_02870C_POS1_EXPORT_FORMAT                                 0xFFFFFF0F
+#define     V_02870C_SPI_SHADER_NONE                                0x00
+#define     V_02870C_SPI_SHADER_1COMP                               0x01
+#define     V_02870C_SPI_SHADER_2COMP                               0x02
+#define     V_02870C_SPI_SHADER_4COMPRESS                           0x03
+#define     V_02870C_SPI_SHADER_4COMP                               0x04
+#define   S_02870C_POS2_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 8)
+#define   G_02870C_POS2_EXPORT_FORMAT(x)                              (((x) >> 8) & 0x0F)
+#define   C_02870C_POS2_EXPORT_FORMAT                                 0xFFFFF0FF
+#define     V_02870C_SPI_SHADER_NONE                                0x00
+#define     V_02870C_SPI_SHADER_1COMP                               0x01
+#define     V_02870C_SPI_SHADER_2COMP                               0x02
+#define     V_02870C_SPI_SHADER_4COMPRESS                           0x03
+#define     V_02870C_SPI_SHADER_4COMP                               0x04
+#define   S_02870C_POS3_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 12)
+#define   G_02870C_POS3_EXPORT_FORMAT(x)                              (((x) >> 12) & 0x0F)
+#define   C_02870C_POS3_EXPORT_FORMAT                                 0xFFFF0FFF
+#define     V_02870C_SPI_SHADER_NONE                                0x00
+#define     V_02870C_SPI_SHADER_1COMP                               0x01
+#define     V_02870C_SPI_SHADER_2COMP                               0x02
+#define     V_02870C_SPI_SHADER_4COMPRESS                           0x03
+#define     V_02870C_SPI_SHADER_4COMP                               0x04
+#define R_028710_SPI_SHADER_Z_FORMAT                                    0x028710
+#define   S_028710_Z_EXPORT_FORMAT(x)                                 (((x) & 0x0F) << 0)
+#define   G_028710_Z_EXPORT_FORMAT(x)                                 (((x) >> 0) & 0x0F)
+#define   C_028710_Z_EXPORT_FORMAT                                    0xFFFFFFF0
+#define     V_028710_SPI_SHADER_ZERO                                0x00
+#define     V_028710_SPI_SHADER_32_R                                0x01
+#define     V_028710_SPI_SHADER_32_GR                               0x02
+#define     V_028710_SPI_SHADER_32_AR                               0x03
+#define     V_028710_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028710_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028710_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028710_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028710_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028710_SPI_SHADER_32_ABGR                             0x09
+#define R_028714_SPI_SHADER_COL_FORMAT                                  0x028714
+#define   S_028714_COL0_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 0)
+#define   G_028714_COL0_EXPORT_FORMAT(x)                              (((x) >> 0) & 0x0F)
+#define   C_028714_COL0_EXPORT_FORMAT                                 0xFFFFFFF0
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL1_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 4)
+#define   G_028714_COL1_EXPORT_FORMAT(x)                              (((x) >> 4) & 0x0F)
+#define   C_028714_COL1_EXPORT_FORMAT                                 0xFFFFFF0F
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL2_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 8)
+#define   G_028714_COL2_EXPORT_FORMAT(x)                              (((x) >> 8) & 0x0F)
+#define   C_028714_COL2_EXPORT_FORMAT                                 0xFFFFF0FF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL3_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 12)
+#define   G_028714_COL3_EXPORT_FORMAT(x)                              (((x) >> 12) & 0x0F)
+#define   C_028714_COL3_EXPORT_FORMAT                                 0xFFFF0FFF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL4_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 16)
+#define   G_028714_COL4_EXPORT_FORMAT(x)                              (((x) >> 16) & 0x0F)
+#define   C_028714_COL4_EXPORT_FORMAT                                 0xFFF0FFFF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL5_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 20)
+#define   G_028714_COL5_EXPORT_FORMAT(x)                              (((x) >> 20) & 0x0F)
+#define   C_028714_COL5_EXPORT_FORMAT                                 0xFF0FFFFF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL6_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 24)
+#define   G_028714_COL6_EXPORT_FORMAT(x)                              (((x) >> 24) & 0x0F)
+#define   C_028714_COL6_EXPORT_FORMAT                                 0xF0FFFFFF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define   S_028714_COL7_EXPORT_FORMAT(x)                              (((x) & 0x0F) << 28)
+#define   G_028714_COL7_EXPORT_FORMAT(x)                              (((x) >> 28) & 0x0F)
+#define   C_028714_COL7_EXPORT_FORMAT                                 0x0FFFFFFF
+#define     V_028714_SPI_SHADER_ZERO                                0x00
+#define     V_028714_SPI_SHADER_32_R                                0x01
+#define     V_028714_SPI_SHADER_32_GR                               0x02
+#define     V_028714_SPI_SHADER_32_AR                               0x03
+#define     V_028714_SPI_SHADER_FP16_ABGR                           0x04
+#define     V_028714_SPI_SHADER_UNORM16_ABGR                        0x05
+#define     V_028714_SPI_SHADER_SNORM16_ABGR                        0x06
+#define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
+#define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
+#define     V_028714_SPI_SHADER_32_ABGR                             0x09
+#define R_028780_CB_BLEND0_CONTROL                                      0x028780
+#define   S_028780_COLOR_SRCBLEND(x)                                  (((x) & 0x1F) << 0)
+#define   G_028780_COLOR_SRCBLEND(x)                                  (((x) >> 0) & 0x1F)
+#define   C_028780_COLOR_SRCBLEND                                     0xFFFFFFE0
+#define     V_028780_BLEND_ZERO                                     0x00
+#define     V_028780_BLEND_ONE                                      0x01
+#define     V_028780_BLEND_SRC_COLOR                                0x02
+#define     V_028780_BLEND_ONE_MINUS_SRC_COLOR                      0x03
+#define     V_028780_BLEND_SRC_ALPHA                                0x04
+#define     V_028780_BLEND_ONE_MINUS_SRC_ALPHA                      0x05
+#define     V_028780_BLEND_DST_ALPHA                                0x06
+#define     V_028780_BLEND_ONE_MINUS_DST_ALPHA                      0x07
+#define     V_028780_BLEND_DST_COLOR                                0x08
+#define     V_028780_BLEND_ONE_MINUS_DST_COLOR                      0x09
+#define     V_028780_BLEND_SRC_ALPHA_SATURATE                       0x0A
+#define     V_028780_BLEND_CONSTANT_COLOR                           0x0D
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR                 0x0E
+#define     V_028780_BLEND_SRC1_COLOR                               0x0F
+#define     V_028780_BLEND_INV_SRC1_COLOR                           0x10
+#define     V_028780_BLEND_SRC1_ALPHA                               0x11
+#define     V_028780_BLEND_INV_SRC1_ALPHA                           0x12
+#define     V_028780_BLEND_CONSTANT_ALPHA                           0x13
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA                 0x14
+#define   S_028780_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 5)
+#define   G_028780_COLOR_COMB_FCN(x)                                  (((x) >> 5) & 0x07)
+#define   C_028780_COLOR_COMB_FCN                                     0xFFFFFF1F
+#define     V_028780_COMB_DST_PLUS_SRC                              0x00
+#define     V_028780_COMB_SRC_MINUS_DST                             0x01
+#define     V_028780_COMB_MIN_DST_SRC                               0x02
+#define     V_028780_COMB_MAX_DST_SRC                               0x03
+#define     V_028780_COMB_DST_MINUS_SRC                             0x04
+#define   S_028780_COLOR_DESTBLEND(x)                                 (((x) & 0x1F) << 8)
+#define   G_028780_COLOR_DESTBLEND(x)                                 (((x) >> 8) & 0x1F)
+#define   C_028780_COLOR_DESTBLEND                                    0xFFFFE0FF
+#define     V_028780_BLEND_ZERO                                     0x00
+#define     V_028780_BLEND_ONE                                      0x01
+#define     V_028780_BLEND_SRC_COLOR                                0x02
+#define     V_028780_BLEND_ONE_MINUS_SRC_COLOR                      0x03
+#define     V_028780_BLEND_SRC_ALPHA                                0x04
+#define     V_028780_BLEND_ONE_MINUS_SRC_ALPHA                      0x05
+#define     V_028780_BLEND_DST_ALPHA                                0x06
+#define     V_028780_BLEND_ONE_MINUS_DST_ALPHA                      0x07
+#define     V_028780_BLEND_DST_COLOR                                0x08
+#define     V_028780_BLEND_ONE_MINUS_DST_COLOR                      0x09
+#define     V_028780_BLEND_SRC_ALPHA_SATURATE                       0x0A
+#define     V_028780_BLEND_CONSTANT_COLOR                           0x0D
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR                 0x0E
+#define     V_028780_BLEND_SRC1_COLOR                               0x0F
+#define     V_028780_BLEND_INV_SRC1_COLOR                           0x10
+#define     V_028780_BLEND_SRC1_ALPHA                               0x11
+#define     V_028780_BLEND_INV_SRC1_ALPHA                           0x12
+#define     V_028780_BLEND_CONSTANT_ALPHA                           0x13
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA                 0x14
+#define   S_028780_ALPHA_SRCBLEND(x)                                  (((x) & 0x1F) << 16)
+#define   G_028780_ALPHA_SRCBLEND(x)                                  (((x) >> 16) & 0x1F)
+#define   C_028780_ALPHA_SRCBLEND                                     0xFFE0FFFF
+#define     V_028780_BLEND_ZERO                                     0x00
+#define     V_028780_BLEND_ONE                                      0x01
+#define     V_028780_BLEND_SRC_COLOR                                0x02
+#define     V_028780_BLEND_ONE_MINUS_SRC_COLOR                      0x03
+#define     V_028780_BLEND_SRC_ALPHA                                0x04
+#define     V_028780_BLEND_ONE_MINUS_SRC_ALPHA                      0x05
+#define     V_028780_BLEND_DST_ALPHA                                0x06
+#define     V_028780_BLEND_ONE_MINUS_DST_ALPHA                      0x07
+#define     V_028780_BLEND_DST_COLOR                                0x08
+#define     V_028780_BLEND_ONE_MINUS_DST_COLOR                      0x09
+#define     V_028780_BLEND_SRC_ALPHA_SATURATE                       0x0A
+#define     V_028780_BLEND_CONSTANT_COLOR                           0x0D
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR                 0x0E
+#define     V_028780_BLEND_SRC1_COLOR                               0x0F
+#define     V_028780_BLEND_INV_SRC1_COLOR                           0x10
+#define     V_028780_BLEND_SRC1_ALPHA                               0x11
+#define     V_028780_BLEND_INV_SRC1_ALPHA                           0x12
+#define     V_028780_BLEND_CONSTANT_ALPHA                           0x13
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA                 0x14
+#define   S_028780_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 21)
+#define   G_028780_ALPHA_COMB_FCN(x)                                  (((x) >> 21) & 0x07)
+#define   C_028780_ALPHA_COMB_FCN                                     0xFF1FFFFF
+#define     V_028780_COMB_DST_PLUS_SRC                              0x00
+#define     V_028780_COMB_SRC_MINUS_DST                             0x01
+#define     V_028780_COMB_MIN_DST_SRC                               0x02
+#define     V_028780_COMB_MAX_DST_SRC                               0x03
+#define     V_028780_COMB_DST_MINUS_SRC                             0x04
+#define   S_028780_ALPHA_DESTBLEND(x)                                 (((x) & 0x1F) << 24)
+#define   G_028780_ALPHA_DESTBLEND(x)                                 (((x) >> 24) & 0x1F)
+#define   C_028780_ALPHA_DESTBLEND                                    0xE0FFFFFF
+#define     V_028780_BLEND_ZERO                                     0x00
+#define     V_028780_BLEND_ONE                                      0x01
+#define     V_028780_BLEND_SRC_COLOR                                0x02
+#define     V_028780_BLEND_ONE_MINUS_SRC_COLOR                      0x03
+#define     V_028780_BLEND_SRC_ALPHA                                0x04
+#define     V_028780_BLEND_ONE_MINUS_SRC_ALPHA                      0x05
+#define     V_028780_BLEND_DST_ALPHA                                0x06
+#define     V_028780_BLEND_ONE_MINUS_DST_ALPHA                      0x07
+#define     V_028780_BLEND_DST_COLOR                                0x08
+#define     V_028780_BLEND_ONE_MINUS_DST_COLOR                      0x09
+#define     V_028780_BLEND_SRC_ALPHA_SATURATE                       0x0A
+#define     V_028780_BLEND_CONSTANT_COLOR                           0x0D
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR                 0x0E
+#define     V_028780_BLEND_SRC1_COLOR                               0x0F
+#define     V_028780_BLEND_INV_SRC1_COLOR                           0x10
+#define     V_028780_BLEND_SRC1_ALPHA                               0x11
+#define     V_028780_BLEND_INV_SRC1_ALPHA                           0x12
+#define     V_028780_BLEND_CONSTANT_ALPHA                           0x13
+#define     V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA                 0x14
+#define   S_028780_SEPARATE_ALPHA_BLEND(x)                            (((x) & 0x1) << 29)
+#define   G_028780_SEPARATE_ALPHA_BLEND(x)                            (((x) >> 29) & 0x1)
+#define   C_028780_SEPARATE_ALPHA_BLEND                               0xDFFFFFFF
+#define   S_028780_ENABLE(x)                                          (((x) & 0x1) << 30)
+#define   G_028780_ENABLE(x)                                          (((x) >> 30) & 0x1)
+#define   C_028780_ENABLE                                             0xBFFFFFFF
+#define   S_028780_DISABLE_ROP3(x)                                    (((x) & 0x1) << 31)
+#define   G_028780_DISABLE_ROP3(x)                                    (((x) >> 31) & 0x1)
+#define   C_028780_DISABLE_ROP3                                       0x7FFFFFFF
+#define R_028784_CB_BLEND1_CONTROL                                      0x028784
+#define R_028788_CB_BLEND2_CONTROL                                      0x028788
+#define R_02878C_CB_BLEND3_CONTROL                                      0x02878C
+#define R_028790_CB_BLEND4_CONTROL                                      0x028790
+#define R_028794_CB_BLEND5_CONTROL                                      0x028794
+#define R_028798_CB_BLEND6_CONTROL                                      0x028798
+#define R_02879C_CB_BLEND7_CONTROL                                      0x02879C
+#define R_0287D4_PA_CL_POINT_X_RAD                                      0x0287D4
+#define R_0287D8_PA_CL_POINT_Y_RAD                                      0x0287D8
+#define R_0287DC_PA_CL_POINT_SIZE                                       0x0287DC
+#define R_0287E0_PA_CL_POINT_CULL_RAD                                   0x0287E0
+#define R_0287E4_VGT_DMA_BASE_HI                                        0x0287E4
+#define   S_0287E4_BASE_ADDR(x)                                       (((x) & 0xFF) << 0)
+#define   G_0287E4_BASE_ADDR(x)                                       (((x) >> 0) & 0xFF)
+#define   C_0287E4_BASE_ADDR                                          0xFFFFFF00
+#define R_0287E8_VGT_DMA_BASE                                           0x0287E8
+#define R_0287F0_VGT_DRAW_INITIATOR                                     0x0287F0
+#define   S_0287F0_SOURCE_SELECT(x)                                   (((x) & 0x03) << 0)
+#define   G_0287F0_SOURCE_SELECT(x)                                   (((x) >> 0) & 0x03)
+#define   C_0287F0_SOURCE_SELECT                                      0xFFFFFFFC
+#define     V_0287F0_DI_SRC_SEL_DMA                                 0x00
+#define     V_0287F0_DI_SRC_SEL_IMMEDIATE                           0x01
+#define     V_0287F0_DI_SRC_SEL_AUTO_INDEX                          0x02
+#define     V_0287F0_DI_SRC_SEL_RESERVED                            0x03
+#define   S_0287F0_MAJOR_MODE(x)                                      (((x) & 0x03) << 2)
+#define   G_0287F0_MAJOR_MODE(x)                                      (((x) >> 2) & 0x03)
+#define   C_0287F0_MAJOR_MODE                                         0xFFFFFFF3
+#define     V_0287F0_DI_MAJOR_MODE_0                                0x00
+#define     V_0287F0_DI_MAJOR_MODE_1                                0x01
+#define   S_0287F0_NOT_EOP(x)                                         (((x) & 0x1) << 5)
+#define   G_0287F0_NOT_EOP(x)                                         (((x) >> 5) & 0x1)
+#define   C_0287F0_NOT_EOP                                            0xFFFFFFDF
+#define   S_0287F0_USE_OPAQUE(x)                                      (((x) & 0x1) << 6)
+#define   G_0287F0_USE_OPAQUE(x)                                      (((x) >> 6) & 0x1)
+#define   C_0287F0_USE_OPAQUE                                         0xFFFFFFBF
+#define R_0287F4_VGT_IMMED_DATA                                         0x0287F4
+#define R_028800_DB_DEPTH_CONTROL                                       0x028800
+#define   S_028800_STENCIL_ENABLE(x)                                  (((x) & 0x1) << 0)
+#define   G_028800_STENCIL_ENABLE(x)                                  (((x) >> 0) & 0x1)
+#define   C_028800_STENCIL_ENABLE                                     0xFFFFFFFE
+#define   S_028800_Z_ENABLE(x)                                        (((x) & 0x1) << 1)
+#define   G_028800_Z_ENABLE(x)                                        (((x) >> 1) & 0x1)
+#define   C_028800_Z_ENABLE                                           0xFFFFFFFD
+#define   S_028800_Z_WRITE_ENABLE(x)                                  (((x) & 0x1) << 2)
+#define   G_028800_Z_WRITE_ENABLE(x)                                  (((x) >> 2) & 0x1)
+#define   C_028800_Z_WRITE_ENABLE                                     0xFFFFFFFB
+#define   S_028800_DEPTH_BOUNDS_ENABLE(x)                             (((x) & 0x1) << 3)
+#define   G_028800_DEPTH_BOUNDS_ENABLE(x)                             (((x) >> 3) & 0x1)
+#define   C_028800_DEPTH_BOUNDS_ENABLE                                0xFFFFFFF7
+#define   S_028800_ZFUNC(x)                                           (((x) & 0x07) << 4)
+#define   G_028800_ZFUNC(x)                                           (((x) >> 4) & 0x07)
+#define   C_028800_ZFUNC                                              0xFFFFFF8F
+#define     V_028800_FRAG_NEVER                                     0x00
+#define     V_028800_FRAG_LESS                                      0x01
+#define     V_028800_FRAG_EQUAL                                     0x02
+#define     V_028800_FRAG_LEQUAL                                    0x03
+#define     V_028800_FRAG_GREATER                                   0x04
+#define     V_028800_FRAG_NOTEQUAL                                  0x05
+#define     V_028800_FRAG_GEQUAL                                    0x06
+#define     V_028800_FRAG_ALWAYS                                    0x07
+#define   S_028800_BACKFACE_ENABLE(x)                                 (((x) & 0x1) << 7)
+#define   G_028800_BACKFACE_ENABLE(x)                                 (((x) >> 7) & 0x1)
+#define   C_028800_BACKFACE_ENABLE                                    0xFFFFFF7F
+#define   S_028800_STENCILFUNC(x)                                     (((x) & 0x07) << 8)
+#define   G_028800_STENCILFUNC(x)                                     (((x) >> 8) & 0x07)
+#define   C_028800_STENCILFUNC                                        0xFFFFF8FF
+#define     V_028800_REF_NEVER                                      0x00
+#define     V_028800_REF_LESS                                       0x01
+#define     V_028800_REF_EQUAL                                      0x02
+#define     V_028800_REF_LEQUAL                                     0x03
+#define     V_028800_REF_GREATER                                    0x04
+#define     V_028800_REF_NOTEQUAL                                   0x05
+#define     V_028800_REF_GEQUAL                                     0x06
+#define     V_028800_REF_ALWAYS                                     0x07
+#define   S_028800_STENCILFUNC_BF(x)                                  (((x) & 0x07) << 20)
+#define   G_028800_STENCILFUNC_BF(x)                                  (((x) >> 20) & 0x07)
+#define   C_028800_STENCILFUNC_BF                                     0xFF8FFFFF
+#define     V_028800_REF_NEVER                                      0x00
+#define     V_028800_REF_LESS                                       0x01
+#define     V_028800_REF_EQUAL                                      0x02
+#define     V_028800_REF_LEQUAL                                     0x03
+#define     V_028800_REF_GREATER                                    0x04
+#define     V_028800_REF_NOTEQUAL                                   0x05
+#define     V_028800_REF_GEQUAL                                     0x06
+#define     V_028800_REF_ALWAYS                                     0x07
+#define   S_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL(x)               (((x) & 0x1) << 30)
+#define   G_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL(x)               (((x) >> 30) & 0x1)
+#define   C_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL                  0xBFFFFFFF
+#define   S_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x)              (((x) & 0x1) << 31)
+#define   G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x)              (((x) >> 31) & 0x1)
+#define   C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS                 0x7FFFFFFF
+#define R_028804_DB_EQAA                                                0x028804
+#define R_028808_CB_COLOR_CONTROL                                       0x028808
+#define   S_028808_DEGAMMA_ENABLE(x)                                  (((x) & 0x1) << 3)
+#define   G_028808_DEGAMMA_ENABLE(x)                                  (((x) >> 3) & 0x1)
+#define   C_028808_DEGAMMA_ENABLE                                     0xFFFFFFF7
+#define   S_028808_MODE(x)                                            (((x) & 0x07) << 4)
+#define   G_028808_MODE(x)                                            (((x) >> 4) & 0x07)
+#define   C_028808_MODE                                               0xFFFFFF8F
+#define     V_028808_CB_DISABLE                                     0x00
+#define     V_028808_CB_NORMAL                                      0x01
+#define     V_028808_CB_ELIMINATE_FAST_CLEAR                        0x02
+#define     V_028808_CB_RESOLVE                                     0x03
+#define     V_028808_CB_FMASK_DECOMPRESS                            0x05
+#define   S_028808_ROP3(x)                                            (((x) & 0xFF) << 16)
+#define   G_028808_ROP3(x)                                            (((x) >> 16) & 0xFF)
+#define   C_028808_ROP3                                               0xFF00FFFF
+#define     V_028808_X_0X00                                         0x00
+#define     V_028808_X_0X05                                         0x05
+#define     V_028808_X_0X0A                                         0x0A
+#define     V_028808_X_0X0F                                         0x0F
+#define     V_028808_X_0X11                                         0x11
+#define     V_028808_X_0X22                                         0x22
+#define     V_028808_X_0X33                                         0x33
+#define     V_028808_X_0X44                                         0x44
+#define     V_028808_X_0X50                                         0x50
+#define     V_028808_X_0X55                                         0x55
+#define     V_028808_X_0X5A                                         0x5A
+#define     V_028808_X_0X5F                                         0x5F
+#define     V_028808_X_0X66                                         0x66
+#define     V_028808_X_0X77                                         0x77
+#define     V_028808_X_0X88                                         0x88
+#define     V_028808_X_0X99                                         0x99
+#define     V_028808_X_0XA0                                         0xA0
+#define     V_028808_X_0XA5                                         0xA5
+#define     V_028808_X_0XAA                                         0xAA
+#define     V_028808_X_0XAF                                         0xAF
+#define     V_028808_X_0XBB                                         0xBB
+#define     V_028808_X_0XCC                                         0xCC
+#define     V_028808_X_0XDD                                         0xDD
+#define     V_028808_X_0XEE                                         0xEE
+#define     V_028808_X_0XF0                                         0xF0
+#define     V_028808_X_0XF5                                         0xF5
+#define     V_028808_X_0XFA                                         0xFA
+#define     V_028808_X_0XFF                                         0xFF
+#define R_02880C_DB_SHADER_CONTROL                                      0x02880C
+#define   S_02880C_Z_EXPORT_ENABLE(x)                                 (((x) & 0x1) << 0)
+#define   G_02880C_Z_EXPORT_ENABLE(x)                                 (((x) >> 0) & 0x1)
+#define   C_02880C_Z_EXPORT_ENABLE                                    0xFFFFFFFE
+#define   S_02880C_STENCIL_TEST_VAL_EXPORT_ENAB(x)                    (((x) & 0x1) << 1)
+#define   G_02880C_STENCIL_TEST_VAL_EXPORT_ENAB(x)                    (((x) >> 1) & 0x1)
+#define   C_02880C_STENCIL_TEST_VAL_EXPORT_ENAB                       0xFFFFFFFD
+#define   S_02880C_STENCIL_OP_VAL_EXPORT_ENABLE(x)                    (((x) & 0x1) << 2)
+#define   G_02880C_STENCIL_OP_VAL_EXPORT_ENABLE(x)                    (((x) >> 2) & 0x1)
+#define   C_02880C_STENCIL_OP_VAL_EXPORT_ENABLE                       0xFFFFFFFB
+#define   S_02880C_Z_ORDER(x)                                         (((x) & 0x03) << 4)
+#define   G_02880C_Z_ORDER(x)                                         (((x) >> 4) & 0x03)
+#define   C_02880C_Z_ORDER                                            0xFFFFFFCF
+#define     V_02880C_LATE_Z                                         0x00
+#define     V_02880C_EARLY_Z_THEN_LATE_Z                            0x01
+#define     V_02880C_RE_Z                                           0x02
+#define     V_02880C_EARLY_Z_THEN_RE_Z                              0x03
+#define   S_02880C_KILL_ENABLE(x)                                     (((x) & 0x1) << 6)
+#define   G_02880C_KILL_ENABLE(x)                                     (((x) >> 6) & 0x1)
+#define   C_02880C_KILL_ENABLE                                        0xFFFFFFBF
+#define   S_02880C_COVERAGE_TO_MASK_ENABLE(x)                         (((x) & 0x1) << 7)
+#define   G_02880C_COVERAGE_TO_MASK_ENABLE(x)                         (((x) >> 7) & 0x1)
+#define   C_02880C_COVERAGE_TO_MASK_ENABLE                            0xFFFFFF7F
+#define   S_02880C_MASK_EXPORT_ENABLE(x)                              (((x) & 0x1) << 8)
+#define   G_02880C_MASK_EXPORT_ENABLE(x)                              (((x) >> 8) & 0x1)
+#define   C_02880C_MASK_EXPORT_ENABLE                                 0xFFFFFEFF
+#define   S_02880C_EXEC_ON_HIER_FAIL(x)                               (((x) & 0x1) << 9)
+#define   G_02880C_EXEC_ON_HIER_FAIL(x)                               (((x) >> 9) & 0x1)
+#define   C_02880C_EXEC_ON_HIER_FAIL                                  0xFFFFFDFF
+#define   S_02880C_EXEC_ON_NOOP(x)                                    (((x) & 0x1) << 10)
+#define   G_02880C_EXEC_ON_NOOP(x)                                    (((x) >> 10) & 0x1)
+#define   C_02880C_EXEC_ON_NOOP                                       0xFFFFFBFF
+#define   S_02880C_ALPHA_TO_MASK_DISABLE(x)                           (((x) & 0x1) << 11)
+#define   G_02880C_ALPHA_TO_MASK_DISABLE(x)                           (((x) >> 11) & 0x1)
+#define   C_02880C_ALPHA_TO_MASK_DISABLE                              0xFFFFF7FF
+#define   S_02880C_DEPTH_BEFORE_SHADER(x)                             (((x) & 0x1) << 12)
+#define   G_02880C_DEPTH_BEFORE_SHADER(x)                             (((x) >> 12) & 0x1)
+#define   C_02880C_DEPTH_BEFORE_SHADER                                0xFFFFEFFF
+#define R_028810_PA_CL_CLIP_CNTL                                        0x028810
+#define   S_028810_UCP_ENA_0(x)                                       (((x) & 0x1) << 0)
+#define   G_028810_UCP_ENA_0(x)                                       (((x) >> 0) & 0x1)
+#define   C_028810_UCP_ENA_0                                          0xFFFFFFFE
+#define   S_028810_UCP_ENA_1(x)                                       (((x) & 0x1) << 1)
+#define   G_028810_UCP_ENA_1(x)                                       (((x) >> 1) & 0x1)
+#define   C_028810_UCP_ENA_1                                          0xFFFFFFFD
+#define   S_028810_UCP_ENA_2(x)                                       (((x) & 0x1) << 2)
+#define   G_028810_UCP_ENA_2(x)                                       (((x) >> 2) & 0x1)
+#define   C_028810_UCP_ENA_2                                          0xFFFFFFFB
+#define   S_028810_UCP_ENA_3(x)                                       (((x) & 0x1) << 3)
+#define   G_028810_UCP_ENA_3(x)                                       (((x) >> 3) & 0x1)
+#define   C_028810_UCP_ENA_3                                          0xFFFFFFF7
+#define   S_028810_UCP_ENA_4(x)                                       (((x) & 0x1) << 4)
+#define   G_028810_UCP_ENA_4(x)                                       (((x) >> 4) & 0x1)
+#define   C_028810_UCP_ENA_4                                          0xFFFFFFEF
+#define   S_028810_UCP_ENA_5(x)                                       (((x) & 0x1) << 5)
+#define   G_028810_UCP_ENA_5(x)                                       (((x) >> 5) & 0x1)
+#define   C_028810_UCP_ENA_5                                          0xFFFFFFDF
+#define   S_028810_PS_UCP_Y_SCALE_NEG(x)                              (((x) & 0x1) << 13)
+#define   G_028810_PS_UCP_Y_SCALE_NEG(x)                              (((x) >> 13) & 0x1)
+#define   C_028810_PS_UCP_Y_SCALE_NEG                                 0xFFFFDFFF
+#define   S_028810_PS_UCP_MODE(x)                                     (((x) & 0x03) << 14)
+#define   G_028810_PS_UCP_MODE(x)                                     (((x) >> 14) & 0x03)
+#define   C_028810_PS_UCP_MODE                                        0xFFFF3FFF
+#define   S_028810_CLIP_DISABLE(x)                                    (((x) & 0x1) << 16)
+#define   G_028810_CLIP_DISABLE(x)                                    (((x) >> 16) & 0x1)
+#define   C_028810_CLIP_DISABLE                                       0xFFFEFFFF
+#define   S_028810_UCP_CULL_ONLY_ENA(x)                               (((x) & 0x1) << 17)
+#define   G_028810_UCP_CULL_ONLY_ENA(x)                               (((x) >> 17) & 0x1)
+#define   C_028810_UCP_CULL_ONLY_ENA                                  0xFFFDFFFF
+#define   S_028810_BOUNDARY_EDGE_FLAG_ENA(x)                          (((x) & 0x1) << 18)
+#define   G_028810_BOUNDARY_EDGE_FLAG_ENA(x)                          (((x) >> 18) & 0x1)
+#define   C_028810_BOUNDARY_EDGE_FLAG_ENA                             0xFFFBFFFF
+#define   S_028810_DX_CLIP_SPACE_DEF(x)                               (((x) & 0x1) << 19)
+#define   G_028810_DX_CLIP_SPACE_DEF(x)                               (((x) >> 19) & 0x1)
+#define   C_028810_DX_CLIP_SPACE_DEF                                  0xFFF7FFFF
+#define   S_028810_DIS_CLIP_ERR_DETECT(x)                             (((x) & 0x1) << 20)
+#define   G_028810_DIS_CLIP_ERR_DETECT(x)                             (((x) >> 20) & 0x1)
+#define   C_028810_DIS_CLIP_ERR_DETECT                                0xFFEFFFFF
+#define   S_028810_VTX_KILL_OR(x)                                     (((x) & 0x1) << 21)
+#define   G_028810_VTX_KILL_OR(x)                                     (((x) >> 21) & 0x1)
+#define   C_028810_VTX_KILL_OR                                        0xFFDFFFFF
+#define   S_028810_DX_RASTERIZATION_KILL(x)                           (((x) & 0x1) << 22)
+#define   G_028810_DX_RASTERIZATION_KILL(x)                           (((x) >> 22) & 0x1)
+#define   C_028810_DX_RASTERIZATION_KILL                              0xFFBFFFFF
+#define   S_028810_DX_LINEAR_ATTR_CLIP_ENA(x)                         (((x) & 0x1) << 24)
+#define   G_028810_DX_LINEAR_ATTR_CLIP_ENA(x)                         (((x) >> 24) & 0x1)
+#define   C_028810_DX_LINEAR_ATTR_CLIP_ENA                            0xFEFFFFFF
+#define   S_028810_VTE_VPORT_PROVOKE_DISABLE(x)                       (((x) & 0x1) << 25)
+#define   G_028810_VTE_VPORT_PROVOKE_DISABLE(x)                       (((x) >> 25) & 0x1)
+#define   C_028810_VTE_VPORT_PROVOKE_DISABLE                          0xFDFFFFFF
+#define   S_028810_ZCLIP_NEAR_DISABLE(x)                              (((x) & 0x1) << 26)
+#define   G_028810_ZCLIP_NEAR_DISABLE(x)                              (((x) >> 26) & 0x1)
+#define   C_028810_ZCLIP_NEAR_DISABLE                                 0xFBFFFFFF
+#define   S_028810_ZCLIP_FAR_DISABLE(x)                               (((x) & 0x1) << 27)
+#define   G_028810_ZCLIP_FAR_DISABLE(x)                               (((x) >> 27) & 0x1)
+#define   C_028810_ZCLIP_FAR_DISABLE                                  0xF7FFFFFF
+#define R_028814_PA_SU_SC_MODE_CNTL                                     0x028814
+#define   S_028814_CULL_FRONT(x)                                      (((x) & 0x1) << 0)
+#define   G_028814_CULL_FRONT(x)                                      (((x) >> 0) & 0x1)
+#define   C_028814_CULL_FRONT                                         0xFFFFFFFE
+#define   S_028814_CULL_BACK(x)                                       (((x) & 0x1) << 1)
+#define   G_028814_CULL_BACK(x)                                       (((x) >> 1) & 0x1)
+#define   C_028814_CULL_BACK                                          0xFFFFFFFD
+#define   S_028814_FACE(x)                                            (((x) & 0x1) << 2)
+#define   G_028814_FACE(x)                                            (((x) >> 2) & 0x1)
+#define   C_028814_FACE                                               0xFFFFFFFB
+#define   S_028814_POLY_MODE(x)                                       (((x) & 0x03) << 3)
+#define   G_028814_POLY_MODE(x)                                       (((x) >> 3) & 0x03)
+#define   C_028814_POLY_MODE                                          0xFFFFFFE7
+#define     V_028814_X_DISABLE_POLY_MODE                            0x00
+#define     V_028814_X_DUAL_MODE                                    0x01
+#define   S_028814_POLYMODE_FRONT_PTYPE(x)                            (((x) & 0x07) << 5)
+#define   G_028814_POLYMODE_FRONT_PTYPE(x)                            (((x) >> 5) & 0x07)
+#define   C_028814_POLYMODE_FRONT_PTYPE                               0xFFFFFF1F
+#define     V_028814_X_DRAW_POINTS                                  0x00
+#define     V_028814_X_DRAW_LINES                                   0x01
+#define     V_028814_X_DRAW_TRIANGLES                               0x02
+#define   S_028814_POLYMODE_BACK_PTYPE(x)                             (((x) & 0x07) << 8)
+#define   G_028814_POLYMODE_BACK_PTYPE(x)                             (((x) >> 8) & 0x07)
+#define   C_028814_POLYMODE_BACK_PTYPE                                0xFFFFF8FF
+#define     V_028814_X_DRAW_POINTS                                  0x00
+#define     V_028814_X_DRAW_LINES                                   0x01
+#define     V_028814_X_DRAW_TRIANGLES                               0x02
+#define   S_028814_POLY_OFFSET_FRONT_ENABLE(x)                        (((x) & 0x1) << 11)
+#define   G_028814_POLY_OFFSET_FRONT_ENABLE(x)                        (((x) >> 11) & 0x1)
+#define   C_028814_POLY_OFFSET_FRONT_ENABLE                           0xFFFFF7FF
+#define   S_028814_POLY_OFFSET_BACK_ENABLE(x)                         (((x) & 0x1) << 12)
+#define   G_028814_POLY_OFFSET_BACK_ENABLE(x)                         (((x) >> 12) & 0x1)
+#define   C_028814_POLY_OFFSET_BACK_ENABLE                            0xFFFFEFFF
+#define   S_028814_POLY_OFFSET_PARA_ENABLE(x)                         (((x) & 0x1) << 13)
+#define   G_028814_POLY_OFFSET_PARA_ENABLE(x)                         (((x) >> 13) & 0x1)
+#define   C_028814_POLY_OFFSET_PARA_ENABLE                            0xFFFFDFFF
+#define   S_028814_VTX_WINDOW_OFFSET_ENABLE(x)                        (((x) & 0x1) << 16)
+#define   G_028814_VTX_WINDOW_OFFSET_ENABLE(x)                        (((x) >> 16) & 0x1)
+#define   C_028814_VTX_WINDOW_OFFSET_ENABLE                           0xFFFEFFFF
+#define   S_028814_PROVOKING_VTX_LAST(x)                              (((x) & 0x1) << 19)
+#define   G_028814_PROVOKING_VTX_LAST(x)                              (((x) >> 19) & 0x1)
+#define   C_028814_PROVOKING_VTX_LAST                                 0xFFF7FFFF
+#define   S_028814_PERSP_CORR_DIS(x)                                  (((x) & 0x1) << 20)
+#define   G_028814_PERSP_CORR_DIS(x)                                  (((x) >> 20) & 0x1)
+#define   C_028814_PERSP_CORR_DIS                                     0xFFEFFFFF
+#define   S_028814_MULTI_PRIM_IB_ENA(x)                               (((x) & 0x1) << 21)
+#define   G_028814_MULTI_PRIM_IB_ENA(x)                               (((x) >> 21) & 0x1)
+#define   C_028814_MULTI_PRIM_IB_ENA                                  0xFFDFFFFF
+#define R_028818_PA_CL_VTE_CNTL                                         0x028818
+#define   S_028818_VPORT_X_SCALE_ENA(x)                               (((x) & 0x1) << 0)
+#define   G_028818_VPORT_X_SCALE_ENA(x)                               (((x) >> 0) & 0x1)
+#define   C_028818_VPORT_X_SCALE_ENA                                  0xFFFFFFFE
+#define   S_028818_VPORT_X_OFFSET_ENA(x)                              (((x) & 0x1) << 1)
+#define   G_028818_VPORT_X_OFFSET_ENA(x)                              (((x) >> 1) & 0x1)
+#define   C_028818_VPORT_X_OFFSET_ENA                                 0xFFFFFFFD
+#define   S_028818_VPORT_Y_SCALE_ENA(x)                               (((x) & 0x1) << 2)
+#define   G_028818_VPORT_Y_SCALE_ENA(x)                               (((x) >> 2) & 0x1)
+#define   C_028818_VPORT_Y_SCALE_ENA                                  0xFFFFFFFB
+#define   S_028818_VPORT_Y_OFFSET_ENA(x)                              (((x) & 0x1) << 3)
+#define   G_028818_VPORT_Y_OFFSET_ENA(x)                              (((x) >> 3) & 0x1)
+#define   C_028818_VPORT_Y_OFFSET_ENA                                 0xFFFFFFF7
+#define   S_028818_VPORT_Z_SCALE_ENA(x)                               (((x) & 0x1) << 4)
+#define   G_028818_VPORT_Z_SCALE_ENA(x)                               (((x) >> 4) & 0x1)
+#define   C_028818_VPORT_Z_SCALE_ENA                                  0xFFFFFFEF
+#define   S_028818_VPORT_Z_OFFSET_ENA(x)                              (((x) & 0x1) << 5)
+#define   G_028818_VPORT_Z_OFFSET_ENA(x)                              (((x) >> 5) & 0x1)
+#define   C_028818_VPORT_Z_OFFSET_ENA                                 0xFFFFFFDF
+#define   S_028818_VTX_XY_FMT(x)                                      (((x) & 0x1) << 8)
+#define   G_028818_VTX_XY_FMT(x)                                      (((x) >> 8) & 0x1)
+#define   C_028818_VTX_XY_FMT                                         0xFFFFFEFF
+#define   S_028818_VTX_Z_FMT(x)                                       (((x) & 0x1) << 9)
+#define   G_028818_VTX_Z_FMT(x)                                       (((x) >> 9) & 0x1)
+#define   C_028818_VTX_Z_FMT                                          0xFFFFFDFF
+#define   S_028818_VTX_W0_FMT(x)                                      (((x) & 0x1) << 10)
+#define   G_028818_VTX_W0_FMT(x)                                      (((x) >> 10) & 0x1)
+#define   C_028818_VTX_W0_FMT                                         0xFFFFFBFF
+#define R_02881C_PA_CL_VS_OUT_CNTL                                      0x02881C
+#define   S_02881C_CLIP_DIST_ENA_0(x)                                 (((x) & 0x1) << 0)
+#define   G_02881C_CLIP_DIST_ENA_0(x)                                 (((x) >> 0) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_0                                    0xFFFFFFFE
+#define   S_02881C_CLIP_DIST_ENA_1(x)                                 (((x) & 0x1) << 1)
+#define   G_02881C_CLIP_DIST_ENA_1(x)                                 (((x) >> 1) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_1                                    0xFFFFFFFD
+#define   S_02881C_CLIP_DIST_ENA_2(x)                                 (((x) & 0x1) << 2)
+#define   G_02881C_CLIP_DIST_ENA_2(x)                                 (((x) >> 2) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_2                                    0xFFFFFFFB
+#define   S_02881C_CLIP_DIST_ENA_3(x)                                 (((x) & 0x1) << 3)
+#define   G_02881C_CLIP_DIST_ENA_3(x)                                 (((x) >> 3) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_3                                    0xFFFFFFF7
+#define   S_02881C_CLIP_DIST_ENA_4(x)                                 (((x) & 0x1) << 4)
+#define   G_02881C_CLIP_DIST_ENA_4(x)                                 (((x) >> 4) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_4                                    0xFFFFFFEF
+#define   S_02881C_CLIP_DIST_ENA_5(x)                                 (((x) & 0x1) << 5)
+#define   G_02881C_CLIP_DIST_ENA_5(x)                                 (((x) >> 5) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_5                                    0xFFFFFFDF
+#define   S_02881C_CLIP_DIST_ENA_6(x)                                 (((x) & 0x1) << 6)
+#define   G_02881C_CLIP_DIST_ENA_6(x)                                 (((x) >> 6) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_6                                    0xFFFFFFBF
+#define   S_02881C_CLIP_DIST_ENA_7(x)                                 (((x) & 0x1) << 7)
+#define   G_02881C_CLIP_DIST_ENA_7(x)                                 (((x) >> 7) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_7                                    0xFFFFFF7F
+#define   S_02881C_CULL_DIST_ENA_0(x)                                 (((x) & 0x1) << 8)
+#define   G_02881C_CULL_DIST_ENA_0(x)                                 (((x) >> 8) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_0                                    0xFFFFFEFF
+#define   S_02881C_CULL_DIST_ENA_1(x)                                 (((x) & 0x1) << 9)
+#define   G_02881C_CULL_DIST_ENA_1(x)                                 (((x) >> 9) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_1                                    0xFFFFFDFF
+#define   S_02881C_CULL_DIST_ENA_2(x)                                 (((x) & 0x1) << 10)
+#define   G_02881C_CULL_DIST_ENA_2(x)                                 (((x) >> 10) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_2                                    0xFFFFFBFF
+#define   S_02881C_CULL_DIST_ENA_3(x)                                 (((x) & 0x1) << 11)
+#define   G_02881C_CULL_DIST_ENA_3(x)                                 (((x) >> 11) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_3                                    0xFFFFF7FF
+#define   S_02881C_CULL_DIST_ENA_4(x)                                 (((x) & 0x1) << 12)
+#define   G_02881C_CULL_DIST_ENA_4(x)                                 (((x) >> 12) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_4                                    0xFFFFEFFF
+#define   S_02881C_CULL_DIST_ENA_5(x)                                 (((x) & 0x1) << 13)
+#define   G_02881C_CULL_DIST_ENA_5(x)                                 (((x) >> 13) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_5                                    0xFFFFDFFF
+#define   S_02881C_CULL_DIST_ENA_6(x)                                 (((x) & 0x1) << 14)
+#define   G_02881C_CULL_DIST_ENA_6(x)                                 (((x) >> 14) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_6                                    0xFFFFBFFF
+#define   S_02881C_CULL_DIST_ENA_7(x)                                 (((x) & 0x1) << 15)
+#define   G_02881C_CULL_DIST_ENA_7(x)                                 (((x) >> 15) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_7                                    0xFFFF7FFF
+#define   S_02881C_USE_VTX_POINT_SIZE(x)                              (((x) & 0x1) << 16)
+#define   G_02881C_USE_VTX_POINT_SIZE(x)                              (((x) >> 16) & 0x1)
+#define   C_02881C_USE_VTX_POINT_SIZE                                 0xFFFEFFFF
+#define   S_02881C_USE_VTX_EDGE_FLAG(x)                               (((x) & 0x1) << 17)
+#define   G_02881C_USE_VTX_EDGE_FLAG(x)                               (((x) >> 17) & 0x1)
+#define   C_02881C_USE_VTX_EDGE_FLAG                                  0xFFFDFFFF
+#define   S_02881C_USE_VTX_RENDER_TARGET_INDX(x)                      (((x) & 0x1) << 18)
+#define   G_02881C_USE_VTX_RENDER_TARGET_INDX(x)                      (((x) >> 18) & 0x1)
+#define   C_02881C_USE_VTX_RENDER_TARGET_INDX                         0xFFFBFFFF
+#define   S_02881C_USE_VTX_VIEWPORT_INDX(x)                           (((x) & 0x1) << 19)
+#define   G_02881C_USE_VTX_VIEWPORT_INDX(x)                           (((x) >> 19) & 0x1)
+#define   C_02881C_USE_VTX_VIEWPORT_INDX                              0xFFF7FFFF
+#define   S_02881C_USE_VTX_KILL_FLAG(x)                               (((x) & 0x1) << 20)
+#define   G_02881C_USE_VTX_KILL_FLAG(x)                               (((x) >> 20) & 0x1)
+#define   C_02881C_USE_VTX_KILL_FLAG                                  0xFFEFFFFF
+#define   S_02881C_VS_OUT_MISC_VEC_ENA(x)                             (((x) & 0x1) << 21)
+#define   G_02881C_VS_OUT_MISC_VEC_ENA(x)                             (((x) >> 21) & 0x1)
+#define   C_02881C_VS_OUT_MISC_VEC_ENA                                0xFFDFFFFF
+#define   S_02881C_VS_OUT_CCDIST0_VEC_ENA(x)                          (((x) & 0x1) << 22)
+#define   G_02881C_VS_OUT_CCDIST0_VEC_ENA(x)                          (((x) >> 22) & 0x1)
+#define   C_02881C_VS_OUT_CCDIST0_VEC_ENA                             0xFFBFFFFF
+#define   S_02881C_VS_OUT_CCDIST1_VEC_ENA(x)                          (((x) & 0x1) << 23)
+#define   G_02881C_VS_OUT_CCDIST1_VEC_ENA(x)                          (((x) >> 23) & 0x1)
+#define   C_02881C_VS_OUT_CCDIST1_VEC_ENA                             0xFF7FFFFF
+#define   S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(x)                        (((x) & 0x1) << 24)
+#define   G_02881C_VS_OUT_MISC_SIDE_BUS_ENA(x)                        (((x) >> 24) & 0x1)
+#define   C_02881C_VS_OUT_MISC_SIDE_BUS_ENA                           0xFEFFFFFF
+#define   S_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) & 0x1) << 25)
+#define   G_02881C_USE_VTX_GS_CUT_FLAG(x)                             (((x) >> 25) & 0x1)
+#define   C_02881C_USE_VTX_GS_CUT_FLAG                                0xFDFFFFFF
+#define R_028820_PA_CL_NANINF_CNTL                                      0x028820
+#define   S_028820_VTE_XY_INF_DISCARD(x)                              (((x) & 0x1) << 0)
+#define   G_028820_VTE_XY_INF_DISCARD(x)                              (((x) >> 0) & 0x1)
+#define   C_028820_VTE_XY_INF_DISCARD                                 0xFFFFFFFE
+#define   S_028820_VTE_Z_INF_DISCARD(x)                               (((x) & 0x1) << 1)
+#define   G_028820_VTE_Z_INF_DISCARD(x)                               (((x) >> 1) & 0x1)
+#define   C_028820_VTE_Z_INF_DISCARD                                  0xFFFFFFFD
+#define   S_028820_VTE_W_INF_DISCARD(x)                               (((x) & 0x1) << 2)
+#define   G_028820_VTE_W_INF_DISCARD(x)                               (((x) >> 2) & 0x1)
+#define   C_028820_VTE_W_INF_DISCARD                                  0xFFFFFFFB
+#define   S_028820_VTE_0XNANINF_IS_0(x)                               (((x) & 0x1) << 3)
+#define   G_028820_VTE_0XNANINF_IS_0(x)                               (((x) >> 3) & 0x1)
+#define   C_028820_VTE_0XNANINF_IS_0                                  0xFFFFFFF7
+#define   S_028820_VTE_XY_NAN_RETAIN(x)                               (((x) & 0x1) << 4)
+#define   G_028820_VTE_XY_NAN_RETAIN(x)                               (((x) >> 4) & 0x1)
+#define   C_028820_VTE_XY_NAN_RETAIN                                  0xFFFFFFEF
+#define   S_028820_VTE_Z_NAN_RETAIN(x)                                (((x) & 0x1) << 5)
+#define   G_028820_VTE_Z_NAN_RETAIN(x)                                (((x) >> 5) & 0x1)
+#define   C_028820_VTE_Z_NAN_RETAIN                                   0xFFFFFFDF
+#define   S_028820_VTE_W_NAN_RETAIN(x)                                (((x) & 0x1) << 6)
+#define   G_028820_VTE_W_NAN_RETAIN(x)                                (((x) >> 6) & 0x1)
+#define   C_028820_VTE_W_NAN_RETAIN                                   0xFFFFFFBF
+#define   S_028820_VTE_W_RECIP_NAN_IS_0(x)                            (((x) & 0x1) << 7)
+#define   G_028820_VTE_W_RECIP_NAN_IS_0(x)                            (((x) >> 7) & 0x1)
+#define   C_028820_VTE_W_RECIP_NAN_IS_0                               0xFFFFFF7F
+#define   S_028820_VS_XY_NAN_TO_INF(x)                                (((x) & 0x1) << 8)
+#define   G_028820_VS_XY_NAN_TO_INF(x)                                (((x) >> 8) & 0x1)
+#define   C_028820_VS_XY_NAN_TO_INF                                   0xFFFFFEFF
+#define   S_028820_VS_XY_INF_RETAIN(x)                                (((x) & 0x1) << 9)
+#define   G_028820_VS_XY_INF_RETAIN(x)                                (((x) >> 9) & 0x1)
+#define   C_028820_VS_XY_INF_RETAIN                                   0xFFFFFDFF
+#define   S_028820_VS_Z_NAN_TO_INF(x)                                 (((x) & 0x1) << 10)
+#define   G_028820_VS_Z_NAN_TO_INF(x)                                 (((x) >> 10) & 0x1)
+#define   C_028820_VS_Z_NAN_TO_INF                                    0xFFFFFBFF
+#define   S_028820_VS_Z_INF_RETAIN(x)                                 (((x) & 0x1) << 11)
+#define   G_028820_VS_Z_INF_RETAIN(x)                                 (((x) >> 11) & 0x1)
+#define   C_028820_VS_Z_INF_RETAIN                                    0xFFFFF7FF
+#define   S_028820_VS_W_NAN_TO_INF(x)                                 (((x) & 0x1) << 12)
+#define   G_028820_VS_W_NAN_TO_INF(x)                                 (((x) >> 12) & 0x1)
+#define   C_028820_VS_W_NAN_TO_INF                                    0xFFFFEFFF
+#define   S_028820_VS_W_INF_RETAIN(x)                                 (((x) & 0x1) << 13)
+#define   G_028820_VS_W_INF_RETAIN(x)                                 (((x) >> 13) & 0x1)
+#define   C_028820_VS_W_INF_RETAIN                                    0xFFFFDFFF
+#define   S_028820_VS_CLIP_DIST_INF_DISCARD(x)                        (((x) & 0x1) << 14)
+#define   G_028820_VS_CLIP_DIST_INF_DISCARD(x)                        (((x) >> 14) & 0x1)
+#define   C_028820_VS_CLIP_DIST_INF_DISCARD                           0xFFFFBFFF
+#define   S_028820_VTE_NO_OUTPUT_NEG_0(x)                             (((x) & 0x1) << 20)
+#define   G_028820_VTE_NO_OUTPUT_NEG_0(x)                             (((x) >> 20) & 0x1)
+#define   C_028820_VTE_NO_OUTPUT_NEG_0                                0xFFEFFFFF
+#define R_028824_PA_SU_LINE_STIPPLE_CNTL                                0x028824
+#define   S_028824_LINE_STIPPLE_RESET(x)                              (((x) & 0x03) << 0)
+#define   G_028824_LINE_STIPPLE_RESET(x)                              (((x) >> 0) & 0x03)
+#define   C_028824_LINE_STIPPLE_RESET                                 0xFFFFFFFC
+#define   S_028824_EXPAND_FULL_LENGTH(x)                              (((x) & 0x1) << 2)
+#define   G_028824_EXPAND_FULL_LENGTH(x)                              (((x) >> 2) & 0x1)
+#define   C_028824_EXPAND_FULL_LENGTH                                 0xFFFFFFFB
+#define   S_028824_FRACTIONAL_ACCUM(x)                                (((x) & 0x1) << 3)
+#define   G_028824_FRACTIONAL_ACCUM(x)                                (((x) >> 3) & 0x1)
+#define   C_028824_FRACTIONAL_ACCUM                                   0xFFFFFFF7
+#define   S_028824_DIAMOND_ADJUST(x)                                  (((x) & 0x1) << 4)
+#define   G_028824_DIAMOND_ADJUST(x)                                  (((x) >> 4) & 0x1)
+#define   C_028824_DIAMOND_ADJUST                                     0xFFFFFFEF
+#define R_028828_PA_SU_LINE_STIPPLE_SCALE                               0x028828
+#define R_02882C_PA_SU_PRIM_FILTER_CNTL                                 0x02882C
+#define   S_02882C_TRIANGLE_FILTER_DISABLE(x)                         (((x) & 0x1) << 0)
+#define   G_02882C_TRIANGLE_FILTER_DISABLE(x)                         (((x) >> 0) & 0x1)
+#define   C_02882C_TRIANGLE_FILTER_DISABLE                            0xFFFFFFFE
+#define   S_02882C_LINE_FILTER_DISABLE(x)                             (((x) & 0x1) << 1)
+#define   G_02882C_LINE_FILTER_DISABLE(x)                             (((x) >> 1) & 0x1)
+#define   C_02882C_LINE_FILTER_DISABLE                                0xFFFFFFFD
+#define   S_02882C_POINT_FILTER_DISABLE(x)                            (((x) & 0x1) << 2)
+#define   G_02882C_POINT_FILTER_DISABLE(x)                            (((x) >> 2) & 0x1)
+#define   C_02882C_POINT_FILTER_DISABLE                               0xFFFFFFFB
+#define   S_02882C_RECTANGLE_FILTER_DISABLE(x)                        (((x) & 0x1) << 3)
+#define   G_02882C_RECTANGLE_FILTER_DISABLE(x)                        (((x) >> 3) & 0x1)
+#define   C_02882C_RECTANGLE_FILTER_DISABLE                           0xFFFFFFF7
+#define   S_02882C_TRIANGLE_EXPAND_ENA(x)                             (((x) & 0x1) << 4)
+#define   G_02882C_TRIANGLE_EXPAND_ENA(x)                             (((x) >> 4) & 0x1)
+#define   C_02882C_TRIANGLE_EXPAND_ENA                                0xFFFFFFEF
+#define   S_02882C_LINE_EXPAND_ENA(x)                                 (((x) & 0x1) << 5)
+#define   G_02882C_LINE_EXPAND_ENA(x)                                 (((x) >> 5) & 0x1)
+#define   C_02882C_LINE_EXPAND_ENA                                    0xFFFFFFDF
+#define   S_02882C_POINT_EXPAND_ENA(x)                                (((x) & 0x1) << 6)
+#define   G_02882C_POINT_EXPAND_ENA(x)                                (((x) >> 6) & 0x1)
+#define   C_02882C_POINT_EXPAND_ENA                                   0xFFFFFFBF
+#define   S_02882C_RECTANGLE_EXPAND_ENA(x)                            (((x) & 0x1) << 7)
+#define   G_02882C_RECTANGLE_EXPAND_ENA(x)                            (((x) >> 7) & 0x1)
+#define   C_02882C_RECTANGLE_EXPAND_ENA                               0xFFFFFF7F
+#define   S_02882C_PRIM_EXPAND_CONSTANT(x)                            (((x) & 0xFF) << 8)
+#define   G_02882C_PRIM_EXPAND_CONSTANT(x)                            (((x) >> 8) & 0xFF)
+#define   C_02882C_PRIM_EXPAND_CONSTANT                               0xFFFF00FF
+#define R_028A00_PA_SU_POINT_SIZE                                       0x028A00
+#define   S_028A00_HEIGHT(x)                                          (((x) & 0xFFFF) << 0)
+#define   G_028A00_HEIGHT(x)                                          (((x) >> 0) & 0xFFFF)
+#define   C_028A00_HEIGHT                                             0xFFFF0000
+#define   S_028A00_WIDTH(x)                                           (((x) & 0xFFFF) << 16)
+#define   G_028A00_WIDTH(x)                                           (((x) >> 16) & 0xFFFF)
+#define   C_028A00_WIDTH                                              0x0000FFFF
+#define R_028A04_PA_SU_POINT_MINMAX                                     0x028A04
+#define   S_028A04_MIN_SIZE(x)                                        (((x) & 0xFFFF) << 0)
+#define   G_028A04_MIN_SIZE(x)                                        (((x) >> 0) & 0xFFFF)
+#define   C_028A04_MIN_SIZE                                           0xFFFF0000
+#define   S_028A04_MAX_SIZE(x)                                        (((x) & 0xFFFF) << 16)
+#define   G_028A04_MAX_SIZE(x)                                        (((x) >> 16) & 0xFFFF)
+#define   C_028A04_MAX_SIZE                                           0x0000FFFF
+#define R_028A08_PA_SU_LINE_CNTL                                        0x028A08
+#define   S_028A08_WIDTH(x)                                           (((x) & 0xFFFF) << 0)
+#define   G_028A08_WIDTH(x)                                           (((x) >> 0) & 0xFFFF)
+#define   C_028A08_WIDTH                                              0xFFFF0000
+#define R_028A0C_PA_SC_LINE_STIPPLE                                     0x028A0C
+#define   S_028A0C_LINE_PATTERN(x)                                    (((x) & 0xFFFF) << 0)
+#define   G_028A0C_LINE_PATTERN(x)                                    (((x) >> 0) & 0xFFFF)
+#define   C_028A0C_LINE_PATTERN                                       0xFFFF0000
+#define   S_028A0C_REPEAT_COUNT(x)                                    (((x) & 0xFF) << 16)
+#define   G_028A0C_REPEAT_COUNT(x)                                    (((x) >> 16) & 0xFF)
+#define   C_028A0C_REPEAT_COUNT                                       0xFF00FFFF
+#define   S_028A0C_PATTERN_BIT_ORDER(x)                               (((x) & 0x1) << 28)
+#define   G_028A0C_PATTERN_BIT_ORDER(x)                               (((x) >> 28) & 0x1)
+#define   C_028A0C_PATTERN_BIT_ORDER                                  0xEFFFFFFF
+#define   S_028A0C_AUTO_RESET_CNTL(x)                                 (((x) & 0x03) << 29)
+#define   G_028A0C_AUTO_RESET_CNTL(x)                                 (((x) >> 29) & 0x03)
+#define   C_028A0C_AUTO_RESET_CNTL                                    0x9FFFFFFF
+#define R_028A10_VGT_OUTPUT_PATH_CNTL                                   0x028A10
+#define   S_028A10_PATH_SELECT(x)                                     (((x) & 0x07) << 0)
+#define   G_028A10_PATH_SELECT(x)                                     (((x) >> 0) & 0x07)
+#define   C_028A10_PATH_SELECT                                        0xFFFFFFF8
+#define     V_028A10_VGT_OUTPATH_VTX_REUSE                          0x00
+#define     V_028A10_VGT_OUTPATH_TESS_EN                            0x01
+#define     V_028A10_VGT_OUTPATH_PASSTHRU                           0x02
+#define     V_028A10_VGT_OUTPATH_GS_BLOCK                           0x03
+#define     V_028A10_VGT_OUTPATH_HS_BLOCK                           0x04
+#define R_028A14_VGT_HOS_CNTL                                           0x028A14
+#define   S_028A14_TESS_MODE(x)                                       (((x) & 0x03) << 0)
+#define   G_028A14_TESS_MODE(x)                                       (((x) >> 0) & 0x03)
+#define   C_028A14_TESS_MODE                                          0xFFFFFFFC
+#define R_028A18_VGT_HOS_MAX_TESS_LEVEL                                 0x028A18
+#define R_028A1C_VGT_HOS_MIN_TESS_LEVEL                                 0x028A1C
+#define R_028A20_VGT_HOS_REUSE_DEPTH                                    0x028A20
+#define   S_028A20_REUSE_DEPTH(x)                                     (((x) & 0xFF) << 0)
+#define   G_028A20_REUSE_DEPTH(x)                                     (((x) >> 0) & 0xFF)
+#define   C_028A20_REUSE_DEPTH                                        0xFFFFFF00
+#define R_028A24_VGT_GROUP_PRIM_TYPE                                    0x028A24
+#define   S_028A24_PRIM_TYPE(x)                                       (((x) & 0x1F) << 0)
+#define   G_028A24_PRIM_TYPE(x)                                       (((x) >> 0) & 0x1F)
+#define   C_028A24_PRIM_TYPE                                          0xFFFFFFE0
+#define     V_028A24_VGT_GRP_3D_POINT                               0x00
+#define     V_028A24_VGT_GRP_3D_LINE                                0x01
+#define     V_028A24_VGT_GRP_3D_TRI                                 0x02
+#define     V_028A24_VGT_GRP_3D_RECT                                0x03
+#define     V_028A24_VGT_GRP_3D_QUAD                                0x04
+#define     V_028A24_VGT_GRP_2D_COPY_RECT_V0                        0x05
+#define     V_028A24_VGT_GRP_2D_COPY_RECT_V1                        0x06
+#define     V_028A24_VGT_GRP_2D_COPY_RECT_V2                        0x07
+#define     V_028A24_VGT_GRP_2D_COPY_RECT_V3                        0x08
+#define     V_028A24_VGT_GRP_2D_FILL_RECT                           0x09
+#define     V_028A24_VGT_GRP_2D_LINE                                0x0A
+#define     V_028A24_VGT_GRP_2D_TRI                                 0x0B
+#define     V_028A24_VGT_GRP_PRIM_INDEX_LINE                        0x0C
+#define     V_028A24_VGT_GRP_PRIM_INDEX_TRI                         0x0D
+#define     V_028A24_VGT_GRP_PRIM_INDEX_QUAD                        0x0E
+#define     V_028A24_VGT_GRP_3D_LINE_ADJ                            0x0F
+#define     V_028A24_VGT_GRP_3D_TRI_ADJ                             0x10
+#define     V_028A24_VGT_GRP_3D_PATCH                               0x11
+#define   S_028A24_RETAIN_ORDER(x)                                    (((x) & 0x1) << 14)
+#define   G_028A24_RETAIN_ORDER(x)                                    (((x) >> 14) & 0x1)
+#define   C_028A24_RETAIN_ORDER                                       0xFFFFBFFF
+#define   S_028A24_RETAIN_QUADS(x)                                    (((x) & 0x1) << 15)
+#define   G_028A24_RETAIN_QUADS(x)                                    (((x) >> 15) & 0x1)
+#define   C_028A24_RETAIN_QUADS                                       0xFFFF7FFF
+#define   S_028A24_PRIM_ORDER(x)                                      (((x) & 0x07) << 16)
+#define   G_028A24_PRIM_ORDER(x)                                      (((x) >> 16) & 0x07)
+#define   C_028A24_PRIM_ORDER                                         0xFFF8FFFF
+#define     V_028A24_VGT_GRP_LIST                                   0x00
+#define     V_028A24_VGT_GRP_STRIP                                  0x01
+#define     V_028A24_VGT_GRP_FAN                                    0x02
+#define     V_028A24_VGT_GRP_LOOP                                   0x03
+#define     V_028A24_VGT_GRP_POLYGON                                0x04
+#define R_028A28_VGT_GROUP_FIRST_DECR                                   0x028A28
+#define   S_028A28_FIRST_DECR(x)                                      (((x) & 0x0F) << 0)
+#define   G_028A28_FIRST_DECR(x)                                      (((x) >> 0) & 0x0F)
+#define   C_028A28_FIRST_DECR                                         0xFFFFFFF0
+#define R_028A2C_VGT_GROUP_DECR                                         0x028A2C
+#define   S_028A2C_DECR(x)                                            (((x) & 0x0F) << 0)
+#define   G_028A2C_DECR(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028A2C_DECR                                               0xFFFFFFF0
+#define R_028A30_VGT_GROUP_VECT_0_CNTL                                  0x028A30
+#define   S_028A30_COMP_X_EN(x)                                       (((x) & 0x1) << 0)
+#define   G_028A30_COMP_X_EN(x)                                       (((x) >> 0) & 0x1)
+#define   C_028A30_COMP_X_EN                                          0xFFFFFFFE
+#define   S_028A30_COMP_Y_EN(x)                                       (((x) & 0x1) << 1)
+#define   G_028A30_COMP_Y_EN(x)                                       (((x) >> 1) & 0x1)
+#define   C_028A30_COMP_Y_EN                                          0xFFFFFFFD
+#define   S_028A30_COMP_Z_EN(x)                                       (((x) & 0x1) << 2)
+#define   G_028A30_COMP_Z_EN(x)                                       (((x) >> 2) & 0x1)
+#define   C_028A30_COMP_Z_EN                                          0xFFFFFFFB
+#define   S_028A30_COMP_W_EN(x)                                       (((x) & 0x1) << 3)
+#define   G_028A30_COMP_W_EN(x)                                       (((x) >> 3) & 0x1)
+#define   C_028A30_COMP_W_EN                                          0xFFFFFFF7
+#define   S_028A30_STRIDE(x)                                          (((x) & 0xFF) << 8)
+#define   G_028A30_STRIDE(x)                                          (((x) >> 8) & 0xFF)
+#define   C_028A30_STRIDE                                             0xFFFF00FF
+#define   S_028A30_SHIFT(x)                                           (((x) & 0xFF) << 16)
+#define   G_028A30_SHIFT(x)                                           (((x) >> 16) & 0xFF)
+#define   C_028A30_SHIFT                                              0xFF00FFFF
+#define R_028A34_VGT_GROUP_VECT_1_CNTL                                  0x028A34
+#define   S_028A34_COMP_X_EN(x)                                       (((x) & 0x1) << 0)
+#define   G_028A34_COMP_X_EN(x)                                       (((x) >> 0) & 0x1)
+#define   C_028A34_COMP_X_EN                                          0xFFFFFFFE
+#define   S_028A34_COMP_Y_EN(x)                                       (((x) & 0x1) << 1)
+#define   G_028A34_COMP_Y_EN(x)                                       (((x) >> 1) & 0x1)
+#define   C_028A34_COMP_Y_EN                                          0xFFFFFFFD
+#define   S_028A34_COMP_Z_EN(x)                                       (((x) & 0x1) << 2)
+#define   G_028A34_COMP_Z_EN(x)                                       (((x) >> 2) & 0x1)
+#define   C_028A34_COMP_Z_EN                                          0xFFFFFFFB
+#define   S_028A34_COMP_W_EN(x)                                       (((x) & 0x1) << 3)
+#define   G_028A34_COMP_W_EN(x)                                       (((x) >> 3) & 0x1)
+#define   C_028A34_COMP_W_EN                                          0xFFFFFFF7
+#define   S_028A34_STRIDE(x)                                          (((x) & 0xFF) << 8)
+#define   G_028A34_STRIDE(x)                                          (((x) >> 8) & 0xFF)
+#define   C_028A34_STRIDE                                             0xFFFF00FF
+#define   S_028A34_SHIFT(x)                                           (((x) & 0xFF) << 16)
+#define   G_028A34_SHIFT(x)                                           (((x) >> 16) & 0xFF)
+#define   C_028A34_SHIFT                                              0xFF00FFFF
+#define R_028A38_VGT_GROUP_VECT_0_FMT_CNTL                              0x028A38
+#define   S_028A38_X_CONV(x)                                          (((x) & 0x0F) << 0)
+#define   G_028A38_X_CONV(x)                                          (((x) >> 0) & 0x0F)
+#define   C_028A38_X_CONV                                             0xFFFFFFF0
+#define     V_028A38_VGT_GRP_INDEX_16                               0x00
+#define     V_028A38_VGT_GRP_INDEX_32                               0x01
+#define     V_028A38_VGT_GRP_UINT_16                                0x02
+#define     V_028A38_VGT_GRP_UINT_32                                0x03
+#define     V_028A38_VGT_GRP_SINT_16                                0x04
+#define     V_028A38_VGT_GRP_SINT_32                                0x05
+#define     V_028A38_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A38_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A38_X_OFFSET(x)                                        (((x) & 0x0F) << 4)
+#define   G_028A38_X_OFFSET(x)                                        (((x) >> 4) & 0x0F)
+#define   C_028A38_X_OFFSET                                           0xFFFFFF0F
+#define   S_028A38_Y_CONV(x)                                          (((x) & 0x0F) << 8)
+#define   G_028A38_Y_CONV(x)                                          (((x) >> 8) & 0x0F)
+#define   C_028A38_Y_CONV                                             0xFFFFF0FF
+#define     V_028A38_VGT_GRP_INDEX_16                               0x00
+#define     V_028A38_VGT_GRP_INDEX_32                               0x01
+#define     V_028A38_VGT_GRP_UINT_16                                0x02
+#define     V_028A38_VGT_GRP_UINT_32                                0x03
+#define     V_028A38_VGT_GRP_SINT_16                                0x04
+#define     V_028A38_VGT_GRP_SINT_32                                0x05
+#define     V_028A38_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A38_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A38_Y_OFFSET(x)                                        (((x) & 0x0F) << 12)
+#define   G_028A38_Y_OFFSET(x)                                        (((x) >> 12) & 0x0F)
+#define   C_028A38_Y_OFFSET                                           0xFFFF0FFF
+#define   S_028A38_Z_CONV(x)                                          (((x) & 0x0F) << 16)
+#define   G_028A38_Z_CONV(x)                                          (((x) >> 16) & 0x0F)
+#define   C_028A38_Z_CONV                                             0xFFF0FFFF
+#define     V_028A38_VGT_GRP_INDEX_16                               0x00
+#define     V_028A38_VGT_GRP_INDEX_32                               0x01
+#define     V_028A38_VGT_GRP_UINT_16                                0x02
+#define     V_028A38_VGT_GRP_UINT_32                                0x03
+#define     V_028A38_VGT_GRP_SINT_16                                0x04
+#define     V_028A38_VGT_GRP_SINT_32                                0x05
+#define     V_028A38_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A38_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A38_Z_OFFSET(x)                                        (((x) & 0x0F) << 20)
+#define   G_028A38_Z_OFFSET(x)                                        (((x) >> 20) & 0x0F)
+#define   C_028A38_Z_OFFSET                                           0xFF0FFFFF
+#define   S_028A38_W_CONV(x)                                          (((x) & 0x0F) << 24)
+#define   G_028A38_W_CONV(x)                                          (((x) >> 24) & 0x0F)
+#define   C_028A38_W_CONV                                             0xF0FFFFFF
+#define     V_028A38_VGT_GRP_INDEX_16                               0x00
+#define     V_028A38_VGT_GRP_INDEX_32                               0x01
+#define     V_028A38_VGT_GRP_UINT_16                                0x02
+#define     V_028A38_VGT_GRP_UINT_32                                0x03
+#define     V_028A38_VGT_GRP_SINT_16                                0x04
+#define     V_028A38_VGT_GRP_SINT_32                                0x05
+#define     V_028A38_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A38_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A38_W_OFFSET(x)                                        (((x) & 0x0F) << 28)
+#define   G_028A38_W_OFFSET(x)                                        (((x) >> 28) & 0x0F)
+#define   C_028A38_W_OFFSET                                           0x0FFFFFFF
+#define R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL                              0x028A3C
+#define   S_028A3C_X_CONV(x)                                          (((x) & 0x0F) << 0)
+#define   G_028A3C_X_CONV(x)                                          (((x) >> 0) & 0x0F)
+#define   C_028A3C_X_CONV                                             0xFFFFFFF0
+#define     V_028A3C_VGT_GRP_INDEX_16                               0x00
+#define     V_028A3C_VGT_GRP_INDEX_32                               0x01
+#define     V_028A3C_VGT_GRP_UINT_16                                0x02
+#define     V_028A3C_VGT_GRP_UINT_32                                0x03
+#define     V_028A3C_VGT_GRP_SINT_16                                0x04
+#define     V_028A3C_VGT_GRP_SINT_32                                0x05
+#define     V_028A3C_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A3C_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A3C_X_OFFSET(x)                                        (((x) & 0x0F) << 4)
+#define   G_028A3C_X_OFFSET(x)                                        (((x) >> 4) & 0x0F)
+#define   C_028A3C_X_OFFSET                                           0xFFFFFF0F
+#define   S_028A3C_Y_CONV(x)                                          (((x) & 0x0F) << 8)
+#define   G_028A3C_Y_CONV(x)                                          (((x) >> 8) & 0x0F)
+#define   C_028A3C_Y_CONV                                             0xFFFFF0FF
+#define     V_028A3C_VGT_GRP_INDEX_16                               0x00
+#define     V_028A3C_VGT_GRP_INDEX_32                               0x01
+#define     V_028A3C_VGT_GRP_UINT_16                                0x02
+#define     V_028A3C_VGT_GRP_UINT_32                                0x03
+#define     V_028A3C_VGT_GRP_SINT_16                                0x04
+#define     V_028A3C_VGT_GRP_SINT_32                                0x05
+#define     V_028A3C_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A3C_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A3C_Y_OFFSET(x)                                        (((x) & 0x0F) << 12)
+#define   G_028A3C_Y_OFFSET(x)                                        (((x) >> 12) & 0x0F)
+#define   C_028A3C_Y_OFFSET                                           0xFFFF0FFF
+#define   S_028A3C_Z_CONV(x)                                          (((x) & 0x0F) << 16)
+#define   G_028A3C_Z_CONV(x)                                          (((x) >> 16) & 0x0F)
+#define   C_028A3C_Z_CONV                                             0xFFF0FFFF
+#define     V_028A3C_VGT_GRP_INDEX_16                               0x00
+#define     V_028A3C_VGT_GRP_INDEX_32                               0x01
+#define     V_028A3C_VGT_GRP_UINT_16                                0x02
+#define     V_028A3C_VGT_GRP_UINT_32                                0x03
+#define     V_028A3C_VGT_GRP_SINT_16                                0x04
+#define     V_028A3C_VGT_GRP_SINT_32                                0x05
+#define     V_028A3C_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A3C_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A3C_Z_OFFSET(x)                                        (((x) & 0x0F) << 20)
+#define   G_028A3C_Z_OFFSET(x)                                        (((x) >> 20) & 0x0F)
+#define   C_028A3C_Z_OFFSET                                           0xFF0FFFFF
+#define   S_028A3C_W_CONV(x)                                          (((x) & 0x0F) << 24)
+#define   G_028A3C_W_CONV(x)                                          (((x) >> 24) & 0x0F)
+#define   C_028A3C_W_CONV                                             0xF0FFFFFF
+#define     V_028A3C_VGT_GRP_INDEX_16                               0x00
+#define     V_028A3C_VGT_GRP_INDEX_32                               0x01
+#define     V_028A3C_VGT_GRP_UINT_16                                0x02
+#define     V_028A3C_VGT_GRP_UINT_32                                0x03
+#define     V_028A3C_VGT_GRP_SINT_16                                0x04
+#define     V_028A3C_VGT_GRP_SINT_32                                0x05
+#define     V_028A3C_VGT_GRP_FLOAT_32                               0x06
+#define     V_028A3C_VGT_GRP_AUTO_PRIM                              0x07
+#define     V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT                      0x08
+#define   S_028A3C_W_OFFSET(x)                                        (((x) & 0x0F) << 28)
+#define   G_028A3C_W_OFFSET(x)                                        (((x) >> 28) & 0x0F)
+#define   C_028A3C_W_OFFSET                                           0x0FFFFFFF
+#define R_028A40_VGT_GS_MODE                                            0x028A40
+#define   S_028A40_MODE(x)                                            (((x) & 0x07) << 0)
+#define   G_028A40_MODE(x)                                            (((x) >> 0) & 0x07)
+#define   C_028A40_MODE                                               0xFFFFFFF8
+#define     V_028A40_GS_OFF                                         0x00
+#define     V_028A40_GS_SCENARIO_A                                  0x01
+#define     V_028A40_GS_SCENARIO_B                                  0x02
+#define     V_028A40_GS_SCENARIO_G                                  0x03
+#define     V_028A40_GS_SCENARIO_C                                  0x04
+#define     V_028A40_SPRITE_EN                                      0x05
+#define   S_028A40_CUT_MODE(x)                                        (((x) & 0x03) << 4)
+#define   G_028A40_CUT_MODE(x)                                        (((x) >> 4) & 0x03)
+#define   C_028A40_CUT_MODE                                           0xFFFFFFCF
+#define     V_028A40_GS_CUT_1024                                    0x00
+#define     V_028A40_GS_CUT_512                                     0x01
+#define     V_028A40_GS_CUT_256                                     0x02
+#define     V_028A40_GS_CUT_128                                     0x03
+#define   S_028A40_GS_C_PACK_EN(x)                                    (((x) & 0x1) << 11)
+#define   G_028A40_GS_C_PACK_EN(x)                                    (((x) >> 11) & 0x1)
+#define   C_028A40_GS_C_PACK_EN                                       0xFFFFF7FF
+#define   S_028A40_ES_PASSTHRU(x)                                     (((x) & 0x1) << 13)
+#define   G_028A40_ES_PASSTHRU(x)                                     (((x) >> 13) & 0x1)
+#define   C_028A40_ES_PASSTHRU                                        0xFFFFDFFF
+#define   S_028A40_COMPUTE_MODE(x)                                    (((x) & 0x1) << 14)
+#define   G_028A40_COMPUTE_MODE(x)                                    (((x) >> 14) & 0x1)
+#define   C_028A40_COMPUTE_MODE                                       0xFFFFBFFF
+#define   S_028A40_FAST_COMPUTE_MODE(x)                               (((x) & 0x1) << 15)
+#define   G_028A40_FAST_COMPUTE_MODE(x)                               (((x) >> 15) & 0x1)
+#define   C_028A40_FAST_COMPUTE_MODE                                  0xFFFF7FFF
+#define   S_028A40_ELEMENT_INFO_EN(x)                                 (((x) & 0x1) << 16)
+#define   G_028A40_ELEMENT_INFO_EN(x)                                 (((x) >> 16) & 0x1)
+#define   C_028A40_ELEMENT_INFO_EN                                    0xFFFEFFFF
+#define   S_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) & 0x1) << 17)
+#define   G_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) >> 17) & 0x1)
+#define   C_028A40_PARTIAL_THD_AT_EOI                                 0xFFFDFFFF
+#define   S_028A40_SUPPRESS_CUTS(x)                                   (((x) & 0x1) << 18)
+#define   G_028A40_SUPPRESS_CUTS(x)                                   (((x) >> 18) & 0x1)
+#define   C_028A40_SUPPRESS_CUTS                                      0xFFFBFFFF
+#define   S_028A40_ES_WRITE_OPTIMIZE(x)                               (((x) & 0x1) << 19)
+#define   G_028A40_ES_WRITE_OPTIMIZE(x)                               (((x) >> 19) & 0x1)
+#define   C_028A40_ES_WRITE_OPTIMIZE                                  0xFFF7FFFF
+#define   S_028A40_GS_WRITE_OPTIMIZE(x)                               (((x) & 0x1) << 20)
+#define   G_028A40_GS_WRITE_OPTIMIZE(x)                               (((x) >> 20) & 0x1)
+#define   C_028A40_GS_WRITE_OPTIMIZE                                  0xFFEFFFFF
+#define R_028A48_PA_SC_MODE_CNTL_0                                      0x028A48
+#define   S_028A48_MSAA_ENABLE(x)                                     (((x) & 0x1) << 0)
+#define   G_028A48_MSAA_ENABLE(x)                                     (((x) >> 0) & 0x1)
+#define   C_028A48_MSAA_ENABLE                                        0xFFFFFFFE
+#define   S_028A48_VPORT_SCISSOR_ENABLE(x)                            (((x) & 0x1) << 1)
+#define   G_028A48_VPORT_SCISSOR_ENABLE(x)                            (((x) >> 1) & 0x1)
+#define   C_028A48_VPORT_SCISSOR_ENABLE                               0xFFFFFFFD
+#define   S_028A48_LINE_STIPPLE_ENABLE(x)                             (((x) & 0x1) << 2)
+#define   G_028A48_LINE_STIPPLE_ENABLE(x)                             (((x) >> 2) & 0x1)
+#define   C_028A48_LINE_STIPPLE_ENABLE                                0xFFFFFFFB
+#define   S_028A48_SEND_UNLIT_STILES_TO_PKR(x)                        (((x) & 0x1) << 3)
+#define   G_028A48_SEND_UNLIT_STILES_TO_PKR(x)                        (((x) >> 3) & 0x1)
+#define   C_028A48_SEND_UNLIT_STILES_TO_PKR                           0xFFFFFFF7
+#define R_028A4C_PA_SC_MODE_CNTL_1                                      0x028A4C
+#define   S_028A4C_WALK_SIZE(x)                                       (((x) & 0x1) << 0)
+#define   G_028A4C_WALK_SIZE(x)                                       (((x) >> 0) & 0x1)
+#define   C_028A4C_WALK_SIZE                                          0xFFFFFFFE
+#define   S_028A4C_WALK_ALIGNMENT(x)                                  (((x) & 0x1) << 1)
+#define   G_028A4C_WALK_ALIGNMENT(x)                                  (((x) >> 1) & 0x1)
+#define   C_028A4C_WALK_ALIGNMENT                                     0xFFFFFFFD
+#define   S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(x)                        (((x) & 0x1) << 2)
+#define   G_028A4C_WALK_ALIGN8_PRIM_FITS_ST(x)                        (((x) >> 2) & 0x1)
+#define   C_028A4C_WALK_ALIGN8_PRIM_FITS_ST                           0xFFFFFFFB
+#define   S_028A4C_WALK_FENCE_ENABLE(x)                               (((x) & 0x1) << 3)
+#define   G_028A4C_WALK_FENCE_ENABLE(x)                               (((x) >> 3) & 0x1)
+#define   C_028A4C_WALK_FENCE_ENABLE                                  0xFFFFFFF7
+#define   S_028A4C_WALK_FENCE_SIZE(x)                                 (((x) & 0x07) << 4)
+#define   G_028A4C_WALK_FENCE_SIZE(x)                                 (((x) >> 4) & 0x07)
+#define   C_028A4C_WALK_FENCE_SIZE                                    0xFFFFFF8F
+#define   S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(x)                     (((x) & 0x1) << 7)
+#define   G_028A4C_SUPERTILE_WALK_ORDER_ENABLE(x)                     (((x) >> 7) & 0x1)
+#define   C_028A4C_SUPERTILE_WALK_ORDER_ENABLE                        0xFFFFFF7F
+#define   S_028A4C_TILE_WALK_ORDER_ENABLE(x)                          (((x) & 0x1) << 8)
+#define   G_028A4C_TILE_WALK_ORDER_ENABLE(x)                          (((x) >> 8) & 0x1)
+#define   C_028A4C_TILE_WALK_ORDER_ENABLE                             0xFFFFFEFF
+#define   S_028A4C_TILE_COVER_DISABLE(x)                              (((x) & 0x1) << 9)
+#define   G_028A4C_TILE_COVER_DISABLE(x)                              (((x) >> 9) & 0x1)
+#define   C_028A4C_TILE_COVER_DISABLE                                 0xFFFFFDFF
+#define   S_028A4C_TILE_COVER_NO_SCISSOR(x)                           (((x) & 0x1) << 10)
+#define   G_028A4C_TILE_COVER_NO_SCISSOR(x)                           (((x) >> 10) & 0x1)
+#define   C_028A4C_TILE_COVER_NO_SCISSOR                              0xFFFFFBFF
+#define   S_028A4C_ZMM_LINE_EXTENT(x)                                 (((x) & 0x1) << 11)
+#define   G_028A4C_ZMM_LINE_EXTENT(x)                                 (((x) >> 11) & 0x1)
+#define   C_028A4C_ZMM_LINE_EXTENT                                    0xFFFFF7FF
+#define   S_028A4C_ZMM_LINE_OFFSET(x)                                 (((x) & 0x1) << 12)
+#define   G_028A4C_ZMM_LINE_OFFSET(x)                                 (((x) >> 12) & 0x1)
+#define   C_028A4C_ZMM_LINE_OFFSET                                    0xFFFFEFFF
+#define   S_028A4C_ZMM_RECT_EXTENT(x)                                 (((x) & 0x1) << 13)
+#define   G_028A4C_ZMM_RECT_EXTENT(x)                                 (((x) >> 13) & 0x1)
+#define   C_028A4C_ZMM_RECT_EXTENT                                    0xFFFFDFFF
+#define   S_028A4C_KILL_PIX_POST_HI_Z(x)                              (((x) & 0x1) << 14)
+#define   G_028A4C_KILL_PIX_POST_HI_Z(x)                              (((x) >> 14) & 0x1)
+#define   C_028A4C_KILL_PIX_POST_HI_Z                                 0xFFFFBFFF
+#define   S_028A4C_KILL_PIX_POST_DETAIL_MASK(x)                       (((x) & 0x1) << 15)
+#define   G_028A4C_KILL_PIX_POST_DETAIL_MASK(x)                       (((x) >> 15) & 0x1)
+#define   C_028A4C_KILL_PIX_POST_DETAIL_MASK                          0xFFFF7FFF
+#define   S_028A4C_PS_ITER_SAMPLE(x)                                  (((x) & 0x1) << 16)
+#define   G_028A4C_PS_ITER_SAMPLE(x)                                  (((x) >> 16) & 0x1)
+#define   C_028A4C_PS_ITER_SAMPLE                                     0xFFFEFFFF
+#define   S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) & 0x1) << 17)
+#define   G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x)                   (((x) >> 17) & 0x1)
+#define   C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC                      0xFFFDFFFF
+#define   S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) & 0x1) << 25)
+#define   G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)                         (((x) >> 25) & 0x1)
+#define   C_028A4C_FORCE_EOV_CNTDWN_ENABLE                            0xFDFFFFFF
+#define   S_028A4C_FORCE_EOV_REZ_ENABLE(x)                            (((x) & 0x1) << 26)
+#define   G_028A4C_FORCE_EOV_REZ_ENABLE(x)                            (((x) >> 26) & 0x1)
+#define   C_028A4C_FORCE_EOV_REZ_ENABLE                               0xFBFFFFFF
+#define   S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(x)                   (((x) & 0x1) << 27)
+#define   G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(x)                   (((x) >> 27) & 0x1)
+#define   C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE                      0xF7FFFFFF
+#define   S_028A4C_OUT_OF_ORDER_WATER_MARK(x)                         (((x) & 0x07) << 28)
+#define   G_028A4C_OUT_OF_ORDER_WATER_MARK(x)                         (((x) >> 28) & 0x07)
+#define   C_028A4C_OUT_OF_ORDER_WATER_MARK                            0x8FFFFFFF
+#define R_028A50_VGT_ENHANCE                                            0x028A50
+#define R_028A54_VGT_GS_PER_ES                                          0x028A54
+#define   S_028A54_GS_PER_ES(x)                                       (((x) & 0x7FF) << 0)
+#define   G_028A54_GS_PER_ES(x)                                       (((x) >> 0) & 0x7FF)
+#define   C_028A54_GS_PER_ES                                          0xFFFFF800
+#define R_028A58_VGT_ES_PER_GS                                          0x028A58
+#define   S_028A58_ES_PER_GS(x)                                       (((x) & 0x7FF) << 0)
+#define   G_028A58_ES_PER_GS(x)                                       (((x) >> 0) & 0x7FF)
+#define   C_028A58_ES_PER_GS                                          0xFFFFF800
+#define R_028A5C_VGT_GS_PER_VS                                          0x028A5C
+#define   S_028A5C_GS_PER_VS(x)                                       (((x) & 0x0F) << 0)
+#define   G_028A5C_GS_PER_VS(x)                                       (((x) >> 0) & 0x0F)
+#define   C_028A5C_GS_PER_VS                                          0xFFFFFFF0
+#define R_028A60_VGT_GSVS_RING_OFFSET_1                                 0x028A60
+#define   S_028A60_OFFSET(x)                                          (((x) & 0x7FFF) << 0)
+#define   G_028A60_OFFSET(x)                                          (((x) >> 0) & 0x7FFF)
+#define   C_028A60_OFFSET                                             0xFFFF8000
+#define R_028A64_VGT_GSVS_RING_OFFSET_2                                 0x028A64
+#define   S_028A64_OFFSET(x)                                          (((x) & 0x7FFF) << 0)
+#define   G_028A64_OFFSET(x)                                          (((x) >> 0) & 0x7FFF)
+#define   C_028A64_OFFSET                                             0xFFFF8000
+#define R_028A68_VGT_GSVS_RING_OFFSET_3                                 0x028A68
+#define   S_028A68_OFFSET(x)                                          (((x) & 0x7FFF) << 0)
+#define   G_028A68_OFFSET(x)                                          (((x) >> 0) & 0x7FFF)
+#define   C_028A68_OFFSET                                             0xFFFF8000
+#define R_028A6C_VGT_GS_OUT_PRIM_TYPE                                   0x028A6C
+#define   S_028A6C_OUTPRIM_TYPE(x)                                    (((x) & 0x3F) << 0)
+#define   G_028A6C_OUTPRIM_TYPE(x)                                    (((x) >> 0) & 0x3F)
+#define   C_028A6C_OUTPRIM_TYPE                                       0xFFFFFFC0
+#define   S_028A6C_OUTPRIM_TYPE_1(x)                                  (((x) & 0x3F) << 8)
+#define   G_028A6C_OUTPRIM_TYPE_1(x)                                  (((x) >> 8) & 0x3F)
+#define   C_028A6C_OUTPRIM_TYPE_1                                     0xFFFFC0FF
+#define   S_028A6C_OUTPRIM_TYPE_2(x)                                  (((x) & 0x3F) << 16)
+#define   G_028A6C_OUTPRIM_TYPE_2(x)                                  (((x) >> 16) & 0x3F)
+#define   C_028A6C_OUTPRIM_TYPE_2                                     0xFFC0FFFF
+#define   S_028A6C_OUTPRIM_TYPE_3(x)                                  (((x) & 0x3F) << 22)
+#define   G_028A6C_OUTPRIM_TYPE_3(x)                                  (((x) >> 22) & 0x3F)
+#define   C_028A6C_OUTPRIM_TYPE_3                                     0xF03FFFFF
+#define   S_028A6C_UNIQUE_TYPE_PER_STREAM(x)                          (((x) & 0x1) << 31)
+#define   G_028A6C_UNIQUE_TYPE_PER_STREAM(x)                          (((x) >> 31) & 0x1)
+#define   C_028A6C_UNIQUE_TYPE_PER_STREAM                             0x7FFFFFFF
+#define R_028A70_IA_ENHANCE                                             0x028A70
+#define R_028A74_VGT_DMA_SIZE                                           0x028A74
+#define R_028A78_VGT_DMA_MAX_SIZE                                       0x028A78
+#define R_028A7C_VGT_DMA_INDEX_TYPE                                     0x028A7C
+#define   S_028A7C_INDEX_TYPE(x)                                      (((x) & 0x03) << 0)
+#define   G_028A7C_INDEX_TYPE(x)                                      (((x) >> 0) & 0x03)
+#define   C_028A7C_INDEX_TYPE                                         0xFFFFFFFC
+#define     V_028A7C_VGT_INDEX_16                                   0x00
+#define     V_028A7C_VGT_INDEX_32                                   0x01
+#define   S_028A7C_SWAP_MODE(x)                                       (((x) & 0x03) << 2)
+#define   G_028A7C_SWAP_MODE(x)                                       (((x) >> 2) & 0x03)
+#define   C_028A7C_SWAP_MODE                                          0xFFFFFFF3
+#define     V_028A7C_VGT_DMA_SWAP_NONE                              0x00
+#define     V_028A7C_VGT_DMA_SWAP_16_BIT                            0x01
+#define     V_028A7C_VGT_DMA_SWAP_32_BIT                            0x02
+#define     V_028A7C_VGT_DMA_SWAP_WORD                              0x03
+#define R_028A84_VGT_PRIMITIVEID_EN                                     0x028A84
+#define   S_028A84_PRIMITIVEID_EN(x)                                  (((x) & 0x1) << 0)
+#define   G_028A84_PRIMITIVEID_EN(x)                                  (((x) >> 0) & 0x1)
+#define   C_028A84_PRIMITIVEID_EN                                     0xFFFFFFFE
+#define   S_028A84_DISABLE_RESET_ON_EOI(x)                            (((x) & 0x1) << 1)
+#define   G_028A84_DISABLE_RESET_ON_EOI(x)                            (((x) >> 1) & 0x1)
+#define   C_028A84_DISABLE_RESET_ON_EOI                               0xFFFFFFFD
+#define R_028A88_VGT_DMA_NUM_INSTANCES                                  0x028A88
+#define R_028A8C_VGT_PRIMITIVEID_RESET                                  0x028A8C
+#define R_028A90_VGT_EVENT_INITIATOR                                    0x028A90
+#define   S_028A90_EVENT_TYPE(x)                                      (((x) & 0x3F) << 0)
+#define   G_028A90_EVENT_TYPE(x)                                      (((x) >> 0) & 0x3F)
+#define   C_028A90_EVENT_TYPE                                         0xFFFFFFC0
+#define     V_028A90_SAMPLE_STREAMOUTSTATS1                         0x01
+#define     V_028A90_SAMPLE_STREAMOUTSTATS2                         0x02
+#define     V_028A90_SAMPLE_STREAMOUTSTATS3                         0x03
+#define     V_028A90_CACHE_FLUSH_TS                                 0x04
+#define     V_028A90_CONTEXT_DONE                                   0x05
+#define     V_028A90_CACHE_FLUSH                                    0x06
+#define     V_028A90_CS_PARTIAL_FLUSH                               0x07
+#define     V_028A90_VGT_STREAMOUT_SYNC                             0x08
+#define     V_028A90_VGT_STREAMOUT_RESET                            0x0A
+#define     V_028A90_END_OF_PIPE_INCR_DE                            0x0B
+#define     V_028A90_END_OF_PIPE_IB_END                             0x0C
+#define     V_028A90_RST_PIX_CNT                                    0x0D
+#define     V_028A90_VS_PARTIAL_FLUSH                               0x0F
+#define     V_028A90_PS_PARTIAL_FLUSH                               0x10
+#define     V_028A90_FLUSH_HS_OUTPUT                                0x11
+#define     V_028A90_FLUSH_LS_OUTPUT                                0x12
+#define     V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT                   0x14
+#define     V_028A90_ZPASS_DONE                                     0x15
+#define     V_028A90_CACHE_FLUSH_AND_INV_EVENT                      0x16
+#define     V_028A90_PERFCOUNTER_START                              0x17
+#define     V_028A90_PERFCOUNTER_STOP                               0x18
+#define     V_028A90_PIPELINESTAT_START                             0x19
+#define     V_028A90_PIPELINESTAT_STOP                              0x1A
+#define     V_028A90_PERFCOUNTER_SAMPLE                             0x1B
+#define     V_028A90_FLUSH_ES_OUTPUT                                0x1C
+#define     V_028A90_FLUSH_GS_OUTPUT                                0x1D
+#define     V_028A90_SAMPLE_PIPELINESTAT                            0x1E
+#define     V_028A90_SO_VGTSTREAMOUT_FLUSH                          0x1F
+#define     V_028A90_SAMPLE_STREAMOUTSTATS                          0x20
+#define     V_028A90_RESET_VTX_CNT                                  0x21
+#define     V_028A90_BLOCK_CONTEXT_DONE                             0x22
+#define     V_028A90_CS_CONTEXT_DONE                                0x23
+#define     V_028A90_VGT_FLUSH                                      0x24
+#define     V_028A90_SC_SEND_DB_VPZ                                 0x27
+#define     V_028A90_BOTTOM_OF_PIPE_TS                              0x28
+#define     V_028A90_DB_CACHE_FLUSH_AND_INV                         0x2A
+#define     V_028A90_FLUSH_AND_INV_DB_DATA_TS                       0x2B
+#define     V_028A90_FLUSH_AND_INV_DB_META                          0x2C
+#define     V_028A90_FLUSH_AND_INV_CB_DATA_TS                       0x2D
+#define     V_028A90_FLUSH_AND_INV_CB_META                          0x2E
+#define     V_028A90_CS_DONE                                        0x2F
+#define     V_028A90_PS_DONE                                        0x30
+#define     V_028A90_FLUSH_AND_INV_CB_PIXEL_DATA                    0x31
+#define     V_028A90_THREAD_TRACE_START                             0x33
+#define     V_028A90_THREAD_TRACE_STOP                              0x34
+#define     V_028A90_THREAD_TRACE_MARKER                            0x35
+#define     V_028A90_THREAD_TRACE_FLUSH                             0x36
+#define     V_028A90_THREAD_TRACE_FINISH                            0x37
+#define   S_028A90_ADDRESS_HI(x)                                      (((x) & 0x1FF) << 18)
+#define   G_028A90_ADDRESS_HI(x)                                      (((x) >> 18) & 0x1FF)
+#define   C_028A90_ADDRESS_HI                                         0xF803FFFF
+#define   S_028A90_EXTENDED_EVENT(x)                                  (((x) & 0x1) << 27)
+#define   G_028A90_EXTENDED_EVENT(x)                                  (((x) >> 27) & 0x1)
+#define   C_028A90_EXTENDED_EVENT                                     0xF7FFFFFF
+#define R_028A94_VGT_MULTI_PRIM_IB_RESET_EN                             0x028A94
+#define   S_028A94_RESET_EN(x)                                        (((x) & 0x1) << 0)
+#define   G_028A94_RESET_EN(x)                                        (((x) >> 0) & 0x1)
+#define   C_028A94_RESET_EN                                           0xFFFFFFFE
+#define R_028AA0_VGT_INSTANCE_STEP_RATE_0                               0x028AA0
+#define R_028AA4_VGT_INSTANCE_STEP_RATE_1                               0x028AA4
+#define R_028AA8_IA_MULTI_VGT_PARAM                                     0x028AA8
+#define   S_028AA8_PRIMGROUP_SIZE(x)                                  (((x) & 0xFFFF) << 0)
+#define   G_028AA8_PRIMGROUP_SIZE(x)                                  (((x) >> 0) & 0xFFFF)
+#define   C_028AA8_PRIMGROUP_SIZE                                     0xFFFF0000
+#define   S_028AA8_PARTIAL_VS_WAVE_ON(x)                              (((x) & 0x1) << 16)
+#define   G_028AA8_PARTIAL_VS_WAVE_ON(x)                              (((x) >> 16) & 0x1)
+#define   C_028AA8_PARTIAL_VS_WAVE_ON                                 0xFFFEFFFF
+#define   S_028AA8_SWITCH_ON_EOP(x)                                   (((x) & 0x1) << 17)
+#define   G_028AA8_SWITCH_ON_EOP(x)                                   (((x) >> 17) & 0x1)
+#define   C_028AA8_SWITCH_ON_EOP                                      0xFFFDFFFF
+#define   S_028AA8_PARTIAL_ES_WAVE_ON(x)                              (((x) & 0x1) << 18)
+#define   G_028AA8_PARTIAL_ES_WAVE_ON(x)                              (((x) >> 18) & 0x1)
+#define   C_028AA8_PARTIAL_ES_WAVE_ON                                 0xFFFBFFFF
+#define   S_028AA8_SWITCH_ON_EOI(x)                                   (((x) & 0x1) << 19)
+#define   G_028AA8_SWITCH_ON_EOI(x)                                   (((x) >> 19) & 0x1)
+#define   C_028AA8_SWITCH_ON_EOI                                      0xFFF7FFFF
+#define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
+#define   S_028AAC_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028AAC_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028AAC_ITEMSIZE                                           0xFFFF8000
+#define R_028AB0_VGT_GSVS_RING_ITEMSIZE                                 0x028AB0
+#define   S_028AB0_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028AB0_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028AB0_ITEMSIZE                                           0xFFFF8000
+#define R_028AB4_VGT_REUSE_OFF                                          0x028AB4
+#define   S_028AB4_REUSE_OFF(x)                                       (((x) & 0x1) << 0)
+#define   G_028AB4_REUSE_OFF(x)                                       (((x) >> 0) & 0x1)
+#define   C_028AB4_REUSE_OFF                                          0xFFFFFFFE
+#define R_028AB8_VGT_VTX_CNT_EN                                         0x028AB8
+#define   S_028AB8_VTX_CNT_EN(x)                                      (((x) & 0x1) << 0)
+#define   G_028AB8_VTX_CNT_EN(x)                                      (((x) >> 0) & 0x1)
+#define   C_028AB8_VTX_CNT_EN                                         0xFFFFFFFE
+#define R_028ABC_DB_HTILE_SURFACE                                       0x028ABC
+#define   S_028ABC_LINEAR(x)                                          (((x) & 0x1) << 0)
+#define   G_028ABC_LINEAR(x)                                          (((x) >> 0) & 0x1)
+#define   C_028ABC_LINEAR                                             0xFFFFFFFE
+#define   S_028ABC_FULL_CACHE(x)                                      (((x) & 0x1) << 1)
+#define   G_028ABC_FULL_CACHE(x)                                      (((x) >> 1) & 0x1)
+#define   C_028ABC_FULL_CACHE                                         0xFFFFFFFD
+#define   S_028ABC_HTILE_USES_PRELOAD_WIN(x)                          (((x) & 0x1) << 2)
+#define   G_028ABC_HTILE_USES_PRELOAD_WIN(x)                          (((x) >> 2) & 0x1)
+#define   C_028ABC_HTILE_USES_PRELOAD_WIN                             0xFFFFFFFB
+#define   S_028ABC_PRELOAD(x)                                         (((x) & 0x1) << 3)
+#define   G_028ABC_PRELOAD(x)                                         (((x) >> 3) & 0x1)
+#define   C_028ABC_PRELOAD                                            0xFFFFFFF7
+#define   S_028ABC_PREFETCH_WIDTH(x)                                  (((x) & 0x3F) << 4)
+#define   G_028ABC_PREFETCH_WIDTH(x)                                  (((x) >> 4) & 0x3F)
+#define   C_028ABC_PREFETCH_WIDTH                                     0xFFFFFC0F
+#define   S_028ABC_PREFETCH_HEIGHT(x)                                 (((x) & 0x3F) << 10)
+#define   G_028ABC_PREFETCH_HEIGHT(x)                                 (((x) >> 10) & 0x3F)
+#define   C_028ABC_PREFETCH_HEIGHT                                    0xFFFF03FF
+#define   S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) & 0x1) << 16)
+#define   G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x)                         (((x) >> 16) & 0x1)
+#define   C_028ABC_DST_OUTSIDE_ZERO_TO_ONE                            0xFFFEFFFF
+#define R_028AC0_DB_SRESULTS_COMPARE_STATE0                             0x028AC0
+#define   S_028AC0_COMPAREFUNC0(x)                                    (((x) & 0x07) << 0)
+#define   G_028AC0_COMPAREFUNC0(x)                                    (((x) >> 0) & 0x07)
+#define   C_028AC0_COMPAREFUNC0                                       0xFFFFFFF8
+#define     V_028AC0_REF_NEVER                                      0x00
+#define     V_028AC0_REF_LESS                                       0x01
+#define     V_028AC0_REF_EQUAL                                      0x02
+#define     V_028AC0_REF_LEQUAL                                     0x03
+#define     V_028AC0_REF_GREATER                                    0x04
+#define     V_028AC0_REF_NOTEQUAL                                   0x05
+#define     V_028AC0_REF_GEQUAL                                     0x06
+#define     V_028AC0_REF_ALWAYS                                     0x07
+#define   S_028AC0_COMPAREVALUE0(x)                                   (((x) & 0xFF) << 4)
+#define   G_028AC0_COMPAREVALUE0(x)                                   (((x) >> 4) & 0xFF)
+#define   C_028AC0_COMPAREVALUE0                                      0xFFFFF00F
+#define   S_028AC0_COMPAREMASK0(x)                                    (((x) & 0xFF) << 12)
+#define   G_028AC0_COMPAREMASK0(x)                                    (((x) >> 12) & 0xFF)
+#define   C_028AC0_COMPAREMASK0                                       0xFFF00FFF
+#define   S_028AC0_ENABLE0(x)                                         (((x) & 0x1) << 24)
+#define   G_028AC0_ENABLE0(x)                                         (((x) >> 24) & 0x1)
+#define   C_028AC0_ENABLE0                                            0xFEFFFFFF
+#define R_028AC4_DB_SRESULTS_COMPARE_STATE1                             0x028AC4
+#define   S_028AC4_COMPAREFUNC1(x)                                    (((x) & 0x07) << 0)
+#define   G_028AC4_COMPAREFUNC1(x)                                    (((x) >> 0) & 0x07)
+#define   C_028AC4_COMPAREFUNC1                                       0xFFFFFFF8
+#define     V_028AC4_REF_NEVER                                      0x00
+#define     V_028AC4_REF_LESS                                       0x01
+#define     V_028AC4_REF_EQUAL                                      0x02
+#define     V_028AC4_REF_LEQUAL                                     0x03
+#define     V_028AC4_REF_GREATER                                    0x04
+#define     V_028AC4_REF_NOTEQUAL                                   0x05
+#define     V_028AC4_REF_GEQUAL                                     0x06
+#define     V_028AC4_REF_ALWAYS                                     0x07
+#define   S_028AC4_COMPAREVALUE1(x)                                   (((x) & 0xFF) << 4)
+#define   G_028AC4_COMPAREVALUE1(x)                                   (((x) >> 4) & 0xFF)
+#define   C_028AC4_COMPAREVALUE1                                      0xFFFFF00F
+#define   S_028AC4_COMPAREMASK1(x)                                    (((x) & 0xFF) << 12)
+#define   G_028AC4_COMPAREMASK1(x)                                    (((x) >> 12) & 0xFF)
+#define   C_028AC4_COMPAREMASK1                                       0xFFF00FFF
+#define   S_028AC4_ENABLE1(x)                                         (((x) & 0x1) << 24)
+#define   G_028AC4_ENABLE1(x)                                         (((x) >> 24) & 0x1)
+#define   C_028AC4_ENABLE1                                            0xFEFFFFFF
+#define R_028AC8_DB_PRELOAD_CONTROL                                     0x028AC8
+#define   S_028AC8_START_X(x)                                         (((x) & 0xFF) << 0)
+#define   G_028AC8_START_X(x)                                         (((x) >> 0) & 0xFF)
+#define   C_028AC8_START_X                                            0xFFFFFF00
+#define   S_028AC8_START_Y(x)                                         (((x) & 0xFF) << 8)
+#define   G_028AC8_START_Y(x)                                         (((x) >> 8) & 0xFF)
+#define   C_028AC8_START_Y                                            0xFFFF00FF
+#define   S_028AC8_MAX_X(x)                                           (((x) & 0xFF) << 16)
+#define   G_028AC8_MAX_X(x)                                           (((x) >> 16) & 0xFF)
+#define   C_028AC8_MAX_X                                              0xFF00FFFF
+#define   S_028AC8_MAX_Y(x)                                           (((x) & 0xFF) << 24)
+#define   G_028AC8_MAX_Y(x)                                           (((x) >> 24) & 0xFF)
+#define   C_028AC8_MAX_Y                                              0x00FFFFFF
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0                              0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0                               0x028AD4
+#define   S_028AD4_STRIDE(x)                                          (((x) & 0x3FF) << 0)
+#define   G_028AD4_STRIDE(x)                                          (((x) >> 0) & 0x3FF)
+#define   C_028AD4_STRIDE                                             0xFFFFFC00
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0                            0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1                              0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1                               0x028AE4
+#define   S_028AE4_STRIDE(x)                                          (((x) & 0x3FF) << 0)
+#define   G_028AE4_STRIDE(x)                                          (((x) >> 0) & 0x3FF)
+#define   C_028AE4_STRIDE                                             0xFFFFFC00
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1                            0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2                              0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2                               0x028AF4
+#define   S_028AF4_STRIDE(x)                                          (((x) & 0x3FF) << 0)
+#define   G_028AF4_STRIDE(x)                                          (((x) >> 0) & 0x3FF)
+#define   C_028AF4_STRIDE                                             0xFFFFFC00
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2                            0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3                              0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3                               0x028B04
+#define   S_028B04_STRIDE(x)                                          (((x) & 0x3FF) << 0)
+#define   G_028B04_STRIDE(x)                                          (((x) >> 0) & 0x3FF)
+#define   C_028B04_STRIDE                                             0xFFFFFC00
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3                            0x028B0C
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET                         0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE             0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE                  0x028B30
+#define   S_028B30_VERTEX_STRIDE(x)                                   (((x) & 0x1FF) << 0)
+#define   G_028B30_VERTEX_STRIDE(x)                                   (((x) >> 0) & 0x1FF)
+#define   C_028B30_VERTEX_STRIDE                                      0xFFFFFE00
+#define R_028B38_VGT_GS_MAX_VERT_OUT                                    0x028B38
+#define   S_028B38_MAX_VERT_OUT(x)                                    (((x) & 0x7FF) << 0)
+#define   G_028B38_MAX_VERT_OUT(x)                                    (((x) >> 0) & 0x7FF)
+#define   C_028B38_MAX_VERT_OUT                                       0xFFFFF800
+#define R_028B54_VGT_SHADER_STAGES_EN                                   0x028B54
+#define   S_028B54_LS_EN(x)                                           (((x) & 0x03) << 0)
+#define   G_028B54_LS_EN(x)                                           (((x) >> 0) & 0x03)
+#define   C_028B54_LS_EN                                              0xFFFFFFFC
+#define     V_028B54_LS_STAGE_OFF                                   0x00
+#define     V_028B54_LS_STAGE_ON                                    0x01
+#define     V_028B54_CS_STAGE_ON                                    0x02
+#define   S_028B54_HS_EN(x)                                           (((x) & 0x1) << 2)
+#define   G_028B54_HS_EN(x)                                           (((x) >> 2) & 0x1)
+#define   C_028B54_HS_EN                                              0xFFFFFFFB
+#define   S_028B54_ES_EN(x)                                           (((x) & 0x03) << 3)
+#define   G_028B54_ES_EN(x)                                           (((x) >> 3) & 0x03)
+#define   C_028B54_ES_EN                                              0xFFFFFFE7
+#define     V_028B54_ES_STAGE_OFF                                   0x00
+#define     V_028B54_ES_STAGE_DS                                    0x01
+#define     V_028B54_ES_STAGE_REAL                                  0x02
+#define   S_028B54_GS_EN(x)                                           (((x) & 0x1) << 5)
+#define   G_028B54_GS_EN(x)                                           (((x) >> 5) & 0x1)
+#define   C_028B54_GS_EN                                              0xFFFFFFDF
+#define   S_028B54_VS_EN(x)                                           (((x) & 0x03) << 6)
+#define   G_028B54_VS_EN(x)                                           (((x) >> 6) & 0x03)
+#define   C_028B54_VS_EN                                              0xFFFFFF3F
+#define     V_028B54_VS_STAGE_REAL                                  0x00
+#define     V_028B54_VS_STAGE_DS                                    0x01
+#define     V_028B54_VS_STAGE_COPY_SHADER                           0x02
+#define   S_028B54_DYNAMIC_HS(x)                                      (((x) & 0x1) << 8)
+#define   G_028B54_DYNAMIC_HS(x)                                      (((x) >> 8) & 0x1)
+#define   C_028B54_DYNAMIC_HS                                         0xFFFFFEFF
+#define R_028B58_VGT_LS_HS_CONFIG                                       0x028B58
+#define   S_028B58_NUM_PATCHES(x)                                     (((x) & 0xFF) << 0)
+#define   G_028B58_NUM_PATCHES(x)                                     (((x) >> 0) & 0xFF)
+#define   C_028B58_NUM_PATCHES                                        0xFFFFFF00
+#define   S_028B58_HS_NUM_INPUT_CP(x)                                 (((x) & 0x3F) << 8)
+#define   G_028B58_HS_NUM_INPUT_CP(x)                                 (((x) >> 8) & 0x3F)
+#define   C_028B58_HS_NUM_INPUT_CP                                    0xFFFFC0FF
+#define   S_028B58_HS_NUM_OUTPUT_CP(x)                                (((x) & 0x3F) << 14)
+#define   G_028B58_HS_NUM_OUTPUT_CP(x)                                (((x) >> 14) & 0x3F)
+#define   C_028B58_HS_NUM_OUTPUT_CP                                   0xFFF03FFF
+#define R_028B5C_VGT_GS_VERT_ITEMSIZE                                   0x028B5C
+#define   S_028B5C_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028B5C_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028B5C_ITEMSIZE                                           0xFFFF8000
+#define R_028B60_VGT_GS_VERT_ITEMSIZE_1                                 0x028B60
+#define   S_028B60_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028B60_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028B60_ITEMSIZE                                           0xFFFF8000
+#define R_028B64_VGT_GS_VERT_ITEMSIZE_2                                 0x028B64
+#define   S_028B64_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028B64_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028B64_ITEMSIZE                                           0xFFFF8000
+#define R_028B68_VGT_GS_VERT_ITEMSIZE_3                                 0x028B68
+#define   S_028B68_ITEMSIZE(x)                                        (((x) & 0x7FFF) << 0)
+#define   G_028B68_ITEMSIZE(x)                                        (((x) >> 0) & 0x7FFF)
+#define   C_028B68_ITEMSIZE                                           0xFFFF8000
+#define R_028B6C_VGT_TF_PARAM                                           0x028B6C
+#define   S_028B6C_TYPE(x)                                            (((x) & 0x03) << 0)
+#define   G_028B6C_TYPE(x)                                            (((x) >> 0) & 0x03)
+#define   C_028B6C_TYPE                                               0xFFFFFFFC
+#define     V_028B6C_TESS_ISOLINE                                   0x00
+#define     V_028B6C_TESS_TRIANGLE                                  0x01
+#define     V_028B6C_TESS_QUAD                                      0x02
+#define   S_028B6C_PARTITIONING(x)                                    (((x) & 0x07) << 2)
+#define   G_028B6C_PARTITIONING(x)                                    (((x) >> 2) & 0x07)
+#define   C_028B6C_PARTITIONING                                       0xFFFFFFE3
+#define     V_028B6C_PART_INTEGER                                   0x00
+#define     V_028B6C_PART_POW2                                      0x01
+#define     V_028B6C_PART_FRAC_ODD                                  0x02
+#define     V_028B6C_PART_FRAC_EVEN                                 0x03
+#define   S_028B6C_TOPOLOGY(x)                                        (((x) & 0x07) << 5)
+#define   G_028B6C_TOPOLOGY(x)                                        (((x) >> 5) & 0x07)
+#define   C_028B6C_TOPOLOGY                                           0xFFFFFF1F
+#define     V_028B6C_OUTPUT_POINT                                   0x00
+#define     V_028B6C_OUTPUT_LINE                                    0x01
+#define     V_028B6C_OUTPUT_TRIANGLE_CW                             0x02
+#define     V_028B6C_OUTPUT_TRIANGLE_CCW                            0x03
+#define   S_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) & 0x1) << 8)
+#define   G_028B6C_RESERVED_REDUC_AXIS(x)                             (((x) >> 8) & 0x1)
+#define   C_028B6C_RESERVED_REDUC_AXIS                                0xFFFFFEFF
+#define   S_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) & 0x0F) << 10)
+#define   G_028B6C_NUM_DS_WAVES_PER_SIMD(x)                           (((x) >> 10) & 0x0F)
+#define   C_028B6C_NUM_DS_WAVES_PER_SIMD                              0xFFFFC3FF
+#define   S_028B6C_DISABLE_DONUTS(x)                                  (((x) & 0x1) << 14)
+#define   G_028B6C_DISABLE_DONUTS(x)                                  (((x) >> 14) & 0x1)
+#define   C_028B6C_DISABLE_DONUTS                                     0xFFFFBFFF
+#define R_028B70_DB_ALPHA_TO_MASK                                       0x028B70
+#define   S_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) & 0x1) << 0)
+#define   G_028B70_ALPHA_TO_MASK_ENABLE(x)                            (((x) >> 0) & 0x1)
+#define   C_028B70_ALPHA_TO_MASK_ENABLE                               0xFFFFFFFE
+#define   S_028B70_ALPHA_TO_MASK_OFFSET0(x)                           (((x) & 0x03) << 8)
+#define   G_028B70_ALPHA_TO_MASK_OFFSET0(x)                           (((x) >> 8) & 0x03)
+#define   C_028B70_ALPHA_TO_MASK_OFFSET0                              0xFFFFFCFF
+#define   S_028B70_ALPHA_TO_MASK_OFFSET1(x)                           (((x) & 0x03) << 10)
+#define   G_028B70_ALPHA_TO_MASK_OFFSET1(x)                           (((x) >> 10) & 0x03)
+#define   C_028B70_ALPHA_TO_MASK_OFFSET1                              0xFFFFF3FF
+#define   S_028B70_ALPHA_TO_MASK_OFFSET2(x)                           (((x) & 0x03) << 12)
+#define   G_028B70_ALPHA_TO_MASK_OFFSET2(x)                           (((x) >> 12) & 0x03)
+#define   C_028B70_ALPHA_TO_MASK_OFFSET2                              0xFFFFCFFF
+#define   S_028B70_ALPHA_TO_MASK_OFFSET3(x)                           (((x) & 0x03) << 14)
+#define   G_028B70_ALPHA_TO_MASK_OFFSET3(x)                           (((x) >> 14) & 0x03)
+#define   C_028B70_ALPHA_TO_MASK_OFFSET3                              0xFFFF3FFF
+#define   S_028B70_OFFSET_ROUND(x)                                    (((x) & 0x1) << 16)
+#define   G_028B70_OFFSET_ROUND(x)                                    (((x) >> 16) & 0x1)
+#define   C_028B70_OFFSET_ROUND                                       0xFFFEFFFF
+#define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL                          0x028B78
+#define   S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(x)                     (((x) & 0xFF) << 0)
+#define   G_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(x)                     (((x) >> 0) & 0xFF)
+#define   C_028B78_POLY_OFFSET_NEG_NUM_DB_BITS                        0xFFFFFF00
+#define   S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(x)                     (((x) & 0x1) << 8)
+#define   G_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(x)                     (((x) >> 8) & 0x1)
+#define   C_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT                        0xFFFFFEFF
+#define R_028B7C_PA_SU_POLY_OFFSET_CLAMP                                0x028B7C
+#define R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE                          0x028B80
+#define R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET                         0x028B84
+#define R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE                           0x028B88
+#define R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET                          0x028B8C
+#define R_028B90_VGT_GS_INSTANCE_CNT                                    0x028B90
+#define   S_028B90_ENABLE(x)                                          (((x) & 0x1) << 0)
+#define   G_028B90_ENABLE(x)                                          (((x) >> 0) & 0x1)
+#define   C_028B90_ENABLE                                             0xFFFFFFFE
+#define   S_028B90_CNT(x)                                             (((x) & 0x7F) << 2)
+#define   G_028B90_CNT(x)                                             (((x) >> 2) & 0x7F)
+#define   C_028B90_CNT                                                0xFFFFFE03
+#define R_028B94_VGT_STRMOUT_CONFIG                                     0x028B94
+#define   S_028B94_STREAMOUT_0_EN(x)                                  (((x) & 0x1) << 0)
+#define   G_028B94_STREAMOUT_0_EN(x)                                  (((x) >> 0) & 0x1)
+#define   C_028B94_STREAMOUT_0_EN                                     0xFFFFFFFE
+#define   S_028B94_STREAMOUT_1_EN(x)                                  (((x) & 0x1) << 1)
+#define   G_028B94_STREAMOUT_1_EN(x)                                  (((x) >> 1) & 0x1)
+#define   C_028B94_STREAMOUT_1_EN                                     0xFFFFFFFD
+#define   S_028B94_STREAMOUT_2_EN(x)                                  (((x) & 0x1) << 2)
+#define   G_028B94_STREAMOUT_2_EN(x)                                  (((x) >> 2) & 0x1)
+#define   C_028B94_STREAMOUT_2_EN                                     0xFFFFFFFB
+#define   S_028B94_STREAMOUT_3_EN(x)                                  (((x) & 0x1) << 3)
+#define   G_028B94_STREAMOUT_3_EN(x)                                  (((x) >> 3) & 0x1)
+#define   C_028B94_STREAMOUT_3_EN                                     0xFFFFFFF7
+#define   S_028B94_RAST_STREAM(x)                                     (((x) & 0x07) << 4)
+#define   G_028B94_RAST_STREAM(x)                                     (((x) >> 4) & 0x07)
+#define   C_028B94_RAST_STREAM                                        0xFFFFFF8F
+#define   S_028B94_RAST_STREAM_MASK(x)                                (((x) & 0x0F) << 8)
+#define   G_028B94_RAST_STREAM_MASK(x)                                (((x) >> 8) & 0x0F)
+#define   C_028B94_RAST_STREAM_MASK                                   0xFFFFF0FF
+#define   S_028B94_USE_RAST_STREAM_MASK(x)                            (((x) & 0x1) << 31)
+#define   G_028B94_USE_RAST_STREAM_MASK(x)                            (((x) >> 31) & 0x1)
+#define   C_028B94_USE_RAST_STREAM_MASK                               0x7FFFFFFF
+#define R_028B98_VGT_STRMOUT_BUFFER_CONFIG                              0x028B98
+#define   S_028B98_STREAM_0_BUFFER_EN(x)                              (((x) & 0x0F) << 0)
+#define   G_028B98_STREAM_0_BUFFER_EN(x)                              (((x) >> 0) & 0x0F)
+#define   C_028B98_STREAM_0_BUFFER_EN                                 0xFFFFFFF0
+#define   S_028B98_STREAM_1_BUFFER_EN(x)                              (((x) & 0x0F) << 4)
+#define   G_028B98_STREAM_1_BUFFER_EN(x)                              (((x) >> 4) & 0x0F)
+#define   C_028B98_STREAM_1_BUFFER_EN                                 0xFFFFFF0F
+#define   S_028B98_STREAM_2_BUFFER_EN(x)                              (((x) & 0x0F) << 8)
+#define   G_028B98_STREAM_2_BUFFER_EN(x)                              (((x) >> 8) & 0x0F)
+#define   C_028B98_STREAM_2_BUFFER_EN                                 0xFFFFF0FF
+#define   S_028B98_STREAM_3_BUFFER_EN(x)                              (((x) & 0x0F) << 12)
+#define   G_028B98_STREAM_3_BUFFER_EN(x)                              (((x) >> 12) & 0x0F)
+#define   C_028B98_STREAM_3_BUFFER_EN                                 0xFFFF0FFF
+#define R_028BD4_PA_SC_CENTROID_PRIORITY_0                              0x028BD4
+#define   S_028BD4_DISTANCE_0(x)                                      (((x) & 0x0F) << 0)
+#define   G_028BD4_DISTANCE_0(x)                                      (((x) >> 0) & 0x0F)
+#define   C_028BD4_DISTANCE_0                                         0xFFFFFFF0
+#define   S_028BD4_DISTANCE_1(x)                                      (((x) & 0x0F) << 4)
+#define   G_028BD4_DISTANCE_1(x)                                      (((x) >> 4) & 0x0F)
+#define   C_028BD4_DISTANCE_1                                         0xFFFFFF0F
+#define   S_028BD4_DISTANCE_2(x)                                      (((x) & 0x0F) << 8)
+#define   G_028BD4_DISTANCE_2(x)                                      (((x) >> 8) & 0x0F)
+#define   C_028BD4_DISTANCE_2                                         0xFFFFF0FF
+#define   S_028BD4_DISTANCE_3(x)                                      (((x) & 0x0F) << 12)
+#define   G_028BD4_DISTANCE_3(x)                                      (((x) >> 12) & 0x0F)
+#define   C_028BD4_DISTANCE_3                                         0xFFFF0FFF
+#define   S_028BD4_DISTANCE_4(x)                                      (((x) & 0x0F) << 16)
+#define   G_028BD4_DISTANCE_4(x)                                      (((x) >> 16) & 0x0F)
+#define   C_028BD4_DISTANCE_4                                         0xFFF0FFFF
+#define   S_028BD4_DISTANCE_5(x)                                      (((x) & 0x0F) << 20)
+#define   G_028BD4_DISTANCE_5(x)                                      (((x) >> 20) & 0x0F)
+#define   C_028BD4_DISTANCE_5                                         0xFF0FFFFF
+#define   S_028BD4_DISTANCE_6(x)                                      (((x) & 0x0F) << 24)
+#define   G_028BD4_DISTANCE_6(x)                                      (((x) >> 24) & 0x0F)
+#define   C_028BD4_DISTANCE_6                                         0xF0FFFFFF
+#define   S_028BD4_DISTANCE_7(x)                                      (((x) & 0x0F) << 28)
+#define   G_028BD4_DISTANCE_7(x)                                      (((x) >> 28) & 0x0F)
+#define   C_028BD4_DISTANCE_7                                         0x0FFFFFFF
+#define R_028BD8_PA_SC_CENTROID_PRIORITY_1                              0x028BD8
+#define   S_028BD8_DISTANCE_8(x)                                      (((x) & 0x0F) << 0)
+#define   G_028BD8_DISTANCE_8(x)                                      (((x) >> 0) & 0x0F)
+#define   C_028BD8_DISTANCE_8                                         0xFFFFFFF0
+#define   S_028BD8_DISTANCE_9(x)                                      (((x) & 0x0F) << 4)
+#define   G_028BD8_DISTANCE_9(x)                                      (((x) >> 4) & 0x0F)
+#define   C_028BD8_DISTANCE_9                                         0xFFFFFF0F
+#define   S_028BD8_DISTANCE_10(x)                                     (((x) & 0x0F) << 8)
+#define   G_028BD8_DISTANCE_10(x)                                     (((x) >> 8) & 0x0F)
+#define   C_028BD8_DISTANCE_10                                        0xFFFFF0FF
+#define   S_028BD8_DISTANCE_11(x)                                     (((x) & 0x0F) << 12)
+#define   G_028BD8_DISTANCE_11(x)                                     (((x) >> 12) & 0x0F)
+#define   C_028BD8_DISTANCE_11                                        0xFFFF0FFF
+#define   S_028BD8_DISTANCE_12(x)                                     (((x) & 0x0F) << 16)
+#define   G_028BD8_DISTANCE_12(x)                                     (((x) >> 16) & 0x0F)
+#define   C_028BD8_DISTANCE_12                                        0xFFF0FFFF
+#define   S_028BD8_DISTANCE_13(x)                                     (((x) & 0x0F) << 20)
+#define   G_028BD8_DISTANCE_13(x)                                     (((x) >> 20) & 0x0F)
+#define   C_028BD8_DISTANCE_13                                        0xFF0FFFFF
+#define   S_028BD8_DISTANCE_14(x)                                     (((x) & 0x0F) << 24)
+#define   G_028BD8_DISTANCE_14(x)                                     (((x) >> 24) & 0x0F)
+#define   C_028BD8_DISTANCE_14                                        0xF0FFFFFF
+#define   S_028BD8_DISTANCE_15(x)                                     (((x) & 0x0F) << 28)
+#define   G_028BD8_DISTANCE_15(x)                                     (((x) >> 28) & 0x0F)
+#define   C_028BD8_DISTANCE_15                                        0x0FFFFFFF
+#define R_028BDC_PA_SC_LINE_CNTL                                        0x028BDC
+#define   S_028BDC_EXPAND_LINE_WIDTH(x)                               (((x) & 0x1) << 9)
+#define   G_028BDC_EXPAND_LINE_WIDTH(x)                               (((x) >> 9) & 0x1)
+#define   C_028BDC_EXPAND_LINE_WIDTH                                  0xFFFFFDFF
+#define   S_028BDC_LAST_PIXEL(x)                                      (((x) & 0x1) << 10)
+#define   G_028BDC_LAST_PIXEL(x)                                      (((x) >> 10) & 0x1)
+#define   C_028BDC_LAST_PIXEL                                         0xFFFFFBFF
+#define   S_028BDC_PERPENDICULAR_ENDCAP_ENA(x)                        (((x) & 0x1) << 11)
+#define   G_028BDC_PERPENDICULAR_ENDCAP_ENA(x)                        (((x) >> 11) & 0x1)
+#define   C_028BDC_PERPENDICULAR_ENDCAP_ENA                           0xFFFFF7FF
+#define   S_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) & 0x1) << 12)
+#define   G_028BDC_DX10_DIAMOND_TEST_ENA(x)                           (((x) >> 12) & 0x1)
+#define   C_028BDC_DX10_DIAMOND_TEST_ENA                              0xFFFFEFFF
+#define R_028BE0_PA_SC_AA_CONFIG                                        0x028BE0
+#define   S_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) & 0x07) << 0)
+#define   G_028BE0_MSAA_NUM_SAMPLES(x)                                (((x) >> 0) & 0x07)
+#define   C_028BE0_MSAA_NUM_SAMPLES                                   0xFFFFFFF8
+#define   S_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) & 0x1) << 4)
+#define   G_028BE0_AA_MASK_CENTROID_DTMN(x)                           (((x) >> 4) & 0x1)
+#define   C_028BE0_AA_MASK_CENTROID_DTMN                              0xFFFFFFEF
+#define   S_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) & 0x0F) << 13)
+#define   G_028BE0_MAX_SAMPLE_DIST(x)                                 (((x) >> 13) & 0x0F)
+#define   C_028BE0_MAX_SAMPLE_DIST                                    0xFFFE1FFF
+#define   S_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) & 0x07) << 20)
+#define   G_028BE0_MSAA_EXPOSED_SAMPLES(x)                            (((x) >> 20) & 0x07)
+#define   C_028BE0_MSAA_EXPOSED_SAMPLES                               0xFF8FFFFF
+#define   S_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) & 0x03) << 24)
+#define   G_028BE0_DETAIL_TO_EXPOSED_MODE(x)                          (((x) >> 24) & 0x03)
+#define   C_028BE0_DETAIL_TO_EXPOSED_MODE                             0xFCFFFFFF
+#define R_028BE4_PA_SU_VTX_CNTL                                         0x028BE4
+#define   S_028BE4_PIX_CENTER(x)                                      (((x) & 0x1) << 0)
+#define   G_028BE4_PIX_CENTER(x)                                      (((x) >> 0) & 0x1)
+#define   C_028BE4_PIX_CENTER                                         0xFFFFFFFE
+#define   S_028BE4_ROUND_MODE(x)                                      (((x) & 0x03) << 1)
+#define   G_028BE4_ROUND_MODE(x)                                      (((x) >> 1) & 0x03)
+#define   C_028BE4_ROUND_MODE                                         0xFFFFFFF9
+#define     V_028BE4_X_TRUNCATE                                     0x00
+#define     V_028BE4_X_ROUND                                        0x01
+#define     V_028BE4_X_ROUND_TO_EVEN                                0x02
+#define     V_028BE4_X_ROUND_TO_ODD                                 0x03
+#define   S_028BE4_QUANT_MODE(x)                                      (((x) & 0x07) << 3)
+#define   G_028BE4_QUANT_MODE(x)                                      (((x) >> 3) & 0x07)
+#define   C_028BE4_QUANT_MODE                                         0xFFFFFFC7
+#define     V_028BE4_X_16_8_FIXED_POINT_1_16TH                      0x00
+#define     V_028BE4_X_16_8_FIXED_POINT_1_8TH                       0x01
+#define     V_028BE4_X_16_8_FIXED_POINT_1_4TH                       0x02
+#define     V_028BE4_X_16_8_FIXED_POINT_1_2                         0x03
+#define     V_028BE4_X_16_8_FIXED_POINT_1                           0x04
+#define     V_028BE4_X_16_8_FIXED_POINT_1_256TH                     0x05
+#define     V_028BE4_X_14_10_FIXED_POINT_1_1024TH                   0x06
+#define     V_028BE4_X_12_12_FIXED_POINT_1_4096TH                   0x07
+#define R_028BE8_PA_CL_GB_VERT_CLIP_ADJ                                 0x028BE8
+#define R_028BEC_PA_CL_GB_VERT_DISC_ADJ                                 0x028BEC
+#define R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ                                 0x028BF0
+#define R_028BF4_PA_CL_GB_HORZ_DISC_ADJ                                 0x028BF4
+#define R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0                      0x028BF8
+#define   S_028BF8_S0_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028BF8_S0_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028BF8_S0_X                                               0xFFFFFFF0
+#define   S_028BF8_S0_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028BF8_S0_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028BF8_S0_Y                                               0xFFFFFF0F
+#define   S_028BF8_S1_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028BF8_S1_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028BF8_S1_X                                               0xFFFFF0FF
+#define   S_028BF8_S1_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028BF8_S1_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028BF8_S1_Y                                               0xFFFF0FFF
+#define   S_028BF8_S2_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028BF8_S2_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028BF8_S2_X                                               0xFFF0FFFF
+#define   S_028BF8_S2_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028BF8_S2_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028BF8_S2_Y                                               0xFF0FFFFF
+#define   S_028BF8_S3_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028BF8_S3_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028BF8_S3_X                                               0xF0FFFFFF
+#define   S_028BF8_S3_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028BF8_S3_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028BF8_S3_Y                                               0x0FFFFFFF
+#define R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1                      0x028BFC
+#define   S_028BFC_S4_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028BFC_S4_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028BFC_S4_X                                               0xFFFFFFF0
+#define   S_028BFC_S4_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028BFC_S4_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028BFC_S4_Y                                               0xFFFFFF0F
+#define   S_028BFC_S5_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028BFC_S5_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028BFC_S5_X                                               0xFFFFF0FF
+#define   S_028BFC_S5_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028BFC_S5_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028BFC_S5_Y                                               0xFFFF0FFF
+#define   S_028BFC_S6_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028BFC_S6_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028BFC_S6_X                                               0xFFF0FFFF
+#define   S_028BFC_S6_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028BFC_S6_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028BFC_S6_Y                                               0xFF0FFFFF
+#define   S_028BFC_S7_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028BFC_S7_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028BFC_S7_X                                               0xF0FFFFFF
+#define   S_028BFC_S7_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028BFC_S7_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028BFC_S7_Y                                               0x0FFFFFFF
+#define R_028C00_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2                      0x028C00
+#define   S_028C00_S8_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C00_S8_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C00_S8_X                                               0xFFFFFFF0
+#define   S_028C00_S8_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C00_S8_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C00_S8_Y                                               0xFFFFFF0F
+#define   S_028C00_S9_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C00_S9_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C00_S9_X                                               0xFFFFF0FF
+#define   S_028C00_S9_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C00_S9_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C00_S9_Y                                               0xFFFF0FFF
+#define   S_028C00_S10_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C00_S10_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C00_S10_X                                              0xFFF0FFFF
+#define   S_028C00_S10_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C00_S10_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C00_S10_Y                                              0xFF0FFFFF
+#define   S_028C00_S11_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C00_S11_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C00_S11_X                                              0xF0FFFFFF
+#define   S_028C00_S11_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C00_S11_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C00_S11_Y                                              0x0FFFFFFF
+#define R_028C04_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3                      0x028C04
+#define   S_028C04_S12_X(x)                                           (((x) & 0x0F) << 0)
+#define   G_028C04_S12_X(x)                                           (((x) >> 0) & 0x0F)
+#define   C_028C04_S12_X                                              0xFFFFFFF0
+#define   S_028C04_S12_Y(x)                                           (((x) & 0x0F) << 4)
+#define   G_028C04_S12_Y(x)                                           (((x) >> 4) & 0x0F)
+#define   C_028C04_S12_Y                                              0xFFFFFF0F
+#define   S_028C04_S13_X(x)                                           (((x) & 0x0F) << 8)
+#define   G_028C04_S13_X(x)                                           (((x) >> 8) & 0x0F)
+#define   C_028C04_S13_X                                              0xFFFFF0FF
+#define   S_028C04_S13_Y(x)                                           (((x) & 0x0F) << 12)
+#define   G_028C04_S13_Y(x)                                           (((x) >> 12) & 0x0F)
+#define   C_028C04_S13_Y                                              0xFFFF0FFF
+#define   S_028C04_S14_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C04_S14_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C04_S14_X                                              0xFFF0FFFF
+#define   S_028C04_S14_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C04_S14_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C04_S14_Y                                              0xFF0FFFFF
+#define   S_028C04_S15_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C04_S15_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C04_S15_X                                              0xF0FFFFFF
+#define   S_028C04_S15_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C04_S15_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C04_S15_Y                                              0x0FFFFFFF
+#define R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0                      0x028C08
+#define   S_028C08_S0_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C08_S0_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C08_S0_X                                               0xFFFFFFF0
+#define   S_028C08_S0_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C08_S0_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C08_S0_Y                                               0xFFFFFF0F
+#define   S_028C08_S1_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C08_S1_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C08_S1_X                                               0xFFFFF0FF
+#define   S_028C08_S1_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C08_S1_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C08_S1_Y                                               0xFFFF0FFF
+#define   S_028C08_S2_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C08_S2_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C08_S2_X                                               0xFFF0FFFF
+#define   S_028C08_S2_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C08_S2_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C08_S2_Y                                               0xFF0FFFFF
+#define   S_028C08_S3_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C08_S3_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C08_S3_X                                               0xF0FFFFFF
+#define   S_028C08_S3_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C08_S3_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C08_S3_Y                                               0x0FFFFFFF
+#define R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1                      0x028C0C
+#define   S_028C0C_S4_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C0C_S4_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C0C_S4_X                                               0xFFFFFFF0
+#define   S_028C0C_S4_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C0C_S4_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C0C_S4_Y                                               0xFFFFFF0F
+#define   S_028C0C_S5_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C0C_S5_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C0C_S5_X                                               0xFFFFF0FF
+#define   S_028C0C_S5_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C0C_S5_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C0C_S5_Y                                               0xFFFF0FFF
+#define   S_028C0C_S6_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C0C_S6_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C0C_S6_X                                               0xFFF0FFFF
+#define   S_028C0C_S6_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C0C_S6_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C0C_S6_Y                                               0xFF0FFFFF
+#define   S_028C0C_S7_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C0C_S7_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C0C_S7_X                                               0xF0FFFFFF
+#define   S_028C0C_S7_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C0C_S7_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C0C_S7_Y                                               0x0FFFFFFF
+#define R_028C10_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2                      0x028C10
+#define   S_028C10_S8_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C10_S8_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C10_S8_X                                               0xFFFFFFF0
+#define   S_028C10_S8_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C10_S8_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C10_S8_Y                                               0xFFFFFF0F
+#define   S_028C10_S9_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C10_S9_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C10_S9_X                                               0xFFFFF0FF
+#define   S_028C10_S9_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C10_S9_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C10_S9_Y                                               0xFFFF0FFF
+#define   S_028C10_S10_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C10_S10_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C10_S10_X                                              0xFFF0FFFF
+#define   S_028C10_S10_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C10_S10_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C10_S10_Y                                              0xFF0FFFFF
+#define   S_028C10_S11_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C10_S11_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C10_S11_X                                              0xF0FFFFFF
+#define   S_028C10_S11_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C10_S11_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C10_S11_Y                                              0x0FFFFFFF
+#define R_028C14_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3                      0x028C14
+#define   S_028C14_S12_X(x)                                           (((x) & 0x0F) << 0)
+#define   G_028C14_S12_X(x)                                           (((x) >> 0) & 0x0F)
+#define   C_028C14_S12_X                                              0xFFFFFFF0
+#define   S_028C14_S12_Y(x)                                           (((x) & 0x0F) << 4)
+#define   G_028C14_S12_Y(x)                                           (((x) >> 4) & 0x0F)
+#define   C_028C14_S12_Y                                              0xFFFFFF0F
+#define   S_028C14_S13_X(x)                                           (((x) & 0x0F) << 8)
+#define   G_028C14_S13_X(x)                                           (((x) >> 8) & 0x0F)
+#define   C_028C14_S13_X                                              0xFFFFF0FF
+#define   S_028C14_S13_Y(x)                                           (((x) & 0x0F) << 12)
+#define   G_028C14_S13_Y(x)                                           (((x) >> 12) & 0x0F)
+#define   C_028C14_S13_Y                                              0xFFFF0FFF
+#define   S_028C14_S14_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C14_S14_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C14_S14_X                                              0xFFF0FFFF
+#define   S_028C14_S14_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C14_S14_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C14_S14_Y                                              0xFF0FFFFF
+#define   S_028C14_S15_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C14_S15_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C14_S15_X                                              0xF0FFFFFF
+#define   S_028C14_S15_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C14_S15_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C14_S15_Y                                              0x0FFFFFFF
+#define R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0                      0x028C18
+#define   S_028C18_S0_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C18_S0_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C18_S0_X                                               0xFFFFFFF0
+#define   S_028C18_S0_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C18_S0_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C18_S0_Y                                               0xFFFFFF0F
+#define   S_028C18_S1_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C18_S1_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C18_S1_X                                               0xFFFFF0FF
+#define   S_028C18_S1_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C18_S1_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C18_S1_Y                                               0xFFFF0FFF
+#define   S_028C18_S2_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C18_S2_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C18_S2_X                                               0xFFF0FFFF
+#define   S_028C18_S2_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C18_S2_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C18_S2_Y                                               0xFF0FFFFF
+#define   S_028C18_S3_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C18_S3_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C18_S3_X                                               0xF0FFFFFF
+#define   S_028C18_S3_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C18_S3_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C18_S3_Y                                               0x0FFFFFFF
+#define R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1                      0x028C1C
+#define   S_028C1C_S4_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C1C_S4_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C1C_S4_X                                               0xFFFFFFF0
+#define   S_028C1C_S4_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C1C_S4_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C1C_S4_Y                                               0xFFFFFF0F
+#define   S_028C1C_S5_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C1C_S5_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C1C_S5_X                                               0xFFFFF0FF
+#define   S_028C1C_S5_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C1C_S5_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C1C_S5_Y                                               0xFFFF0FFF
+#define   S_028C1C_S6_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C1C_S6_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C1C_S6_X                                               0xFFF0FFFF
+#define   S_028C1C_S6_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C1C_S6_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C1C_S6_Y                                               0xFF0FFFFF
+#define   S_028C1C_S7_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C1C_S7_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C1C_S7_X                                               0xF0FFFFFF
+#define   S_028C1C_S7_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C1C_S7_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C1C_S7_Y                                               0x0FFFFFFF
+#define R_028C20_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2                      0x028C20
+#define   S_028C20_S8_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C20_S8_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C20_S8_X                                               0xFFFFFFF0
+#define   S_028C20_S8_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C20_S8_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C20_S8_Y                                               0xFFFFFF0F
+#define   S_028C20_S9_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C20_S9_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C20_S9_X                                               0xFFFFF0FF
+#define   S_028C20_S9_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C20_S9_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C20_S9_Y                                               0xFFFF0FFF
+#define   S_028C20_S10_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C20_S10_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C20_S10_X                                              0xFFF0FFFF
+#define   S_028C20_S10_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C20_S10_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C20_S10_Y                                              0xFF0FFFFF
+#define   S_028C20_S11_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C20_S11_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C20_S11_X                                              0xF0FFFFFF
+#define   S_028C20_S11_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C20_S11_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C20_S11_Y                                              0x0FFFFFFF
+#define R_028C24_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3                      0x028C24
+#define   S_028C24_S12_X(x)                                           (((x) & 0x0F) << 0)
+#define   G_028C24_S12_X(x)                                           (((x) >> 0) & 0x0F)
+#define   C_028C24_S12_X                                              0xFFFFFFF0
+#define   S_028C24_S12_Y(x)                                           (((x) & 0x0F) << 4)
+#define   G_028C24_S12_Y(x)                                           (((x) >> 4) & 0x0F)
+#define   C_028C24_S12_Y                                              0xFFFFFF0F
+#define   S_028C24_S13_X(x)                                           (((x) & 0x0F) << 8)
+#define   G_028C24_S13_X(x)                                           (((x) >> 8) & 0x0F)
+#define   C_028C24_S13_X                                              0xFFFFF0FF
+#define   S_028C24_S13_Y(x)                                           (((x) & 0x0F) << 12)
+#define   G_028C24_S13_Y(x)                                           (((x) >> 12) & 0x0F)
+#define   C_028C24_S13_Y                                              0xFFFF0FFF
+#define   S_028C24_S14_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C24_S14_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C24_S14_X                                              0xFFF0FFFF
+#define   S_028C24_S14_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C24_S14_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C24_S14_Y                                              0xFF0FFFFF
+#define   S_028C24_S15_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C24_S15_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C24_S15_X                                              0xF0FFFFFF
+#define   S_028C24_S15_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C24_S15_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C24_S15_Y                                              0x0FFFFFFF
+#define R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0                      0x028C28
+#define   S_028C28_S0_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C28_S0_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C28_S0_X                                               0xFFFFFFF0
+#define   S_028C28_S0_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C28_S0_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C28_S0_Y                                               0xFFFFFF0F
+#define   S_028C28_S1_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C28_S1_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C28_S1_X                                               0xFFFFF0FF
+#define   S_028C28_S1_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C28_S1_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C28_S1_Y                                               0xFFFF0FFF
+#define   S_028C28_S2_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C28_S2_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C28_S2_X                                               0xFFF0FFFF
+#define   S_028C28_S2_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C28_S2_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C28_S2_Y                                               0xFF0FFFFF
+#define   S_028C28_S3_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C28_S3_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C28_S3_X                                               0xF0FFFFFF
+#define   S_028C28_S3_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C28_S3_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C28_S3_Y                                               0x0FFFFFFF
+#define R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1                      0x028C2C
+#define   S_028C2C_S4_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C2C_S4_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C2C_S4_X                                               0xFFFFFFF0
+#define   S_028C2C_S4_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C2C_S4_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C2C_S4_Y                                               0xFFFFFF0F
+#define   S_028C2C_S5_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C2C_S5_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C2C_S5_X                                               0xFFFFF0FF
+#define   S_028C2C_S5_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C2C_S5_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C2C_S5_Y                                               0xFFFF0FFF
+#define   S_028C2C_S6_X(x)                                            (((x) & 0x0F) << 16)
+#define   G_028C2C_S6_X(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028C2C_S6_X                                               0xFFF0FFFF
+#define   S_028C2C_S6_Y(x)                                            (((x) & 0x0F) << 20)
+#define   G_028C2C_S6_Y(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028C2C_S6_Y                                               0xFF0FFFFF
+#define   S_028C2C_S7_X(x)                                            (((x) & 0x0F) << 24)
+#define   G_028C2C_S7_X(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028C2C_S7_X                                               0xF0FFFFFF
+#define   S_028C2C_S7_Y(x)                                            (((x) & 0x0F) << 28)
+#define   G_028C2C_S7_Y(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028C2C_S7_Y                                               0x0FFFFFFF
+#define R_028C30_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2                      0x028C30
+#define   S_028C30_S8_X(x)                                            (((x) & 0x0F) << 0)
+#define   G_028C30_S8_X(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028C30_S8_X                                               0xFFFFFFF0
+#define   S_028C30_S8_Y(x)                                            (((x) & 0x0F) << 4)
+#define   G_028C30_S8_Y(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028C30_S8_Y                                               0xFFFFFF0F
+#define   S_028C30_S9_X(x)                                            (((x) & 0x0F) << 8)
+#define   G_028C30_S9_X(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028C30_S9_X                                               0xFFFFF0FF
+#define   S_028C30_S9_Y(x)                                            (((x) & 0x0F) << 12)
+#define   G_028C30_S9_Y(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028C30_S9_Y                                               0xFFFF0FFF
+#define   S_028C30_S10_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C30_S10_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C30_S10_X                                              0xFFF0FFFF
+#define   S_028C30_S10_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C30_S10_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C30_S10_Y                                              0xFF0FFFFF
+#define   S_028C30_S11_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C30_S11_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C30_S11_X                                              0xF0FFFFFF
+#define   S_028C30_S11_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C30_S11_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C30_S11_Y                                              0x0FFFFFFF
+#define R_028C34_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3                      0x028C34
+#define   S_028C34_S12_X(x)                                           (((x) & 0x0F) << 0)
+#define   G_028C34_S12_X(x)                                           (((x) >> 0) & 0x0F)
+#define   C_028C34_S12_X                                              0xFFFFFFF0
+#define   S_028C34_S12_Y(x)                                           (((x) & 0x0F) << 4)
+#define   G_028C34_S12_Y(x)                                           (((x) >> 4) & 0x0F)
+#define   C_028C34_S12_Y                                              0xFFFFFF0F
+#define   S_028C34_S13_X(x)                                           (((x) & 0x0F) << 8)
+#define   G_028C34_S13_X(x)                                           (((x) >> 8) & 0x0F)
+#define   C_028C34_S13_X                                              0xFFFFF0FF
+#define   S_028C34_S13_Y(x)                                           (((x) & 0x0F) << 12)
+#define   G_028C34_S13_Y(x)                                           (((x) >> 12) & 0x0F)
+#define   C_028C34_S13_Y                                              0xFFFF0FFF
+#define   S_028C34_S14_X(x)                                           (((x) & 0x0F) << 16)
+#define   G_028C34_S14_X(x)                                           (((x) >> 16) & 0x0F)
+#define   C_028C34_S14_X                                              0xFFF0FFFF
+#define   S_028C34_S14_Y(x)                                           (((x) & 0x0F) << 20)
+#define   G_028C34_S14_Y(x)                                           (((x) >> 20) & 0x0F)
+#define   C_028C34_S14_Y                                              0xFF0FFFFF
+#define   S_028C34_S15_X(x)                                           (((x) & 0x0F) << 24)
+#define   G_028C34_S15_X(x)                                           (((x) >> 24) & 0x0F)
+#define   C_028C34_S15_X                                              0xF0FFFFFF
+#define   S_028C34_S15_Y(x)                                           (((x) & 0x0F) << 28)
+#define   G_028C34_S15_Y(x)                                           (((x) >> 28) & 0x0F)
+#define   C_028C34_S15_Y                                              0x0FFFFFFF
+#define R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0                                0x028C38
+#define   S_028C38_AA_MASK_X0Y0(x)                                    (((x) & 0xFFFF) << 0)
+#define   G_028C38_AA_MASK_X0Y0(x)                                    (((x) >> 0) & 0xFFFF)
+#define   C_028C38_AA_MASK_X0Y0                                       0xFFFF0000
+#define   S_028C38_AA_MASK_X1Y0(x)                                    (((x) & 0xFFFF) << 16)
+#define   G_028C38_AA_MASK_X1Y0(x)                                    (((x) >> 16) & 0xFFFF)
+#define   C_028C38_AA_MASK_X1Y0                                       0x0000FFFF
+#define R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1                                0x028C3C
+#define   S_028C3C_AA_MASK_X0Y1(x)                                    (((x) & 0xFFFF) << 0)
+#define   G_028C3C_AA_MASK_X0Y1(x)                                    (((x) >> 0) & 0xFFFF)
+#define   C_028C3C_AA_MASK_X0Y1                                       0xFFFF0000
+#define   S_028C3C_AA_MASK_X1Y1(x)                                    (((x) & 0xFFFF) << 16)
+#define   G_028C3C_AA_MASK_X1Y1(x)                                    (((x) >> 16) & 0xFFFF)
+#define   C_028C3C_AA_MASK_X1Y1                                       0x0000FFFF
+#define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL                            0x028C58
+#define   S_028C58_VTX_REUSE_DEPTH(x)                                 (((x) & 0xFF) << 0)
+#define   G_028C58_VTX_REUSE_DEPTH(x)                                 (((x) >> 0) & 0xFF)
+#define   C_028C58_VTX_REUSE_DEPTH                                    0xFFFFFF00
+#define R_028C5C_VGT_OUT_DEALLOC_CNTL                                   0x028C5C
+#define   S_028C5C_DEALLOC_DIST(x)                                    (((x) & 0x7F) << 0)
+#define   G_028C5C_DEALLOC_DIST(x)                                    (((x) >> 0) & 0x7F)
+#define   C_028C5C_DEALLOC_DIST                                       0xFFFFFF80
+#define R_028C60_CB_COLOR0_BASE                                         0x028C60
+#define R_028C64_CB_COLOR0_PITCH                                        0x028C64
+#define   S_028C64_TILE_MAX(x)                                        (((x) & 0x7FF) << 0)
+#define   G_028C64_TILE_MAX(x)                                        (((x) >> 0) & 0x7FF)
+#define   C_028C64_TILE_MAX                                           0xFFFFF800
+#define R_028C68_CB_COLOR0_SLICE                                        0x028C68
+#define   S_028C68_TILE_MAX(x)                                        (((x) & 0x3FFFFF) << 0)
+#define   G_028C68_TILE_MAX(x)                                        (((x) >> 0) & 0x3FFFFF)
+#define   C_028C68_TILE_MAX                                           0xFFC00000
+#define R_028C6C_CB_COLOR0_VIEW                                         0x028C6C
+#define   S_028C6C_SLICE_START(x)                                     (((x) & 0x7FF) << 0)
+#define   G_028C6C_SLICE_START(x)                                     (((x) >> 0) & 0x7FF)
+#define   C_028C6C_SLICE_START                                        0xFFFFF800
+#define   S_028C6C_SLICE_MAX(x)                                       (((x) & 0x7FF) << 13)
+#define   G_028C6C_SLICE_MAX(x)                                       (((x) >> 13) & 0x7FF)
+#define   C_028C6C_SLICE_MAX                                          0xFF001FFF
+#define R_028C70_CB_COLOR0_INFO                                         0x028C70
+#define   S_028C70_ENDIAN(x)                                          (((x) & 0x03) << 0)
+#define   G_028C70_ENDIAN(x)                                          (((x) >> 0) & 0x03)
+#define   C_028C70_ENDIAN                                             0xFFFFFFFC
+#define     V_028C70_ENDIAN_NONE                                    0x00
+#define     V_028C70_ENDIAN_8IN16                                   0x01
+#define     V_028C70_ENDIAN_8IN32                                   0x02
+#define     V_028C70_ENDIAN_8IN64                                   0x03
+#define   S_028C70_FORMAT(x)                                          (((x) & 0x1F) << 2)
+#define   G_028C70_FORMAT(x)                                          (((x) >> 2) & 0x1F)
+#define   C_028C70_FORMAT                                             0xFFFFFF83
+#define     V_028C70_COLOR_INVALID                                  0x00
+#define     V_028C70_COLOR_8                                        0x01
+#define     V_028C70_COLOR_16                                       0x02
+#define     V_028C70_COLOR_8_8                                      0x03
+#define     V_028C70_COLOR_32                                       0x04
+#define     V_028C70_COLOR_16_16                                    0x05
+#define     V_028C70_COLOR_10_11_11                                 0x06
+#define     V_028C70_COLOR_11_11_10                                 0x07
+#define     V_028C70_COLOR_10_10_10_2                               0x08
+#define     V_028C70_COLOR_2_10_10_10                               0x09
+#define     V_028C70_COLOR_8_8_8_8                                  0x0A
+#define     V_028C70_COLOR_32_32                                    0x0B
+#define     V_028C70_COLOR_16_16_16_16                              0x0C
+#define     V_028C70_COLOR_32_32_32_32                              0x0E
+#define     V_028C70_COLOR_5_6_5                                    0x10
+#define     V_028C70_COLOR_1_5_5_5                                  0x11
+#define     V_028C70_COLOR_5_5_5_1                                  0x12
+#define     V_028C70_COLOR_4_4_4_4                                  0x13
+#define     V_028C70_COLOR_8_24                                     0x14
+#define     V_028C70_COLOR_24_8                                     0x15
+#define     V_028C70_COLOR_X24_8_32_FLOAT                           0x16
+#define   S_028C70_LINEAR_GENERAL(x)                                  (((x) & 0x1) << 7)
+#define   G_028C70_LINEAR_GENERAL(x)                                  (((x) >> 7) & 0x1)
+#define   C_028C70_LINEAR_GENERAL                                     0xFFFFFF7F
+#define   S_028C70_NUMBER_TYPE(x)                                     (((x) & 0x07) << 8)
+#define   G_028C70_NUMBER_TYPE(x)                                     (((x) >> 8) & 0x07)
+#define   C_028C70_NUMBER_TYPE                                        0xFFFFF8FF
+#define     V_028C70_NUMBER_UNORM                                   0x00
+#define     V_028C70_NUMBER_SNORM                                   0x01
+#define     V_028C70_NUMBER_UINT                                    0x04
+#define     V_028C70_NUMBER_SINT                                    0x05
+#define     V_028C70_NUMBER_SRGB                                    0x06
+#define     V_028C70_NUMBER_FLOAT                                   0x07
+#define   S_028C70_COMP_SWAP(x)                                       (((x) & 0x03) << 11)
+#define   G_028C70_COMP_SWAP(x)                                       (((x) >> 11) & 0x03)
+#define   C_028C70_COMP_SWAP                                          0xFFFFE7FF
+#define     V_028C70_SWAP_STD                                       0x00
+#define     V_028C70_SWAP_ALT                                       0x01
+#define     V_028C70_SWAP_STD_REV                                   0x02
+#define     V_028C70_SWAP_ALT_REV                                   0x03
+#define   S_028C70_FAST_CLEAR(x)                                      (((x) & 0x1) << 13)
+#define   G_028C70_FAST_CLEAR(x)                                      (((x) >> 13) & 0x1)
+#define   C_028C70_FAST_CLEAR                                         0xFFFFDFFF
+#define   S_028C70_COMPRESSION(x)                                     (((x) & 0x1) << 14)
+#define   G_028C70_COMPRESSION(x)                                     (((x) >> 14) & 0x1)
+#define   C_028C70_COMPRESSION                                        0xFFFFBFFF
+#define   S_028C70_BLEND_CLAMP(x)                                     (((x) & 0x1) << 15)
+#define   G_028C70_BLEND_CLAMP(x)                                     (((x) >> 15) & 0x1)
+#define   C_028C70_BLEND_CLAMP                                        0xFFFF7FFF
+#define   S_028C70_BLEND_BYPASS(x)                                    (((x) & 0x1) << 16)
+#define   G_028C70_BLEND_BYPASS(x)                                    (((x) >> 16) & 0x1)
+#define   C_028C70_BLEND_BYPASS                                       0xFFFEFFFF
+#define   S_028C70_SIMPLE_FLOAT(x)                                    (((x) & 0x1) << 17)
+#define   G_028C70_SIMPLE_FLOAT(x)                                    (((x) >> 17) & 0x1)
+#define   C_028C70_SIMPLE_FLOAT                                       0xFFFDFFFF
+#define   S_028C70_ROUND_MODE(x)                                      (((x) & 0x1) << 18)
+#define   G_028C70_ROUND_MODE(x)                                      (((x) >> 18) & 0x1)
+#define   C_028C70_ROUND_MODE                                         0xFFFBFFFF
+#define   S_028C70_CMASK_IS_LINEAR(x)                                 (((x) & 0x1) << 19)
+#define   G_028C70_CMASK_IS_LINEAR(x)                                 (((x) >> 19) & 0x1)
+#define   C_028C70_CMASK_IS_LINEAR                                    0xFFF7FFFF
+#define   S_028C70_BLEND_OPT_DONT_RD_DST(x)                           (((x) & 0x07) << 20)
+#define   G_028C70_BLEND_OPT_DONT_RD_DST(x)                           (((x) >> 20) & 0x07)
+#define   C_028C70_BLEND_OPT_DONT_RD_DST                              0xFF8FFFFF
+#define     V_028C70_FORCE_OPT_AUTO                                 0x00
+#define     V_028C70_FORCE_OPT_DISABLE                              0x01
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_0                    0x02
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_0                  0x03
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_0                 0x04
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_1                    0x05
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_1                  0x06
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_1                 0x07
+#define   S_028C70_BLEND_OPT_DISCARD_PIXEL(x)                         (((x) & 0x07) << 23)
+#define   G_028C70_BLEND_OPT_DISCARD_PIXEL(x)                         (((x) >> 23) & 0x07)
+#define   C_028C70_BLEND_OPT_DISCARD_PIXEL                            0xFC7FFFFF
+#define     V_028C70_FORCE_OPT_AUTO                                 0x00
+#define     V_028C70_FORCE_OPT_DISABLE                              0x01
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_0                    0x02
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_0                  0x03
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_0                 0x04
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_1                    0x05
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_1                  0x06
+#define     V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_1                 0x07
+#define R_028C74_CB_COLOR0_ATTRIB                                       0x028C74
+#define   S_028C74_TILE_MODE_INDEX(x)                                 (((x) & 0x1F) << 0)
+#define   G_028C74_TILE_MODE_INDEX(x)                                 (((x) >> 0) & 0x1F)
+#define   C_028C74_TILE_MODE_INDEX                                    0xFFFFFFE0
+#define   S_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) & 0x1F) << 5)
+#define   G_028C74_FMASK_TILE_MODE_INDEX(x)                           (((x) >> 5) & 0x1F)
+#define   C_028C74_FMASK_TILE_MODE_INDEX                              0xFFFFFC1F
+#define   S_028C74_NUM_SAMPLES(x)                                     (((x) & 0x07) << 12)
+#define   G_028C74_NUM_SAMPLES(x)                                     (((x) >> 12) & 0x07)
+#define   C_028C74_NUM_SAMPLES                                        0xFFFF8FFF
+#define   S_028C74_NUM_FRAGMENTS(x)                                   (((x) & 0x03) << 15)
+#define   G_028C74_NUM_FRAGMENTS(x)                                   (((x) >> 15) & 0x03)
+#define   C_028C74_NUM_FRAGMENTS                                      0xFFFE7FFF
+#define   S_028C74_FORCE_DST_ALPHA_1(x)                               (((x) & 0x1) << 17)
+#define   G_028C74_FORCE_DST_ALPHA_1(x)                               (((x) >> 17) & 0x1)
+#define   C_028C74_FORCE_DST_ALPHA_1                                  0xFFFDFFFF
+#define R_028C7C_CB_COLOR0_CMASK                                        0x028C7C
+#define R_028C80_CB_COLOR0_CMASK_SLICE                                  0x028C80
+#define   S_028C80_TILE_MAX(x)                                        (((x) & 0x3FFF) << 0)
+#define   G_028C80_TILE_MAX(x)                                        (((x) >> 0) & 0x3FFF)
+#define   C_028C80_TILE_MAX                                           0xFFFFC000
+#define R_028C84_CB_COLOR0_FMASK                                        0x028C84
+#define R_028C88_CB_COLOR0_FMASK_SLICE                                  0x028C88
+#define   S_028C88_TILE_MAX(x)                                        (((x) & 0x3FFFFF) << 0)
+#define   G_028C88_TILE_MAX(x)                                        (((x) >> 0) & 0x3FFFFF)
+#define   C_028C88_TILE_MAX                                           0xFFC00000
+#define R_028C8C_CB_COLOR0_CLEAR_WORD0                                  0x028C8C
+#define R_028C90_CB_COLOR0_CLEAR_WORD1                                  0x028C90
+#define R_028C9C_CB_COLOR1_BASE                                         0x028C9C
+#define R_028CA0_CB_COLOR1_PITCH                                        0x028CA0
+#define R_028CA4_CB_COLOR1_SLICE                                        0x028CA4
+#define R_028CA8_CB_COLOR1_VIEW                                         0x028CA8
+#define R_028CAC_CB_COLOR1_INFO                                         0x028CAC
+#define R_028CB0_CB_COLOR1_ATTRIB                                       0x028CB0
+#define R_028CD4_CB_COLOR1_CMASK                                        0x028CB8
+#define R_028CBC_CB_COLOR1_CMASK_SLICE                                  0x028CBC
+#define R_028CC0_CB_COLOR1_FMASK                                        0x028CC0
+#define R_028CC4_CB_COLOR1_FMASK_SLICE                                  0x028CC4
+#define R_028CC8_CB_COLOR1_CLEAR_WORD0                                  0x028CC8
+#define R_028CCC_CB_COLOR1_CLEAR_WORD1                                  0x028CCC
+#define R_028CD8_CB_COLOR2_BASE                                         0x028CD8
+#define R_028CDC_CB_COLOR2_PITCH                                        0x028CDC
+#define R_028CE0_CB_COLOR2_SLICE                                        0x028CE0
+#define R_028CE4_CB_COLOR2_VIEW                                         0x028CE4
+#define R_028CE8_CB_COLOR2_INFO                                         0x028CE8
+#define R_028CEC_CB_COLOR2_ATTRIB                                       0x028CEC
+#define R_028CF4_CB_COLOR2_CMASK                                        0x028CF4
+#define R_028CF8_CB_COLOR2_CMASK_SLICE                                  0x028CF8
+#define R_028CFC_CB_COLOR2_FMASK                                        0x028CFC
+#define R_028D00_CB_COLOR2_FMASK_SLICE                                  0x028D00
+#define R_028D04_CB_COLOR2_CLEAR_WORD0                                  0x028D04
+#define R_028D08_CB_COLOR2_CLEAR_WORD1                                  0x028D08
+#define R_028D14_CB_COLOR3_BASE                                         0x028D14
+#define R_028D18_CB_COLOR3_PITCH                                        0x028D18
+#define R_028D1C_CB_COLOR3_SLICE                                        0x028D1C
+#define R_028D20_CB_COLOR3_VIEW                                         0x028D20
+#define R_028D24_CB_COLOR3_INFO                                         0x028D24
+#define R_028D28_CB_COLOR3_ATTRIB                                       0x028D28
+#define R_028D30_CB_COLOR3_CMASK                                        0x028D30
+#define R_028D34_CB_COLOR3_CMASK_SLICE                                  0x028D34
+#define R_028D38_CB_COLOR3_FMASK                                        0x028D38
+#define R_028D3C_CB_COLOR3_FMASK_SLICE                                  0x028D3C
+#define R_028D40_CB_COLOR3_CLEAR_WORD0                                  0x028D40
+#define R_028D44_CB_COLOR3_CLEAR_WORD1                                  0x028D44
+#define R_028D50_CB_COLOR4_BASE                                         0x028D50
+#define R_028D54_CB_COLOR4_PITCH                                        0x028D54
+#define R_028D58_CB_COLOR4_SLICE                                        0x028D58
+#define R_028D5C_CB_COLOR4_VIEW                                         0x028D5C
+#define R_028D60_CB_COLOR4_INFO                                         0x028D60
+#define R_028D64_CB_COLOR4_ATTRIB                                       0x028D64
+#define R_028D6C_CB_COLOR4_CMASK                                        0x028D6C
+#define R_028D70_CB_COLOR4_CMASK_SLICE                                  0x028D70
+#define R_028D74_CB_COLOR4_FMASK                                        0x028D74
+#define R_028D78_CB_COLOR4_FMASK_SLICE                                  0x028D78
+#define R_028D7C_CB_COLOR4_CLEAR_WORD0                                  0x028D7C
+#define R_028D80_CB_COLOR4_CLEAR_WORD1                                  0x028D80
+#define R_028D8C_CB_COLOR5_BASE                                         0x028D8C
+#define R_028D90_CB_COLOR5_PITCH                                        0x028D90
+#define R_028D94_CB_COLOR5_SLICE                                        0x028D94
+#define R_028D98_CB_COLOR5_VIEW                                         0x028D98
+#define R_028D9C_CB_COLOR5_INFO                                         0x028D9C
+#define R_028DA0_CB_COLOR5_ATTRIB                                       0x028DA0
+#define R_028DA8_CB_COLOR5_CMASK                                        0x028DA8
+#define R_028DAC_CB_COLOR5_CMASK_SLICE                                  0x028DAC
+#define R_028DB0_CB_COLOR5_FMASK                                        0x028DB0
+#define R_028DB4_CB_COLOR5_FMASK_SLICE                                  0x028DB4
+#define R_028DB8_CB_COLOR5_CLEAR_WORD0                                  0x028DB8
+#define R_028DBC_CB_COLOR5_CLEAR_WORD1                                  0x028DBC
+#define R_028DC8_CB_COLOR6_BASE                                         0x028DC8
+#define R_028DCC_CB_COLOR6_PITCH                                        0x028DCC
+#define R_028DD0_CB_COLOR6_SLICE                                        0x028DD0
+#define R_028DD4_CB_COLOR6_VIEW                                         0x028DD4
+#define R_028DD8_CB_COLOR6_INFO                                         0x028DD8
+#define R_028DDC_CB_COLOR6_ATTRIB                                       0x028DDC
+#define R_028DE4_CB_COLOR6_CMASK                                        0x028DE4
+#define R_028DE8_CB_COLOR6_CMASK_SLICE                                  0x028DE8
+#define R_028DEC_CB_COLOR6_FMASK                                        0x028DEC
+#define R_028DF0_CB_COLOR6_FMASK_SLICE                                  0x028DF0
+#define R_028DF4_CB_COLOR6_CLEAR_WORD0                                  0x028DF4
+#define R_028DF8_CB_COLOR6_CLEAR_WORD1                                  0x028DF8
+#define R_028E04_CB_COLOR7_BASE                                         0x028E04
+#define R_028E08_CB_COLOR7_PITCH                                        0x028E08
+#define R_028E0C_CB_COLOR7_SLICE                                        0x028E0C
+#define R_028E10_CB_COLOR7_VIEW                                         0x028E10
+#define R_028E14_CB_COLOR7_INFO                                         0x028E14
+#define R_028E18_CB_COLOR7_ATTRIB                                       0x028E18
+#define R_028E20_CB_COLOR7_CMASK                                        0x028E20
+#define R_028E24_CB_COLOR7_CMASK_SLICE                                  0x028E24
+#define R_028E28_CB_COLOR7_FMASK                                        0x028E28
+#define R_028E2C_CB_COLOR7_FMASK_SLICE                                  0x028E2C
+#define R_028E30_CB_COLOR7_CLEAR_WORD0                                  0x028E30
+#define R_028E34_CB_COLOR7_CLEAR_WORD1                                  0x028E34
+
+#endif /* _SID_H */
+
diff --git a/src/gallium/targets/dri-radeonsi/Makefile b/src/gallium/targets/dri-radeonsi/Makefile

new file mode 100644 (file)

index 0000000..f76d71b
--- /dev/null
+++ b/src/gallium/targets/dri-radeonsi/Makefile
@@ -0,0 +1,26 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = radeonsi_dri.so
+
+PIPE_DRIVERS = \
+       $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a \
+       $(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \
+       $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+       $(TOP)/src/gallium/drivers/trace/libtrace.a \
+       $(TOP)/src/gallium/drivers/rbug/librbug.a \
+       $(TOP)/src/gallium/drivers/noop/libnoop.a
+
+C_SOURCES = \
+       target.c \
+       $(COMMON_GALLIUM_SOURCES) \
+       $(DRIVER_SOURCES)
+
+DRIVER_DEFINES = \
+       -DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_NOOP
+
+include ../Makefile.dri
+
+DRI_LIB_DEPS += -ldrm_radeon
+
+symlinks:
diff --git a/src/gallium/targets/dri-radeonsi/SConscript b/src/gallium/targets/dri-radeonsi/SConscript

new file mode 100644 (file)

index 0000000..2b5c151
--- /dev/null
+++ b/src/gallium/targets/dri-radeonsi/SConscript
@@ -0,0 +1,25 @@
+Import('*')
+
+env = drienv.Clone()
+
+env.Append(CPPDEFINES = ['GALLIUM_RBUG', 'GALLIUM_TRACE'])
+
+env.Prepend(LIBS = [
+    st_dri,
+    radeonwinsys,
+    radeonsi,
+    trace,
+    rbug,
+    mesa,
+    glsl,
+    gallium,
+    COMMON_DRI_DRM_OBJECTS
+])
+
+module = env.SharedLibrary(
+    target ='radeonsi_dri.so',
+    source = 'target.c',
+    SHLIBPREFIX = '',
+)
+
+env.Alias('dri-radeonsi', module)
diff --git a/src/gallium/targets/dri-radeonsi/target.c b/src/gallium/targets/dri-radeonsi/target.c

new file mode 100644 (file)

index 0000000..1350ba2
--- /dev/null
+++ b/src/gallium/targets/dri-radeonsi/target.c
@@ -0,0 +1,40 @@
+#include "state_tracker/drm_driver.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "radeonsi/radeonsi_public.h"
+
+static struct pipe_screen *create_screen(int fd)
+{
+   struct radeon_winsys *radeon;
+   struct pipe_screen *screen;
+
+   radeon = radeon_drm_winsys_create(fd);
+   if (!radeon)
+      return NULL;
+
+   screen = radeonsi_screen_create(radeon);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+static const struct drm_conf_ret throttle_ret = {
+   .type = DRM_CONF_INT,
+   .val.val_int = 2,
+};
+
+static const struct drm_conf_ret *drm_configuration(enum drm_conf conf)
+{
+   switch (conf) {
+   case DRM_CONF_THROTTLE:
+      return &throttle_ret;
+   default:
+      break;
+   }
+   return NULL;
+}
+
+DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, drm_configuration)
diff --git a/src/gallium/targets/egl-static/Android.mk b/src/gallium/targets/egl-static/Android.mk

index 21b6dc27921beb674d339dc4b88b24f46c435a3a..99c08120d4b3ff621ec3afc023c2d98f2d437071 100644 (file)
--- a/src/gallium/targets/egl-static/Android.mk
+++ b/src/gallium/targets/egl-static/Android.mk
@@ -65,6 +65,9 @@ endif
  ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
  LOCAL_CFLAGS += -D_EGL_PIPE_R600=1
  endif
+ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -D_EGL_PIPE_RADEONSI=1
+endif
  ifneq ($(filter vmwgfx, $(MESA_GPU_DRIVERS)),)
  LOCAL_CFLAGS += -D_EGL_PIPE_VMWGFX=1
  endif
diff --git a/src/gallium/targets/egl-static/Makefile b/src/gallium/targets/egl-static/Makefile

index 02a55eef160ee74c474942fe87151c65f3fea8a4..2c6656bce5e90547a1d1cbd196acf8821914111b 100644 (file)
--- a/src/gallium/targets/egl-static/Makefile
+++ b/src/gallium/targets/egl-static/Makefile
@@ -130,6 +130,17 @@ egl_SYS += -ldrm_radeon
  endif
  endif
  
+# radeonsi
+ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
+ifneq ($(findstring radeonsi,$(GALLIUM_DRIVERS_DIRS)),)
+egl_CPPFLAGS += -D_EGL_PIPE_RADEONSI=1
+egl_LIBS += \
+       $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+       $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a
+egl_SYS += -ldrm_radeon
+endif
+endif
+
  # vmwgfx
  ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),)
  egl_CPPFLAGS += -D_EGL_PIPE_VMWGFX=1
diff --git a/src/gallium/targets/egl-static/SConscript b/src/gallium/targets/egl-static/SConscript

index e657e9f2ff5bf822e6e90fc1946791c7a09afb96..d831b1107641b985894e005ea61e500af518212f 100644 (file)
--- a/src/gallium/targets/egl-static/SConscript
+++ b/src/gallium/targets/egl-static/SConscript
@@ -98,11 +98,12 @@ if env['HAVE_DRM']:
          ])
  
      if env['HAVE_DRM_RADEON']:
-        env.Append(CPPDEFINES = ['_EGL_PIPE_R300', '_EGL_PIPE_R600'])
+        env.Append(CPPDEFINES = ['_EGL_PIPE_R300', '_EGL_PIPE_R600', '_EGL_PIPE_RADEONSI'])
          env.Prepend(LIBS = [
              radeonwinsys,
              r300,
              r600,
+            radeonsi,
          ])
  
      env.Append(CPPDEFINES = ['_EGL_PIPE_VMWGFX'])
diff --git a/src/gallium/targets/egl-static/egl_pipe.c b/src/gallium/targets/egl-static/egl_pipe.c

index 887bcfd12c4862e275ee7264d0f01088bde14773..407c6a8f236c497fca605b106b4d634150403e30 100644 (file)
--- a/src/gallium/targets/egl-static/egl_pipe.c
+++ b/src/gallium/targets/egl-static/egl_pipe.c
@@ -40,6 +40,8 @@
  #include "r300/r300_public.h"
  /* for r600 */
  #include "r600/r600_public.h"
+/* for radeonsi */
+#include "radeonsi/radeonsi_public.h"
  /* for vmwgfx */
  #include "svga/drm/svga_drm_public.h"
  #include "svga/svga_public.h"
@@ -131,6 +133,29 @@ pipe_r600_create_screen(int fd)
  #endif
  }
  
+static struct pipe_screen *
+pipe_radeonsi_create_screen(int fd)
+{
+#if _EGL_PIPE_RADEONSI
+   struct radeon_winsys *rw;
+   struct pipe_screen *screen;
+
+   rw = radeon_drm_winsys_create(fd);
+   if (!rw)
+      return NULL;
+
+   screen = radeonsi_screen_create(rw);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+#else
+   return NULL;
+#endif
+}
+
  static struct pipe_screen *
  pipe_vmwgfx_create_screen(int fd)
  {
@@ -165,6 +190,8 @@ egl_pipe_create_drm_screen(const char *name, int fd)
        return pipe_r300_create_screen(fd);
     else if (strcmp(name, "r600") == 0)
        return pipe_r600_create_screen(fd);
+   else if (strcmp(name, "radeonsi") == 0)
+      return pipe_radeonsi_create_screen(fd);
     else if (strcmp(name, "vmwgfx") == 0)
        return pipe_vmwgfx_create_screen(fd);
     else
diff --git a/src/gallium/targets/gbm/Makefile b/src/gallium/targets/gbm/Makefile

index 2737b7986cbfd3ccdfecb8b4cd2c796d4c25c26d..50970f9058e69ba0f0289d5a86d528770e525f3e 100644 (file)
--- a/src/gallium/targets/gbm/Makefile
+++ b/src/gallium/targets/gbm/Makefile
@@ -80,6 +80,12 @@ r600_LIBS = \
         $(TOP)/src/gallium/drivers/r600/libr600.a
  r600_SYS += -ldrm_radeon
  
+# radeonsi pipe driver
+radeonsi_LIBS = \
+       $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+       $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a
+radeonsi_SYS += -ldrm_radeon
+
  # vmwgfx pipe driver
  vmwgfx_LIBS = \
         $(TOP)/src/gallium/winsys/svga/drm/libsvgadrm.a \
@@ -126,6 +132,13 @@ pipe_SOURCES += pipe_r600.c
  endif
  endif
  
+ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),)
+ifneq ($(findstring radeonsi,$(GALLIUM_DRIVERS_DIRS)),)
+_pipe_TARGETS_CC += $(PIPE_PREFIX)radeonsi.so
+pipe_SOURCES += pipe_radeonsi.c
+endif
+endif
+
  ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),)
  _pipe_TARGETS_CC += $(PIPE_PREFIX)vmwgfx.so
  pipe_SOURCES += pipe_vmwgfx.c
diff --git a/src/gallium/targets/gbm/pipe_radeonsi.c b/src/gallium/targets/gbm/pipe_radeonsi.c

new file mode 100644 (file)

index 0000000..bb57118
--- /dev/null
+++ b/src/gallium/targets/gbm/pipe_radeonsi.c
@@ -0,0 +1,26 @@
+#include "state_tracker/drm_driver.h"
+#include "target-helpers/inline_debug_helper.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "radeonsi/radeonsi_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct radeon_winsys *rw;
+   struct pipe_screen *screen;
+
+   rw = radeon_drm_winsys_create(fd);
+   if (!rw)
+      return NULL;
+
+   screen = radeonsi_screen_create(rw);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+PUBLIC
+DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/xorg-radeonsi/Makefile b/src/gallium/targets/xorg-radeonsi/Makefile

new file mode 100644 (file)

index 0000000..af5cf88
--- /dev/null
+++ b/src/gallium/targets/xorg-radeonsi/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = radeonsi_drv.so
+
+C_SOURCES = \
+       target.c \
+       xorg.c
+
+DRIVER_DEFINES = \
+       -DHAVE_CONFIG_H -DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_GALAHAD
+
+DRIVER_PIPES = \
+       $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \
+       $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a \
+       $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \
+       $(TOP)/src/gallium/drivers/galahad/libgalahad.a \
+       $(TOP)/src/gallium/drivers/trace/libtrace.a \
+       $(TOP)/src/gallium/drivers/rbug/librbug.a
+
+DRIVER_LINKS = \
+       $(shell $(PKG_CONFIG) --libs libdrm)
+
+include ../Makefile.xorg
diff --git a/src/gallium/targets/xorg-radeonsi/target.c b/src/gallium/targets/xorg-radeonsi/target.c

new file mode 100644 (file)

index 0000000..c023c68
--- /dev/null
+++ b/src/gallium/targets/xorg-radeonsi/target.c
@@ -0,0 +1,26 @@
+
+#include "target-helpers/inline_debug_helper.h"
+#include "state_tracker/drm_driver.h"
+#include "radeon/drm/radeon_drm_public.h"
+#include "radeonsi/radeonsi_public.h"
+
+static struct pipe_screen *
+create_screen(int fd)
+{
+   struct radeon_winsys *sws;
+   struct pipe_screen *screen;
+
+   sws = radeon_drm_winsys_create(fd);
+   if (!sws)
+      return NULL;
+
+   screen = radeonsi_screen_create(sws);
+   if (!screen)
+      return NULL;
+
+   screen = debug_screen_wrap(screen);
+
+   return screen;
+}
+
+DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, NULL)
diff --git a/src/gallium/targets/xorg-radeonsi/xorg.c b/src/gallium/targets/xorg-radeonsi/xorg.c

new file mode 100644 (file)

index 0000000..3db9f31
--- /dev/null
+++ b/src/gallium/targets/xorg-radeonsi/xorg.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ *
+ * Author: Alan Hourihane <alanh@tungstengraphics.com>
+ * Author: Jakob Bornecrantz <wallbraker@gmail.com>
+ * Author: Corbin Simpson <MostAwesomedude@gmail.com>
+ *
+ */
+
+#include "../../state_trackers/xorg/xorg_winsys.h"
+
+static void radeonsi_xorg_identify(int flags);
+static Bool radeonsi_xorg_pci_probe(DriverPtr driver,
+                                int entity_num,
+                                struct pci_device *device,
+                                intptr_t match_data);
+
+static const struct pci_id_match radeonsi_xorg_device_match[] = {
+    {0x1002, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY, 0, 0, 0},
+    {0, 0, 0},
+};
+
+static SymTabRec radeonsi_xorg_chipsets[] = {
+    {PCI_MATCH_ANY, "AMD Southern Islands Graphics Chipset"},
+    {-1, NULL}
+};
+
+static PciChipsets radeonsi_xorg_pci_devices[] = {
+    {PCI_MATCH_ANY, PCI_MATCH_ANY, NULL},
+    {-1, -1, NULL}
+};
+
+static XF86ModuleVersionInfo radeonsi_xorg_version = {
+    "radeonsi",
+    MODULEVENDORSTRING,
+    MODINFOSTRING1,
+    MODINFOSTRING2,
+    XORG_VERSION_CURRENT,
+    0, 1, 0, /* major, minor, patch */
+    ABI_CLASS_VIDEODRV,
+    ABI_VIDEODRV_VERSION,
+    MOD_CLASS_VIDEODRV,
+    {0, 0, 0, 0}
+};
+
+/*
+ * Xorg driver exported structures
+ */
+
+_X_EXPORT DriverRec radeonsi_driver = {
+    1,
+    "radeonsi",
+    radeonsi_xorg_identify,
+    NULL,
+    xorg_tracker_available_options,
+    NULL,
+    0,
+    NULL,
+    radeonsi_xorg_device_match,
+    radeonsi_xorg_pci_probe
+};
+
+static MODULESETUPPROTO(radeonsi_xorg_setup);
+
+_X_EXPORT XF86ModuleData radeonsiModuleData = {
+    &radeonsi_xorg_version,
+    radeonsi_xorg_setup,
+    NULL
+};
+
+/*
+ * Xorg driver functions
+ */
+
+static pointer
+radeonsi_xorg_setup(pointer module, pointer opts, int *errmaj, int *errmin)
+{
+    static Bool setupDone = 0;
+
+    /* This module should be loaded only once, but check to be sure.
+     */
+    if (!setupDone) {
+       setupDone = 1;
+       xf86AddDriver(&radeonsi_driver, module, HaveDriverFuncs);
+
+       /*
+        * The return value must be non-NULL on success even though there
+        * is no TearDownProc.
+        */
+       return (pointer) 1;
+    } else {
+       if (errmaj)
+           *errmaj = LDR_ONCEONLY;
+       return NULL;
+    }
+}
+
+static void
+radeonsi_xorg_identify(int flags)
+{
+    xf86PrintChipsets("radeonsi", "Driver for AMD Radeon SI Gallium with KMS",
+                     radeonsi_xorg_chipsets);
+}
+
+static Bool
+radeonsi_xorg_pci_probe(DriverPtr driver,
+         int entity_num, struct pci_device *device, intptr_t match_data)
+{
+    ScrnInfoPtr scrn = NULL;
+    EntityInfoPtr entity;
+
+    scrn = xf86ConfigPciEntity(scrn, 0, entity_num, radeonsi_xorg_pci_devices,
+                              NULL, NULL, NULL, NULL, NULL);
+    if (scrn != NULL) {
+       scrn->driverVersion = 1;
+       scrn->driverName = "radeonsi";
+       scrn->name = "RADEONSI";
+       scrn->Probe = NULL;
+
+       entity = xf86GetEntityInfo(entity_num);
+
+       /* Use all the functions from the xorg tracker */
+       xorg_tracker_set_functions(scrn);
+    }
+    return scrn != NULL;
+}
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c

index c29dca394f06db21780f42e190dbe3e816d272ce..4d343b8489bcd5d7c7bc8d258a8a3835e263998f 100644 (file)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -226,6 +226,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
          ws->gen = R600;
          break;
  
+#define CHIPSET(pci_id, name, family) case pci_id:
+#include "pci_ids/radeonsi_pci_ids.h"
+#undef CHIPSET
+        ws->gen = SI;
+        break;
+
      default:
          fprintf(stderr, "radeon: Invalid PCI ID.\n");
          return FALSE;
@@ -256,7 +262,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
                                    &ws->info.r300_num_z_pipes))
              return FALSE;
      }
-    else if (ws->gen == R600) {
+    else if (ws->gen >= R600) {
          if (ws->info.drm_minor >= 9 &&
              !radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_BACKENDS,
                                    "num backends",
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h

index 6ac86bcfabb8a354058c6e9dcb383dad26dfbb6c..22983072fbb551af7205a468222ebaf66deca581 100644 (file)
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -35,7 +35,8 @@
  
  enum radeon_generation {
      R300,
-    R600
+    R600,
+    SI
  };
  
  struct radeon_drm_winsys {
author	Tom Stellard <thomas.stellard@amd.com>
	Fri, 6 Jan 2012 22:38:37 +0000 (17:38 -0500)
committer	Tom Stellard <thomas.stellard@amd.com>
	Fri, 13 Apr 2012 14:32:06 +0000 (10:32 -0400)
Android.mk		patch \| blob \| history
configs/autoconf.in		patch \| blob \| history
configure.ac		patch \| blob \| history
include/pci_ids/pci_id_driver_map.h		patch \| blob \| history
include/pci_ids/radeonsi_pci_ids.h	[new file with mode: 0644]	patch \| blob
src/egl/main/Android.mk		patch \| blob \| history
src/gallium/Android.mk		patch \| blob \| history
src/gallium/SConscript		patch \| blob \| history
src/gallium/drivers/Makefile.am		patch \| blob \| history
src/gallium/drivers/radeon/AMDGPU.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUConstants.pm	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUISelLowering.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUISelLowering.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUInstrInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUInstructions.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUIntrinsics.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPURegisterInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPURegisterInfo.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUTargetMachine.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUUtil.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDGPUUtil.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL7XXDevice.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL7XXDevice.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILAlgorithms.tpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILAsmBackend.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILAsmBackend.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILBarrierDetect.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILBase.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILCallingConv.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILCodeEmitter.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILCompilerErrors.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILCompilerWarnings.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILConversions.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILDevice.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILDevice.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILDeviceInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILDeviceInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILDevices.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILELFWriterInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILEnumeratedTypes.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILEvergreenDevice.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILFormats.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILFrameLowering.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILFrameLowering.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILGlobalManager.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILGlobalManager.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILIOExpansion.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILIOExpansion.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILISelLowering.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILISelLowering.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILImageExpansion.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInliner.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInstrInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInstrInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInstrInfo.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInstrPatterns.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILInstructions.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILIntrinsicInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILIntrinsics.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILKernel.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILKernelManager.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILKernelManager.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILLiteralManager.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILMachinePeephole.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILModuleInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILModuleInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILMultiClass.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILNIDevice.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILNIDevice.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILNodes.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILOperands.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILPatterns.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILPointerManager.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILPointerManager.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILPrintfConvert.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILProfiles.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILRegisterInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILRegisterInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILRegisterInfo.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILSIDevice.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILSIDevice.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILSubtarget.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILSubtarget.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILTargetMachine.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILTargetMachine.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILTokenDesc.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILUtilityFunctions.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/AMDILVersion.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/LICENSE.TXT	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/Makefile	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/Makefile.sources	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/Processors.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600CodeEmitter.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600GenRegisterInfo.pl	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600ISelLowering.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600ISelLowering.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600InstrFormats.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600InstrInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600InstrInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600Instructions.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600Intrinsics.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600KernelParameters.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600KernelParameters.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600LowerInstructions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600OpenCLUtils.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600RegisterInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600RegisterInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/R600Schedule.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIAssignInterpRegs.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SICodeEmitter.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIConvertToISA.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIGenRegisterInfo.pl	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIISelLowering.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIISelLowering.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIInstrFormats.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIInstrInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIInstrInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIInstrInfo.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIInstructions.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIIntrinsics.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SILowerShaderInstructions.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIMachineFunctionInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIPropagateImmReads.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIRegisterInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SIRegisterInfo.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/SISchedule.td	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/loader.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/radeon_llvm.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/radeon_llvm_emit.cpp	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/Android.mk	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/Makefile	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/Makefile.sources	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/SConscript	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/evergreen_hw_context.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/evergreen_state.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_blit.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_buffer.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_hw_context.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_hw_context_priv.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_query.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_resource.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_resource.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_state_common.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_texture.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/r600_translate.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/radeonsi_pipe.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/radeonsi_pipe.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/radeonsi_public.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/radeonsi_shader.c	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/radeonsi_shader.h	[new file with mode: 0644]	patch \| blob
src/gallium/drivers/radeonsi/sid.h	[new file with mode: 0644]	patch \| blob
src/gallium/targets/dri-radeonsi/Makefile	[new file with mode: 0644]	patch \| blob
src/gallium/targets/dri-radeonsi/SConscript	[new file with mode: 0644]	patch \| blob
src/gallium/targets/dri-radeonsi/target.c	[new file with mode: 0644]	patch \| blob
src/gallium/targets/egl-static/Android.mk		patch \| blob \| history
src/gallium/targets/egl-static/Makefile		patch \| blob \| history
src/gallium/targets/egl-static/SConscript		patch \| blob \| history
src/gallium/targets/egl-static/egl_pipe.c		patch \| blob \| history
src/gallium/targets/gbm/Makefile		patch \| blob \| history
src/gallium/targets/gbm/pipe_radeonsi.c	[new file with mode: 0644]	patch \| blob
src/gallium/targets/xorg-radeonsi/Makefile	[new file with mode: 0644]	patch \| blob
src/gallium/targets/xorg-radeonsi/target.c	[new file with mode: 0644]	patch \| blob
src/gallium/targets/xorg-radeonsi/xorg.c	[new file with mode: 0644]	patch \| blob
src/gallium/winsys/radeon/drm/radeon_drm_winsys.c		patch \| blob \| history
src/gallium/winsys/radeon/drm/radeon_drm_winsys.h		patch \| blob \| history