Merge branch 'mesa_7_7_branch'
[mesa.git] / src / mesa / drivers / dri / r600 / r700_assembler.c
index 158c5fa5491d0026dfbf270e977b8bd84f9b8e20..c01b2fbb1464ccb42a85eec40a1496ebd21dba20 100644 (file)
@@ -32,6 +32,7 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "shader/prog_parameter.h"
 
 #include "radeon_debug.h"
 #include "r600_context.h"
 #define USE_CF_FOR_CONTINUE_BREAK 1
 #define USE_CF_FOR_POP_AFTER      1
 
+struct prog_instruction noise1_insts[12] = { 
+    {OPCODE_BGNSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 2, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{8, 0, 0, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 4, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{8, 0, 585, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 8, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_SGT , {{0, 0, 585, 0, 0, 0}, {8, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 1, 1, 0, 8, 1672, 0}, 1, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_IF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 7, 0, 0}, 0, 0, 0, 1, 0, 0, 0, 15, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 1755, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_ENDIF , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_MOV , {{0, 0, 1170, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {0, 0, 1, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_RET , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}, 
+    {OPCODE_ENDSUB , {{13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}, {13, 0, 1672, 0, 0, 0}}, {13, 0, 15, 0, 8, 1672, 0}, 0, 0, 0, 1, 0, 0, 0, -1, 0, 0, 0}
+};
+float noise1_const[2][4] = {
+    {0.300000f, 0.900000f, 0.500000f, 0.300000f}
+};
+
+COMPILED_SUB noise1_presub = {
+    &(noise1_insts[0]),
+    12, 
+    2, 
+    1, 
+    0, 
+    &(noise1_const[0]), 
+    SWIZZLE_X, 
+    SWIZZLE_X, 
+    SWIZZLE_X, 
+    SWIZZLE_X,
+    {0,0,0},
+    0 
+};
+
 BITS addrmode_PVSDST(PVSDST * pPVSDST)
 {
        return pPVSDST->addrmode0 | ((BITS)pPVSDST->addrmode1 << 1);
@@ -330,14 +364,14 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
     return(format);
 }
 
-unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
+unsigned int r700GetNumOperands(GLuint opcode, GLuint nIsOp3) 
 {
-    if(pAsm->D.dst.op3)
+    if(nIsOp3 > 0)
     {
         return 3;
     }
 
-    switch (pAsm->D.dst.opcode)
+    switch (opcode)
     {
     case SQ_OP2_INST_ADD:
     case SQ_OP2_INST_KILLE:
@@ -378,7 +412,7 @@ unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
         return 1;
         
     default: radeon_error(
-                   "Need instruction operand number for %x.\n", pAsm->D.dst.opcode);
+                   "Need instruction operand number for %x.\n", opcode); 
     };
 
     return 3;
@@ -500,12 +534,20 @@ int Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700
 
     pAsm->unCFflags = 0;
 
+    pAsm->presubs           = NULL;
+    pAsm->unPresubArraySize = 0;
+    pAsm->unNumPresub       = 0;
+    pAsm->unCurNumILInsts   = 0;
+
+    pAsm->unVetTexBits      = 0;
+
     return 0;
 }
 
 GLboolean IsTex(gl_inst_opcode Opcode)
 {
-    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) )
+    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) ||
+        (OPCODE_DDX==Opcode) || (OPCODE_DDY==Opcode) )
     {
         return GL_TRUE;
     }
@@ -849,6 +891,7 @@ GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
                                        GLubyte             element,
                                        GLuint              _signed,
                                        GLboolean           normalize,
+                                       GLenum              format,
                                        VTX_FETCH_METHOD  * pFetchMethod)
 {
     GLuint client_size_inbyte;
@@ -897,10 +940,21 @@ GLboolean assemble_vfetch_instruction2(r700_AssemblerBase* pAsm,
        vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
        vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;
 
-       vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
-       vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
-       vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
-       vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+       if(format == GL_BGRA)
+       {
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_Z;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_X;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+       }
+       else
+       {
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (size < 1) ? SQ_SEL_0 : SQ_SEL_X;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (size < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (size < 3) ? SQ_SEL_0 : SQ_SEL_Z;
+               vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (size < 4) ? SQ_SEL_1 : SQ_SEL_W;
+
+       }
 
        vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
     vfetch_instruction_ptr->m_Word1.f.data_format      = data_format;
@@ -1220,7 +1274,15 @@ GLboolean assemble_src(r700_AssemblerBase *pAsm,
             }
 
             pAsm->S[fld].src.rtype = SRC_REG_CONSTANT;
-            pAsm->S[fld].src.reg   = pILInst->SrcReg[src].Index;
+            if(pILInst->SrcReg[src].Index < 0)
+            {
+                WARN_ONCE("Negative register offsets not supported yet!\n");
+                pAsm->S[fld].src.reg  = 0;
+            } 
+            else
+            {
+                pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index;
+            }
             break;      
         case PROGRAM_INPUT:
             setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE); 
@@ -1292,6 +1354,15 @@ GLboolean assemble_dst(r700_AssemblerBase *pAsm)
     pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
     pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
   
+    if(pILInst->SaturateMode == SATURATE_ZERO_ONE)
+    {
+        pAsm->D2.dst2.SaturateMode = 1;
+    }
+    else
+    {
+        pAsm->D2.dst2.SaturateMode = 0;
+    }
+
     return GL_TRUE;
 }
 
@@ -1364,43 +1435,65 @@ GLboolean tex_src(r700_AssemblerBase *pAsm)
             pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
             break;
         case PROGRAM_INPUT:
-            switch (pILInst->SrcReg[0].Index)
+            if(SPT_VP == pAsm->currentShaderType)
+            {
+                switch (pILInst->SrcReg[0].Index)
+                {
+                    case VERT_ATTRIB_TEX0:
+                    case VERT_ATTRIB_TEX1:
+                    case VERT_ATTRIB_TEX2:
+                    case VERT_ATTRIB_TEX3:
+                    case VERT_ATTRIB_TEX4:
+                    case VERT_ATTRIB_TEX5:
+                    case VERT_ATTRIB_TEX6:
+                    case VERT_ATTRIB_TEX7:
+                        bValidTexCoord = GL_TRUE;
+                        pAsm->S[0].src.reg   =
+                            pAsm->ucVP_AttributeMap[pILInst->SrcReg[0].Index];
+                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
+                        break;
+                }
+            }
+            else
             {
-                case FRAG_ATTRIB_WPOS:
-                case FRAG_ATTRIB_COL0:
-                case FRAG_ATTRIB_COL1:
-                case FRAG_ATTRIB_FOGC:
-                case FRAG_ATTRIB_TEX0:
-                case FRAG_ATTRIB_TEX1:
-                case FRAG_ATTRIB_TEX2:
-                case FRAG_ATTRIB_TEX3:
-                case FRAG_ATTRIB_TEX4:
-                case FRAG_ATTRIB_TEX5:
-                case FRAG_ATTRIB_TEX6:
-                case FRAG_ATTRIB_TEX7:
-                    bValidTexCoord = GL_TRUE;
+                switch (pILInst->SrcReg[0].Index)
+                {
+                    case FRAG_ATTRIB_WPOS:
+                    case FRAG_ATTRIB_COL0:
+                    case FRAG_ATTRIB_COL1:
+                    case FRAG_ATTRIB_FOGC:
+                    case FRAG_ATTRIB_TEX0:
+                    case FRAG_ATTRIB_TEX1:
+                    case FRAG_ATTRIB_TEX2:
+                    case FRAG_ATTRIB_TEX3:
+                    case FRAG_ATTRIB_TEX4:
+                    case FRAG_ATTRIB_TEX5:
+                    case FRAG_ATTRIB_TEX6:
+                    case FRAG_ATTRIB_TEX7:
+                        bValidTexCoord = GL_TRUE;
+                        pAsm->S[0].src.reg   =
+                            pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
+                        pAsm->S[0].src.rtype = SRC_REG_INPUT;
+                        break;
+                    case FRAG_ATTRIB_FACE:
+                        fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
+                        break;
+                    case FRAG_ATTRIB_PNTC:
+                        fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
+                        break;
+                }
+
+                if( (pILInst->SrcReg[0].Index >= FRAG_ATTRIB_VAR0) ||
+                    (pILInst->SrcReg[0].Index < FRAG_ATTRIB_MAX) )
+                {
+                                   bValidTexCoord = GL_TRUE;
                     pAsm->S[0].src.reg   =
                         pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
                     pAsm->S[0].src.rtype = SRC_REG_INPUT;
-                    break;
-                case FRAG_ATTRIB_FACE:
-                    fprintf(stderr, "FRAG_ATTRIB_FACE unsupported\n");
-                    break;
-                case FRAG_ATTRIB_PNTC:
-                    fprintf(stderr, "FRAG_ATTRIB_PNTC unsupported\n");
-                    break;
-            }
-
-            if( (pILInst->SrcReg[0].Index >= FRAG_ATTRIB_VAR0) ||
-                (pILInst->SrcReg[0].Index < FRAG_ATTRIB_MAX) )
-            {
-                               bValidTexCoord = GL_TRUE;
-                pAsm->S[0].src.reg   =
-                    pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
-                pAsm->S[0].src.rtype = SRC_REG_INPUT;
+                }
             }
 
-        break;
+            break;
         }
     }
 
@@ -1445,8 +1538,17 @@ GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalize
     tex_instruction_ptr->m_Word0.f.tex_inst         = pAsm->D.dst.opcode;
     tex_instruction_ptr->m_Word0.f.bc_frac_mode     = 0x0;
     tex_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+    tex_instruction_ptr->m_Word0.f.alt_const        = 0;
 
-    tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+    if(SPT_VP == pAsm->currentShaderType)
+    {
+        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg + VERT_ATTRIB_MAX;
+        pAsm->unVetTexBits |= 1 << texture_unit_source->reg;
+    }
+    else
+    {
+        tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+    }
 
     tex_instruction_ptr->m_Word1.f.lod_bias     = 0x0;
     if (normalized) {
@@ -1465,7 +1567,6 @@ GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalize
     tex_instruction_ptr->m_Word2.f.offset_x   = 0x0;
     tex_instruction_ptr->m_Word2.f.offset_y   = 0x0;
     tex_instruction_ptr->m_Word2.f.offset_z   = 0x0;
-
     tex_instruction_ptr->m_Word2.f.sampler_id = texture_unit_source->reg;
 
     // dst
@@ -1724,7 +1825,7 @@ GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
     }
     else 
     {
-        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count++;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count += (GetInstructionSize(alu_instruction_ptr->m_ShaderInstType) / 2);
     }
 
     // If this clause constains any instruction that is forward dependent on a TEX instruction, 
@@ -2001,7 +2102,7 @@ GLboolean check_scalar(r700_AssemblerBase* pAsm,
 
     GLuint swizzle_key;
 
-    GLuint number_of_operands = r700GetNumOperands(pAsm);
+    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
 
     for (src=0; src<number_of_operands; src++) 
     {
@@ -2090,7 +2191,7 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
 
     GLuint swizzle_key;
 
-    GLuint number_of_operands = r700GetNumOperands(pAsm);
+    GLuint number_of_operands = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
 
     for (src=0; src<number_of_operands; src++) 
     {
@@ -2159,6 +2260,10 @@ GLboolean check_vector(r700_AssemblerBase* pAsm,
 
 GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 {
+    R700ALUInstruction            * alu_instruction_ptr;
+    R700ALUInstructionHalfLiteral * alu_instruction_ptr_hl;
+    R700ALUInstructionFullLiteral * alu_instruction_ptr_fl;
+
     GLuint    number_of_scalar_operations;
     GLboolean is_single_scalar_operation;
     GLuint    scalar_channel_index;
@@ -2167,7 +2272,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
     int    current_source_index;
     GLuint contiguous_slots_needed;
 
-    GLuint    uNumSrc = r700GetNumOperands(pAsm);
+    GLuint    uNumSrc = r700GetNumOperands(pAsm->D.dst.opcode, pAsm->D.dst.op3);
     //GLuint    channel_swizzle, j;
     //GLuint    chan_counter[4] = {0, 0, 0, 0};
     //PVSSRC *  pSource[3];
@@ -2224,23 +2329,44 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 
     contiguous_slots_needed = 0;
 
-    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) 
+    if(!is_single_scalar_operation
     {
         contiguous_slots_needed = 4;
     }
 
+    contiguous_slots_needed += pAsm->D2.dst2.literal_slots;
+
     initialize(pAsm);    
 
     for (scalar_channel_index=0;
             scalar_channel_index < number_of_scalar_operations; 
                 scalar_channel_index++) 
     {
-        R700ALUInstruction* alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
-        if (alu_instruction_ptr == NULL) 
-               {
-                       return GL_FALSE;
-               }
-        Init_R700ALUInstruction(alu_instruction_ptr);
+        if(scalar_channel_index == (number_of_scalar_operations-1))
+        {
+            switch(pAsm->D2.dst2.literal_slots)
+            {
+            case 0:
+                alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+                Init_R700ALUInstruction(alu_instruction_ptr);
+                break;
+            case 1:
+                alu_instruction_ptr_hl = (R700ALUInstructionHalfLiteral*) CALLOC_STRUCT(R700ALUInstructionHalfLiteral);
+                Init_R700ALUInstructionHalfLiteral(alu_instruction_ptr_hl, pAsm->C[0].f, pAsm->C[1].f);
+                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_hl;
+                break;
+            case 2:
+                alu_instruction_ptr_fl = (R700ALUInstructionFullLiteral*) CALLOC_STRUCT(R700ALUInstructionFullLiteral);
+                Init_R700ALUInstructionFullLiteral(alu_instruction_ptr_fl,pAsm->C[0].f, pAsm->C[1].f, pAsm->C[2].f, pAsm->C[3].f);
+                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_fl;
+            break;
+            };
+        }
+        else
+        {
+            alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+            Init_R700ALUInstruction(alu_instruction_ptr);
+        }
         
         //src 0
         current_source_index = 0;
@@ -2270,7 +2396,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
         }
 
         //other bits
-        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_AR_X;
+        alu_instruction_ptr->m_Word0.f.index_mode = pAsm->D2.dst2.index_mode;
 
         if(   (is_single_scalar_operation == GL_TRUE) 
            || (GL_TRUE == bSplitInst) )
@@ -2282,9 +2408,17 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
         }
 
-        alu_instruction_ptr->m_Word0.f.pred_sel                = 0x0;
-        alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;  
-        alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+        alu_instruction_ptr->m_Word0.f.pred_sel = (pAsm->D.dst.pred_inv > 0) ? 1 : 0;
+        if(1 == pAsm->D.dst.predicated)
+        {
+            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x1;
+            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x1;
+        }
+        else
+        {
+            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
+            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+        }
 
         // dst
         if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
@@ -2323,7 +2457,7 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
 
         alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
 
-        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->pILInst[pAsm->uiCurInst].SaturateMode;
+        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->D2.dst2.SaturateMode;
 
         if (pAsm->D.dst.op3) 
         {            
@@ -2350,8 +2484,8 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             {
                 alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;
 
-                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = pAsm->S[0].src.abs;
+                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = pAsm->S[1].src.abs;
 
                 //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
                 //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
@@ -2379,8 +2513,8 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             {
                 alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;
 
-                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = pAsm->S[0].src.abs;
+                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = pAsm->S[1].src.abs;
 
                 //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
                 //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
@@ -2430,686 +2564,125 @@ GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
             }
         }
 
-        contiguous_slots_needed = 0;
+        contiguous_slots_needed -= 1;
     }
 
     return GL_TRUE;
 }
 
-GLboolean assemble_alu_instruction2(r700_AssemblerBase *pAsm)
+GLboolean next_ins(r700_AssemblerBase *pAsm)
 {
-    GLuint    number_of_scalar_operations;
-    GLboolean is_single_scalar_operation;
-    GLuint    scalar_channel_index;
-
-    PVSSRC * pcurrent_source;
-    int    current_source_index;
-    GLuint contiguous_slots_needed;
-
-    GLuint    uNumSrc = r700GetNumOperands(pAsm);
-    
-    GLboolean bSplitInst = GL_FALSE;
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
 
-    if (1 == pAsm->D.dst.math) 
+    if( GL_TRUE == pAsm->is_tex )
     {
-        is_single_scalar_operation = GL_TRUE;
-        number_of_scalar_operations = 1;
+           if (pILInst->TexSrcTarget == TEXTURE_RECT_INDEX) {
+                   if( GL_FALSE == assemble_tex_instruction(pAsm, GL_FALSE) ) 
+                   {
+                           radeon_error("Error assembling TEX instruction\n");
+                           return GL_FALSE;
+                   }
+           } else {
+                   if( GL_FALSE == assemble_tex_instruction(pAsm, GL_TRUE) ) 
+                   {
+                           radeon_error("Error assembling TEX instruction\n");
+                           return GL_FALSE;
+                   }
+           }
     }
     else 
+    {   //ALU      
+        if( GL_FALSE == assemble_alu_instruction(pAsm) ) 
+        {
+            radeon_error("Error assembling ALU instruction\n");
+            return GL_FALSE;
+        }
+    } 
+      
+    if(pAsm->D.dst.rtype == DST_REG_OUT) 
     {
-        is_single_scalar_operation = GL_FALSE;
-        number_of_scalar_operations = 4;
+        if(pAsm->D.dst.op3) 
+        {        
+            // There is no mask for OP3 instructions, so all channels are written        
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
+        }
+        else 
+        {
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
+               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
+        }
     }
+    
+    //reset for next inst.
+    pAsm->D.bits    = 0;
+    pAsm->D2.bits   = 0;
+    pAsm->S[0].bits = 0;
+    pAsm->S[1].bits = 0;
+    pAsm->S[2].bits = 0;
+    pAsm->is_tex = GL_FALSE;
+    pAsm->need_tex_barrier = GL_FALSE;
+    pAsm->D2.bits = 0;
+    pAsm->C[0].bits = pAsm->C[1].bits = pAsm->C[2].bits = pAsm->C[3].bits = 0;
+    return GL_TRUE;
+}
 
-    contiguous_slots_needed = 0;
+GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode)
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // opcode  tmp.x,    a.x
+    // MOV     dst,      tmp.x
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math = 1;
 
-    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) ) 
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
     {
-        contiguous_slots_needed = 4;
+        return GL_FALSE;
     }
 
-    initialize(pAsm);    
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
 
-    for (scalar_channel_index=0;
-            scalar_channel_index < number_of_scalar_operations; 
-                scalar_channel_index++) 
+    // Now replicate result to all necessary channels in destination
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
     {
-        R700ALUInstruction* alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
-        if (alu_instruction_ptr == NULL) 
-        {
-            return GL_FALSE;
-        }
-        Init_R700ALUInstruction(alu_instruction_ptr);
-        
-        //src 0
-        current_source_index = 0;
-        pcurrent_source = &(pAsm->S[0].src);
+        return GL_FALSE;
+    }
 
-        if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                         current_source_index,
-                                         pcurrent_source, 
-                                         scalar_channel_index) )     
-        {
-            return GL_FALSE;
-        }
-   
-        if (uNumSrc > 1) 
-        {            
-            // Process source 1            
-            current_source_index = 1;
-            pcurrent_source = &(pAsm->S[current_source_index].src);
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
 
-            if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                             current_source_index,
-                                             pcurrent_source, 
-                                             scalar_channel_index) ) 
-            {
-                return GL_FALSE;
-            }
-        }
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
 
-        //other bits
-        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_LOOP;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
 
-        if(   (is_single_scalar_operation == GL_TRUE) 
-           || (GL_TRUE == bSplitInst) )
-        {
-            alu_instruction_ptr->m_Word0.f.last = 1;
-        }
-        else 
-        {
-            alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
-        }
+    return GL_TRUE;
+}
 
-        alu_instruction_ptr->m_Word0.f.pred_sel = (pAsm->D.dst.pred_inv > 0) ? 1 : 0;
-        if(1 == pAsm->D.dst.predicated)
-        {
-            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x1;  
-            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x1; 
-        }
-        else
-        {
-            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;  
-            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0; 
-        }
-       
-        // dst
-        if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
-            (pAsm->D.dst.rtype == DST_REG_OUT) ) 
-        {
-            alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
-        }
-        else 
-        {
-            radeon_error("Only temp destination registers supported for ALU dest regs.\n");
-            return GL_FALSE;
-        }
+GLboolean assemble_ABS(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
 
-        alu_instruction_ptr->m_Word1.f.dst_rel  = SQ_ABSOLUTE;  //D.rtype
-
-        if ( is_single_scalar_operation == GL_TRUE ) 
-        {
-            // Override scalar_channel_index since only one scalar value will be written
-            if(pAsm->D.dst.writex) 
-            {
-                scalar_channel_index = 0;
-            }
-            else if(pAsm->D.dst.writey) 
-            {
-                scalar_channel_index = 1;
-            }
-            else if(pAsm->D.dst.writez) 
-            {
-                scalar_channel_index = 2;
-            }
-            else if(pAsm->D.dst.writew) 
-            {
-                scalar_channel_index = 3;
-            }
-        }
-
-        alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
-
-        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->D2.dst2.SaturateMode;
-
-        if (pAsm->D.dst.op3) 
-        {            
-            //op3
-
-            alu_instruction_ptr->m_Word1_OP3.f.alu_inst = pAsm->D.dst.opcode;
-
-            //There's 3rd src for op3
-            current_source_index = 2;
-            pcurrent_source = &(pAsm->S[current_source_index].src);
-
-            if ( GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                              current_source_index,
-                                              pcurrent_source, 
-                                              scalar_channel_index) ) 
-            {
-                return GL_FALSE;
-            }
-        }
-        else 
-        {
-            //op2
-            if (pAsm->bR6xx)
-            {
-                alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;
-
-                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = 0x0;
-
-                //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
-                //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
-                switch (scalar_channel_index) 
-                {
-                    case 0: 
-                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writex; 
-                        break;
-                    case 1: 
-                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writey; 
-                        break;
-                    case 2: 
-                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writez; 
-                        break;
-                    case 3: 
-                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writew; 
-                        break;
-                    default: 
-                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = 1; //SQ_SEL_MASK;
-                        break;
-                }            
-                alu_instruction_ptr->m_Word1_OP2.f6.omod               = SQ_ALU_OMOD_OFF;
-            }
-            else
-            {
-                alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;
-
-                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = 0x0;
-
-                //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
-                //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
-                switch (scalar_channel_index) 
-                {
-                    case 0: 
-                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writex; 
-                        break;
-                    case 1: 
-                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writey; 
-                        break;
-                    case 2: 
-                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writez; 
-                        break;
-                    case 3: 
-                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writew; 
-                        break;
-                    default: 
-                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = 1; //SQ_SEL_MASK;
-                        break;
-                }            
-                alu_instruction_ptr->m_Word1_OP2.f.omod               = SQ_ALU_OMOD_OFF;
-            }
-        }
-
-        if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
-        {
-            return GL_FALSE;
-        }
-
-        /*
-         * Judge the type of current instruction, is it vector or scalar 
-         * instruction.
-         */        
-        if (is_single_scalar_operation) 
-        {
-            if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
-            {
-                return GL_FALSE;
-            }
-        }
-        else 
-        {
-            if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
-            {
-                return GL_FALSE; 
-            }
-        }
-
-        contiguous_slots_needed = 0;
-    }
-
-    return GL_TRUE;
-}
-
-GLboolean assemble_alu_instruction_literal(r700_AssemblerBase *pAsm, GLfloat * pLiteral)
-{
-    R700ALUInstruction            * alu_instruction_ptr;
-    R700ALUInstructionHalfLiteral * alu_instruction_ptr_hl;
-    R700ALUInstructionFullLiteral * alu_instruction_ptr_fl;
-
-    GLuint    number_of_scalar_operations;
-    GLboolean is_single_scalar_operation;
-    GLuint    scalar_channel_index;
-
-    GLuint   contiguous_slots_needed;
-    GLuint   lastInstruction;
-    GLuint   not_masked[4];
-
-    GLuint    uNumSrc = r700GetNumOperands(pAsm);
-    
-    GLboolean bSplitInst = GL_FALSE;
-
-    number_of_scalar_operations = 0;
-    contiguous_slots_needed     = 0;
-
-    if(1 == pAsm->D.dst.writew)
-    {
-        lastInstruction = 3;
-        number_of_scalar_operations++;
-        not_masked[3] = 1;
-    }
-    else
-    {
-        not_masked[3] = 0;
-    }
-    if(1 == pAsm->D.dst.writez)
-    {
-        lastInstruction = 2;
-        number_of_scalar_operations++;
-        not_masked[2] = 1;
-    }
-    else
-    {
-        not_masked[2] = 0;
-    }
-    if(1 == pAsm->D.dst.writey)
-    {
-        lastInstruction = 1;
-        number_of_scalar_operations++;
-        not_masked[1] = 1;
-    }
-    else
-    {
-        not_masked[1] = 0;
-    }
-    if(1 == pAsm->D.dst.writex)
-    {
-        lastInstruction = 0;
-        number_of_scalar_operations++;
-        not_masked[0] = 1;
-    }
-    else
-    {
-        not_masked[0] = 0;
-    }
-    
-    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) ) 
-    {
-        contiguous_slots_needed = 4;
-    }
-    else
-    {
-        contiguous_slots_needed = number_of_scalar_operations;
-    }
-
-    if(1 == pAsm->D2.dst2.literal)
-    {
-        contiguous_slots_needed += 1;
-    }
-    else if(2 == pAsm->D2.dst2.literal)
-    {
-        contiguous_slots_needed += 2;
-    }
-
-    initialize(pAsm);    
-
-    for (scalar_channel_index=0; scalar_channel_index < 4; scalar_channel_index++) 
-    {
-        if(0 == not_masked[scalar_channel_index])
-        {
-            continue;
-        }
-
-        if(scalar_channel_index == lastInstruction)
-        {
-            switch (pAsm->D2.dst2.literal)
-            {
-            case 0:
-                alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
-                if (alu_instruction_ptr == NULL) 
-                       {
-                               return GL_FALSE;
-                       }
-                Init_R700ALUInstruction(alu_instruction_ptr);
-                break;
-            case 1:
-                alu_instruction_ptr_hl = (R700ALUInstructionHalfLiteral*) CALLOC_STRUCT(R700ALUInstructionHalfLiteral);
-                if (alu_instruction_ptr_hl == NULL) 
-                       {
-                               return GL_FALSE;
-                       }
-                Init_R700ALUInstructionHalfLiteral(alu_instruction_ptr_hl, pLiteral[0], pLiteral[1]);
-                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_hl;
-                break;
-            case 2:
-                alu_instruction_ptr_fl = (R700ALUInstructionFullLiteral*) CALLOC_STRUCT(R700ALUInstructionFullLiteral);
-                if (alu_instruction_ptr_fl == NULL) 
-                       {
-                               return GL_FALSE;
-                       }
-                Init_R700ALUInstructionFullLiteral(alu_instruction_ptr_fl, pLiteral[0], pLiteral[1], pLiteral[2], pLiteral[3]);
-                alu_instruction_ptr = (R700ALUInstruction*)alu_instruction_ptr_fl;
-                break;
-            default:
-                break;
-            };
-        }
-        else
-        {
-            alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
-            if (alu_instruction_ptr == NULL) 
-                   {
-                           return GL_FALSE;
-                   }
-            Init_R700ALUInstruction(alu_instruction_ptr);
-        }
-
-        //src 0
-        if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                         0,
-                                         &(pAsm->S[0].src), 
-                                         scalar_channel_index) )     
-        {
-            return GL_FALSE;
-        }
-   
-        if (uNumSrc > 1) 
-        {            
-            // Process source 1            
-            if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                             1,
-                                             &(pAsm->S[1].src), 
-                                             scalar_channel_index) ) 
-            {
-                return GL_FALSE;
-            }
-        }
-
-        //other bits
-        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_LOOP;
-
-        if(scalar_channel_index == lastInstruction)
-        {
-            alu_instruction_ptr->m_Word0.f.last = 1;
-        }
-
-        alu_instruction_ptr->m_Word0.f.pred_sel = 0x0;
-        if(1 == pAsm->D.dst.predicated)
-        {            
-            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x1;  
-            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x1; 
-        }
-        else
-        {
-            alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0;  
-            alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0; 
-        }
-
-        // dst
-        if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
-            (pAsm->D.dst.rtype == DST_REG_OUT) ) 
-        {
-            alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
-        }
-        else 
-        {
-            radeon_error("Only temp destination registers supported for ALU dest regs.\n");
-            return GL_FALSE;
-        }
-
-        alu_instruction_ptr->m_Word1.f.dst_rel  = SQ_ABSOLUTE;  //D.rtype
-
-        alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
-
-        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->D2.dst2.SaturateMode;
-
-        if (pAsm->D.dst.op3) 
-        {            
-            //op3
-            alu_instruction_ptr->m_Word1_OP3.f.alu_inst = pAsm->D.dst.opcode;
-
-            //There's 3rd src for op3
-            if ( GL_FALSE == assemble_alu_src(alu_instruction_ptr,
-                                              2,
-                                              &(pAsm->S[2].src), 
-                                              scalar_channel_index) ) 
-            {
-                return GL_FALSE;
-            }
-        }
-        else 
-        {
-            //op2
-            if (pAsm->bR6xx)
-            {
-                alu_instruction_ptr->m_Word1_OP2.f6.alu_inst   = pAsm->D.dst.opcode;
-                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs   = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs   = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f6.write_mask = 1;           
-                alu_instruction_ptr->m_Word1_OP2.f6.omod       = SQ_ALU_OMOD_OFF;
-            }
-            else
-            {
-                alu_instruction_ptr->m_Word1_OP2.f.alu_inst    = pAsm->D.dst.opcode;
-                alu_instruction_ptr->m_Word1_OP2.f.src0_abs    = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f.src1_abs    = 0x0;
-                alu_instruction_ptr->m_Word1_OP2.f.write_mask  = 1;                        
-                alu_instruction_ptr->m_Word1_OP2.f.omod        = SQ_ALU_OMOD_OFF;
-            }
-        }
-
-        if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
-        {
-            return GL_FALSE;
-        }
-  
-        if (1 == number_of_scalar_operations) 
-        {
-            if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
-            {
-                return GL_FALSE;
-            }
-        }
-        else 
-        {
-            if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
-            {
-                return GL_FALSE;
-            }
-        }
-
-        contiguous_slots_needed -= 2;
-    }
-
-    return GL_TRUE;
-}
-
-GLboolean next_ins(r700_AssemblerBase *pAsm)
-{
-    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
-
-    if( GL_TRUE == pAsm->is_tex )
-    {
-           if (pILInst->TexSrcTarget == TEXTURE_RECT_INDEX) {
-                   if( GL_FALSE == assemble_tex_instruction(pAsm, GL_FALSE) ) 
-                   {
-                           radeon_error("Error assembling TEX instruction\n");
-                           return GL_FALSE;
-                   }
-           } else {
-                   if( GL_FALSE == assemble_tex_instruction(pAsm, GL_TRUE) ) 
-                   {
-                           radeon_error("Error assembling TEX instruction\n");
-                           return GL_FALSE;
-                   }
-           }
-    }
-    else 
-    {   //ALU      
-        if( GL_FALSE == assemble_alu_instruction(pAsm) ) 
-        {
-            radeon_error("Error assembling ALU instruction\n");
-            return GL_FALSE;
-        }
-    } 
-      
-    if(pAsm->D.dst.rtype == DST_REG_OUT) 
-    {
-        if(pAsm->D.dst.op3) 
-        {        
-            // There is no mask for OP3 instructions, so all channels are written        
-            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
-        }
-        else 
-        {
-            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
-               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
-        }
-    }
-    
-    //reset for next inst.
-    pAsm->D.bits    = 0;
-    pAsm->D2.bits   = 0;
-    pAsm->S[0].bits = 0;
-    pAsm->S[1].bits = 0;
-    pAsm->S[2].bits = 0;
-    pAsm->is_tex = GL_FALSE;
-    pAsm->need_tex_barrier = GL_FALSE;
-
-    return GL_TRUE;
-}
-
-GLboolean next_ins2(r700_AssemblerBase *pAsm)
-{
-    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
-
-    //ALU      
-    if( GL_FALSE == assemble_alu_instruction2(pAsm) ) 
-    {
-        radeon_error("Error assembling ALU instruction\n");
-        return GL_FALSE;
-    }
-     
-    if(pAsm->D.dst.rtype == DST_REG_OUT) 
-    {
-        if(pAsm->D.dst.op3) 
-        {        
-            // There is no mask for OP3 instructions, so all channels are written        
-            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
-        }
-        else 
-        {
-            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
-               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
-        }
-    }
-    
-    //reset for next inst.
-    pAsm->D.bits    = 0;
-    pAsm->D2.bits   = 0;
-    pAsm->S[0].bits = 0;
-    pAsm->S[1].bits = 0;
-    pAsm->S[2].bits = 0;
-    pAsm->is_tex = GL_FALSE;
-    pAsm->need_tex_barrier = GL_FALSE;
-
-    pAsm->D2.bits = 0;
-
-    return GL_TRUE;
-}
-
-/* not work yet */
-GLboolean next_ins_literal(r700_AssemblerBase *pAsm, GLfloat * pLiteral)
-{
-    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
-
-    //ALU      
-    if( GL_FALSE == assemble_alu_instruction_literal(pAsm, pLiteral) ) 
-    {
-        radeon_error("Error assembling ALU instruction\n");
-        return GL_FALSE;
-    }
-    
-    //reset for next inst.
-    pAsm->D.bits    = 0;
-    pAsm->D2.bits   = 0;
-    pAsm->S[0].bits = 0;
-    pAsm->S[1].bits = 0;
-    pAsm->S[2].bits = 0;
-    pAsm->is_tex = GL_FALSE;
-    pAsm->need_tex_barrier = GL_FALSE;
-    return GL_TRUE;
-}
-
-GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode)
-{
-    BITS tmp;
-
-    checkop1(pAsm);
-
-    tmp = gethelpr(pAsm);
-
-    // opcode  tmp.x,    a.x
-    // MOV     dst,      tmp.x
-
-    pAsm->D.dst.opcode = opcode;
-    pAsm->D.dst.math = 1;
-
-    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
-    pAsm->D.dst.reg    = tmp;
-    pAsm->D.dst.writex = 1;
-
-    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
-    {
-        return GL_FALSE;
-    }
-
-    if ( GL_FALSE == next_ins(pAsm) ) 
-    {
-        return GL_FALSE;
-    }
-
-    // Now replicate result to all necessary channels in destination
-    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
-
-    if( GL_FALSE == assemble_dst(pAsm) )
-    {
-        return GL_FALSE;
-    }
-
-    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
-    pAsm->S[0].src.reg   = tmp;
-
-    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
-    noneg_PVSSRC(&(pAsm->S[0].src));
-
-    if( GL_FALSE == next_ins(pAsm) )
-    {
-        return GL_FALSE;
-    }
-
-    return GL_TRUE;
-}
-
-GLboolean assemble_ABS(r700_AssemblerBase *pAsm)
-{
-    checkop1(pAsm);
-
-    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  
+    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  
 
     if( GL_FALSE == assemble_dst(pAsm) )
     {
@@ -3273,18 +2846,53 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
         noneg_PVSSRC(&(pAsm->S[0].src));
         noswizzle_PVSSRC(&(pAsm->S[0].src));
 
-        if( GL_FALSE == next_ins(pAsm) )
-        {
-            return GL_FALSE;
-        }
-    }
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
+{
+    int tmp;
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    assemble_src(pAsm, 0, -1);
+
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    pAsm->D2.dst2.literal_slots = 1;
+    pAsm->C[0].f = 1/(3.1415926535 * 2);
+    pAsm->C[1].f = 0.0F;
+    next_ins(pAsm);
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math = 1;
+
+    assemble_dst(pAsm);
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    next_ins(pAsm);
 
+    //TODO - replicate if more channels set in WriteMask
     return GL_TRUE;
-}
 
-GLboolean assemble_COS(r700_AssemblerBase *pAsm)
-{
-    return assemble_math_function(pAsm, SQ_OP2_INST_COS);
 }
  
 GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
@@ -4400,77 +4008,67 @@ GLboolean assemble_RSQ(r700_AssemblerBase *pAsm)
     return assemble_math_function(pAsm, SQ_OP2_INST_RECIPSQRT_IEEE);
 }
  
-GLboolean assemble_SIN(r700_AssemblerBase *pAsm) 
-{
-    return assemble_math_function(pAsm, SQ_OP2_INST_SIN);
-}
 GLboolean assemble_SCS(r700_AssemblerBase *pAsm) 
 {
     BITS tmp;
 
-       checkop1(pAsm);
-
-       tmp = gethelpr(pAsm);
-
-       // COS tmp.x,    a.x
-       pAsm->D.dst.opcode = SQ_OP2_INST_COS;
-       pAsm->D.dst.math = 1;
+    checkop1(pAsm);
 
-       setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-       pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-       pAsm->D.dst.reg = tmp;
-       pAsm->D.dst.writex = 1;
+    tmp = gethelpr(pAsm);
+    /* tmp.x = src /2*PI */
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
 
-       if( GL_FALSE == assemble_src(pAsm, 0, -1) )
-       {
-               return GL_FALSE;
-       }
+    assemble_src(pAsm, 0, -1);
 
-       if ( GL_FALSE == next_ins(pAsm) )
-       {
-               return GL_FALSE;
-       }
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    pAsm->D2.dst2.literal_slots = 1;
+    pAsm->C[0].f = 1/(3.1415926535 * 2);
+    pAsm->C[1].f = 0.0F;
 
-       // SIN tmp.y,    a.x
-       pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
-       pAsm->D.dst.math = 1;
+    next_ins(pAsm);
 
-       setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-       pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-       pAsm->D.dst.reg = tmp;
-       pAsm->D.dst.writey = 1;
+    // COS dst.x,    a.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_COS;
+    pAsm->D.dst.math = 1;
 
-       if( GL_FALSE == assemble_src(pAsm, 0, -1) )
-       {
-               return GL_FALSE;
-       }
+    assemble_dst(pAsm);
+    /* mask y */
+    pAsm->D.dst.writey = 0;
 
-       if( GL_FALSE == next_ins(pAsm) )
-       {
-               return GL_FALSE;
-       }
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
 
-       // MOV dst.mask,     tmp
-       pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
 
-       if( GL_FALSE == assemble_dst(pAsm) )
-       {
-               return GL_FALSE;
-       }
+    // SIN dst.y,    a.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
+    pAsm->D.dst.math = 1;
 
-       setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-       pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
-       pAsm->S[0].src.reg = tmp;
+    assemble_dst(pAsm);
+    /* mask x */
+    pAsm->D.dst.writex = 0;
 
-       noswizzle_PVSSRC(&(pAsm->S[0].src));
-       pAsm->S[0].src.swizzlez = SQ_SEL_0;
-       pAsm->S[0].src.swizzlew = SQ_SEL_0;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
 
-       if ( GL_FALSE == next_ins(pAsm) )
-       {
-               return GL_FALSE;
-       }
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
 
     return GL_TRUE;
 }
@@ -4537,7 +4135,7 @@ GLboolean assemble_LOGIC_PRED(r700_AssemblerBase *pAsm, BITS opcode)
     pAsm->S[1].src.swizzlez = SQ_SEL_0;
     pAsm->S[1].src.swizzlew = SQ_SEL_0;
 
-    if( GL_FALSE == next_ins2(pAsm) ) 
+    if( GL_FALSE == next_ins(pAsm) ) 
     {
            return GL_FALSE;
     }
@@ -4721,24 +4319,6 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
             return GL_FALSE;
         }
  
-        /* tmp1.z = ABS(tmp1.z) dont have abs support in assembler currently
-         * have to do explicit instruction
-         */
-        pAsm->D.dst.opcode = SQ_OP2_INST_MAX;
-        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-        pAsm->D.dst.reg   = tmp1;
-        pAsm->D.dst.writez = 1;
-
-        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[0].src.reg = tmp1;
-       noswizzle_PVSSRC(&(pAsm->S[0].src));
-        pAsm->S[1].bits = pAsm->S[0].bits;
-        flipneg_PVSSRC(&(pAsm->S[1].src));
-        
-        next_ins(pAsm);
-
         /* tmp1.z = RCP_e(|tmp1.z|) */
         pAsm->D.dst.opcode = SQ_OP2_INST_RECIP_IEEE;
         pAsm->D.dst.math = 1;
@@ -4751,13 +4331,13 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
         pAsm->S[0].src.reg = tmp1;
         pAsm->S[0].src.swizzlex = SQ_SEL_Z;
+        pAsm->S[0].src.abs = 1;
 
         next_ins(pAsm);
 
         /* MULADD R0.x,  R0.x,  PS1,  (0x3FC00000, 1.5f).x
          * MULADD R0.y,  R0.y,  PS1,  (0x3FC00000, 1.5f).x
          * muladd has no writemask, have to use another temp 
-         * also no support for imm constants, so add 1 here
          */
         pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
         pAsm->D.dst.op3    = 1;
@@ -4774,30 +4354,12 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[1].src.reg   = tmp1;
         setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z);
         setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
-        pAsm->S[2].src.rtype = SRC_REG_TEMPORARY;
+        /* immediate c 1.5 */
+        pAsm->D2.dst2.literal_slots = 1;
+        pAsm->C[0].f = 1.5F;
+        pAsm->S[2].src.rtype = SRC_REC_LITERAL;
         pAsm->S[2].src.reg   = tmp1;
-        setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_1);
-
-        next_ins(pAsm);
-
-        /* ADD the remaining .5 */
-        pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
-        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
-        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
-        pAsm->D.dst.reg   = tmp2;
-        pAsm->D.dst.writex = 1;
-        pAsm->D.dst.writey = 1;
-        pAsm->D.dst.writez = 0;
-        pAsm->D.dst.writew = 0;
-
-        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
-        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[0].src.reg   = tmp2;
-        noswizzle_PVSSRC(&(pAsm->S[0].src));
-        setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
-        pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
-        pAsm->S[1].src.reg   = 252; // SQ_ALU_SRC_0_5 
-        noswizzle_PVSSRC(&(pAsm->S[1].src));
+        setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_X);
 
         next_ins(pAsm);
 
@@ -4822,13 +4384,23 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
 
     }
 
-    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_TXB)
-    {
-        pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_L;
-    }
-    else
+    switch(pAsm->pILInst[pAsm->uiCurInst].Opcode)
     {
-        pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+        case OPCODE_DDX:
+            /* will these need WQM(1) on CF inst ? */
+            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_H;
+            break;
+        case OPCODE_DDY:
+            pAsm->D.dst.opcode = SQ_TEX_INST_GET_GRADIENTS_V;
+            break;
+        case OPCODE_TXB:
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_L;
+            break;
+        default:
+            if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE_C;
+            else
+                pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
     }
 
     pAsm->is_tex = GL_TRUE;
@@ -4874,11 +4446,46 @@ GLboolean assemble_TEX(r700_AssemblerBase *pAsm)
         pAsm->S[0].src.swizzlew = SQ_SEL_Y;
     }
  
+    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+    {
+        /* compare value goes to w chan ? */
+        pAsm->S[0].src.swizzlew = SQ_SEL_Z;
+    }
+
     if ( GL_FALSE == next_ins(pAsm) )
         {
             return GL_FALSE;
         }
 
+    /* add ARB shadow ambient but clamp to 0..1 */
+    if(pAsm->pILInst[pAsm->uiCurInst].TexShadow == 1)
+    {
+       /* ADD_SAT dst,  dst,  ambient[texunit] */
+       pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+
+       if( GL_FALSE == assemble_dst(pAsm) )
+       {
+           return GL_FALSE;
+       }
+       pAsm->D2.dst2.SaturateMode = 1;
+
+       pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+       pAsm->S[0].src.reg = pAsm->D.dst.reg;
+       noswizzle_PVSSRC(&(pAsm->S[0].src));
+       noneg_PVSSRC(&(pAsm->S[0].src));
+
+       pAsm->S[1].src.rtype = SRC_REG_CONSTANT;
+       pAsm->S[1].src.reg = pAsm->shadow_regs[pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit];
+       noswizzle_PVSSRC(&(pAsm->S[1].src));
+       noneg_PVSSRC(&(pAsm->S[1].src));
+
+       if( GL_FALSE == next_ins(pAsm) )
+       {
+           return GL_FALSE;
+       }
+
+    }
+
     return GL_TRUE;
 }
 
@@ -5483,22 +5090,22 @@ void add_return_inst(r700_AssemblerBase *pAsm)
 {
     if(GL_FALSE == add_cf_instruction(pAsm) )
     {
-        return GL_FALSE;
+        return;
     }
     //pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 1;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.pop_count        = 0;
-    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_const         = 0x0;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
 
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.end_of_program   = 0x0;
-    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0; 
+    pAsm->cf_current_cf_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_RETURN;
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
 
     pAsm->cf_current_cf_clause_ptr->m_Word1.f.barrier          = 0x1;
 }
 
-GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex)
+GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex, GLuint uiIL_Shift)
 {
     /* Put in sub */
     if( (pAsm->unSubArrayPointer + 1) > pAsm->unSubArraySize )
@@ -5513,7 +5120,7 @@ GLboolean assemble_BGNSUB(r700_AssemblerBase *pAsm, GLint nILindex)
         pAsm->unSubArraySize += 10;
     }
 
-    pAsm->subs[pAsm->unSubArrayPointer].subIL_Offset = nILindex;
+    pAsm->subs[pAsm->unSubArrayPointer].subIL_Offset = nILindex + uiIL_Shift;
     pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pHead=NULL;  
     pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.pTail=NULL;  
     pAsm->subs[pAsm->unSubArrayPointer].lstCFInstructions_local.uNumOfNode=0;
@@ -5604,9 +5211,13 @@ GLboolean assemble_RET(r700_AssemblerBase *pAsm)
 
 GLboolean assemble_CAL(r700_AssemblerBase *pAsm, 
                        GLint nILindex,
+                       GLuint uiIL_Shift,
                        GLuint uiNumberInsts,
-                       struct prog_instruction *pILInst)
+                       struct prog_instruction *pILInst,
+                       PRESUB_DESC * pPresubDesc)
 {
+    GLint uiIL_Offset;
+
     pAsm->alu_x_opcode = SQ_CF_INST_ALU;
 
     if(GL_FALSE == add_cf_instruction(pAsm) )
@@ -5639,8 +5250,12 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
         pAsm->unCallerArraySize += 10;
     }
     
-    pAsm->callers[pAsm->unCallerArrayPointer].subIL_Offset = nILindex;
-    pAsm->callers[pAsm->unCallerArrayPointer].cf_ptr       = pAsm->cf_current_cf_clause_ptr; 
+    uiIL_Offset = nILindex + uiIL_Shift;
+    pAsm->callers[pAsm->unCallerArrayPointer].subIL_Offset = uiIL_Offset; 
+    pAsm->callers[pAsm->unCallerArrayPointer].cf_ptr       = pAsm->cf_current_cf_clause_ptr;
+    
+    pAsm->callers[pAsm->unCallerArrayPointer].finale_cf_ptr  = NULL; 
+    pAsm->callers[pAsm->unCallerArrayPointer].prelude_cf_ptr = NULL; 
 
     pAsm->unCallerArrayPointer++;
 
@@ -5650,7 +5265,7 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
     GLboolean bRet;
     for(j=0; j<pAsm->unSubArrayPointer; j++)
     {
-        if(nILindex == pAsm->subs[j].subIL_Offset)
+        if(uiIL_Offset == pAsm->subs[j].subIL_Offset)
         {   /* compiled before */
 
             max = pAsm->subs[j].unStackDepthMax 
@@ -5668,7 +5283,7 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
     pAsm->callers[pAsm->unCallerArrayPointer - 1].subDescIndex = pAsm->unSubArrayPointer;
     unSubID = pAsm->unSubArrayPointer;
 
-    bRet = AssembleInstr(nILindex, uiNumberInsts, pILInst, pAsm);
+    bRet = AssembleInstr(nILindex, uiIL_Shift, uiNumberInsts, pILInst, pAsm);
 
     if(GL_TRUE == bRet)
     {
@@ -5678,6 +5293,8 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
         {
             pAsm->CALLSTACK[pAsm->CALLSP].max = max;
         }
+
+        pAsm->subs[unSubID].pPresubDesc = pPresubDesc;
     }
 
     return bRet;
@@ -5685,7 +5302,7 @@ GLboolean assemble_CAL(r700_AssemblerBase *pAsm,
 
 GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
 {
-    GLfloat fLiteral[2] = {0.1, 0.0};
+    /*GLfloat fLiteral[2] = {0.1, 0.0};*/
 
     pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
     pAsm->D.dst.op3      = 0;
@@ -5695,11 +5312,12 @@ GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
     pAsm->D.dst.writey   = 0;
     pAsm->D.dst.writez   = 0;
     pAsm->D.dst.writew   = 0;
-    pAsm->D2.dst2.literal      = 1;
+    pAsm->D2.dst2.literal_slots      = 1;
     pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
     pAsm->D.dst.predicated     = 0;
     /* in reloc where dislink flag init inst, only one slot alu inst is handled. */
     pAsm->D.dst.math           = 1; /* TODO : not math really, but one channel op, more generic alu assembler needed */
+    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */
 #if 0
     pAsm->S[0].src.rtype = SRC_REC_LITERAL;
     //pAsm->S[0].src.reg   = 0;
@@ -5724,7 +5342,7 @@ GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
     pAsm->S[0].src.swizzlez = flagValue;
     pAsm->S[0].src.swizzlew = flagValue;
 
-    if( GL_FALSE == next_ins2(pAsm) )
+    if( GL_FALSE == next_ins(pAsm) )
     {
         return GL_FALSE;
     }
@@ -5735,7 +5353,7 @@ GLboolean setRetInLoopFlag(r700_AssemblerBase *pAsm, GLuint flagValue)
 
 GLboolean testFlag(r700_AssemblerBase *pAsm)
 {
-    GLfloat fLiteral[2] = {0.1, 0.0};
+    /*GLfloat fLiteral[2] = {0.1, 0.0};*/
 
     //Test flag
     GLuint tmp = gethelpr(pAsm);
@@ -5749,9 +5367,10 @@ GLboolean testFlag(r700_AssemblerBase *pAsm)
     pAsm->D.dst.writey   = 0;
     pAsm->D.dst.writez   = 0;
     pAsm->D.dst.writew   = 0;
-    pAsm->D2.dst2.literal      = 1;
+    pAsm->D2.dst2.literal_slots      = 1;
     pAsm->D2.dst2.SaturateMode = SATURATE_OFF;
     pAsm->D.dst.predicated     = 1;
+    pAsm->D2.dst2.index_mode = SQ_INDEX_LOOP; /* Check this ! */
 
     pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
     pAsm->S[0].src.reg   = pAsm->flag_reg_index;
@@ -5785,7 +5404,7 @@ GLboolean testFlag(r700_AssemblerBase *pAsm)
     pAsm->S[1].src.swizzlez = SQ_SEL_1;
     pAsm->S[1].src.swizzlew = SQ_SEL_1;
 
-    if( GL_FALSE == next_ins2(pAsm) )
+    if( GL_FALSE == next_ins(pAsm) )
     {
         return GL_FALSE;
     }
@@ -5841,6 +5460,7 @@ GLboolean breakLoopOnFlag(r700_AssemblerBase *pAsm, GLuint unFCSP)
 }
 
 GLboolean AssembleInstr(GLuint uiFirstInst,
+                        GLuint uiIL_Shift,
                         GLuint uiNumberInsts,
                         struct prog_instruction *pILInst, 
                                                r700_AssemblerBase *pR700AsmCode)
@@ -5914,7 +5534,7 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
                 return GL_FALSE;
             break;  
         case OPCODE_COS: 
-            if ( GL_FALSE == assemble_COS(pR700AsmCode) ) 
+            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_COS) ) 
                 return GL_FALSE;
             break;  
 
@@ -5996,6 +5616,26 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
         case OPCODE_MUL: 
             if ( GL_FALSE == assemble_MUL(pR700AsmCode) ) 
                 return GL_FALSE;
+            break;
+            
+        case OPCODE_NOISE1:
+            {                                               
+                callPreSub(pR700AsmCode, 
+                           GLSL_NOISE1,                         
+                           &noise1_presub,                                                  
+                           pILInst->DstReg.Index + pR700AsmCode->starting_temp_register_number, 
+                           1); 
+                radeon_error("noise1: not yet supported shader instruction\n");
+            };
+            break; 
+        case OPCODE_NOISE2: 
+            radeon_error("noise2: not yet supported shader instruction\n");
+            break; 
+        case OPCODE_NOISE3: 
+            radeon_error("noise3: not yet supported shader instruction\n");
+            break; 
+        case OPCODE_NOISE4: 
+            radeon_error("noise4: not yet supported shader instruction\n");
             break; 
 
         case OPCODE_POW: 
@@ -6011,7 +5651,7 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
                 return GL_FALSE;
             break;  
         case OPCODE_SIN: 
-            if ( GL_FALSE == assemble_SIN(pR700AsmCode) ) 
+            if ( GL_FALSE == assemble_TRIG(pR700AsmCode, SQ_OP2_INST_SIN) ) 
                 return GL_FALSE;
             break;  
         case OPCODE_SCS: 
@@ -6108,7 +5748,8 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
                 }
             }
             break;
-
+        case OPCODE_DDX:
+        case OPCODE_DDY:
         case OPCODE_TEX: 
         case OPCODE_TXB:  
         case OPCODE_TXP: 
@@ -6126,11 +5767,11 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
                 return GL_FALSE;
             break;  
 
-        case OPCODE_IF   : 
+        case OPCODE_IF:
             {                
                 GLboolean bHasElse = GL_FALSE;
 
-                if(pILInst[pILInst[i].BranchTarget - 1].Opcode == OPCODE_ELSE)
+                if(pILInst[pILInst[i].BranchTarget].Opcode == OPCODE_ELSE)
                 {
                     bHasElse = GL_TRUE;
                 }
@@ -6181,7 +5822,7 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
             break;
 
         case OPCODE_BGNSUB:
-            if( GL_FALSE == assemble_BGNSUB(pR700AsmCode, i) )
+            if( GL_FALSE == assemble_BGNSUB(pR700AsmCode, i, uiIL_Shift) )
             {
                 return GL_FALSE;
             }
@@ -6196,9 +5837,11 @@ GLboolean AssembleInstr(GLuint uiFirstInst,
         
         case OPCODE_CAL:
             if( GL_FALSE == assemble_CAL(pR700AsmCode, 
-                                         pILInst[i].BranchTarget,                                         
+                                         pILInst[i].BranchTarget,
+                                         uiIL_Shift,
                                          uiNumberInsts,
-                                         pILInst) )
+                                         pILInst,
+                                         NULL) )
             {
                 return GL_FALSE;
             }
@@ -6235,7 +5878,7 @@ GLboolean InitShaderProgram(r700_AssemblerBase * pAsm)
     return GL_TRUE;
 }
 
-GLboolean RelocProgram(r700_AssemblerBase * pAsm)
+GLboolean RelocProgram(r700_AssemblerBase * pAsm, struct gl_program * pILProg)
 {
     GLuint i;
     GLuint unCFoffset;
@@ -6245,6 +5888,12 @@ GLboolean RelocProgram(r700_AssemblerBase * pAsm)
     R700ShaderInstruction *        pInst;
     R700ControlFlowGenericClause * pCFInst;
 
+    R700ControlFlowALUClause * pCF_ALU;
+    R700ALUInstruction       * pALU;
+    GLuint                     unConstOffset = 0;
+    GLuint                     unRegOffset;
+    GLuint                     unMinRegIndex;
+
     plstCFmain = pAsm->CALLSTACK[0].plstCFInstructions_local;
 
     /* remove flags init if they are not used */
@@ -6290,6 +5939,11 @@ GLboolean RelocProgram(r700_AssemblerBase * pAsm)
 
     unCFoffset = plstCFmain->uNumOfNode;
 
+    if(NULL != pILProg->Parameters)
+    {        
+        unConstOffset = pILProg->Parameters->NumParameters;
+    }
+
     /* Reloc subs */
     for(i=0; i<pAsm->unSubArrayPointer; i++)
     {
@@ -6327,6 +5981,84 @@ GLboolean RelocProgram(r700_AssemblerBase * pAsm)
             pInst = pInst->pNextInst;
         };
 
+        if(NULL != pAsm->subs[i].pPresubDesc)
+        {
+            GLuint                     uNumSrc;            
+            
+            unMinRegIndex  = pAsm->subs[i].pPresubDesc->pCompiledSub->MinRegIndex;
+            unRegOffset    = pAsm->subs[i].pPresubDesc->maxStartReg;            
+            unConstOffset += pAsm->subs[i].pPresubDesc->unConstantsStart;
+
+            pInst = plstCFsub->pHead;
+            while(pInst)
+            {
+                if(SIT_CF_ALU == pInst->m_ShaderInstType)
+                {
+                    pCF_ALU = (R700ControlFlowALUClause *)pInst;
+
+                    pALU = pCF_ALU->m_pLinkedALUInstruction;
+                    for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                    {
+                        pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;
+
+                        if(pALU->m_Word0.f.src0_sel < SQ_ALU_SRC_GPR_SIZE)
+                        {   
+                            pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
+                        }
+                        else if(pALU->m_Word0.f.src0_sel >= SQ_ALU_SRC_CFILE_BASE)
+                        {   
+                            pALU->m_Word0.f.src0_sel += unConstOffset;
+                        }
+
+                        if( ((pALU->m_Word1.val >> SQ_ALU_WORD1_OP3_ALU_INST_SHIFT) & 0x0000001F) 
+                            >= SQ_OP3_INST_MUL_LIT )
+                        {   /* op3 : 3 srcs */
+                            if(pALU->m_Word1_OP3.f.src2_sel < SQ_ALU_SRC_GPR_SIZE)
+                            {   
+                                pALU->m_Word1_OP3.f.src2_sel = pALU->m_Word1_OP3.f.src2_sel + unRegOffset - unMinRegIndex;
+                            }
+                            else if(pALU->m_Word1_OP3.f.src2_sel >= SQ_ALU_SRC_CFILE_BASE)
+                            {   
+                                pALU->m_Word1_OP3.f.src2_sel += unConstOffset;
+                            }    
+                            if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
+                            {   
+                                pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
+                            }
+                            else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
+                            {   
+                                pALU->m_Word0.f.src1_sel += unConstOffset;
+                            }                                 
+                        }
+                        else
+                        {
+                            if(pAsm->bR6xx)
+                            {
+                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f6.alu_inst, 0);
+                            }
+                            else
+                            {
+                                uNumSrc = r700GetNumOperands(pALU->m_Word1_OP2.f.alu_inst, 0);
+                            }
+                            if(2 == uNumSrc)
+                            {   /* 2 srcs */
+                                if(pALU->m_Word0.f.src1_sel < SQ_ALU_SRC_GPR_SIZE)
+                                {   
+                                    pALU->m_Word0.f.src1_sel = pALU->m_Word0.f.src1_sel + unRegOffset - unMinRegIndex;
+                                }
+                                else if(pALU->m_Word0.f.src1_sel >= SQ_ALU_SRC_CFILE_BASE)
+                                {   
+                                    pALU->m_Word0.f.src1_sel += unConstOffset;
+                                }                                  
+                            }                            
+                        }
+                        pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                    }                    
+                }             
+                pInst = pInst->pNextInst;
+            };
+        }
+
         /* Put sub into main */
         plstCFmain->pTail->pNextInst = plstCFsub->pHead;
         plstCFmain->pTail            = plstCFsub->pTail;
@@ -6340,11 +6072,216 @@ GLboolean RelocProgram(r700_AssemblerBase * pAsm)
     {
         pAsm->callers[i].cf_ptr->m_Word0.f.addr
             = pAsm->subs[pAsm->callers[i].subDescIndex].unCFoffset; 
+
+        if(NULL != pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc)
+        {                 
+            unMinRegIndex = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->pCompiledSub->MinRegIndex;
+            unRegOffset = pAsm->subs[pAsm->callers[i].subDescIndex].pPresubDesc->maxStartReg;
+
+            if(NULL != pAsm->callers[i].prelude_cf_ptr)
+            {                
+                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].prelude_cf_ptr);
+                pALU = pCF_ALU->m_pLinkedALUInstruction;
+                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                {
+                    pALU->m_Word1.f.dst_gpr = pALU->m_Word1.f.dst_gpr + unRegOffset - unMinRegIndex;
+                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                }
+            }
+            if(NULL != pAsm->callers[i].finale_cf_ptr)
+            {
+                pCF_ALU = (R700ControlFlowALUClause * )(pAsm->callers[i].finale_cf_ptr);
+                pALU = pCF_ALU->m_pLinkedALUInstruction;
+                for(int j=0; j<=pCF_ALU->m_Word1.f.count; j++)
+                {
+                    pALU->m_Word0.f.src0_sel = pALU->m_Word0.f.src0_sel + unRegOffset - unMinRegIndex;
+                    pALU = (R700ALUInstruction*)(pALU->pNextInst);
+                }
+            }
+        }
     }
 
     return GL_TRUE;
 }
 
+GLboolean callPreSub(r700_AssemblerBase* pAsm, 
+                         LOADABLE_SCRIPT_SIGNITURE scriptSigniture,                          
+                         COMPILED_SUB * pCompiledSub,                                               
+                         GLshort uOutReg,
+                         GLshort uNumValidSrc)
+{
+    /* save assemble context */
+    GLuint starting_temp_register_number_save;
+    GLuint number_used_registers_save;
+    GLuint uFirstHelpReg_save;
+    GLuint uHelpReg_save;
+    GLuint uiCurInst_save;
+    struct prog_instruction *pILInst_save;
+    PRESUB_DESC * pPresubDesc;
+    GLboolean     bRet;
+    int i;
+
+    R700ControlFlowGenericClause* prelude_cf_ptr = NULL;
+
+    /* copy srcs to presub inputs */
+    pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    for(i=0; i<uNumValidSrc; i++)
+    {
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = pCompiledSub->srcRegIndex[i];
+        pAsm->D.dst.writex = 1;
+        pAsm->D.dst.writey = 1;
+        pAsm->D.dst.writez = 1;
+        pAsm->D.dst.writew = 1;
+
+        if( GL_FALSE == assemble_src(pAsm, i, 0) )
+        {
+            return GL_FALSE;
+        }
+
+        next_ins(pAsm);
+    }
+    if(uNumValidSrc > 0)
+    {
+        prelude_cf_ptr     = pAsm->cf_current_alu_clause_ptr;
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    }
+
+    /* browse thro existing presubs. */
+    for(i=0; i<pAsm->unNumPresub; i++)
+    {
+        if(pAsm->presubs[i].sptSigniture == scriptSigniture)
+        {
+            break;
+        }
+    }
+
+    if(i == pAsm->unNumPresub)
+    {   /* not loaded yet */
+        /* save assemble context */
+        number_used_registers_save         = pAsm->number_used_registers;
+        uFirstHelpReg_save                 = pAsm->uFirstHelpReg;
+        uHelpReg_save                      = pAsm->uHelpReg;
+        starting_temp_register_number_save = pAsm->starting_temp_register_number;
+        pILInst_save                       = pAsm->pILInst;
+        uiCurInst_save                     = pAsm->uiCurInst;
+
+        /* alloc in presub */
+        if( (pAsm->unNumPresub + 1) > pAsm->unPresubArraySize )
+        {
+            pAsm->presubs = (PRESUB_DESC*)_mesa_realloc( (void *)pAsm->presubs,
+                                      sizeof(PRESUB_DESC) * pAsm->unPresubArraySize,
+                                      sizeof(PRESUB_DESC) * (pAsm->unPresubArraySize + 4) );
+            if(NULL == pAsm->presubs)
+            {
+                radeon_error("No memeory to allocate built in shader function description structures. \n");
+                return GL_FALSE;
+            }
+            pAsm->unPresubArraySize += 4;
+        }
+        
+        pPresubDesc = &(pAsm->presubs[i]);
+        pPresubDesc->sptSigniture = scriptSigniture;
+
+        /* constants offsets need to be final resolved at reloc. */
+        if(0 == pAsm->unNumPresub)
+        {
+            pPresubDesc->unConstantsStart = 0; 
+        }
+        else
+        {
+            pPresubDesc->unConstantsStart =  pAsm->presubs[i-1].unConstantsStart
+                                           + pAsm->presubs[i-1].pCompiledSub->NumParameters;
+        }
+
+        pPresubDesc->pCompiledSub = pCompiledSub;
+
+        pPresubDesc->subIL_Shift = pAsm->unCurNumILInsts;
+        pPresubDesc->maxStartReg  = uFirstHelpReg_save;
+        pAsm->unCurNumILInsts    += pCompiledSub->NumInstructions;
+
+        pAsm->unNumPresub++;
+
+        /* setup new assemble context */
+        pAsm->starting_temp_register_number = 0;
+        pAsm->number_used_registers = pCompiledSub->NumTemporaries;
+        pAsm->uFirstHelpReg         = pAsm->number_used_registers;
+        pAsm->uHelpReg              = pAsm->uFirstHelpReg;
+
+        bRet = assemble_CAL(pAsm, 
+                            0, 
+                            pPresubDesc->subIL_Shift, 
+                            pCompiledSub->NumInstructions,
+                            pCompiledSub->Instructions,
+                            pPresubDesc);
+
+        
+        pPresubDesc->number_used_registers = pAsm->number_used_registers;        
+
+        /* restore assemble context */
+        pAsm->number_used_registers         = number_used_registers_save; 
+        pAsm->uFirstHelpReg                 = uFirstHelpReg_save;
+        pAsm->uHelpReg                      = uHelpReg_save;
+        pAsm->starting_temp_register_number = starting_temp_register_number_save;
+        pAsm->pILInst                       = pILInst_save; 
+        pAsm->uiCurInst                     = uiCurInst_save;
+    }
+    else
+    {   /* was loaded */
+        pPresubDesc = &(pAsm->presubs[i]);  
+        
+        bRet = assemble_CAL(pAsm, 
+                            0, 
+                            pPresubDesc->subIL_Shift, 
+                            pCompiledSub->NumInstructions,
+                            pCompiledSub->Instructions,
+                            pPresubDesc);
+    }
+
+    if(GL_FALSE == bRet)
+    {
+        radeon_error("Shader presub assemble failed. \n");
+    }
+    else
+    {
+        /* copy presub output to real dst */ 
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = pCompiledSub->dstRegIndex;
+        pAsm->S[0].src.swizzlex = pCompiledSub->outputSwizzleX;
+        pAsm->S[0].src.swizzley = pCompiledSub->outputSwizzleY;
+        pAsm->S[0].src.swizzlez = pCompiledSub->outputSwizzleZ;
+        pAsm->S[0].src.swizzlew = pCompiledSub->outputSwizzleW;
+
+        next_ins(pAsm);        
+
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].finale_cf_ptr  = pAsm->cf_current_alu_clause_ptr;
+        pAsm->callers[pAsm->unCallerArrayPointer - 1].prelude_cf_ptr = prelude_cf_ptr;
+        pAsm->alu_x_opcode = SQ_CF_INST_ALU;
+    }
+
+    if( (pPresubDesc->number_used_registers + pAsm->uFirstHelpReg) > pAsm->number_used_registers )
+    {
+        pAsm->number_used_registers = pPresubDesc->number_used_registers + pAsm->uFirstHelpReg;
+    }
+    if(pAsm->uFirstHelpReg > pPresubDesc->maxStartReg)
+    {
+        pPresubDesc->maxStartReg = pAsm->uFirstHelpReg;
+    }
+
+    return bRet;
+}
+
 GLboolean Process_Export(r700_AssemblerBase* pAsm,
                          GLuint type,
                          GLuint export_starting_index,
@@ -6702,6 +6639,11 @@ GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode)
         FREE(pR700AsmCode->callers);
     }
 
+    if(NULL != pR700AsmCode->presubs)
+    {
+        FREE(pR700AsmCode->presubs);
+    }
+
     return GL_TRUE;
 }