src/mesa/x86/mmx_blend.S

   1         ;
   2 /*
   3  * Written by José Fonseca <j_r_fonseca@yahoo.co.uk>
   4  */
   5
   6
   7 #ifdef USE_MMX_ASM
   8 #include "matypes.h"
   9
  10 /* integer multiplication - alpha plus one
  11  *
  12  * makes the following approximation to the division (Sree)
  13  *
  14  *   rgb*a/255 ~= (rgb*(a+1)) >> 256
  15  *
  16  * which is the fastest method that satisfies the following OpenGL criteria
  17  *
  18  *   0*0 = 0 and 255*255 = 255
  19  *
  20  * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making
  21  *
  22  *   PCMPEQW    ( MX1, MX1 )
  23  */
  24 #define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \
  25     PSUBW      ( MX1, MA1 )                     /*   a1 + 1  |   a1 + 1  |   a1 + 1  |   a1 + 1  */     ;\
  26     PMULLW     ( MP1, MA1 )                     /*                  t1 = p1*a1                   */     ;\
  27                                                                                                         ;\
  28 TWO(PSUBW      ( MX1, MA2 ))                    /*   a2 + 1  |   a2 + 1  |   a2 + 1  |   a2 + 1  */     ;\
  29 TWO(PMULLW     ( MP2, MA2 ))                    /*                  t2 = p2*a2                   */     ;\
  30                                                                                                         ;\
  31     PSRLW      ( CONST(8), MA1 )                /*               t1 >> 8 ~= t1/255               */     ;\
  32 TWO(PSRLW      ( CONST(8), MA2 ))               /*               t2 >> 8 ~= t2/255               */
  33
  34
  35 /* integer multiplication - geometric series
  36  *
  37  * takes the geometric series approximation to the division
  38  *
  39  *   t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
  40  *
  41  * in this case just the first two terms to fit in 16bit arithmetic
  42  *
  43  *   t/255 ~= (t + (t >> 8)) >> 8
  44  *
  45  * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
  46  * so the special case a = 255 must be accounted or roundoff must be used
  47  */
  48 #define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \
  49     PMULLW     ( MP1, MA1 )                     /*                  t1 = p1*a1                   */     ;\
  50 TWO(PMULLW     ( MP2, MA2 ))                    /*                  t2 = p2*a2                   */     ;\
  51                                                                                                         ;\
  52     MOVQ       ( MA1, MP1 )                                                                             ;\
  53     PSRLW      ( CONST(8), MA1 )                /*                    t1 >> 8                    */     ;\
  54                                                                                                         ;\
  55 TWO(MOVQ       ( MA2, MP2 ))                                                                            ;\
  56 TWO(PSRLW      ( CONST(8), MA2 ))               /*                    t2 >> 8                    */     ;\
  57                                                                                                         ;\
  58     PADDW      ( MP1, MA1 )                     /*        t1 + (t1 >> 8) ~= (t1/255) << 8        */     ;\
  59     PSRLW      ( CONST(8), MA1 )                /*    sa1    |    sb1    |    sg1    |    sr1    */     ;\
  60                                                                                                         ;\
  61 TWO(PADDW      ( MP2, MA2 ))                    /*        t2 + (t2 >> 8) ~= (t2/255) << 8        */     ;\
  62 TWO(PSRLW      ( CONST(8), MA2 ))               /*    sa2    |    sb2    |    sg2    |    sr2    */
  63
  64
  65 /* integer multiplication - geometric series plus rounding
  66  *
  67  * when using a geometric series division instead of truncating the result
  68  * use roundoff in the approximation (Jim Blinn)
  69  *
  70  *   t = rgb*a + 0x80
  71  *
  72  * achieving the exact results
  73  *
  74  * note that M80 is register with the 0x0080008000800080 constant
  75  */
  76 #define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \
  77     PMULLW     ( MP1, MA1 )                     /*                  t1 = p1*a1                   */     ;\
  78     PADDW      ( M80, MA1 )                     /*                 t1 += 0x80                    */     ;\
  79                                                                                                         ;\
  80 TWO(PMULLW     ( MP2, MA2 ))                    /*                  t2 = p2*a2                   */     ;\
  81 TWO(PADDW      ( M80, MA2 ))                    /*                 t2 += 0x80                    */     ;\
  82                                                                                                         ;\
  83     MOVQ       ( MA1, MP1 )                                                                             ;\
  84     PSRLW      ( CONST(8), MA1 )                /*                    t1 >> 8                    */     ;\
  85                                                                                                         ;\
  86 TWO(MOVQ       ( MA2, MP2 ))                                                                            ;\
  87 TWO(PSRLW      ( CONST(8), MA2 ))               /*                    t2 >> 8                    */     ;\
  88                                                                                                         ;\
  89     PADDW      ( MP1, MA1 )                     /*        t1 + (t1 >> 8) ~= (t1/255) << 8        */     ;\
  90     PSRLW      ( CONST(8), MA1 )                /*    sa1    |    sb1    |    sg1    |    sr1    */     ;\
  91                                                                                                         ;\
  92 TWO(PADDW      ( MP2, MA2 ))                    /*        t2 + (t2 >> 8) ~= (t2/255) << 8        */     ;\
  93 TWO(PSRLW      ( CONST(8), MA2 ))               /*    sa2    |    sb2    |    sg2    |    sr2    */
  94
  95
  96 /* linear interpolation - geometric series
  97  */
  98 #define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \
  99     PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */     ;\
 100     PSLLW      ( CONST(8), MQ1 )                /*                    q1 << 8                    */     ;\
 101     PMULLW     ( MP1, MA1 )                     /*              t1 = (q1 - p1)*pa1               */     ;\
 102                                                                                                         ;\
 103 TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */     ;\
 104 TWO(PSLLW      ( CONST(8), MQ2 ))               /*                    q2 << 8                    */     ;\
 105 TWO(PMULLW     ( MP2, MA2 ))                    /*              t2 = (q2 - p2)*pa2               */     ;\
 106                                                                                                         ;\
 107     MOVQ       ( MA1, MP1 )                                                                             ;\
 108     PSRLW      ( CONST(8), MA1 )                /*                    t1 >> 8                    */     ;\
 109                                                                                                         ;\
 110 TWO(MOVQ       ( MA2, MP2 ))                                                                            ;\
 111 TWO(PSRLW      ( CONST(8), MA2 ))               /*                    t2 >> 8                    */     ;\
 112                                                                                                         ;\
 113     PADDW      ( MP1, MA1 )                     /*        t1 + (t1 >> 8) ~= (t1/255) << 8        */     ;\
 114 TWO(PADDW      ( MP2, MA2 ))                    /*        t2 + (t2 >> 8) ~= (t2/255) << 8        */     ;\
 115                                                                                                         ;\
 116     PADDW      ( MQ1, MA1 )                     /*              (t1/255 + q1) << 8               */     ;\
 117 TWO(PADDW      ( MQ2, MA2 ))                    /*              (t2/255 + q2) << 8               */     ;\
 118                                                                                                         ;\
 119     PSRLW      ( CONST(8), MA1 )                /*    sa1    |    sb1    |    sg1    |    sr1    */     ;\
 120 TWO(PSRLW      ( CONST(8), MA2 ))               /*    sa2    |    sb2    |    sg2    |    sr2    */
 121
 122
 123 /* linear interpolation - geometric series with roundoff
 124  *
 125  * this is a generalization of Blinn's formula to signed arithmetic
 126  *
 127  * note that M80 is a register with the 0x0080008000800080 constant
 128  */
 129 #define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \
 130     PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */     ;\
 131     PSLLW      ( CONST(8), MQ1 )                /*                    q1 << 8                    */     ;\
 132     PMULLW     ( MP1, MA1 )                     /*              t1 = (q1 - p1)*pa1               */     ;\
 133                                                                                                         ;\
 134 TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */     ;\
 135 TWO(PSLLW      ( CONST(8), MQ2 ))               /*                    q2 << 8                    */     ;\
 136 TWO(PMULLW     ( MP2, MA2 ))                    /*              t2 = (q2 - p2)*pa2               */     ;\
 137                                                                                                         ;\
 138     PSRLW      ( CONST(15), MP1 )               /*                 q1 > p1 ? 1 : 0               */     ;\
 139 TWO(PSRLW      ( CONST(15), MP2 ))              /*                 q2 > q2 ? 1 : 0               */     ;\
 140                                                                                                         ;\
 141     PSLLW      ( CONST(8), MP1 )                /*             q1 > p1 ? 0x100 : 0               */     ;\
 142 TWO(PSLLW      ( CONST(8), MP2 ))               /*             q2 > q2 ? 0x100 : 0               */     ;\
 143                                                                                                         ;\
 144     PSUBW      ( MP1, MA1 )                     /*                  t1 -=? 0x100                 */     ;\
 145 TWO(PSUBW      ( MP2, MA2 ))                    /*                  t2 -=? 0x100                 */     ;\
 146                                                                                                         ;\
 147     PADDW      ( M80, MA1 )                     /*                 t1 += 0x80                    */     ;\
 148 TWO(PADDW      ( M80, MA2 ))                    /*                 t2 += 0x80                    */     ;\
 149                                                                                                         ;\
 150     MOVQ       ( MA1, MP1 )                                                                             ;\
 151     PSRLW      ( CONST(8), MA1 )                /*                    t1 >> 8                    */     ;\
 152                                                                                                         ;\
 153 TWO(MOVQ       ( MA2, MP2 ))                                                                            ;\
 154 TWO(PSRLW      ( CONST(8), MA2 ))               /*                    t2 >> 8                    */     ;\
 155                                                                                                         ;\
 156     PADDW      ( MP1, MA1 )                     /*        t1 + (t1 >> 8) ~= (t1/255) << 8        */     ;\
 157 TWO(PADDW      ( MP2, MA2 ))                    /*        t2 + (t2 >> 8) ~= (t2/255) << 8        */     ;\
 158                                                                                                         ;\
 159     PADDW      ( MQ1, MA1 )                     /*              (t1/255 + q1) << 8               */     ;\
 160 TWO(PADDW      ( MQ2, MA2 ))                    /*              (t2/255 + q2) << 8               */     ;\
 161                                                                                                         ;\
 162     PSRLW      ( CONST(8), MA1 )                /*    sa1    |    sb1    |    sg1    |    sr1    */     ;\
 163 TWO(PSRLW      ( CONST(8), MA2 ))               /*    sa2    |    sb2    |    sg2    |    sr2    */
 164
 165
 166 /* linear interpolation - geometric series with correction
 167  *
 168  * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
 169  *
 170  *   t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
 171  *
 172  * note that although is faster than rounding off it doesn't give always the exact results
 173  */
 174 #define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \
 175     PSUBW      ( MQ1, MP1 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */     ;\
 176     PSLLW      ( CONST(8), MQ1 )                /*                    q1 << 8                    */     ;\
 177     PMULLW     ( MP1, MA1 )                     /*              t1 = (q1 - p1)*pa1               */     ;\
 178                                                                                                         ;\
 179 TWO(PSUBW      ( MQ2, MP2 ))                    /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */     ;\
 180 TWO(PSLLW      ( CONST(8), MQ2 ))               /*                    q2 << 8                    */     ;\
 181 TWO(PMULLW     ( MP2, MA2 ))                    /*              t2 = (q2 - p2)*pa2               */     ;\
 182                                                                                                         ;\
 183     MOVQ       ( MA1, MP1 )                                                                             ;\
 184     PSRLW      ( CONST(8), MA1 )                /*                    t1 >> 8                    */     ;\
 185                                                                                                         ;\
 186 TWO(MOVQ       ( MA2, MP2 ))                                                                            ;\
 187 TWO(PSRLW      ( CONST(8), MA2 ))               /*                    t2 >> 8                    */     ;\
 188                                                                                                         ;\
 189     PADDW      ( MA1, MP1 )                     /*        t1 + (t1 >> 8) ~= (t1/255) << 8        */     ;\
 190     PSRLW      ( CONST(7), MA1 )                /*                    t1 >> 15                   */     ;\
 191                                                                                                         ;\
 192 TWO(PADDW      ( MA2, MP2 ))                    /*        t2 + (t2 >> 8) ~= (t2/255) << 8        */     ;\
 193 TWO(PSRLW      ( CONST(7), MA2 ))               /*                    t2 >> 15                   */     ;\
 194                                                                                                         ;\
 195     PADDW      ( MP1, MA1 )                     /*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */     ;\
 196 TWO(PADDW      ( MP2, MA2 ))                    /*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */     ;\
 197                                                                                                         ;\
 198     PADDW      ( MQ1, MA1 )                     /*              (t1/255 + q1) << 8               */     ;\
 199 TWO(PADDW      ( MQ2, MA2 ))                    /*              (t2/255 + q2) << 8               */     ;\
 200                                                                                                         ;\
 201     PSRLW      ( CONST(8), MA1 )                /*    sa1    |    sb1    |    sg1    |    sr1    */     ;\
 202 TWO(PSRLW      ( CONST(8), MA2 ))               /*    sa2    |    sb2    |    sg2    |    sr2    */
 203
 204
 205 /* common blending setup code
 206  *
 207  * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making
 208  *
 209  *   PXOR      ( M00, M00 )
 210  */
 211 #define GMB_LOAD(rgba, dest, MPP, MQQ) \
 212 ONE(MOVD       ( REGIND(rgba), MPP ))           /*     |     |     |     | qa1 | qb1 | qg1 | qr1 */     ;\
 213 ONE(MOVD       ( REGIND(dest), MQQ ))           /*     |     |     |     | pa1 | pb1 | pg1 | pr1 */     ;\
 214                                                                                                         ;\
 215 TWO(MOVQ       ( REGIND(rgba), MPP ))           /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */     ;\
 216 TWO(MOVQ       ( REGIND(dest), MQQ ))           /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
 217
 218 #define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \
 219 TWO(MOVQ       ( MP1, MP2 ))                                                                            ;\
 220 TWO(MOVQ       ( MQ1, MQ2 ))                                                                            ;\
 221                                                                                                         ;\
 222     PUNPCKLBW  ( M00, MQ1 )                     /*    qa1    |    qb1    |    qg1    |    qr1    */     ;\
 223 TWO(PUNPCKHBW  ( M00, MQ2 ))                    /*    qa2    |    qb2    |    qg2    |    qr2    */     ;\
 224     PUNPCKLBW  ( M00, MP1 )                     /*    pa1    |    pb1    |    pg1    |    pr1    */     ;\
 225 TWO(PUNPCKHBW  ( M00, MP2 ))                    /*    pa2    |    pb2    |    pg2    |    pr2    */
 226
 227 #define GMB_ALPHA(MP1, MA1, MP2, MA2) \
 228     MOVQ       ( MP1, MA1 )                                                                             ;\
 229 TWO(MOVQ       ( MP2, MA2 ))                                                                            ;\
 230                                                                                                         ;\
 231     PUNPCKHWD  ( MA1, MA1 )                     /*    pa1    |    pa1    |           |           */     ;\
 232 TWO(PUNPCKHWD  ( MA2, MA2 ))                    /*    pa2    |    pa2    |           |           */     ;\
 233     PUNPCKHDQ  ( MA1, MA1 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */     ;\
 234 TWO(PUNPCKHDQ  ( MA2, MA2 ))                    /*    pa2    |    pa2    |    pa2    |    pa2    */
 235
 236 #define GMB_PACK( MS1, MS2 ) \
 237     PACKUSWB   ( MS2, MS1 )                     /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */     ;\
 238
 239 #define GMB_STORE(rgba, MSS ) \
 240 ONE(MOVD       ( MSS, REGIND(rgba) ))           /*     |     |     |     | sa1 | sb1 | sg1 | sr1 */     ;\
 241 TWO(MOVQ       ( MSS, REGIND(rgba) ))           /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
 242
 243 /* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006
 244  * Replace data segment constants with text-segment
 245  * constants (via pushl/movq)
 246     SEG_DATA
 247
 248 ALIGNDATA8
 249 const_0080:
 250     D_LONG 0x00800080, 0x00800080
 251
 252 const_80:
 253     D_LONG 0x80808080, 0x80808080
 254 */
 255 #define const_0080_l 0x00800080
 256 #define const_0080_h 0x00800080
 257 #define const_80_l 0x80808080
 258 #define const_80_h 0x80808080
 259
 260     SEG_TEXT
 261
 262
 263 /* Blend transparency function
 264  */
 265
 266 #define TAG(x) CONCAT(x,_transparency)
 267 #define LLTAG(x) LLBL2(x,_transparency)
 268
 269 #define INIT \
 270     PXOR       ( MM0, MM0 )                     /*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
 271
 272 #define MAIN( rgba, dest ) \
 273     GMB_LOAD( rgba, dest, MM1, MM2 )                                                                    ;\
 274     GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )                                                               ;\
 275     GMB_ALPHA( MM1, MM3, MM4, MM6 )                                                                     ;\
 276     GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 )                                                        ;\
 277     GMB_PACK( MM3, MM6 )                                                                                ;\
 278     GMB_STORE( rgba, MM3 )
 279
 280 #include "mmx_blendtmp.h"
 281
 282
 283 /* Blend add function
 284  *
 285  * FIXME: Add some loop unrolling here...
 286  */
 287
 288 #define TAG(x) CONCAT(x,_add)
 289 #define LLTAG(x) LLBL2(x,_add)
 290
 291 #define INIT
 292
 293 #define MAIN( rgba, dest ) \
 294 ONE(MOVD       ( REGIND(rgba), MM1 ))           /*     |     |     |     | qa1 | qb1 | qg1 | qr1 */     ;\
 295 ONE(MOVD       ( REGIND(dest), MM2 ))           /*     |     |     |     | pa1 | pb1 | pg1 | pr1 */     ;\
 296 ONE(PADDUSB    ( MM2, MM1 ))                                                                            ;\
 297 ONE(MOVD       ( MM1, REGIND(rgba) ))           /*     |     |     |     | sa1 | sb1 | sg1 | sr1 */     ;\
 298                                                                                                         ;\
 299 TWO(MOVQ       ( REGIND(rgba), MM1 ))           /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */     ;\
 300 TWO(PADDUSB    ( REGIND(dest), MM1 ))           /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */     ;\
 301 TWO(MOVQ       ( MM1, REGIND(rgba) ))
 302
 303 #include "mmx_blendtmp.h"
 304
 305
 306 /* Blend min function
 307  */
 308
 309 #define TAG(x) CONCAT(x,_min)
 310 #define LLTAG(x) LLBL2(x,_min)
 311
 312 /* Kevin F. Quinn 2nd July 2006
 313  * Replace data segment constants with text-segment instructions
 314 #define INIT \
 315     MOVQ       ( CONTENT(const_80), MM7 )
 316  */
 317 #define INIT \
 318     PUSH_L     ( CONST(const_80_h) )            /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/     ;\
 319     PUSH_L     ( CONST(const_80_l) )                                                                    ;\
 320     MOVQ       ( REGIND(ESP), MM7 )                                                                     ;\
 321     ADD_L      ( CONST(8), ESP)
 322
 323 #define MAIN( rgba, dest ) \
 324     GMB_LOAD( rgba, dest, MM1, MM2 )                                                                    ;\
 325     MOVQ       ( MM1, MM3 )                                                                             ;\
 326     MOVQ       ( MM2, MM4 )                                                                             ;\
 327     PXOR       ( MM7, MM3 )                     /*              unsigned -> signed               */     ;\
 328     PXOR       ( MM7, MM4 )                     /*              unsigned -> signed               */     ;\
 329     PCMPGTB    ( MM3, MM4 )                     /*                 q > p ? 0xff : 0x00           */     ;\
 330     PAND       ( MM4, MM1 )                     /*                 q > p ? p : 0                 */     ;\
 331     PANDN      ( MM2, MM4 )                     /*                 q > p ? 0 : q                 */     ;\
 332     POR        ( MM1, MM4 )                     /*                 q > p ? p : q                 */     ;\
 333     GMB_STORE( rgba, MM4 )
 334
 335 #include "mmx_blendtmp.h"
 336
 337
 338 /* Blend max function
 339  */
 340
 341 #define TAG(x) CONCAT(x,_max)
 342 #define LLTAG(x) LLBL2(x,_max)
 343
 344 /* Kevin F. Quinn 2nd July 2006
 345  * Replace data segment constants with text-segment instructions
 346 #define INIT \
 347     MOVQ       ( CONTENT(const_80), MM7 )
 348  */
 349 #define INIT \
 350     PUSH_L     ( CONST(const_80_l) )            /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/     ;\
 351     PUSH_L     ( CONST(const_80_h) )                                                                    ;\
 352     MOVQ       ( REGIND(ESP), MM7 )                                                                     ;\
 353     ADD_L      ( CONST(8), ESP)
 354
 355 #define MAIN( rgba, dest ) \
 356     GMB_LOAD( rgba, dest, MM1, MM2 )                                                                    ;\
 357     MOVQ       ( MM1, MM3 )                                                                             ;\
 358     MOVQ       ( MM2, MM4 )                                                                             ;\
 359     PXOR       ( MM7, MM3 )                     /*              unsigned -> signed               */     ;\
 360     PXOR       ( MM7, MM4 )                     /*              unsigned -> signed               */     ;\
 361     PCMPGTB    ( MM3, MM4 )                     /*                 q > p ? 0xff : 0x00           */     ;\
 362     PAND       ( MM4, MM2 )                     /*                 q > p ? q : 0                 */     ;\
 363     PANDN      ( MM1, MM4 )                     /*                 q > p ? 0 : p                 */     ;\
 364     POR        ( MM2, MM4 )                     /*                 q > p ? p : q                 */     ;\
 365     GMB_STORE( rgba, MM4 )
 366
 367 #include "mmx_blendtmp.h"
 368
 369
 370 /* Blend modulate function
 371  */
 372
 373 #define TAG(x) CONCAT(x,_modulate)
 374 #define LLTAG(x) LLBL2(x,_modulate)
 375
 376 /* Kevin F. Quinn 2nd July 2006
 377  * Replace data segment constants with text-segment instructions
 378 #define INIT \
 379     MOVQ       ( CONTENT(const_0080), MM7 )
 380  */
 381 #define INIT \
 382     PXOR       ( MM0, MM0 )                     /*   0x0000  |   0x0000  |   0x0000  |   0x0000  */     ;\
 383     PUSH_L     ( CONST(const_0080_l) )  /*   0x0080  |   0x0080  |   0x0080  |   0x0080  */     ;\
 384     PUSH_L     ( CONST(const_0080_h) )                                                          ;\
 385     MOVQ       ( REGIND(ESP), MM7 )                                                                     ;\
 386     ADD_L      ( CONST(8), ESP)
 387
 388 #define MAIN( rgba, dest ) \
 389     GMB_LOAD( rgba, dest, MM1, MM2 )                                                                    ;\
 390     GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 )                                                               ;\
 391     GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 )                                                             ;\
 392     GMB_PACK( MM2, MM5 )                                                                                ;\
 393     GMB_STORE( rgba, MM2 )
 394
 395 #include "mmx_blendtmp.h"
 396
 397 #endif
 398
 399 #if defined (__ELF__) && defined (__linux__)
 400         .section .note.GNU-stack,"",%progbits
 401 #endif