src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h */
  35 /* Kevin F. Quinn 2nd July 2006
  36  * Replace data segment constants with text-segment instructions
  37         .section        .rodata
  38         .align 16
  39         .type   mask, @object
  40         .size   mask, 32
  41 mask:
  42         .long   0xff00ff00
  43         .long   0xff00ff00
  44         .long   0xff00ff00
  45         .long   0xff00ff00
  46         .long   0x00ff0000
  47         .long   0x00ff0000
  48         .long   0x00ff0000
  49         .long   0x00ff0000
  50  */
  51 #define LOAD_MASK(mvins,m1,m2) \
  52         pushl   $0xff00ff00 ;\
  53         pushl   $0xff00ff00 ;\
  54         pushl   $0xff00ff00 ;\
  55         pushl   $0xff00ff00 ;\
  56         mvins   (%esp), m1      ;\
  57         pushl   $0x00ff0000 ;\
  58         pushl   $0x00ff0000 ;\
  59         pushl   $0x00ff0000 ;\
  60         pushl   $0x00ff0000 ;\
  61         mvins   (%esp), m2      ;\
  62         addl    $32, %esp
  63
  64
  65 /* I implemented these as macros because the appear in quite a few places,
  66  * and I've tweaked them a number of times.  I got tired of changing every
  67  * place they appear. :)
  68  */
  69
  70 #define DO_ONE_PIXEL() \
  71         movl    (%ebx), %eax ; \
  72         addl    $4, %ebx ; \
  73         bswap   %eax          /* ARGB -> BGRA */ ; \
  74         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  75         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  76         addl    $4, %ecx
  77
  78 #define DO_ONE_LAST_PIXEL() \
  79         movl    (%ebx), %eax ; \
  80         bswap   %eax          /* ARGB -> BGRA */ ; \
  81         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  82         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  83
  84
  85 /**
  86  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  87  *
  88  * \warning
  89  * This function assumes that the caller will issue the EMMS instruction
  90  * at the correct places.
  91  */
  92
  93 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  94 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  95         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  96 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  97         pushl   %ebx
  98
  99 #ifdef USE_INNER_EMMS
 100         emms
 101 #endif
 102 /* Kevin F. Quinn 2nd July 2006
 103  * Replace data segment constants with text-segment instructions
 104         movq    mask, %mm1
 105         movq    mask+16, %mm2
 106  */
 107         LOAD_MASK(movq,%mm1,%mm2)
 108
 109         movl    8(%esp), %ebx   /* source pointer */
 110         movl    16(%esp), %edx  /* number of pixels to copy */
 111         movl    12(%esp), %ecx  /* destination pointer */
 112
 113         testl   %edx, %edx
 114         jle     .L20            /* Bail if there's nothing to do. */
 115
 116         movl    %ebx, %eax
 117
 118         negl    %eax
 119         sarl    $2, %eax
 120         andl    $1, %eax
 121         je      .L17
 122
 123         subl    %eax, %edx
 124         DO_ONE_PIXEL()
 125 .L17:
 126
 127         /* Would it be faster to unroll this loop once and process 4 pixels
 128          * per pass, instead of just two?
 129          */
 130
 131         movl    %edx, %eax
 132         shrl    %eax
 133         jmp     .L18
 134 .L19:
 135         movq    (%ebx), %mm0
 136         addl    $8, %ebx
 137
 138         /* These 9 instructions do what PSHUFB (if there were such an
 139          * instruction) could do in 1. :(
 140          */
 141
 142         movq    %mm0, %mm3
 143         movq    %mm0, %mm4
 144
 145         pand    %mm2, %mm3
 146         psllq   $16, %mm4
 147         psrlq   $16, %mm3
 148         pand    %mm2, %mm4
 149
 150         pand    %mm1, %mm0
 151         por     %mm4, %mm3
 152         por     %mm3, %mm0
 153
 154         movq    %mm0, (%ecx)
 155         addl    $8, %ecx
 156         subl    $1, %eax
 157 .L18:
 158         jne     .L19
 159
 160 #ifdef USE_INNER_EMMS
 161         emms
 162 #endif
 163
 164         /* At this point there are either 1 or 0 pixels remaining to be
 165          * converted.  Convert the last pixel, if needed.
 166          */
 167
 168         testl   $1, %edx
 169         je      .L20
 170
 171         DO_ONE_LAST_PIXEL()
 172
 173 .L20:
 174         popl    %ebx
 175         ret
 176         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 177
 178
 179 /**
 180  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 181  * instructions are only actually used to read data from the framebuffer.
 182  * In practice, the speed-up is pretty small.
 183  *
 184  * \todo
 185  * Do some more testing and determine if there's any reason to have this
 186  * function in addition to the MMX version.
 187  *
 188  * \warning
 189  * This function assumes that the caller will issue the EMMS instruction
 190  * at the correct places.
 191  */
 192
 193 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 194 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 195         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 196 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 197         pushl   %esi
 198         pushl   %ebx
 199         pushl   %ebp
 200
 201 #ifdef USE_INNER_EMMS
 202         emms
 203 #endif
 204 /* Kevin F. Quinn 2nd July 2006
 205  * Replace data segment constants with text-segment instructions
 206         movq    mask, %mm1
 207         movq    mask+16, %mm2
 208  */
 209         LOAD_MASK(movq,%mm1,%mm2)
 210
 211         movl    16(%esp), %ebx  /* source pointer */
 212         movl    24(%esp), %edx  /* number of pixels to copy */
 213         movl    20(%esp), %ecx  /* destination pointer */
 214
 215         testl   %edx, %edx
 216         jle     .L35            /* Bail if there's nothing to do. */
 217
 218         movl    %esp, %ebp
 219         subl    $16, %esp
 220         andl    $0xfffffff0, %esp
 221
 222         movl    %ebx, %eax
 223         movl    %edx, %esi
 224
 225         negl    %eax
 226         andl    $15, %eax
 227         sarl    $2, %eax
 228         cmpl    %edx, %eax
 229         cmovle  %eax, %esi
 230
 231         subl    %esi, %edx
 232
 233         testl   $1, %esi
 234         je      .L32
 235
 236         DO_ONE_PIXEL()
 237 .L32:
 238
 239         testl   $2, %esi
 240         je      .L31
 241
 242         movq    (%ebx), %mm0
 243         addl    $8, %ebx
 244
 245         movq    %mm0, %mm3
 246         movq    %mm0, %mm4
 247
 248         pand    %mm2, %mm3
 249         psllq   $16, %mm4
 250         psrlq   $16, %mm3
 251         pand    %mm2, %mm4
 252
 253         pand    %mm1, %mm0
 254         por     %mm4, %mm3
 255         por     %mm3, %mm0
 256
 257         movq    %mm0, (%ecx)
 258         addl    $8, %ecx
 259 .L31:
 260
 261         movl    %edx, %eax
 262         shrl    $2, %eax
 263         jmp     .L33
 264 .L34:
 265         movaps  (%ebx), %xmm0
 266         addl    $16, %ebx
 267
 268         /* This would be so much better if we could just move directly from
 269          * an SSE register to an MMX register.  Unfortunately, that
 270          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 271          * instruction.
 272          */
 273
 274         movaps  %xmm0, (%esp)
 275         movq    (%esp), %mm0
 276         movq    8(%esp), %mm5
 277
 278         movq    %mm0, %mm3
 279         movq    %mm0, %mm4
 280         movq    %mm5, %mm6
 281         movq    %mm5, %mm7
 282
 283         pand    %mm2, %mm3
 284         pand    %mm2, %mm6
 285
 286         psllq   $16, %mm4
 287         psllq   $16, %mm7
 288
 289         psrlq   $16, %mm3
 290         psrlq   $16, %mm6
 291
 292         pand    %mm2, %mm4
 293         pand    %mm2, %mm7
 294
 295         pand    %mm1, %mm0
 296         pand    %mm1, %mm5
 297
 298         por     %mm4, %mm3
 299         por     %mm7, %mm6
 300
 301         por     %mm3, %mm0
 302         por     %mm6, %mm5
 303
 304         movq    %mm0, (%ecx)
 305         movq    %mm5, 8(%ecx)
 306         addl    $16, %ecx
 307
 308         subl    $1, %eax
 309 .L33:
 310         jne     .L34
 311
 312 #ifdef USE_INNER_EMMS
 313         emms
 314 #endif
 315         movl    %ebp, %esp
 316
 317         /* At this point there are either [0, 3] pixels remaining to be
 318          * converted.
 319          */
 320
 321         testl   $2, %edx
 322         je      .L36
 323
 324         movq    (%ebx), %mm0
 325         addl    $8, %ebx
 326
 327         movq    %mm0, %mm3
 328         movq    %mm0, %mm4
 329
 330         pand    %mm2, %mm3
 331         psllq   $16, %mm4
 332         psrlq   $16, %mm3
 333         pand    %mm2, %mm4
 334
 335         pand    %mm1, %mm0
 336         por     %mm4, %mm3
 337         por     %mm3, %mm0
 338
 339         movq    %mm0, (%ecx)
 340         addl    $8, %ecx
 341 .L36:
 342
 343         testl   $1, %edx
 344         je      .L35
 345
 346         DO_ONE_LAST_PIXEL()
 347 .L35:
 348         popl    %ebp
 349         popl    %ebx
 350         popl    %esi
 351         ret
 352         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 353
 354
 355 /**
 356  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 357  */
 358
 359         .text
 360 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 361 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 362         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 363 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 364         pushl   %esi
 365         pushl   %ebx
 366
 367 /* Kevin F. Quinn 2nd July 2006
 368  * Replace data segment constants with text-segment instructions
 369         movdqa  mask, %xmm1
 370         movdqa  mask+16, %xmm2
 371  */
 372         LOAD_MASK(movdqu,%xmm1,%xmm2)
 373
 374         movl    12(%esp), %ebx  /* source pointer */
 375         movl    20(%esp), %edx  /* number of pixels to copy */
 376         movl    16(%esp), %ecx  /* destination pointer */
 377
 378         movl    %ebx, %eax
 379         movl    %edx, %esi
 380
 381         testl   %edx, %edx
 382         jle     .L46            /* Bail if there's nothing to do. */
 383
 384         /* If the source pointer isn't a multiple of 16 we have to process
 385          * a few pixels the "slow" way to get the address aligned for
 386          * the SSE fetch intsructions.
 387          */
 388
 389         negl    %eax
 390         andl    $15, %eax
 391         sarl    $2, %eax
 392
 393         cmpl    %edx, %eax
 394         cmovbe  %eax, %esi
 395         subl    %esi, %edx
 396
 397         testl   $1, %esi
 398         je      .L41
 399
 400         DO_ONE_PIXEL()
 401 .L41:
 402         testl   $2, %esi
 403         je      .L40
 404
 405         movq    (%ebx), %xmm0
 406         addl    $8, %ebx
 407
 408         movdqa  %xmm0, %xmm3
 409         movdqa  %xmm0, %xmm4
 410         andps   %xmm1, %xmm0
 411
 412         andps   %xmm2, %xmm3
 413         pslldq  $2, %xmm4
 414         psrldq  $2, %xmm3
 415         andps   %xmm2, %xmm4
 416
 417         orps    %xmm4, %xmm3
 418         orps    %xmm3, %xmm0
 419
 420         movq    %xmm0, (%ecx)
 421         addl    $8, %ecx
 422 .L40:
 423
 424         /* Would it be worth having a specialized version of this loop for
 425          * the case where the destination is 16-byte aligned?  That version
 426          * would be identical except that it could use movedqa instead of
 427          * movdqu.
 428          */
 429
 430         movl    %edx, %eax
 431         shrl    $2, %eax
 432         jmp     .L42
 433 .L43:
 434         movdqa  (%ebx), %xmm0
 435         addl    $16, %ebx
 436
 437         movdqa  %xmm0, %xmm3
 438         movdqa  %xmm0, %xmm4
 439         andps   %xmm1, %xmm0
 440
 441         andps   %xmm2, %xmm3
 442         pslldq  $2, %xmm4
 443         psrldq  $2, %xmm3
 444         andps   %xmm2, %xmm4
 445
 446         orps    %xmm4, %xmm3
 447         orps    %xmm3, %xmm0
 448
 449         movdqu  %xmm0, (%ecx)
 450         addl    $16, %ecx
 451         subl    $1, %eax
 452 .L42:
 453         jne     .L43
 454
 455
 456         /* There may be upto 3 pixels remaining to be copied.  Take care
 457          * of them now.  We do the 2 pixel case first because the data
 458          * will be aligned.
 459          */
 460
 461         testl   $2, %edx
 462         je      .L47
 463
 464         movq    (%ebx), %xmm0
 465
 466         movdqa  %xmm0, %xmm3
 467         movdqa  %xmm0, %xmm4
 468         andps   %xmm1, %xmm0
 469
 470         andps   %xmm2, %xmm3
 471         pslldq  $2, %xmm4
 472         psrldq  $2, %xmm3
 473         andps   %xmm2, %xmm4
 474
 475         orps    %xmm4, %xmm3
 476         orps    %xmm3, %xmm0
 477
 478         movq    %xmm0, (%ecx)
 479 .L47:
 480
 481         testl   $1, %edx
 482         je      .L46
 483
 484         DO_ONE_LAST_PIXEL()
 485 .L46:
 486
 487         popl    %ebx
 488         popl    %esi
 489         ret
 490         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 491
 492
 493
 494 /* Kevin F. Quinn 2nd July 2006
 495  * Replace data segment constants with text-segment instructions
 496  */
 497 #if 0
 498         .section        .rodata
 499
 500         .align  16
 501 mask_565:
 502         .word   0xf800
 503         .word   0x07e0
 504         .word   0x001f
 505         .word   0x0000
 506
 507 /* Setting SCALE_ADJUST to 5 gives a perfect match with the classic C
 508  * implementation in Mesa.  Setting SCALE_ADJUST to 0 is slightly faster but
 509  * at a small cost to accuracy.
 510  */
 511
 512 #define SCALE_ADJUST    5
 513 #if SCALE_ADJUST == 5
 514 prescale:
 515         .word   0x0001
 516         .word   0x0010
 517         .word   0x0200
 518         .word   0x0000
 519
 520 scale:
 521         .word   0x20e8          /* (0x00ff0000 / 0x000007c0) + 1 */
 522         .word   0x40c5          /* (0x00ff0000 / 0x000003f0) + 1 */
 523         .word   0x839d          /* (0x00ff0000 / 0x000001f0) + 1 */
 524         .word   0x0000
 525 #elif SCALE_ADJUST == 0
 526 prescale:
 527         .word   0x0001
 528         .word   0x0020
 529         .word   0x0800
 530         .word   0x0000
 531
 532 scale:
 533         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 534         .word   0x0104          /* (0x00ff0000 / 0x0000fc00) + 1 */
 535         .word   0x0108          /* (0x00ff0000 / 0x0000f800) + 1 */
 536         .word   0x0000
 537 #else
 538 #error SCALE_ADJUST must either be 5 or 0.
 539 #endif
 540
 541
 542 alpha:  .long   0x00000000
 543         .long   0x00ff0000
 544 #endif
 545
 546 #define MASK_565_L      0x07e0f800
 547 #define MASK_565_H      0x0000001f
 548 #define SCALE_ADJUST    5
 549 #if SCALE_ADJUST == 5
 550 #define PRESCALE_L 0x00100001
 551 #define PRESCALE_H 0x00000200
 552 #define SCALE_L 0x40C620E8
 553 #define SCALE_H 0x0000839d
 554 #elif SCALE_ADJUST == 0
 555 #define PRESCALE_L 0x00200001
 556 #define PRESCALE_H 0x00000800
 557 #define SCALE_L 0x01040108
 558 #define SCALE_H 0x00000108
 559 #else
 560 #error SCALE_ADJUST must either be 5 or 0.
 561 #endif
 562 #define ALPHA_L 0x00000000
 563 #define ALPHA_H 0x00ff0000
 564
 565 /**
 566  * MMX optimized version of the RGB565 to RGBA copy routine.
 567  */
 568
 569         .text
 570         .globl  _generic_read_RGBA_span_RGB565_MMX
 571         .hidden _generic_read_RGBA_span_RGB565_MMX
 572         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 573
 574 _generic_read_RGBA_span_RGB565_MMX:
 575
 576 #ifdef USE_INNER_EMMS
 577         emms
 578 #endif
 579
 580         movl    4(%esp), %eax   /* source pointer */
 581         movl    8(%esp), %edx   /* destination pointer */
 582         movl    12(%esp), %ecx  /* number of pixels to copy */
 583
 584 /* Kevin F. Quinn 2nd July 2006
 585  * Replace data segment constants with text-segment instructions
 586         movq    mask_565, %mm5
 587         movq    prescale, %mm6
 588         movq    scale, %mm7
 589  */
 590         pushl   MASK_565_H
 591         pushl   MASK_565_L
 592         movq    (%esp), %mm5
 593         pushl   PRESCALE_H
 594         pushl   PRESCALE_L
 595         movq    (%esp), %mm6
 596         pushl   SCALE_H
 597         pushl   SCALE_L
 598         movq    (%esp), %mm7
 599         pushl   ALPHA_H
 600         pushl   ALPHA_L
 601         movq    (%esp), %mm3
 602         addl    $32,%esp
 603
 604         sarl    $2, %ecx
 605         jle     .L01            /* Bail early if the count is negative. */
 606         jmp     .L02
 607
 608 .L03:
 609         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 610          * second pixels into the four words of %mm0 and %mm2.
 611          */
 612
 613         movq    (%eax), %mm4
 614         addl    $8, %eax
 615
 616         pshufw  $0x00, %mm4, %mm0
 617         pshufw  $0x55, %mm4, %mm2
 618
 619
 620         /* Mask the pixels so that each word of each register contains only
 621          * one color component.
 622          */
 623
 624         pand    %mm5, %mm0
 625         pand    %mm5, %mm2
 626
 627
 628         /* Adjust the component values so that they are as small as possible,
 629          * but large enough so that we can multiply them by an unsigned 16-bit
 630          * number and get a value as large as 0x00ff0000.
 631          */
 632
 633         pmullw  %mm6, %mm0
 634         pmullw  %mm6, %mm2
 635 #if SCALE_ADJUST > 0
 636         psrlw   $SCALE_ADJUST, %mm0
 637         psrlw   $SCALE_ADJUST, %mm2
 638 #endif
 639
 640         /* Scale the input component values to be on the range
 641          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 642          */
 643
 644         pmulhuw %mm7, %mm0
 645         pmulhuw %mm7, %mm2
 646
 647
 648         /* Always set the alpha value to 0xff.
 649          */
 650
 651 /* Kevin F. Quinn 2nd July 2006
 652  * Replace data segment constants with text-segment instructions
 653         por     alpha, %mm0
 654         por     alpha, %mm2
 655  */
 656         por %mm3, %mm0
 657         por %mm3, %mm2
 658
 659
 660         /* Pack the 16-bit values to 8-bit values and store the converted
 661          * pixel data.
 662          */
 663
 664         packuswb        %mm2, %mm0
 665         movq    %mm0, (%edx)
 666         addl    $8, %edx
 667
 668
 669
 670         pshufw  $0xaa, %mm4, %mm0
 671         pshufw  $0xff, %mm4, %mm2
 672
 673         pand    %mm5, %mm0
 674         pand    %mm5, %mm2
 675         pmullw  %mm6, %mm0
 676         pmullw  %mm6, %mm2
 677 #if SCALE_ADJUST > 0
 678         psrlw   $SCALE_ADJUST, %mm0
 679         psrlw   $SCALE_ADJUST, %mm2
 680 #endif
 681         pmulhuw %mm7, %mm0
 682         pmulhuw %mm7, %mm2
 683
 684 /* Kevin F. Quinn 2nd July 2006
 685  * Replace data segment constants with text-segment instructions
 686         por     alpha, %mm0
 687         por     alpha, %mm2
 688  */
 689         por %mm3, %mm0
 690         por %mm3, %mm2
 691
 692         packuswb        %mm2, %mm0
 693
 694         movq    %mm0, (%edx)
 695         addl    $8, %edx
 696
 697         subl    $1, %ecx
 698 .L02:
 699         jne     .L03
 700
 701
 702         /* At this point there can be at most 3 pixels left to process.  If
 703          * there is either 2 or 3 left, process 2.
 704          */
 705
 706         movl    12(%esp), %ecx
 707         testl   $0x02, %ecx
 708         je      .L04
 709
 710         movd    (%eax), %mm4
 711         addl    $4, %eax
 712
 713         pshufw  $0x00, %mm4, %mm0
 714         pshufw  $0x55, %mm4, %mm2
 715
 716         pand    %mm5, %mm0
 717         pand    %mm5, %mm2
 718         pmullw  %mm6, %mm0
 719         pmullw  %mm6, %mm2
 720 #if SCALE_ADJUST > 0
 721         psrlw   $SCALE_ADJUST, %mm0
 722         psrlw   $SCALE_ADJUST, %mm2
 723 #endif
 724         pmulhuw %mm7, %mm0
 725         pmulhuw %mm7, %mm2
 726
 727 /* Kevin F. Quinn 2nd July 2006
 728  * Replace data segment constants with text-segment instructions
 729         por     alpha, %mm0
 730         por     alpha, %mm2
 731  */
 732         por %mm3, %mm0
 733         por %mm3, %mm2
 734
 735         packuswb        %mm2, %mm0
 736
 737         movq    %mm0, (%edx)
 738         addl    $8, %edx
 739
 740 .L04:
 741         /* At this point there can be at most 1 pixel left to process.
 742          * Process it if needed.
 743          */
 744
 745         testl   $0x01, %ecx
 746         je      .L01
 747
 748         movzxw  (%eax), %ecx
 749         movd    %ecx, %mm4
 750
 751         pshufw  $0x00, %mm4, %mm0
 752
 753         pand    %mm5, %mm0
 754         pmullw  %mm6, %mm0
 755 #if SCALE_ADJUST > 0
 756         psrlw   $SCALE_ADJUST, %mm0
 757 #endif
 758         pmulhuw %mm7, %mm0
 759
 760 /* Kevin F. Quinn 2nd July 2006
 761  * Replace data segment constants with text-segment instructions
 762         por     alpha, %mm0
 763  */
 764         por %mm3, %mm0
 765
 766         packuswb        %mm0, %mm0
 767
 768         movd    %mm0, (%edx)
 769
 770 .L01:
 771 #ifdef USE_INNER_EMMS
 772         emms
 773 #endif
 774         ret
 775 #endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */
 776
 777 #if defined (__ELF__) && defined (__linux__)
 778         .section .note.GNU-stack,"",%progbits
 779 #endif