src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32
  33         .file   "read_rgba_span_x86.S"
  34 #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
  35 /* Kevin F. Quinn 2nd July 2006
  36  * Replaced data segment constants with text-segment instructions.
  37  */
  38 #define LOAD_MASK(mvins,m1,m2) \
  39         pushl   $0xff00ff00 ;\
  40         pushl   $0xff00ff00 ;\
  41         pushl   $0xff00ff00 ;\
  42         pushl   $0xff00ff00 ;\
  43         mvins   (%esp), m1      ;\
  44         pushl   $0x00ff0000 ;\
  45         pushl   $0x00ff0000 ;\
  46         pushl   $0x00ff0000 ;\
  47         pushl   $0x00ff0000 ;\
  48         mvins   (%esp), m2      ;\
  49         addl    $32, %esp
  50
  51 /* I implemented these as macros because they appear in several places,
  52  * and I've tweaked them a number of times.  I got tired of changing every
  53  * place they appear. :)
  54  */
  55
  56 #define DO_ONE_PIXEL() \
  57         movl    (%ebx), %eax ; \
  58         addl    $4, %ebx ; \
  59         bswap   %eax          /* ARGB -> BGRA */ ; \
  60         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  61         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  62         addl    $4, %ecx
  63
  64 #define DO_ONE_LAST_PIXEL() \
  65         movl    (%ebx), %eax ; \
  66         bswap   %eax          /* ARGB -> BGRA */ ; \
  67         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  68         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
  69
  70
  71 /**
  72  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  73  *
  74  * \warning
  75  * This function assumes that the caller will issue the EMMS instruction
  76  * at the correct places.
  77  */
  78
  79 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  80 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  81         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  82 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  83         pushl   %ebx
  84
  85 #ifdef USE_INNER_EMMS
  86         emms
  87 #endif
  88         LOAD_MASK(movq,%mm1,%mm2)
  89
  90         movl    8(%esp), %ebx   /* source pointer */
  91         movl    16(%esp), %edx  /* number of pixels to copy */
  92         movl    12(%esp), %ecx  /* destination pointer */
  93
  94         testl   %edx, %edx
  95         jle     .L20            /* Bail if there's nothing to do. */
  96
  97         movl    %ebx, %eax
  98
  99         negl    %eax
 100         sarl    $2, %eax
 101         andl    $1, %eax
 102         je      .L17
 103
 104         subl    %eax, %edx
 105         DO_ONE_PIXEL()
 106 .L17:
 107
 108         /* Would it be faster to unroll this loop once and process 4 pixels
 109          * per pass, instead of just two?
 110          */
 111
 112         movl    %edx, %eax
 113         shrl    %eax
 114         jmp     .L18
 115 .L19:
 116         movq    (%ebx), %mm0
 117         addl    $8, %ebx
 118
 119         /* These 9 instructions do what PSHUFB (if there were such an
 120          * instruction) could do in 1. :(
 121          */
 122
 123         movq    %mm0, %mm3
 124         movq    %mm0, %mm4
 125
 126         pand    %mm2, %mm3
 127         psllq   $16, %mm4
 128         psrlq   $16, %mm3
 129         pand    %mm2, %mm4
 130
 131         pand    %mm1, %mm0
 132         por     %mm4, %mm3
 133         por     %mm3, %mm0
 134
 135         movq    %mm0, (%ecx)
 136         addl    $8, %ecx
 137         subl    $1, %eax
 138 .L18:
 139         jne     .L19
 140
 141 #ifdef USE_INNER_EMMS
 142         emms
 143 #endif
 144
 145         /* At this point there are either 1 or 0 pixels remaining to be
 146          * converted.  Convert the last pixel, if needed.
 147          */
 148
 149         testl   $1, %edx
 150         je      .L20
 151
 152         DO_ONE_LAST_PIXEL()
 153
 154 .L20:
 155         popl    %ebx
 156         ret
 157         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 158
 159
 160 /**
 161  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 162  * instructions are only actually used to read data from the framebuffer.
 163  * In practice, the speed-up is pretty small.
 164  *
 165  * \todo
 166  * Do some more testing and determine if there's any reason to have this
 167  * function in addition to the MMX version.
 168  *
 169  * \warning
 170  * This function assumes that the caller will issue the EMMS instruction
 171  * at the correct places.
 172  */
 173
 174 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 175 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 176         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 177 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 178         pushl   %esi
 179         pushl   %ebx
 180         pushl   %ebp
 181
 182 #ifdef USE_INNER_EMMS
 183         emms
 184 #endif
 185
 186         LOAD_MASK(movq,%mm1,%mm2)
 187
 188         movl    16(%esp), %ebx  /* source pointer */
 189         movl    24(%esp), %edx  /* number of pixels to copy */
 190         movl    20(%esp), %ecx  /* destination pointer */
 191
 192         testl   %edx, %edx
 193         jle     .L35            /* Bail if there's nothing to do. */
 194
 195         movl    %esp, %ebp
 196         subl    $16, %esp
 197         andl    $0xfffffff0, %esp
 198
 199         movl    %ebx, %eax
 200         movl    %edx, %esi
 201
 202         negl    %eax
 203         andl    $15, %eax
 204         sarl    $2, %eax
 205         cmpl    %edx, %eax
 206         cmovle  %eax, %esi
 207
 208         subl    %esi, %edx
 209
 210         testl   $1, %esi
 211         je      .L32
 212
 213         DO_ONE_PIXEL()
 214 .L32:
 215
 216         testl   $2, %esi
 217         je      .L31
 218
 219         movq    (%ebx), %mm0
 220         addl    $8, %ebx
 221
 222         movq    %mm0, %mm3
 223         movq    %mm0, %mm4
 224
 225         pand    %mm2, %mm3
 226         psllq   $16, %mm4
 227         psrlq   $16, %mm3
 228         pand    %mm2, %mm4
 229
 230         pand    %mm1, %mm0
 231         por     %mm4, %mm3
 232         por     %mm3, %mm0
 233
 234         movq    %mm0, (%ecx)
 235         addl    $8, %ecx
 236 .L31:
 237
 238         movl    %edx, %eax
 239         shrl    $2, %eax
 240         jmp     .L33
 241 .L34:
 242         movaps  (%ebx), %xmm0
 243         addl    $16, %ebx
 244
 245         /* This would be so much better if we could just move directly from
 246          * an SSE register to an MMX register.  Unfortunately, that
 247          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 248          * instruction.
 249          */
 250
 251         movaps  %xmm0, (%esp)
 252         movq    (%esp), %mm0
 253         movq    8(%esp), %mm5
 254
 255         movq    %mm0, %mm3
 256         movq    %mm0, %mm4
 257         movq    %mm5, %mm6
 258         movq    %mm5, %mm7
 259
 260         pand    %mm2, %mm3
 261         pand    %mm2, %mm6
 262
 263         psllq   $16, %mm4
 264         psllq   $16, %mm7
 265
 266         psrlq   $16, %mm3
 267         psrlq   $16, %mm6
 268
 269         pand    %mm2, %mm4
 270         pand    %mm2, %mm7
 271
 272         pand    %mm1, %mm0
 273         pand    %mm1, %mm5
 274
 275         por     %mm4, %mm3
 276         por     %mm7, %mm6
 277
 278         por     %mm3, %mm0
 279         por     %mm6, %mm5
 280
 281         movq    %mm0, (%ecx)
 282         movq    %mm5, 8(%ecx)
 283         addl    $16, %ecx
 284
 285         subl    $1, %eax
 286 .L33:
 287         jne     .L34
 288
 289 #ifdef USE_INNER_EMMS
 290         emms
 291 #endif
 292         movl    %ebp, %esp
 293
 294         /* At this point there are either [0, 3] pixels remaining to be
 295          * converted.
 296          */
 297
 298         testl   $2, %edx
 299         je      .L36
 300
 301         movq    (%ebx), %mm0
 302         addl    $8, %ebx
 303
 304         movq    %mm0, %mm3
 305         movq    %mm0, %mm4
 306
 307         pand    %mm2, %mm3
 308         psllq   $16, %mm4
 309         psrlq   $16, %mm3
 310         pand    %mm2, %mm4
 311
 312         pand    %mm1, %mm0
 313         por     %mm4, %mm3
 314         por     %mm3, %mm0
 315
 316         movq    %mm0, (%ecx)
 317         addl    $8, %ecx
 318 .L36:
 319
 320         testl   $1, %edx
 321         je      .L35
 322
 323         DO_ONE_LAST_PIXEL()
 324 .L35:
 325         popl    %ebp
 326         popl    %ebx
 327         popl    %esi
 328         ret
 329         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 330
 331
 332 /**
 333  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 334  */
 335
 336         .text
 337 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 338 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 339         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 340 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 341         pushl   %esi
 342         pushl   %ebx
 343
 344         LOAD_MASK(movdqu,%xmm1,%xmm2)
 345
 346         movl    12(%esp), %ebx  /* source pointer */
 347         movl    20(%esp), %edx  /* number of pixels to copy */
 348         movl    16(%esp), %ecx  /* destination pointer */
 349
 350         movl    %ebx, %eax
 351         movl    %edx, %esi
 352
 353         testl   %edx, %edx
 354         jle     .L46            /* Bail if there's nothing to do. */
 355
 356         /* If the source pointer isn't a multiple of 16 we have to process
 357          * a few pixels the "slow" way to get the address aligned for
 358          * the SSE fetch intsructions.
 359          */
 360
 361         negl    %eax
 362         andl    $15, %eax
 363         sarl    $2, %eax
 364
 365         cmpl    %edx, %eax
 366         cmovbe  %eax, %esi
 367         subl    %esi, %edx
 368
 369         testl   $1, %esi
 370         je      .L41
 371
 372         DO_ONE_PIXEL()
 373 .L41:
 374         testl   $2, %esi
 375         je      .L40
 376
 377         movq    (%ebx), %xmm0
 378         addl    $8, %ebx
 379
 380         movdqa  %xmm0, %xmm3
 381         movdqa  %xmm0, %xmm4
 382         andps   %xmm1, %xmm0
 383
 384         andps   %xmm2, %xmm3
 385         pslldq  $2, %xmm4
 386         psrldq  $2, %xmm3
 387         andps   %xmm2, %xmm4
 388
 389         orps    %xmm4, %xmm3
 390         orps    %xmm3, %xmm0
 391
 392         movq    %xmm0, (%ecx)
 393         addl    $8, %ecx
 394 .L40:
 395
 396         /* Would it be worth having a specialized version of this loop for
 397          * the case where the destination is 16-byte aligned?  That version
 398          * would be identical except that it could use movedqa instead of
 399          * movdqu.
 400          */
 401
 402         movl    %edx, %eax
 403         shrl    $2, %eax
 404         jmp     .L42
 405 .L43:
 406         movdqa  (%ebx), %xmm0
 407         addl    $16, %ebx
 408
 409         movdqa  %xmm0, %xmm3
 410         movdqa  %xmm0, %xmm4
 411         andps   %xmm1, %xmm0
 412
 413         andps   %xmm2, %xmm3
 414         pslldq  $2, %xmm4
 415         psrldq  $2, %xmm3
 416         andps   %xmm2, %xmm4
 417
 418         orps    %xmm4, %xmm3
 419         orps    %xmm3, %xmm0
 420
 421         movdqu  %xmm0, (%ecx)
 422         addl    $16, %ecx
 423         subl    $1, %eax
 424 .L42:
 425         jne     .L43
 426
 427
 428         /* There may be upto 3 pixels remaining to be copied.  Take care
 429          * of them now.  We do the 2 pixel case first because the data
 430          * will be aligned.
 431          */
 432
 433         testl   $2, %edx
 434         je      .L47
 435
 436         movq    (%ebx), %xmm0
 437         addl    $8, %ebx
 438
 439         movdqa  %xmm0, %xmm3
 440         movdqa  %xmm0, %xmm4
 441         andps   %xmm1, %xmm0
 442
 443         andps   %xmm2, %xmm3
 444         pslldq  $2, %xmm4
 445         psrldq  $2, %xmm3
 446         andps   %xmm2, %xmm4
 447
 448         orps    %xmm4, %xmm3
 449         orps    %xmm3, %xmm0
 450
 451         movq    %xmm0, (%ecx)
 452         addl    $8, %ecx
 453 .L47:
 454
 455         testl   $1, %edx
 456         je      .L46
 457
 458         DO_ONE_LAST_PIXEL()
 459 .L46:
 460
 461         popl    %ebx
 462         popl    %esi
 463         ret
 464         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 465
 466
 467
 468 #define MASK_565_L      0x07e0f800
 469 #define MASK_565_H      0x0000001f
 470 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
 471  * classic C implementation in Mesa.  Setting SCALE_ADJUST
 472  * to 0 is slightly faster but at a small cost to accuracy.
 473  */
 474 #define SCALE_ADJUST    5
 475 #if SCALE_ADJUST == 5
 476 #define PRESCALE_L 0x00100001
 477 #define PRESCALE_H 0x00000200
 478 #define SCALE_L 0x40C620E8
 479 #define SCALE_H 0x0000839d
 480 #elif SCALE_ADJUST == 0
 481 #define PRESCALE_L 0x00200001
 482 #define PRESCALE_H 0x00000800
 483 #define SCALE_L 0x01040108
 484 #define SCALE_H 0x00000108
 485 #else
 486 #error SCALE_ADJUST must either be 5 or 0.
 487 #endif
 488 #define ALPHA_L 0x00000000
 489 #define ALPHA_H 0x00ff0000
 490
 491 /**
 492  * MMX optimized version of the RGB565 to RGBA copy routine.
 493  */
 494
 495         .text
 496         .globl  _generic_read_RGBA_span_RGB565_MMX
 497         .hidden _generic_read_RGBA_span_RGB565_MMX
 498         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 499
 500 _generic_read_RGBA_span_RGB565_MMX:
 501
 502 #ifdef USE_INNER_EMMS
 503         emms
 504 #endif
 505
 506         movl    4(%esp), %eax   /* source pointer */
 507         movl    8(%esp), %edx   /* destination pointer */
 508         movl    12(%esp), %ecx  /* number of pixels to copy */
 509
 510         pushl   $MASK_565_H
 511         pushl   $MASK_565_L
 512         movq    (%esp), %mm5
 513         pushl   $PRESCALE_H
 514         pushl   $PRESCALE_L
 515         movq    (%esp), %mm6
 516         pushl   $SCALE_H
 517         pushl   $SCALE_L
 518         movq    (%esp), %mm7
 519         pushl   $ALPHA_H
 520         pushl   $ALPHA_L
 521         movq    (%esp), %mm3
 522         addl    $32,%esp
 523
 524         sarl    $2, %ecx
 525         jl      .L01            /* Bail early if the count is negative. */
 526         jmp     .L02
 527
 528 .L03:
 529         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 530          * second pixels into the four words of %mm0 and %mm2.
 531          */
 532
 533         movq    (%eax), %mm4
 534         addl    $8, %eax
 535
 536         pshufw  $0x00, %mm4, %mm0
 537         pshufw  $0x55, %mm4, %mm2
 538
 539
 540         /* Mask the pixels so that each word of each register contains only
 541          * one color component.
 542          */
 543
 544         pand    %mm5, %mm0
 545         pand    %mm5, %mm2
 546
 547
 548         /* Adjust the component values so that they are as small as possible,
 549          * but large enough so that we can multiply them by an unsigned 16-bit
 550          * number and get a value as large as 0x00ff0000.
 551          */
 552
 553         pmullw  %mm6, %mm0
 554         pmullw  %mm6, %mm2
 555 #if SCALE_ADJUST > 0
 556         psrlw   $SCALE_ADJUST, %mm0
 557         psrlw   $SCALE_ADJUST, %mm2
 558 #endif
 559
 560         /* Scale the input component values to be on the range
 561          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 562          */
 563
 564         pmulhuw %mm7, %mm0
 565         pmulhuw %mm7, %mm2
 566
 567
 568         /* Always set the alpha value to 0xff.
 569          */
 570
 571         por %mm3, %mm0
 572         por %mm3, %mm2
 573
 574
 575         /* Pack the 16-bit values to 8-bit values and store the converted
 576          * pixel data.
 577          */
 578
 579         packuswb        %mm2, %mm0
 580         movq    %mm0, (%edx)
 581         addl    $8, %edx
 582
 583         pshufw  $0xaa, %mm4, %mm0
 584         pshufw  $0xff, %mm4, %mm2
 585
 586         pand    %mm5, %mm0
 587         pand    %mm5, %mm2
 588         pmullw  %mm6, %mm0
 589         pmullw  %mm6, %mm2
 590 #if SCALE_ADJUST > 0
 591         psrlw   $SCALE_ADJUST, %mm0
 592         psrlw   $SCALE_ADJUST, %mm2
 593 #endif
 594         pmulhuw %mm7, %mm0
 595         pmulhuw %mm7, %mm2
 596
 597         por %mm3, %mm0
 598         por %mm3, %mm2
 599
 600         packuswb        %mm2, %mm0
 601
 602         movq    %mm0, (%edx)
 603         addl    $8, %edx
 604
 605         subl    $1, %ecx
 606 .L02:
 607         jne     .L03
 608
 609
 610         /* At this point there can be at most 3 pixels left to process.  If
 611          * there is either 2 or 3 left, process 2.
 612          */
 613
 614         movl    12(%esp), %ecx
 615         testl   $0x02, %ecx
 616         je      .L04
 617
 618         movd    (%eax), %mm4
 619         addl    $4, %eax
 620
 621         pshufw  $0x00, %mm4, %mm0
 622         pshufw  $0x55, %mm4, %mm2
 623
 624         pand    %mm5, %mm0
 625         pand    %mm5, %mm2
 626         pmullw  %mm6, %mm0
 627         pmullw  %mm6, %mm2
 628 #if SCALE_ADJUST > 0
 629         psrlw   $SCALE_ADJUST, %mm0
 630         psrlw   $SCALE_ADJUST, %mm2
 631 #endif
 632         pmulhuw %mm7, %mm0
 633         pmulhuw %mm7, %mm2
 634
 635         por %mm3, %mm0
 636         por %mm3, %mm2
 637
 638         packuswb        %mm2, %mm0
 639
 640         movq    %mm0, (%edx)
 641         addl    $8, %edx
 642
 643 .L04:
 644         /* At this point there can be at most 1 pixel left to process.
 645          * Process it if needed.
 646          */
 647
 648         testl   $0x01, %ecx
 649         je      .L01
 650
 651         movzwl  (%eax), %ecx
 652         movd    %ecx, %mm4
 653
 654         pshufw  $0x00, %mm4, %mm0
 655
 656         pand    %mm5, %mm0
 657         pmullw  %mm6, %mm0
 658 #if SCALE_ADJUST > 0
 659         psrlw   $SCALE_ADJUST, %mm0
 660 #endif
 661         pmulhuw %mm7, %mm0
 662
 663         por %mm3, %mm0
 664
 665         packuswb        %mm0, %mm0
 666
 667         movd    %mm0, (%edx)
 668
 669 .L01:
 670 #ifdef USE_INNER_EMMS
 671         emms
 672 #endif
 673         ret
 674 #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
 675
 676 #if defined (__ELF__) && defined (__linux__)
 677         .section .note.GNU-stack,"",%progbits
 678 #endif