src/mesa/x86/read_rgba_span_x86.S

   1 /*
   2  * (C) Copyright IBM Corporation 2004
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file read_rgba_span_x86.S
  27  * Optimized routines to transfer pixel data from the framebuffer to a
  28  * buffer in main memory.
  29  *
  30  * \author Ian Romanick <idr@us.ibm.com>
  31  */
  32 /* Control flow enforcement support */
  33 #ifdef HAVE_CET_H
  34 #include <cet.h>
  35 #else
  36 #define _CET_ENDBR
  37 #endif
  38
  39         .file   "read_rgba_span_x86.S"
  40 #if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
  41 /* Kevin F. Quinn 2nd July 2006
  42  * Replaced data segment constants with text-segment instructions.
  43  */
  44 #define LOAD_MASK(mvins,m1,m2) \
  45         pushl   $0xff00ff00 ;\
  46         pushl   $0xff00ff00 ;\
  47         pushl   $0xff00ff00 ;\
  48         pushl   $0xff00ff00 ;\
  49         mvins   (%esp), m1      ;\
  50         pushl   $0x00ff0000 ;\
  51         pushl   $0x00ff0000 ;\
  52         pushl   $0x00ff0000 ;\
  53         pushl   $0x00ff0000 ;\
  54         mvins   (%esp), m2      ;\
  55         addl    $32, %esp
  56
  57 /* I implemented these as macros because they appear in several places,
  58  * and I've tweaked them a number of times.  I got tired of changing every
  59  * place they appear. :)
  60  */
  61
  62 #define DO_ONE_PIXEL() \
  63         movl    (%ebx), %eax ; \
  64         addl    $4, %ebx ; \
  65         bswap   %eax          /* ARGB -> BGRA */ ; \
  66         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  67         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
  68         addl    $4, %ecx
  69
  70 #define DO_ONE_LAST_PIXEL() \
  71         movl    (%ebx), %eax ; \
  72         bswap   %eax          /* ARGB -> BGRA */ ; \
  73         rorl    $8, %eax      /* BGRA -> ABGR */ ; \
  74         movl    %eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
  75
  76
  77 /**
  78  * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
  79  *
  80  * \warning
  81  * This function assumes that the caller will issue the EMMS instruction
  82  * at the correct places.
  83  */
  84
  85 .globl _generic_read_RGBA_span_BGRA8888_REV_MMX
  86 .hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
  87         .type   _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
  88 _generic_read_RGBA_span_BGRA8888_REV_MMX:
  89         _CET_ENDBR
  90         pushl   %ebx
  91
  92 #ifdef USE_INNER_EMMS
  93         emms
  94 #endif
  95         LOAD_MASK(movq,%mm1,%mm2)
  96
  97         movl    8(%esp), %ebx   /* source pointer */
  98         movl    16(%esp), %edx  /* number of pixels to copy */
  99         movl    12(%esp), %ecx  /* destination pointer */
 100
 101         testl   %edx, %edx
 102         jle     .L20            /* Bail if there's nothing to do. */
 103
 104         movl    %ebx, %eax
 105
 106         negl    %eax
 107         sarl    $2, %eax
 108         andl    $1, %eax
 109         je      .L17
 110
 111         subl    %eax, %edx
 112         DO_ONE_PIXEL()
 113 .L17:
 114
 115         /* Would it be faster to unroll this loop once and process 4 pixels
 116          * per pass, instead of just two?
 117          */
 118
 119         movl    %edx, %eax
 120         shrl    %eax
 121         jmp     .L18
 122 .L19:
 123         movq    (%ebx), %mm0
 124         addl    $8, %ebx
 125
 126         /* These 9 instructions do what PSHUFB (if there were such an
 127          * instruction) could do in 1. :(
 128          */
 129
 130         movq    %mm0, %mm3
 131         movq    %mm0, %mm4
 132
 133         pand    %mm2, %mm3
 134         psllq   $16, %mm4
 135         psrlq   $16, %mm3
 136         pand    %mm2, %mm4
 137
 138         pand    %mm1, %mm0
 139         por     %mm4, %mm3
 140         por     %mm3, %mm0
 141
 142         movq    %mm0, (%ecx)
 143         addl    $8, %ecx
 144         subl    $1, %eax
 145 .L18:
 146         jne     .L19
 147
 148 #ifdef USE_INNER_EMMS
 149         emms
 150 #endif
 151
 152         /* At this point there are either 1 or 0 pixels remaining to be
 153          * converted.  Convert the last pixel, if needed.
 154          */
 155
 156         testl   $1, %edx
 157         je      .L20
 158
 159         DO_ONE_LAST_PIXEL()
 160
 161 .L20:
 162         popl    %ebx
 163         ret
 164         .size   _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
 165
 166
 167 /**
 168  * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 169  * instructions are only actually used to read data from the framebuffer.
 170  * In practice, the speed-up is pretty small.
 171  *
 172  * \todo
 173  * Do some more testing and determine if there's any reason to have this
 174  * function in addition to the MMX version.
 175  *
 176  * \warning
 177  * This function assumes that the caller will issue the EMMS instruction
 178  * at the correct places.
 179  */
 180
 181 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE
 182 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
 183         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
 184 _generic_read_RGBA_span_BGRA8888_REV_SSE:
 185         _CET_ENDBR
 186         pushl   %esi
 187         pushl   %ebx
 188         pushl   %ebp
 189
 190 #ifdef USE_INNER_EMMS
 191         emms
 192 #endif
 193
 194         LOAD_MASK(movq,%mm1,%mm2)
 195
 196         movl    16(%esp), %ebx  /* source pointer */
 197         movl    24(%esp), %edx  /* number of pixels to copy */
 198         movl    20(%esp), %ecx  /* destination pointer */
 199
 200         testl   %edx, %edx
 201         jle     .L35            /* Bail if there's nothing to do. */
 202
 203         movl    %esp, %ebp
 204         subl    $16, %esp
 205         andl    $0xfffffff0, %esp
 206
 207         movl    %ebx, %eax
 208         movl    %edx, %esi
 209
 210         negl    %eax
 211         andl    $15, %eax
 212         sarl    $2, %eax
 213         cmpl    %edx, %eax
 214         cmovle  %eax, %esi
 215
 216         subl    %esi, %edx
 217
 218         testl   $1, %esi
 219         je      .L32
 220
 221         DO_ONE_PIXEL()
 222 .L32:
 223
 224         testl   $2, %esi
 225         je      .L31
 226
 227         movq    (%ebx), %mm0
 228         addl    $8, %ebx
 229
 230         movq    %mm0, %mm3
 231         movq    %mm0, %mm4
 232
 233         pand    %mm2, %mm3
 234         psllq   $16, %mm4
 235         psrlq   $16, %mm3
 236         pand    %mm2, %mm4
 237
 238         pand    %mm1, %mm0
 239         por     %mm4, %mm3
 240         por     %mm3, %mm0
 241
 242         movq    %mm0, (%ecx)
 243         addl    $8, %ecx
 244 .L31:
 245
 246         movl    %edx, %eax
 247         shrl    $2, %eax
 248         jmp     .L33
 249 .L34:
 250         movaps  (%ebx), %xmm0
 251         addl    $16, %ebx
 252
 253         /* This would be so much better if we could just move directly from
 254          * an SSE register to an MMX register.  Unfortunately, that
 255          * functionality wasn't introduced until SSE2 with the MOVDQ2Q
 256          * instruction.
 257          */
 258
 259         movaps  %xmm0, (%esp)
 260         movq    (%esp), %mm0
 261         movq    8(%esp), %mm5
 262
 263         movq    %mm0, %mm3
 264         movq    %mm0, %mm4
 265         movq    %mm5, %mm6
 266         movq    %mm5, %mm7
 267
 268         pand    %mm2, %mm3
 269         pand    %mm2, %mm6
 270
 271         psllq   $16, %mm4
 272         psllq   $16, %mm7
 273
 274         psrlq   $16, %mm3
 275         psrlq   $16, %mm6
 276
 277         pand    %mm2, %mm4
 278         pand    %mm2, %mm7
 279
 280         pand    %mm1, %mm0
 281         pand    %mm1, %mm5
 282
 283         por     %mm4, %mm3
 284         por     %mm7, %mm6
 285
 286         por     %mm3, %mm0
 287         por     %mm6, %mm5
 288
 289         movq    %mm0, (%ecx)
 290         movq    %mm5, 8(%ecx)
 291         addl    $16, %ecx
 292
 293         subl    $1, %eax
 294 .L33:
 295         jne     .L34
 296
 297 #ifdef USE_INNER_EMMS
 298         emms
 299 #endif
 300         movl    %ebp, %esp
 301
 302         /* At this point there are either [0, 3] pixels remaining to be
 303          * converted.
 304          */
 305
 306         testl   $2, %edx
 307         je      .L36
 308
 309         movq    (%ebx), %mm0
 310         addl    $8, %ebx
 311
 312         movq    %mm0, %mm3
 313         movq    %mm0, %mm4
 314
 315         pand    %mm2, %mm3
 316         psllq   $16, %mm4
 317         psrlq   $16, %mm3
 318         pand    %mm2, %mm4
 319
 320         pand    %mm1, %mm0
 321         por     %mm4, %mm3
 322         por     %mm3, %mm0
 323
 324         movq    %mm0, (%ecx)
 325         addl    $8, %ecx
 326 .L36:
 327
 328         testl   $1, %edx
 329         je      .L35
 330
 331         DO_ONE_LAST_PIXEL()
 332 .L35:
 333         popl    %ebp
 334         popl    %ebx
 335         popl    %esi
 336         ret
 337         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
 338
 339
 340 /**
 341  * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
 342  */
 343
 344         .text
 345 .globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
 346 .hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
 347         .type   _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
 348 _generic_read_RGBA_span_BGRA8888_REV_SSE2:
 349         _CET_ENDBR
 350         pushl   %esi
 351         pushl   %ebx
 352
 353         LOAD_MASK(movdqu,%xmm1,%xmm2)
 354
 355         movl    12(%esp), %ebx  /* source pointer */
 356         movl    20(%esp), %edx  /* number of pixels to copy */
 357         movl    16(%esp), %ecx  /* destination pointer */
 358
 359         movl    %ebx, %eax
 360         movl    %edx, %esi
 361
 362         testl   %edx, %edx
 363         jle     .L46            /* Bail if there's nothing to do. */
 364
 365         /* If the source pointer isn't a multiple of 16 we have to process
 366          * a few pixels the "slow" way to get the address aligned for
 367          * the SSE fetch intsructions.
 368          */
 369
 370         negl    %eax
 371         andl    $15, %eax
 372         sarl    $2, %eax
 373
 374         cmpl    %edx, %eax
 375         cmovbe  %eax, %esi
 376         subl    %esi, %edx
 377
 378         testl   $1, %esi
 379         je      .L41
 380
 381         DO_ONE_PIXEL()
 382 .L41:
 383         testl   $2, %esi
 384         je      .L40
 385
 386         movq    (%ebx), %xmm0
 387         addl    $8, %ebx
 388
 389         movdqa  %xmm0, %xmm3
 390         movdqa  %xmm0, %xmm4
 391         andps   %xmm1, %xmm0
 392
 393         andps   %xmm2, %xmm3
 394         pslldq  $2, %xmm4
 395         psrldq  $2, %xmm3
 396         andps   %xmm2, %xmm4
 397
 398         orps    %xmm4, %xmm3
 399         orps    %xmm3, %xmm0
 400
 401         movq    %xmm0, (%ecx)
 402         addl    $8, %ecx
 403 .L40:
 404
 405         /* Would it be worth having a specialized version of this loop for
 406          * the case where the destination is 16-byte aligned?  That version
 407          * would be identical except that it could use movedqa instead of
 408          * movdqu.
 409          */
 410
 411         movl    %edx, %eax
 412         shrl    $2, %eax
 413         jmp     .L42
 414 .L43:
 415         movdqa  (%ebx), %xmm0
 416         addl    $16, %ebx
 417
 418         movdqa  %xmm0, %xmm3
 419         movdqa  %xmm0, %xmm4
 420         andps   %xmm1, %xmm0
 421
 422         andps   %xmm2, %xmm3
 423         pslldq  $2, %xmm4
 424         psrldq  $2, %xmm3
 425         andps   %xmm2, %xmm4
 426
 427         orps    %xmm4, %xmm3
 428         orps    %xmm3, %xmm0
 429
 430         movdqu  %xmm0, (%ecx)
 431         addl    $16, %ecx
 432         subl    $1, %eax
 433 .L42:
 434         jne     .L43
 435
 436
 437         /* There may be upto 3 pixels remaining to be copied.  Take care
 438          * of them now.  We do the 2 pixel case first because the data
 439          * will be aligned.
 440          */
 441
 442         testl   $2, %edx
 443         je      .L47
 444
 445         movq    (%ebx), %xmm0
 446         addl    $8, %ebx
 447
 448         movdqa  %xmm0, %xmm3
 449         movdqa  %xmm0, %xmm4
 450         andps   %xmm1, %xmm0
 451
 452         andps   %xmm2, %xmm3
 453         pslldq  $2, %xmm4
 454         psrldq  $2, %xmm3
 455         andps   %xmm2, %xmm4
 456
 457         orps    %xmm4, %xmm3
 458         orps    %xmm3, %xmm0
 459
 460         movq    %xmm0, (%ecx)
 461         addl    $8, %ecx
 462 .L47:
 463
 464         testl   $1, %edx
 465         je      .L46
 466
 467         DO_ONE_LAST_PIXEL()
 468 .L46:
 469
 470         popl    %ebx
 471         popl    %esi
 472         ret
 473         .size   _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
 474
 475
 476
 477 #define MASK_565_L      0x07e0f800
 478 #define MASK_565_H      0x0000001f
 479 /* Setting SCALE_ADJUST to 5 gives a perfect match with the
 480  * classic C implementation in Mesa.  Setting SCALE_ADJUST
 481  * to 0 is slightly faster but at a small cost to accuracy.
 482  */
 483 #define SCALE_ADJUST    5
 484 #if SCALE_ADJUST == 5
 485 #define PRESCALE_L 0x00100001
 486 #define PRESCALE_H 0x00000200
 487 #define SCALE_L 0x40C620E8
 488 #define SCALE_H 0x0000839d
 489 #elif SCALE_ADJUST == 0
 490 #define PRESCALE_L 0x00200001
 491 #define PRESCALE_H 0x00000800
 492 #define SCALE_L 0x01040108
 493 #define SCALE_H 0x00000108
 494 #else
 495 #error SCALE_ADJUST must either be 5 or 0.
 496 #endif
 497 #define ALPHA_L 0x00000000
 498 #define ALPHA_H 0x00ff0000
 499
 500 /**
 501  * MMX optimized version of the RGB565 to RGBA copy routine.
 502  */
 503
 504         .text
 505         .globl  _generic_read_RGBA_span_RGB565_MMX
 506         .hidden _generic_read_RGBA_span_RGB565_MMX
 507         .type   _generic_read_RGBA_span_RGB565_MMX, @function
 508
 509 _generic_read_RGBA_span_RGB565_MMX:
 510         _CET_ENDBR
 511 #ifdef USE_INNER_EMMS
 512         emms
 513 #endif
 514
 515         movl    4(%esp), %eax   /* source pointer */
 516         movl    8(%esp), %edx   /* destination pointer */
 517         movl    12(%esp), %ecx  /* number of pixels to copy */
 518
 519         pushl   $MASK_565_H
 520         pushl   $MASK_565_L
 521         movq    (%esp), %mm5
 522         pushl   $PRESCALE_H
 523         pushl   $PRESCALE_L
 524         movq    (%esp), %mm6
 525         pushl   $SCALE_H
 526         pushl   $SCALE_L
 527         movq    (%esp), %mm7
 528         pushl   $ALPHA_H
 529         pushl   $ALPHA_L
 530         movq    (%esp), %mm3
 531         addl    $32,%esp
 532
 533         sarl    $2, %ecx
 534         jl      .L01            /* Bail early if the count is negative. */
 535         jmp     .L02
 536
 537 .L03:
 538         /* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
 539          * second pixels into the four words of %mm0 and %mm2.
 540          */
 541
 542         movq    (%eax), %mm4
 543         addl    $8, %eax
 544
 545         pshufw  $0x00, %mm4, %mm0
 546         pshufw  $0x55, %mm4, %mm2
 547
 548
 549         /* Mask the pixels so that each word of each register contains only
 550          * one color component.
 551          */
 552
 553         pand    %mm5, %mm0
 554         pand    %mm5, %mm2
 555
 556
 557         /* Adjust the component values so that they are as small as possible,
 558          * but large enough so that we can multiply them by an unsigned 16-bit
 559          * number and get a value as large as 0x00ff0000.
 560          */
 561
 562         pmullw  %mm6, %mm0
 563         pmullw  %mm6, %mm2
 564 #if SCALE_ADJUST > 0
 565         psrlw   $SCALE_ADJUST, %mm0
 566         psrlw   $SCALE_ADJUST, %mm2
 567 #endif
 568
 569         /* Scale the input component values to be on the range
 570          * [0, 0x00ff0000].  This it the real magic of the whole routine.
 571          */
 572
 573         pmulhuw %mm7, %mm0
 574         pmulhuw %mm7, %mm2
 575
 576
 577         /* Always set the alpha value to 0xff.
 578          */
 579
 580         por %mm3, %mm0
 581         por %mm3, %mm2
 582
 583
 584         /* Pack the 16-bit values to 8-bit values and store the converted
 585          * pixel data.
 586          */
 587
 588         packuswb        %mm2, %mm0
 589         movq    %mm0, (%edx)
 590         addl    $8, %edx
 591
 592         pshufw  $0xaa, %mm4, %mm0
 593         pshufw  $0xff, %mm4, %mm2
 594
 595         pand    %mm5, %mm0
 596         pand    %mm5, %mm2
 597         pmullw  %mm6, %mm0
 598         pmullw  %mm6, %mm2
 599 #if SCALE_ADJUST > 0
 600         psrlw   $SCALE_ADJUST, %mm0
 601         psrlw   $SCALE_ADJUST, %mm2
 602 #endif
 603         pmulhuw %mm7, %mm0
 604         pmulhuw %mm7, %mm2
 605
 606         por %mm3, %mm0
 607         por %mm3, %mm2
 608
 609         packuswb        %mm2, %mm0
 610
 611         movq    %mm0, (%edx)
 612         addl    $8, %edx
 613
 614         subl    $1, %ecx
 615 .L02:
 616         jne     .L03
 617
 618
 619         /* At this point there can be at most 3 pixels left to process.  If
 620          * there is either 2 or 3 left, process 2.
 621          */
 622
 623         movl    12(%esp), %ecx
 624         testl   $0x02, %ecx
 625         je      .L04
 626
 627         movd    (%eax), %mm4
 628         addl    $4, %eax
 629
 630         pshufw  $0x00, %mm4, %mm0
 631         pshufw  $0x55, %mm4, %mm2
 632
 633         pand    %mm5, %mm0
 634         pand    %mm5, %mm2
 635         pmullw  %mm6, %mm0
 636         pmullw  %mm6, %mm2
 637 #if SCALE_ADJUST > 0
 638         psrlw   $SCALE_ADJUST, %mm0
 639         psrlw   $SCALE_ADJUST, %mm2
 640 #endif
 641         pmulhuw %mm7, %mm0
 642         pmulhuw %mm7, %mm2
 643
 644         por %mm3, %mm0
 645         por %mm3, %mm2
 646
 647         packuswb        %mm2, %mm0
 648
 649         movq    %mm0, (%edx)
 650         addl    $8, %edx
 651
 652 .L04:
 653         /* At this point there can be at most 1 pixel left to process.
 654          * Process it if needed.
 655          */
 656
 657         testl   $0x01, %ecx
 658         je      .L01
 659
 660         movzwl  (%eax), %ecx
 661         movd    %ecx, %mm4
 662
 663         pshufw  $0x00, %mm4, %mm0
 664
 665         pand    %mm5, %mm0
 666         pmullw  %mm6, %mm0
 667 #if SCALE_ADJUST > 0
 668         psrlw   $SCALE_ADJUST, %mm0
 669 #endif
 670         pmulhuw %mm7, %mm0
 671
 672         por %mm3, %mm0
 673
 674         packuswb        %mm0, %mm0
 675
 676         movd    %mm0, (%edx)
 677
 678 .L01:
 679 #ifdef USE_INNER_EMMS
 680         emms
 681 #endif
 682         ret
 683 #endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
 684
 685 #if defined (__ELF__) && defined (__linux__)
 686         .section .note.GNU-stack,"",%progbits
 687 #endif