src/gallium/drivers/nvfx/nv04_2d.c

   1 /**************************************************************************
   2  *
   3  * Copyright 2009 Ben Skeggs
   4  * Copyright 2009 Younes Manton
   5  * Copyright 2010 Luca Barbieri
   6  * All Rights Reserved.
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining
   9  * a copy of this software and associated documentation files (the
  10  * "Software"), to deal in the Software without restriction, including
  11  * without limitation the rights to use, copy, modify, merge, publish,
  12  * distribute, sub license, and/or sell copies of the Software, and to
  13  * permit persons to whom the Software is furnished to do so, subject to
  14  * the following conditions:
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  17  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
  18  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  19  * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
  20  * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  22  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  23  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  24  *
  25  * The above copyright notice and this permission notice (including the
  26  * next paragraph) shall be included in all copies or substantial portions
  27  * of the Software.
  28  *
  29  **************************************************************************/
  30
  31 /* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */
  32
  33 #include <stdlib.h>
  34 #include <stdio.h>
  35 #include <stdint.h>
  36 #include <nouveau/nouveau_device.h>
  37 #include <nouveau/nouveau_pushbuf.h>
  38 #include <nouveau/nouveau_channel.h>
  39 #include <nouveau/nouveau_bo.h>
  40 #include <nouveau/nouveau_notifier.h>
  41 #include <nouveau/nouveau_grobj.h>
  42 #include "nv04_2d.h"
  43
  44 #include "nouveau/nv_object.xml.h"
  45 #include "nouveau/nv_m2mf.xml.h"
  46 #include "nv01_2d.xml.h"
  47
  48 /* avoid depending on Mesa/Gallium */
  49 #ifdef __GNUC__
  50 #define likely(x) __builtin_expect(!!(x), 1)
  51 #define unlikely(x) __builtin_expect(!!(x), 0)
  52 #else
  53 #define likely(x) !!(x)
  54 #define unlikely(x) !!(x)
  55 #endif
  56
  57 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
  58 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
  59
  60 struct nv04_2d_context
  61 {
  62         struct nouveau_notifier *ntfy;
  63         struct nouveau_grobj *surf2d;
  64         struct nouveau_grobj *swzsurf;
  65         struct nouveau_grobj *m2mf;
  66         struct nouveau_grobj *rect;
  67         struct nouveau_grobj *sifm;
  68         struct nouveau_grobj *blit;
  69 };
  70
  71 static inline int
  72 align(int value, int alignment)
  73 {
  74    return (value + alignment - 1) & ~(alignment - 1);
  75 }
  76
  77 static inline int
  78 util_is_pot(unsigned x)
  79 {
  80    return (x & (x - 1)) == 0;
  81 }
  82
  83 /* Integer base-2 logarithm, rounded towards zero. */
  84 static inline unsigned log2i(unsigned i)
  85 {
  86         unsigned r = 0;
  87
  88         if (i & 0xffff0000) {
  89                 i >>= 16;
  90                 r += 16;
  91         }
  92         if (i & 0x0000ff00) {
  93                 i >>= 8;
  94                 r += 8;
  95         }
  96         if (i & 0x000000f0) {
  97                 i >>= 4;
  98                 r += 4;
  99         }
 100         if (i & 0x0000000c) {
 101                 i >>= 2;
 102                 r += 2;
 103         }
 104         if (i & 0x00000002) {
 105                 r += 1;
 106         }
 107         return r;
 108 }
 109
 110 //#define NV04_REGION_DEBUG
 111
 112 // Yes, we really want to inline everything, since all the functions are used only once
 113 #if defined(__GNUC__) && !defined(DEBUG)
 114 #define inline __attribute__((always_inline)) inline
 115 #endif
 116
 117 static inline unsigned
 118 nv04_swizzle_bits_square(unsigned x, unsigned y)
 119 {
 120         unsigned u = (x & 0x001) << 0 |
 121                      (x & 0x002) << 1 |
 122                      (x & 0x004) << 2 |
 123                      (x & 0x008) << 3 |
 124                      (x & 0x010) << 4 |
 125                      (x & 0x020) << 5 |
 126                      (x & 0x040) << 6 |
 127                      (x & 0x080) << 7 |
 128                      (x & 0x100) << 8 |
 129                      (x & 0x200) << 9 |
 130                      (x & 0x400) << 10 |
 131                      (x & 0x800) << 11;
 132
 133         unsigned v = (y & 0x001) << 1 |
 134                      (y & 0x002) << 2 |
 135                      (y & 0x004) << 3 |
 136                      (y & 0x008) << 4 |
 137                      (y & 0x010) << 5 |
 138                      (y & 0x020) << 6 |
 139                      (y & 0x040) << 7 |
 140                      (y & 0x080) << 8 |
 141                      (y & 0x100) << 9 |
 142                      (y & 0x200) << 10 |
 143                      (y & 0x400) << 11 |
 144                      (y & 0x800) << 12;
 145         return v | u;
 146 }
 147
 148 /* rectangular swizzled textures are linear concatenations of swizzled square tiles */
 149 static inline unsigned
 150 nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h)
 151 {
 152         if(h <= 1)
 153                 return x;
 154         else
 155         {
 156                 unsigned s = MIN2(w, h);
 157                 unsigned m = s - 1;
 158                 return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
 159         }
 160 }
 161
 162 // general 3D texture case
 163 static inline unsigned
 164 nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d)
 165 {
 166         if(d <= 1)
 167                 return nv04_swizzle_bits_2d(x, y, w, h);
 168         else
 169         {
 170                 // TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call
 171                 unsigned v = 0;
 172                 w >>= 1;
 173                 h >>= 1;
 174                 d >>= 1;
 175                 for(int i = 0;;)
 176                 {
 177                         int oldi = i;
 178                         if(likely(w))
 179                         {
 180                                 v |= (x & 1) << i;
 181                                 x >>= 1;
 182                                 w >>= 1;
 183                                 ++i;
 184                         }
 185
 186                         if(likely(h))
 187                         {
 188                                 v |= (y & 1) << i;
 189                                 y >>= 1;
 190                                 h >>= 1;
 191                                 ++i;
 192                         }
 193
 194                         if(likely(d))
 195                         {
 196                                 v |= (z & 1) << i;
 197                                 z >>= 1;
 198                                 d >>= 1;
 199                                 ++i;
 200                         }
 201
 202                         if(i == oldi)
 203                                 break;
 204                 }
 205                 return v;
 206         }
 207 }
 208
 209 unsigned
 210 nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h)
 211 {
 212         if(rgn->pitch)
 213                 return rgn->pitch * rgn->y + (rgn->x << rgn->bpps);
 214         else
 215                 return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps;
 216 }
 217
 218 unsigned
 219 nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h)
 220 {
 221         if(rgn->pitch)
 222                 return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps);
 223         else
 224                 return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps;
 225 }
 226
 227 // *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations
 228 // returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled
 229 /* *pitch == -1 ret = 0 -> 3D swizzled subrect
 230  * *pitch == 0 ret = 0 -> 2D swizzled subrect
 231  * *pitch > 0 ret = 0 -> linear subrect
 232  * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside
 233  */
 234
 235 static inline void
 236 nv04_region_print(struct nv04_region* rgn)
 237 {
 238         fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset);
 239         if(rgn->pitch)
 240                 fprintf(stderr, "lin %i", rgn->pitch);
 241         else
 242                 fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d);
 243         fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z);
 244 }
 245
 246 static inline void
 247 nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h)
 248 {
 249         unsigned end = rgn->offset + nv04_region_end(rgn, w, h);
 250
 251         assert(rgn->offset <= (int)rgn->bo->size);
 252         assert(end <= rgn->bo->size);
 253         (void) end;
 254         if(!rgn->pitch) {
 255                 assert(util_is_pot(rgn->w));
 256                 assert(util_is_pot(rgn->h));
 257         }
 258 }
 259
 260 /* determine if region can be linearized or fake-linearized */
 261 static inline int
 262 nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h)
 263 {
 264         int surf_min;
 265         int rect_min;
 266
 267         if(rgn->pitch)
 268                 return rgn->pitch == w << rgn->bpps;
 269
 270         // redundant, but this is the fast path for the common case
 271         if(w == rgn->w && h == rgn->h && rgn->d <= 1)
 272                 return 1;
 273
 274         // must be POT
 275         if((w & (w - 1)) || (h & (h - 1)))
 276                 return 0;
 277
 278         // must be aligned
 279         if((rgn->x & (w - 1)) || (rgn->y & (h - 1)))
 280                 return 0;
 281
 282         if(rgn->d > 1)
 283                 return 0;
 284
 285         surf_min = MIN2(rgn->w, rgn->h);
 286         rect_min = MIN2(w, h);
 287
 288         if((rect_min == surf_min) || (w == h) || (w == 2 * h))
 289                 return 1;
 290
 291         return 0;
 292 }
 293
 294 // double the pitch until it is larger than the alignment, or the height becomes odd or 1
 295 static inline void
 296 nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align)
 297 {
 298         while(!(*h & 1) && (*w << rgn->bpps) < (1 << align))
 299         {
 300                 *w <<= 1;
 301                 *h >>= 1;
 302         }
 303
 304         while((*w << rgn->bpps) > 16384 && !(*w & 1))
 305         {
 306                 *w >>= 1;
 307                 *h <<= 1;
 308         }
 309
 310 #ifdef NV04_REGION_DEBUG
 311         fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h);
 312 #endif
 313 }
 314
 315 static inline void
 316 nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h)
 317 {
 318         int pos;
 319         if(rgn->pitch)
 320         {
 321                 rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
 322                 rgn->x = 0;
 323                 rgn->y = 0;
 324         }
 325         else
 326         {
 327                 rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps;
 328                 pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
 329                 rgn->x = pos & (w - 1);
 330                 rgn->y = pos / w;
 331         }
 332         rgn->pitch = w << rgn->bpps;
 333
 334 #ifdef NV04_REGION_DEBUG
 335         fprintf(stderr, "\tLINEARIZE ");
 336         nv04_region_print(rgn);
 337         fprintf(stderr, "\n");
 338 #endif
 339 }
 340
 341         /* preserve the offset! */
 342         /*
 343         rgn->pitch = util_format_get_stride(rgn->format, w);
 344         int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d);
 345         rgn->x = pos & (w - 1);
 346         rgn->y = pos & ~(w - 1);
 347         */
 348
 349         /*
 350         rgn->offset +=
 351         rgn->pitch = util_format_get_stride(rgn->format, w);
 352         rgn->x = 0;
 353         rgn->y = 0;
 354         */
 355
 356 /* This code will get used for, and always succeed on:
 357  * - 4x2 1bpp swizzled texture mipmap levels
 358  * - linear regions created by linearization
 359  *
 360  * This code will get used for, and MAY work for:
 361  * - misaligned texture blanket
 362  * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky)
 363  *
 364  * The general case requires splitting the region in 2.
 365  */
 366 static inline int
 367 nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
 368 {
 369         if(rgn->pitch > 0)
 370         {
 371                 assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal!
 372
 373                 if(h <= 1)
 374                 {
 375                         int delta;
 376                         rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps);
 377                         delta = rgn->offset & ((1 << shift) - 1);
 378                         rgn->y = 0;
 379                         rgn->x = delta >> rgn->bpps;
 380                         rgn->offset -= delta;
 381                         rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift);
 382                 }
 383                 else
 384                 {
 385                         int delta = rgn->offset & ((1 << shift) - 1);
 386                         int newxo = (rgn->x << rgn->bpps) + delta;
 387                         int dy = newxo / rgn->pitch;
 388                         newxo -= dy * rgn->pitch;
 389                         if((newxo + (w << rgn->bpps)) > rgn->pitch)
 390                         {
 391                                 // TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles
 392                                 // this does not happen if the surface is pitch-aligned, which it should always be
 393                                 assert(0);
 394                                 return -1;
 395                         }
 396                         rgn->x = newxo >> rgn->bpps;
 397                         rgn->y += dy;
 398                 }
 399         }
 400         else
 401         {
 402                 int size;
 403                 int min;
 404                 int v;
 405
 406                 // we don't care about the alignment of 3D surfaces since the 2D engine can't use them
 407                 if(rgn->d < 0)
 408                         return -1;
 409
 410                 min = MIN2(rgn->w, rgn->h);
 411                 size = min * min << rgn->bpps;
 412
 413                 // this is unfixable, and should not be happening
 414                 if(rgn->offset & (size - 1))
 415                         return -1;
 416
 417                 v = (rgn->offset & ((1 << shift) - 1)) / size;
 418                 rgn->offset -= v * size;
 419
 420                 if(rgn->h == min)
 421                 {
 422                         unsigned w;
 423                         rgn->x += rgn->h * v;
 424                         w = rgn->w + rgn->h * v;
 425
 426                         while(rgn->w < w)
 427                                 rgn->w += rgn->w;
 428                 }
 429                 else
 430                 {
 431                         unsigned h;
 432                         rgn->y += rgn->w * v;
 433                         h = rgn->h + rgn->w * v;
 434
 435                         while(rgn->h < h)
 436                                 rgn->h += rgn->h;
 437                 }
 438         }
 439
 440 #ifdef NV04_REGION_DEBUG
 441         fprintf(stderr, "\tALIGNED ");
 442         nv04_region_print(rgn);
 443         fprintf(stderr, "\n");
 444 #endif
 445         return 0;
 446 }
 447
 448 // both pitch and shift
 449 // will leave the region unchanged if it fails
 450 static inline int
 451 nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift)
 452 {
 453         if(rgn->pitch & ((1 << shift) - 1))
 454         {
 455                 if(h == 1)
 456                         goto do_align; /* this will fix pitch too in this case */
 457                 else
 458                         return -1;
 459         }
 460
 461         if(rgn->offset & ((1 << shift) - 1))
 462         {
 463                 do_align:
 464                 if(nv04_region_do_align_offset(rgn, w, h, shift))
 465                         return -1;
 466         }
 467         return 0;
 468 }
 469
 470 /* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */
 471 void
 472 nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h)
 473 {
 474         uint8_t* mdst;
 475         uint8_t* msrc;
 476         int size;
 477
 478         if(dst->bo != src->bo)
 479         {
 480                 nouveau_bo_map(dst->bo, NOUVEAU_BO_WR);
 481                 nouveau_bo_map(src->bo, NOUVEAU_BO_RD);
 482         }
 483         else
 484                 nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD);
 485
 486         mdst = (uint8_t*)dst->bo->map + dst->offset;
 487         msrc = (uint8_t*)src->bo->map + src->offset;
 488
 489         size = w << dst->bpps;
 490
 491         nv04_region_assert(dst, w, h);
 492         nv04_region_assert(src, w, h);
 493
 494 #ifdef NV04_REGION_DEBUG
 495         fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps);
 496         for(int i = 0; i < 2; ++i)
 497         {
 498                 nv04_region_print(i ? src : dst);
 499                 fprintf(stderr, i ? "\n" : " <- ");
 500         }
 501
 502 //      for(int i = 0; i < 16; ++i)
 503 //              fprintf(stderr, "%02x ", msrc[i]);
 504 //      fprintf(stderr, "\n");
 505 #endif
 506
 507         // TODO: support overlapping copies!
 508         if(src->pitch && dst->pitch)
 509         {
 510                 mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
 511                 msrc += src->y * src->pitch + (src->x << src->bpps);
 512                 if(dst->bo != src->bo)
 513                         goto simple;
 514                 else if(mdst < msrc)
 515                 {
 516                         if(mdst + size <= msrc)
 517                         {
 518 simple:
 519                                 for(int iy = 0; iy < h; ++iy)
 520                                 {
 521                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 522                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 523                                         memcpy(mdst, msrc, size);
 524                                         msrc += src->pitch; mdst += dst->pitch;
 525                                 }
 526                         }
 527                         else
 528                         {
 529                                 for(int iy = 0; iy < h; ++iy)
 530                                 {
 531                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 532                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 533                                         memmove(mdst, msrc, size);
 534                                         msrc += src->pitch; mdst += dst->pitch;
 535                                 }
 536                         }
 537                 }
 538                 else
 539                 {
 540                         /* copy backwards so we don't destroy data we have to read yet */
 541                         if(msrc + size <= mdst)
 542                         {
 543                                 for(int iy = h - 1; iy >= 0; --iy)
 544                                 {
 545                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 546                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 547                                         memcpy(mdst, msrc, size);
 548                                         msrc += src->pitch; mdst += dst->pitch;
 549                                 }
 550                         }
 551                         else
 552                         {
 553                                 for(int iy = h - 1; iy >= 0; --iy)
 554                                 {
 555                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 556                                         assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size);
 557                                         memmove(mdst, msrc, size);
 558                                         msrc += src->pitch; mdst += dst->pitch;
 559                                 }
 560                         }
 561                 }
 562         }
 563         else
 564         {
 565                 int* dswx = NULL;
 566                 int* dswy = NULL;
 567                 int* sswx = NULL;
 568                 int* sswy = NULL;
 569                 int dir;
 570
 571                 if(!dst->pitch)
 572                 {
 573                         dswx = alloca(w * sizeof(int));
 574                         for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both
 575                                 dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d);
 576                         dswy = alloca(h * sizeof(int));
 577                         for(int iy = 0; iy < h; ++iy)
 578                                 dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
 579                 }
 580
 581                 if(!src->pitch)
 582                 {
 583                         sswx = alloca(w * sizeof(int));
 584                         for(int ix = 0; ix < w; ++ix)
 585                                 sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d);
 586                         sswy = alloca(h * sizeof(int));
 587                         for(int iy = 0; iy < h; ++iy)
 588                                 sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d);
 589                 }
 590
 591                 dir = 1;
 592                 /* do backwards copies for overlapping swizzled surfaces */
 593                 if(dst->pitch == src->pitch && dst->offset == src->offset)
 594                 {
 595                         if(dst->y > src->y || (dst->y == src->y && dst->x > src->x))
 596                                 dir = -1;
 597                 }
 598
 599 #define SWIZZLED_COPY_LOOPS
 600                 if(dir == 1)
 601                 {
 602                         int dir = 1;
 603 #define LOOP_Y for(int iy = 0; iy < h; ++iy)
 604 #define LOOP_X for(int ix = 0; ix < w; ++ix)
 605 #include "nv04_2d_loops.h"
 606 #undef LOOP_X
 607 #undef LOOP_Y
 608                 }
 609                 else
 610                 {
 611                         int dir = -1;
 612 #define LOOP_Y for(int iy = h - 1; iy >= 0; --iy)
 613 #define LOOP_X for(int ix = w - 1; ix >= 0; --ix)
 614 #include "nv04_2d_loops.h"
 615 #undef LOOP_X
 616 #undef LOOP_Y
 617                 }
 618 #undef SWIZZLED_COPY_LOOP
 619         }
 620
 621         if(src->bo != dst->bo)
 622                 nouveau_bo_unmap(src->bo);
 623         nouveau_bo_unmap(dst->bo);
 624 }
 625
 626 /* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail
 627  * the alternative is to read, modify and copy back, which may or may not be faster
 628  * loading 3D textures is a common case that hits this and could probably benefit from the temporary
 629  */
 630 void
 631 nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value)
 632 {
 633         uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), (uint8_t*)dst->bo->map + dst->offset);
 634
 635 #ifdef NV04_REGION_DEBUG
 636         fprintf(stderr, "\tRGN_FILL_CPU ");
 637         nv04_region_print(dst);
 638         fprintf(stderr, "\n");
 639 #endif
 640
 641         nv04_region_assert(dst, w, h);
 642
 643         if(dst->pitch)
 644         {
 645                 unsigned size = w << dst->bpps;
 646
 647 #define FILL(T) do { \
 648                         for(int iy = 0; iy < h; ++iy) \
 649                         { \
 650                                 assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \
 651                                 for(int ix = 0; ix < w; ++ix) \
 652                                         ((T*)mdst)[ix] = (T)value; \
 653                                 mdst += dst->pitch; \
 654                         } \
 655                 } while(0)
 656
 657                 mdst += dst->y * dst->pitch + (dst->x << dst->bpps);
 658
 659                 if(dst->bpps == 0)
 660                 {
 661 ms:
 662                         assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size);
 663                         if(size == dst->pitch)
 664                                 memset(mdst, (uint8_t)value, size * h);
 665                         else
 666                         {
 667                                 for(int iy = 0; iy < h; ++iy)
 668                                 {
 669                                         assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size);
 670                                         memset(mdst, (uint8_t)value, size);
 671                                         mdst += dst->pitch;
 672                                 }
 673                         }
 674                 }
 675                 else if(dst->bpps == 1)
 676                 {
 677                         if(!((uint8_t)value ^ (uint8_t)(value >> 8)))
 678                                 goto ms;
 679
 680                         FILL(uint16_t);
 681                 }
 682                 else if(dst->bpps == 2)
 683                 {
 684                         if(value == (uint8_t)value * 0x1010101)
 685                                 goto ms;
 686                         FILL(uint32_t);
 687                 }
 688                 else
 689                         assert(0);
 690 #undef FILL
 691         }
 692         else
 693         {
 694                 int* dswx;
 695                 int* dswy;
 696
 697                 dswx = alloca(w * sizeof(int));
 698                 for(int ix = 0; ix < w; ++ix)
 699                         dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d);
 700                 dswy = alloca(h * sizeof(int));
 701                 for(int iy = 0; iy < h; ++iy)
 702                         dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d);
 703
 704 #define FILL(T) do { \
 705                         T tvalue = (T)value; \
 706                         for(int iy = 0; iy < h; ++iy) \
 707                         { \
 708                                 T* pdst = (T*)mdst + dswy[iy]; \
 709                                 for(int ix = 0; ix < w; ++ix) \
 710                                 { \
 711                                         assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \
 712                                         pdst[dswx[ix]] = tvalue; \
 713                                 } \
 714                         } \
 715                 } while(0)
 716
 717                 if(dst->bpps == 0)
 718                         FILL(uint8_t);
 719                 else if(dst->bpps == 1)
 720                         FILL(uint16_t);
 721                 else if(dst->bpps == 2)
 722                         FILL(uint32_t);
 723                 else
 724                         assert(0 && "unhandled bpp");
 725 #undef FILL
 726         }
 727
 728         nouveau_bo_unmap(dst->bo);
 729 }
 730
 731 static inline int
 732 nv04_region_cs2d_format(struct nv04_region* rgn)
 733 {
 734         switch(rgn->bpps) {
 735         case 0:
 736                 return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
 737         case 1:
 738                 if(rgn->one_bits >= 1)
 739                         return NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_X1R5G5B5;
 740                 else
 741                         return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
 742         case 2:
 743                 if(rgn->one_bits >= 8)
 744                         return NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_X8R8G8B8;
 745                 else
 746                         return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
 747         default:
 748                 return -1;
 749         }
 750 }
 751
 752 static inline int
 753 nv04_region_sifm_format(struct nv04_region* rgn)
 754 {
 755         switch(rgn->bpps) {
 756         case 0:
 757                 return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
 758         case 1:
 759                 if(rgn->one_bits >= 1)
 760                         return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5;
 761                 else
 762                         return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
 763         case 2:
 764                 if(rgn->one_bits >= 8)
 765                         return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
 766                 else
 767                         return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
 768         default:
 769                 return -1;
 770         }
 771 }
 772 static void
 773 nv04_region_copy_swizzle(struct nv04_2d_context *ctx,
 774                           struct nv04_region* dst,
 775                           struct nv04_region* src,
 776                           int w, int h)
 777 {
 778         struct nouveau_channel *chan = ctx->swzsurf->channel;
 779         struct nouveau_grobj *swzsurf = ctx->swzsurf;
 780         struct nouveau_grobj *sifm = ctx->sifm;
 781         int cs2d_format = nv04_region_cs2d_format(dst);
 782         int sifm_format = nv04_region_sifm_format(src);
 783         /* Max width & height may not be the same on all HW, but must be POT */
 784         unsigned max_shift = 10;
 785         unsigned cw = 1 << max_shift;
 786         unsigned ch = 1 << max_shift;
 787         unsigned sx = dst->x >> max_shift;
 788         unsigned sy = dst->y >> max_shift;
 789         unsigned ex = (dst->x + w - 1) >> max_shift;
 790         unsigned ey = (dst->y + h - 1) >> max_shift;
 791         unsigned chunks = (ex - sx + 1) * (ey - sy + 1);
 792         unsigned chunk_size;
 793         if(dst->w < cw)
 794                 cw = dst->w;
 795         if(dst->h < ch)
 796                 ch = dst->h;
 797         chunk_size = cw * ch << dst->bpps;
 798
 799 #ifdef NV04_REGION_DEBUG
 800         fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps);
 801         for(int i = 0; i < 2; ++i)
 802         {
 803                 nv04_region_print(i ? src : dst);
 804                 fprintf(stderr, i ? "\n" : " <- ");
 805         }
 806 #endif
 807
 808         nv04_region_assert(dst, w, h);
 809         nv04_region_assert(src, w, h);
 810
 811         MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2);
 812
 813         BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
 814         OUT_RELOCo(chan, dst->bo,
 815                         NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 816
 817         BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
 818         OUT_RING  (chan, cs2d_format |
 819                          log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U__SHIFT |
 820                          log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V__SHIFT);
 821
 822         BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
 823         OUT_RELOCo(chan, src->bo,
 824                          NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 825         BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
 826         OUT_RING  (chan, swzsurf->handle);
 827
 828         assert(!(dst->offset & 63));
 829
 830         for (int cy = sy; cy <= ey; ++cy) {
 831           int ry = MAX2(0, (int)(dst->y - ch * cy));
 832           int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry;
 833           for (int cx = sx; cx <= ex; ++cx) {
 834             int rx = MAX2(0, (int)(dst->x - cw * cx));
 835             int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx;
 836             unsigned dst_offset;
 837             unsigned src_offset;
 838
 839             BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
 840
 841             dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps);
 842             assert(dst_offset <= dst->bo->size);
 843             assert(dst_offset + chunk_size <= dst->bo->size);
 844             OUT_RELOCl(chan, dst->bo, dst_offset,
 845                             NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 846
 847             BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 848             OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 849             OUT_RING  (chan, sifm_format);
 850             OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
 851             OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y__SHIFT));
 852             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H__SHIFT | rw);
 853             OUT_RING  (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y__SHIFT));
 854             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H__SHIFT | rw);
 855             OUT_RING  (chan, 1 << 20);
 856             OUT_RING  (chan, 1 << 20);
 857
 858             BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
 859             OUT_RING  (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H__SHIFT | align(rw, 8));
 860             OUT_RING  (chan, src->pitch |
 861                              NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
 862                              NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
 863             src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps);
 864             assert(src_offset <= src->bo->size);
 865             assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size);
 866             OUT_RELOCl(chan, src->bo, src_offset,
 867                              NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 868             OUT_RING  (chan, 0);
 869           }
 870         }
 871 }
 872
 873 static inline void
 874 nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands)
 875 {
 876         struct nouveau_channel *chan = ctx->m2mf->channel;
 877         struct nouveau_grobj *m2mf = ctx->m2mf;
 878         MARK_RING (chan, 3 + commands * 9, 2 + commands * 2);
 879         BEGIN_RING(chan, m2mf, NV04_M2MF_DMA_BUFFER_IN, 2);
 880         OUT_RELOCo(chan, srcbo,
 881                    NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
 882         OUT_RELOCo(chan, dstbo,
 883                    NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 884 }
 885
 886 static inline void
 887 nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines)
 888 {
 889         struct nouveau_channel *chan = ctx->m2mf->channel;
 890         struct nouveau_grobj *m2mf = ctx->m2mf;
 891
 892 #ifdef NV04_REGION_DEBUG
 893         fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch);
 894 #endif
 895
 896         BEGIN_RING(chan, m2mf, NV04_M2MF_OFFSET_IN, 8);
 897         OUT_RELOCl(chan, srcbo, *psrcoff,
 898                    NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 899         OUT_RELOCl(chan, dstbo, *pdstoff,
 900                    NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
 901         OUT_RING  (chan, srcpitch);
 902         OUT_RING  (chan, dstpitch);
 903         OUT_RING  (chan, size);
 904         OUT_RING  (chan, lines);
 905         OUT_RING  (chan, 0x0101);
 906         OUT_RING  (chan, 0);
 907
 908         *psrcoff += srcpitch * lines;
 909         *pdstoff += dstpitch * lines;
 910 }
 911
 912 static void
 913 nv04_copy_m2mf(struct nv04_2d_context *ctx,
 914                 struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch,
 915                 struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch,
 916                 unsigned size, unsigned h)
 917 {
 918         unsigned max_pitch = 32767;
 919         unsigned max_lines = 2047;
 920
 921 #ifdef NV04_REGION_DEBUG
 922         fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch);
 923 #endif
 924
 925         if(srcpitch <= max_pitch && dstpitch <= max_pitch)
 926         {
 927                 unsigned full_pages = h / max_lines;
 928                 unsigned leftover_lines = h - full_pages * max_lines;
 929
 930                 nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines);
 931
 932                 for(unsigned i = 0; i < full_pages; ++i)
 933                         nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines);
 934
 935                 if(leftover_lines)
 936                         nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines);
 937         }
 938         else
 939         {
 940                 unsigned lines = size / max_pitch;
 941                 unsigned leftover = size - lines * max_pitch;
 942                 unsigned full_pages = lines / max_lines;
 943                 unsigned leftover_lines = lines - full_pages * max_lines;
 944                 unsigned srcgap = srcpitch - size;
 945                 unsigned dstgap = dstpitch - size;
 946
 947                 nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover));
 948
 949                 for(unsigned i = 0; i < h; ++i)
 950                 {
 951                         for(unsigned j = 0; j < full_pages; ++j)
 952                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines);
 953
 954                         if(leftover_lines)
 955                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines);
 956
 957                         if(leftover)
 958                                 nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1);
 959
 960                         srcoff += srcgap;
 961                         dstoff += dstgap;
 962                 }
 963         }
 964 }
 965
 966 void
 967 nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size)
 968 {
 969 #ifdef NV04_REGION_DEBUG
 970         fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff);
 971 #endif
 972
 973         nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1);
 974 }
 975
 976 static void
 977 nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h)
 978 {
 979 #ifdef NV04_REGION_DEBUG
 980         fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps);
 981         for(int i = 0; i < 2; ++i)
 982         {
 983                 nv04_region_print(i ? src : dst);
 984                 fprintf(stderr, i ? "\n" : " <- ");
 985         }
 986 #endif
 987
 988         nv04_region_assert(dst, w, h);
 989         nv04_region_assert(src, w, h);
 990         assert(src->pitch);
 991         assert(dst->pitch);
 992
 993         nv04_copy_m2mf(ctx,
 994                         dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch,
 995                         src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch,
 996                         w << src->bpps, h);
 997 }
 998
 999 static inline void
1000 nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h)
1001 {
1002         struct nouveau_channel *chan = ctx->surf2d->channel;
1003         struct nouveau_grobj *surf2d = ctx->surf2d;
1004         struct nouveau_grobj *blit = ctx->blit;
1005         int cs2d_format = nv04_region_cs2d_format(dst);
1006
1007 #ifdef NV04_REGION_DEBUG
1008         fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps);
1009         for(int i = 0; i < 2; ++i)
1010         {
1011                 nv04_region_print(i ? src : dst);
1012                 fprintf(stderr, i ? "\n" : " <- ");
1013         }
1014 #endif
1015
1016         assert(!(src->pitch & 63) && src->pitch);
1017         assert(!(dst->pitch & 63) && dst->pitch);
1018         nv04_region_assert(dst, w, h);
1019         nv04_region_assert(src, w, h);
1020
1021         MARK_RING (chan, 12, 4);
1022         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
1023         OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
1024         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1025         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
1026         OUT_RING  (chan, cs2d_format);
1027         OUT_RING  (chan, (dst->pitch << 16) | src->pitch);
1028         OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
1029         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1030
1031         BEGIN_RING(chan, blit, 0x0300, 3);
1032         OUT_RING  (chan, (src->y << 16) | src->x);
1033         OUT_RING  (chan, (dst->y << 16) | dst->x);
1034         OUT_RING  (chan, ( h << 16) |  w);
1035 }
1036
1037 /* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces
1038  * HYPOTESIS:
1039  * 1. The first mipmap level is 64-byte-aligned
1040  * PROOF:
1041  * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are.
1042  * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4
1043  * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not.
1044  * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2
1045  *
1046  * However, nv04_region_align handles that.
1047  */
1048
1049 // 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU
1050 // dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary
1051 int
1052 nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src,
1053                 int w, int h, int dst_to_gpu, int src_on_gpu)
1054 {
1055         assert(src->bpps == dst->bpps);
1056
1057 #ifdef NV04_REGION_DEBUG
1058         fprintf(stderr, "RGN_COPY [%i, %i: %i] ", w, h, dst->bpps);
1059         for(int i = 0; i < 2; ++i)
1060         {
1061                 int gpu = i ? src_on_gpu : dst_to_gpu;
1062                 nv04_region_print(i ? src : dst);
1063                 fprintf(stderr, " %s", gpu ? "gpu" : "cpu");
1064                 fprintf(stderr, i ? "\n" : " <- ");
1065         }
1066 #endif
1067
1068         // if they are contiguous and either both swizzled or both linear, reshape
1069         if(!dst->pitch == !src->pitch
1070                 && nv04_region_is_contiguous(dst, w, h)
1071                 && nv04_region_is_contiguous(src, w, h))
1072         {
1073                 nv04_region_contiguous_shape(dst, &w, &h, 6);
1074                 nv04_region_linearize_contiguous(dst, w, h);
1075                 nv04_region_linearize_contiguous(src, w, h);
1076         }
1077
1078 #ifdef NV04_REGION_DEBUG
1079         fprintf(stderr, "\tOPT ");
1080         for(int i = 0; i < 2; ++i)
1081         {
1082                 nv04_region_print(i ? src : dst);
1083                 fprintf(stderr, i ? "\n" : " <- ");
1084         }
1085 #endif
1086
1087         /* if the destination is not for GPU _and_ source is on CPU, use CPU */
1088         /* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */
1089         /* TODO: benchmark to find out in which cases exactly we should prefer the CPU */
1090          if((!dst_to_gpu && !src_on_gpu)
1091                 || (!dst->pitch && dst->d > 1)
1092                 /* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */
1093          )
1094                  return -1;
1095         /* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine
1096          * ask the caller to use the 3D engine
1097          * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so
1098          */
1099          else if(!src->pitch)
1100          {
1101 #ifdef NV04_REGION_DEBUG
1102                 fprintf(stderr, "\tCOPY_ENG3D\n");
1103 #endif
1104                  return 1;
1105          }
1106         /* Setup transfer to swizzle the texture to vram if needed */
1107         else
1108         {
1109                 if (!dst->pitch)
1110                 {
1111                         if(!dst_to_gpu)
1112                         {
1113 #ifdef NV04_REGION_DEBUG
1114                                 fprintf(stderr, "\tCOPY_ENG3D\n");
1115 #endif
1116                                 return 1;
1117                         }
1118                         else
1119                         {
1120                                 assert(!nv04_region_align(dst, w, h, 6));
1121
1122                                 nv04_region_copy_swizzle(ctx, dst, src, w, h);
1123                                 return 0;
1124                         }
1125                 }
1126                 else
1127                 {
1128                         /* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
1129                          * to NV_M2MF in this case.
1130                          * TODO: is this also true for the source? possibly not
1131                          * TODO: should we just always use m2mf?
1132                          * TODO: if not, add support for multiple operations to copy_blit
1133                          */
1134
1135                         if (!dst_to_gpu
1136                                 || w > 2047
1137                                 || h > 2047
1138                                 || (w & 1)
1139                                 || nv04_region_align(src, w, h, 6)
1140                                 || nv04_region_align(dst, w, h, 6)
1141                                 )
1142                                 nv04_region_copy_m2mf(ctx, dst, src, w, h);
1143                         else
1144                                 nv04_region_copy_blit(ctx, dst, src, w, h);
1145
1146                         return 0;
1147                 }
1148         }
1149 }
1150
1151 static inline void
1152 nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value)
1153 {
1154         struct nouveau_channel *chan = ctx->surf2d->channel;
1155         struct nouveau_grobj *surf2d = ctx->surf2d;
1156         struct nouveau_grobj *rect = ctx->rect;
1157         int cs2d_format, gdirect_format;
1158
1159 #ifdef NV04_REGION_DEBUG
1160         fprintf(stderr, "\tFILL_GDIRECT\n");
1161 #endif
1162
1163         assert(!(dst->pitch & 63) && dst->pitch);
1164         nv04_region_assert(dst, w, h);
1165
1166         switch(dst->bpps)
1167         {
1168         case 0:
1169                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
1170                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
1171                 break;
1172         case 1:
1173                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
1174                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16;
1175                 break;
1176         case 2:
1177                 gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
1178                 cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
1179                 break;
1180         default:
1181                 assert(0);
1182                 gdirect_format = 0;
1183                 cs2d_format = 0;
1184                 break;
1185         }
1186
1187         MARK_RING (chan, 15, 4);
1188         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
1189         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1190         OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1191         BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
1192         OUT_RING  (chan, cs2d_format);
1193         OUT_RING  (chan, (dst->pitch << 16) | dst->pitch);
1194         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1195         OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
1196
1197         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
1198         OUT_RING  (chan, gdirect_format);
1199         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
1200         OUT_RING  (chan, value);
1201         BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
1202         OUT_RING  (chan, (dst->x << 16) | dst->y);
1203         OUT_RING  (chan, ( w << 16) |  h);
1204 }
1205
1206 int
1207 nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst,
1208                   int w, int h, unsigned value)
1209 {
1210         if(!w || !h)
1211                 return 0;
1212
1213 #ifdef NV04_REGION_DEBUG
1214         fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps);
1215         nv04_region_print(dst);
1216         fprintf(stderr, " <- 0x%x\n", value);
1217 #endif
1218
1219         if(nv04_region_is_contiguous(dst, w, h))
1220         {
1221                 nv04_region_contiguous_shape(dst, &w, &h, 6);
1222                 nv04_region_linearize_contiguous(dst, w, h);
1223         }
1224
1225         // TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU
1226         /* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */
1227         if(!dst->pitch)
1228         {
1229 #ifdef NV04_REGION_DEBUG
1230                 fprintf(stderr, "\tFILL_ENG3D\n");
1231 #endif
1232                 return 1;
1233         }
1234         else if(!nv04_region_align(dst, w, h, 6))
1235         {
1236                 nv04_region_fill_gdirect(ctx, dst, w, h, value);
1237                 return 0;
1238         }
1239         else
1240                 return -1;
1241 }
1242
1243
1244 void
1245 nv04_2d_context_takedown(struct nv04_2d_context *ctx)
1246 {
1247         nouveau_notifier_free(&ctx->ntfy);
1248         nouveau_grobj_free(&ctx->m2mf);
1249         nouveau_grobj_free(&ctx->surf2d);
1250         nouveau_grobj_free(&ctx->swzsurf);
1251         nouveau_grobj_free(&ctx->rect);
1252         nouveau_grobj_free(&ctx->blit);
1253         nouveau_grobj_free(&ctx->sifm);
1254
1255         free(ctx);
1256 }
1257
1258 struct nv04_2d_context *
1259 nv04_2d_context_init(struct nouveau_channel* chan)
1260 {
1261         struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context));
1262         unsigned handle = 0x88000000, class;
1263         int ret;
1264
1265         if (!ctx)
1266                 return NULL;
1267
1268         ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
1269         if (ret) {
1270                 nv04_2d_context_takedown(ctx);
1271                 return NULL;
1272         }
1273
1274         ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
1275         if (ret) {
1276                 nv04_2d_context_takedown(ctx);
1277                 return NULL;
1278         }
1279
1280         BEGIN_RING(chan, ctx->m2mf, NV04_M2MF_DMA_NOTIFY, 1);
1281         OUT_RING  (chan, ctx->ntfy->handle);
1282
1283         if (chan->device->chipset < 0x10)
1284                 class = NV04_CONTEXT_SURFACES_2D;
1285         else
1286                 class = NV10_CONTEXT_SURFACES_2D;
1287
1288         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
1289         if (ret) {
1290                 nv04_2d_context_takedown(ctx);
1291                 return NULL;
1292         }
1293
1294         BEGIN_RING(chan, ctx->surf2d,
1295                          NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
1296         OUT_RING  (chan, chan->vram->handle);
1297         OUT_RING  (chan, chan->vram->handle);
1298
1299         if (chan->device->chipset < 0x10)
1300                 class = NV04_IMAGE_BLIT;
1301         else
1302                 class = NV11_IMAGE_BLIT;
1303
1304         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
1305         if (ret) {
1306                 nv04_2d_context_takedown(ctx);
1307                 return NULL;
1308         }
1309
1310         BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
1311         OUT_RING  (chan, ctx->ntfy->handle);
1312         BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACES, 1);
1313         OUT_RING  (chan, ctx->surf2d->handle);
1314         BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
1315         OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
1316
1317         ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
1318                                   &ctx->rect);
1319         if (ret) {
1320                 nv04_2d_context_takedown(ctx);
1321                 return NULL;
1322         }
1323
1324         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
1325         OUT_RING  (chan, ctx->ntfy->handle);
1326         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
1327         OUT_RING  (chan, ctx->surf2d->handle);
1328         BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
1329         OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
1330         BEGIN_RING(chan, ctx->rect,
1331                          NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
1332         OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
1333
1334         switch (chan->device->chipset & 0xf0) {
1335         case 0x00:
1336         case 0x10:
1337                 class = NV04_SWIZZLED_SURFACE;
1338                 break;
1339         case 0x20:
1340                 class = NV11_SWIZZLED_SURFACE;
1341                 break;
1342         case 0x30:
1343                 class = NV30_SWIZZLED_SURFACE;
1344                 break;
1345         case 0x40:
1346         case 0x60:
1347                 class = NV40_SWIZZLED_SURFACE;
1348                 break;
1349         default:
1350                 /* Famous last words: this really can't happen.. */
1351                 assert(0);
1352                 break;
1353         }
1354
1355         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
1356         if (ret) {
1357                 nv04_2d_context_takedown(ctx);
1358                 return NULL;
1359         }
1360
1361         /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
1362         if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND)
1363                 nouveau_grobj_autobind(ctx->swzsurf);
1364
1365         switch (chan->device->chipset & 0xf0) {
1366         case 0x10:
1367         case 0x20:
1368                 class = NV10_SCALED_IMAGE_FROM_MEMORY;
1369                 break;
1370         case 0x30:
1371                 class = NV30_SCALED_IMAGE_FROM_MEMORY;
1372                 break;
1373         case 0x40:
1374         case 0x60:
1375                 class = NV40_SCALED_IMAGE_FROM_MEMORY;
1376                 break;
1377         default:
1378                 class = NV04_SCALED_IMAGE_FROM_MEMORY;
1379                 break;
1380         }
1381
1382         ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
1383         if (ret) {
1384                 nv04_2d_context_takedown(ctx);
1385                 return NULL;
1386         }
1387
1388         /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */
1389         if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND)
1390                 nouveau_grobj_autobind(ctx->sifm);
1391
1392         return ctx;
1393 }