src/gallium/drivers/radeonsi/si_dma.c

   1 /*
   2  * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * on the rights to use, copy, modify, merge, publish, distribute, sub
   8  * license, and/or sell copies of the Software, and to permit persons to whom
   9  * the Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  21  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *      Jerome Glisse
  25  */
  26
  27 #include "sid.h"
  28 #include "si_pipe.h"
  29 #include "../radeon/r600_cs.h"
  30
  31 #include "util/u_format.h"
  32
  33 static unsigned si_array_mode(unsigned mode)
  34 {
  35         switch (mode) {
  36         case RADEON_SURF_MODE_LINEAR_ALIGNED:
  37                 return V_009910_ARRAY_LINEAR_ALIGNED;
  38         case RADEON_SURF_MODE_1D:
  39                 return V_009910_ARRAY_1D_TILED_THIN1;
  40         case RADEON_SURF_MODE_2D:
  41                 return V_009910_ARRAY_2D_TILED_THIN1;
  42         default:
  43         case RADEON_SURF_MODE_LINEAR:
  44                 return V_009910_ARRAY_LINEAR_GENERAL;
  45         }
  46 }
  47
  48 static uint32_t si_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
  49 {
  50         if (sscreen->b.info.si_tile_mode_array_valid) {
  51                 uint32_t gb_tile_mode = sscreen->b.info.si_tile_mode_array[tile_mode];
  52
  53                 return G_009910_MICRO_TILE_MODE(gb_tile_mode);
  54         }
  55
  56         /* The kernel cannod return the tile mode array. Guess? */
  57         return V_009910_ADDR_SURF_THIN_MICRO_TILING;
  58 }
  59
  60 static void si_dma_copy_buffer(struct si_context *ctx,
  61                                 struct pipe_resource *dst,
  62                                 struct pipe_resource *src,
  63                                 uint64_t dst_offset,
  64                                 uint64_t src_offset,
  65                                 uint64_t size)
  66 {
  67         struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
  68         unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
  69         struct r600_resource *rdst = (struct r600_resource*)dst;
  70         struct r600_resource *rsrc = (struct r600_resource*)src;
  71
  72         /* Mark the buffer range of destination as valid (initialized),
  73          * so that transfer_map knows it should wait for the GPU when mapping
  74          * that range. */
  75         util_range_add(&rdst->valid_buffer_range, dst_offset,
  76                        dst_offset + size);
  77
  78         dst_offset += rdst->gpu_address;
  79         src_offset += rsrc->gpu_address;
  80
  81         /* see if we use dword or byte copy */
  82         if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
  83                 size >>= 2;
  84                 sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
  85                 shift = 2;
  86                 max_csize = SI_DMA_COPY_MAX_SIZE_DW;
  87         } else {
  88                 sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
  89                 shift = 0;
  90                 max_csize = SI_DMA_COPY_MAX_SIZE;
  91         }
  92         ncopy = (size / max_csize) + !!(size % max_csize);
  93
  94         r600_need_dma_space(&ctx->b, ncopy * 5);
  95
  96         r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
  97                               RADEON_PRIO_MIN);
  98         r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
  99                               RADEON_PRIO_MIN);
 100
 101         for (i = 0; i < ncopy; i++) {
 102                 csize = size < max_csize ? size : max_csize;
 103                 cs->buf[cs->cdw++] = SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize);
 104                 cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
 105                 cs->buf[cs->cdw++] = src_offset & 0xffffffff;
 106                 cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
 107                 cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
 108                 dst_offset += csize << shift;
 109                 src_offset += csize << shift;
 110                 size -= csize;
 111         }
 112 }
 113
 114 static void si_dma_copy_tile(struct si_context *ctx,
 115                              struct pipe_resource *dst,
 116                              unsigned dst_level,
 117                              unsigned dst_x,
 118                              unsigned dst_y,
 119                              unsigned dst_z,
 120                              struct pipe_resource *src,
 121                              unsigned src_level,
 122                              unsigned src_x,
 123                              unsigned src_y,
 124                              unsigned src_z,
 125                              unsigned copy_height,
 126                              unsigned pitch,
 127                              unsigned bpp)
 128 {
 129         struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
 130         struct si_screen *sscreen = ctx->screen;
 131         struct r600_texture *rsrc = (struct r600_texture*)src;
 132         struct r600_texture *rdst = (struct r600_texture*)dst;
 133         unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
 134         unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
 135         unsigned sub_cmd, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt;
 136         uint64_t base, addr;
 137         unsigned pipe_config, tile_mode_index;
 138
 139         dst_mode = rdst->surface.level[dst_level].mode;
 140         src_mode = rsrc->surface.level[src_level].mode;
 141         /* downcast linear aligned to linear to simplify test */
 142         src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
 143         dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
 144         assert(dst_mode != src_mode);
 145
 146         y = 0;
 147         sub_cmd = SI_DMA_COPY_TILED;
 148         lbpp = util_logbase2(bpp);
 149         pitch_tile_max = ((pitch / bpp) / 8) - 1;
 150
 151         if (dst_mode == RADEON_SURF_MODE_LINEAR) {
 152                 /* T2L */
 153                 array_mode = si_array_mode(src_mode);
 154                 slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) / (8*8);
 155                 slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 156                 /* linear height must be the same as the slice tile max height, it's ok even
 157                  * if the linear destination/source have smaller heigh as the size of the
 158                  * dma packet will be using the copy_height which is always smaller or equal
 159                  * to the linear height
 160                  */
 161                 height = rsrc->surface.level[src_level].npix_y;
 162                 detile = 1;
 163                 x = src_x;
 164                 y = src_y;
 165                 z = src_z;
 166                 base = rsrc->surface.level[src_level].offset;
 167                 addr = rdst->surface.level[dst_level].offset;
 168                 addr += rdst->surface.level[dst_level].slice_size * dst_z;
 169                 addr += dst_y * pitch + dst_x * bpp;
 170                 bank_h = cik_bank_wh(rsrc->surface.bankh);
 171                 bank_w = cik_bank_wh(rsrc->surface.bankw);
 172                 mt_aspect = cik_macro_tile_aspect(rsrc->surface.mtilea);
 173                 tile_split = cik_tile_split(rsrc->surface.tile_split);
 174                 tile_mode_index = si_tile_mode_index(rsrc, src_level,
 175                                                      util_format_has_stencil(util_format_description(src->format)));
 176                 nbanks = si_num_banks(sscreen, rsrc);
 177                 base += rsrc->resource.gpu_address;
 178                 addr += rdst->resource.gpu_address;
 179         } else {
 180                 /* L2T */
 181                 array_mode = si_array_mode(dst_mode);
 182                 slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) / (8*8);
 183                 slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
 184                 /* linear height must be the same as the slice tile max height, it's ok even
 185                  * if the linear destination/source have smaller heigh as the size of the
 186                  * dma packet will be using the copy_height which is always smaller or equal
 187                  * to the linear height
 188                  */
 189                 height = rdst->surface.level[dst_level].npix_y;
 190                 detile = 0;
 191                 x = dst_x;
 192                 y = dst_y;
 193                 z = dst_z;
 194                 base = rdst->surface.level[dst_level].offset;
 195                 addr = rsrc->surface.level[src_level].offset;
 196                 addr += rsrc->surface.level[src_level].slice_size * src_z;
 197                 addr += src_y * pitch + src_x * bpp;
 198                 bank_h = cik_bank_wh(rdst->surface.bankh);
 199                 bank_w = cik_bank_wh(rdst->surface.bankw);
 200                 mt_aspect = cik_macro_tile_aspect(rdst->surface.mtilea);
 201                 tile_split = cik_tile_split(rdst->surface.tile_split);
 202                 tile_mode_index = si_tile_mode_index(rdst, dst_level,
 203                                                      util_format_has_stencil(util_format_description(dst->format)));
 204                 nbanks = si_num_banks(sscreen, rdst);
 205                 base += rdst->resource.gpu_address;
 206                 addr += rsrc->resource.gpu_address;
 207         }
 208
 209         pipe_config = cik_db_pipe_config(sscreen, tile_mode_index);
 210         mt = si_micro_tile_mode(sscreen, tile_mode_index);
 211         size = (copy_height * pitch) / 4;
 212         ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
 213         r600_need_dma_space(&ctx->b, ncopy * 9);
 214
 215         r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
 216                               RADEON_USAGE_READ, RADEON_PRIO_MIN);
 217         r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
 218                               RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
 219
 220         for (i = 0; i < ncopy; i++) {
 221                 cheight = copy_height;
 222                 if (((cheight * pitch) / 4) > SI_DMA_COPY_MAX_SIZE_DW) {
 223                         cheight = (SI_DMA_COPY_MAX_SIZE_DW * 4) / pitch;
 224                 }
 225                 size = (cheight * pitch) / 4;
 226                 cs->buf[cs->cdw++] = SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, size);
 227                 cs->buf[cs->cdw++] = base >> 8;
 228                 cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
 229                                         (lbpp << 24) | (bank_h << 21) |
 230                                         (bank_w << 18) | (mt_aspect << 16);
 231                 cs->buf[cs->cdw++] = (pitch_tile_max << 0) | ((height - 1) << 16);
 232                 cs->buf[cs->cdw++] = (slice_tile_max << 0) | (pipe_config << 26);
 233                 cs->buf[cs->cdw++] = (x << 0) | (z << 18);
 234                 cs->buf[cs->cdw++] = (y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27);
 235                 cs->buf[cs->cdw++] = addr & 0xfffffffc;
 236                 cs->buf[cs->cdw++] = (addr >> 32UL) & 0xff;
 237                 copy_height -= cheight;
 238                 addr += cheight * pitch;
 239                 y += cheight;
 240         }
 241 }
 242
 243 void si_dma_copy(struct pipe_context *ctx,
 244                  struct pipe_resource *dst,
 245                  unsigned dst_level,
 246                  unsigned dstx, unsigned dsty, unsigned dstz,
 247                  struct pipe_resource *src,
 248                  unsigned src_level,
 249                  const struct pipe_box *src_box)
 250 {
 251         struct si_context *sctx = (struct si_context *)ctx;
 252         struct r600_texture *rsrc = (struct r600_texture*)src;
 253         struct r600_texture *rdst = (struct r600_texture*)dst;
 254         unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
 255         unsigned src_w, dst_w;
 256         unsigned src_x, src_y;
 257         unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 258
 259         if (sctx->b.rings.dma.cs == NULL) {
 260                 goto fallback;
 261         }
 262
 263         /* TODO: Implement DMA copy for CIK */
 264         if (sctx->b.chip_class >= CIK) {
 265                 goto fallback;
 266         }
 267
 268         if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 269                 si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
 270                 return;
 271         }
 272
 273         if (src->format != dst->format || src_box->depth > 1 ||
 274             rdst->dirty_level_mask != 0) {
 275                 goto fallback;
 276         }
 277
 278         if (rsrc->dirty_level_mask) {
 279                 ctx->flush_resource(ctx, src);
 280         }
 281
 282         src_x = util_format_get_nblocksx(src->format, src_box->x);
 283         dst_x = util_format_get_nblocksx(src->format, dst_x);
 284         src_y = util_format_get_nblocksy(src->format, src_box->y);
 285         dst_y = util_format_get_nblocksy(src->format, dst_y);
 286
 287         bpp = rdst->surface.bpe;
 288         dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
 289         src_pitch = rsrc->surface.level[src_level].pitch_bytes;
 290         src_w = rsrc->surface.level[src_level].npix_x;
 291         dst_w = rdst->surface.level[dst_level].npix_x;
 292         copy_height = src_box->height / rsrc->surface.blk_h;
 293
 294         dst_mode = rdst->surface.level[dst_level].mode;
 295         src_mode = rsrc->surface.level[src_level].mode;
 296         /* downcast linear aligned to linear to simplify test */
 297         src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
 298         dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
 299
 300         if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
 301                 /* FIXME si can do partial blit */
 302                 goto fallback;
 303         }
 304         /* the x test here are currently useless (because we don't support partial blit)
 305          * but keep them around so we don't forget about those
 306          */
 307         if ((src_pitch % 8) || (src_box->x % 8) || (dst_x % 8) || (src_box->y % 8) || (dst_y % 8)) {
 308                 goto fallback;
 309         }
 310
 311         if (src_mode == dst_mode) {
 312                 uint64_t dst_offset, src_offset;
 313                 /* simple dma blit would do NOTE code here assume :
 314                  *   src_box.x/y == 0
 315                  *   dst_x/y == 0
 316                  *   dst_pitch == src_pitch
 317                  */
 318                 src_offset= rsrc->surface.level[src_level].offset;
 319                 src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
 320                 src_offset += src_y * src_pitch + src_x * bpp;
 321                 dst_offset = rdst->surface.level[dst_level].offset;
 322                 dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
 323                 dst_offset += dst_y * dst_pitch + dst_x * bpp;
 324                 si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset,
 325                             src_box->height * src_pitch);
 326         } else {
 327                 si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z,
 328                                  src, src_level, src_x, src_y, src_box->z,
 329                                  copy_height, dst_pitch, bpp);
 330         }
 331         return;
 332
 333 fallback:
 334         ctx->resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
 335                                   src, src_level, src_box);
 336 }