radeonsi: implement TC-compatible HTILE
authorMarek Olšák <marek.olsak@amd.com>
Tue, 11 Oct 2016 21:19:46 +0000 (23:19 +0200)
committerMarek Olšák <marek.olsak@amd.com>
Thu, 13 Oct 2016 17:00:51 +0000 (19:00 +0200)
so that decompress blits aren't needed and depth texturing needs less
memory bandwidth.

Z16 and Z24 are promoted to Z32_FLOAT by the driver, because TC-compatible
HTILE only supports Z32_FLOAT. This doubles memory footprint for Z16.
The format promotion is not visible to state trackers.

This is part of TC-compatible renderbuffer compression, which has 3 parts:
DCC, HTILE, FMASK. Only TC-compatible FMASK compression is missing now.

I don't see a measurable increase in performance though.

(I tested Talos Principle and DiRT: Showdown, the latter is improved by
 0.5%, which is almost noise, and it originally used layered Z16,
 so at least we know that Z16 promoted to Z32F isn't slower now)

Tested-by: Edmondo Tommasina <edmondo.tommasina@gmail.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
src/gallium/drivers/radeon/r600_pipe_common.h
src/gallium/drivers/radeon/r600_texture.c
src/gallium/drivers/radeon/radeon_winsys.h
src/gallium/drivers/radeonsi/si_blit.c
src/gallium/drivers/radeonsi/si_descriptors.c
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_state.c
src/gallium/drivers/radeonsi/si_state_draw.c
src/gallium/winsys/amdgpu/drm/amdgpu_surface.c

index 290b228b73fe011d7833721269af1761a11941af..5cfcad688c4bb694b192dccc0d65d6bfbcbb2da3 100644 (file)
@@ -245,6 +245,7 @@ struct r600_htile_info {
        unsigned height;
        unsigned xalign;
        unsigned yalign;
+       unsigned alignment;
 };
 
 struct r600_texture {
@@ -252,6 +253,7 @@ struct r600_texture {
 
        uint64_t                        size;
        unsigned                        num_level0_transfers;
+       enum pipe_format                db_render_format;
        bool                            is_depth;
        bool                            db_compatible;
        bool                            can_sample_z;
@@ -273,6 +275,7 @@ struct r600_texture {
        /* Depth buffer compression and fast clear. */
        struct r600_htile_info          htile;
        struct r600_resource            *htile_buffer;
+       bool                            tc_compatible_htile;
        bool                            depth_cleared; /* if it was cleared at least once */
        float                           depth_clear_value;
        bool                            stencil_cleared; /* if it was cleared at least once */
index 57cdbcf615d9cfeb0660e21ce03bac3a1305ee97..625d091fe40d3a8b18f0927f26031e34dfd3a642 100644 (file)
@@ -192,7 +192,8 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
                             struct radeon_surf *surface,
                             const struct pipe_resource *ptex,
                             unsigned array_mode,
-                            bool is_flushed_depth)
+                            bool is_flushed_depth,
+                            bool tc_compatible_htile)
 {
        const struct util_format_description *desc =
                util_format_description(ptex->format);
@@ -256,11 +257,22 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
        if (!is_flushed_depth && is_depth) {
                surface->flags |= RADEON_SURF_ZBUFFER;
 
+               if (tc_compatible_htile &&
+                   array_mode == RADEON_SURF_MODE_2D) {
+                       /* TC-compatible HTILE only supports Z32_FLOAT.
+                        * Promote Z16 to Z32. DB->CB copies will convert
+                        * the format for transfers.
+                        */
+                       surface->bpe = 4;
+                       surface->flags |= RADEON_SURF_TC_COMPATIBLE_HTILE;
+               }
+
                if (is_stencil) {
                        surface->flags |= RADEON_SURF_SBUFFER |
                                          RADEON_SURF_HAS_SBUFFER_MIPTREE;
                }
        }
+
        if (rscreen->chip_class >= SI) {
                surface->flags |= RADEON_SURF_HAS_TILE_MODE_INDEX;
        }
@@ -904,6 +916,7 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
        rtex->htile.height = height;
        rtex->htile.xalign = cl_width * 8;
        rtex->htile.yalign = cl_height * 8;
+       rtex->htile.alignment = base_align;
 
        return (util_max_layer(&rtex->resource.b.b, 0) + 1) *
                align(slice_bytes, base_align);
@@ -912,21 +925,34 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 static void r600_texture_allocate_htile(struct r600_common_screen *rscreen,
                                        struct r600_texture *rtex)
 {
-       unsigned htile_size = r600_texture_get_htile_size(rscreen, rtex);
+       uint64_t htile_size, alignment;
+       uint32_t clear_value;
+
+       if (rtex->tc_compatible_htile) {
+               htile_size = rtex->surface.htile_size;
+               alignment = rtex->surface.htile_alignment;
+               clear_value = 0x0000030F;
+       } else {
+               htile_size = r600_texture_get_htile_size(rscreen, rtex);
+               alignment = rtex->htile.alignment;
+               clear_value = 0;
+       }
 
        if (!htile_size)
                return;
 
        rtex->htile_buffer = (struct r600_resource*)
-                            pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-                                               PIPE_USAGE_DEFAULT, htile_size);
+                            r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
+                                                       PIPE_USAGE_DEFAULT,
+                                                       htile_size, alignment);
        if (rtex->htile_buffer == NULL) {
                /* this is not a fatal error as we can still keep rendering
                 * without htile buffer */
                R600_ERR("Failed to create buffer object for htile buffer.\n");
        } else {
-               r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b, 0,
-                                        htile_size, 0, R600_COHERENCY_NONE);
+               r600_screen_clear_buffer(rscreen, &rtex->htile_buffer->b.b,
+                                        0, htile_size, clear_value,
+                                        R600_COHERENCY_NONE);
        }
 }
 
@@ -967,10 +993,11 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 
        if (rtex->htile_buffer)
                fprintf(f, "  HTile: size=%u, alignment=%u, pitch=%u, height=%u, "
-                       "xalign=%u, yalign=%u\n",
+                       "xalign=%u, yalign=%u, TC_compatible = %u\n",
                        rtex->htile_buffer->b.b.width0,
                        rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
-                       rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
+                       rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign,
+                       rtex->tc_compatible_htile);
 
        if (rtex->dcc_offset) {
                fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
@@ -1054,6 +1081,16 @@ r600_texture_create_object(struct pipe_screen *screen,
                return NULL;
        }
 
+       rtex->tc_compatible_htile = rtex->surface.htile_size != 0;
+       assert(!!(rtex->surface.flags & RADEON_SURF_TC_COMPATIBLE_HTILE) ==
+              rtex->tc_compatible_htile);
+
+       /* TC-compatible HTILE only supports Z32_FLOAT. */
+       if (rtex->tc_compatible_htile)
+               rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+       else
+               rtex->db_render_format = base->format;
+
        /* Tiled depth textures utilize the non-displayable tile order.
         * This must be done after r600_setup_surface.
         * Applies to R600-Cayman. */
@@ -1241,11 +1278,20 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 {
        struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
        struct radeon_surf surface = {0};
+       bool is_flushed_depth = templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH;
+       bool tc_compatible_htile =
+               rscreen->chip_class >= VI &&
+               (templ->flags & PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY) &&
+               !(rscreen->debug_flags & DBG_NO_HYPERZ) &&
+               !is_flushed_depth &&
+               templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
+               util_format_is_depth_or_stencil(templ->format);
+
        int r;
 
        r = r600_init_surface(rscreen, &surface, templ,
                              r600_choose_tiling(rscreen, templ),
-                             templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH);
+                             is_flushed_depth, tc_compatible_htile);
        if (r) {
                return NULL;
        }
@@ -1296,7 +1342,8 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
        else
                array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
 
-       r = r600_init_surface(rscreen, &surface, templ, array_mode, false);
+       r = r600_init_surface(rscreen, &surface, templ, array_mode,
+                             false, false);
        if (r) {
                return NULL;
        }
index 7146737c8260514a279cb130af6f59324dc1e9d8..8946209d33791e828749b678f69ff8d6caa4b32a 100644 (file)
@@ -278,6 +278,7 @@ enum radeon_feature_id {
 #define RADEON_SURF_HAS_TILE_MODE_INDEX         (1 << 20)
 #define RADEON_SURF_FMASK                       (1 << 21)
 #define RADEON_SURF_DISABLE_DCC                 (1 << 22)
+#define RADEON_SURF_TC_COMPATIBLE_HTILE         (1 << 23)
 
 #define RADEON_SURF_GET(v, field)   (((v) >> RADEON_SURF_ ## field ## _SHIFT) & RADEON_SURF_ ## field ## _MASK)
 #define RADEON_SURF_SET(v, field)   (((v) & RADEON_SURF_ ## field ## _MASK) << RADEON_SURF_ ## field ## _SHIFT)
@@ -344,6 +345,9 @@ struct radeon_surf {
 
     uint64_t                    dcc_size;
     uint64_t                    dcc_alignment;
+    /* TC-compatible HTILE only. */
+    uint64_t                    htile_size;
+    uint64_t                    htile_alignment;
 };
 
 struct radeon_bo_list_item {
index c143601d55c01a7830f061665dde6be270c8564b..db41f565a9477bed214f0c0deb670f13c21e82b8 100644 (file)
@@ -332,6 +332,8 @@ si_flush_depth_texture(struct si_context *sctx,
                }
        }
 
+       assert(!tex->tc_compatible_htile || levels_z == 0);
+
        /* We may have to allocate the flushed texture here when called from
         * si_decompress_subresource.
         */
@@ -699,7 +701,10 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
            zsbuf->u.tex.level == 0 &&
            zsbuf->u.tex.first_layer == 0 &&
            zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
-               if (buffers & PIPE_CLEAR_DEPTH) {
+               /* TC-compatible HTILE only supports depth clears to 0 or 1. */
+               if (buffers & PIPE_CLEAR_DEPTH &&
+                   (!zstex->tc_compatible_htile ||
+                    depth == 0 || depth == 1)) {
                        /* Need to disable EXPCLEAR temporarily if clearing
                         * to a new value. */
                        if (!zstex->depth_cleared || zstex->depth_clear_value != depth) {
@@ -713,7 +718,9 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
                        si_mark_atom_dirty(sctx, &sctx->db_render_state);
                }
 
-               if (buffers & PIPE_CLEAR_STENCIL) {
+               /* TC-compatible HTILE only supports stencil clears to 0. */
+               if (buffers & PIPE_CLEAR_STENCIL &&
+                   (!zstex->tc_compatible_htile || stencil == 0)) {
                        stencil &= 0xff;
 
                        /* Need to disable EXPCLEAR temporarily if clearing
index 350242aeed51bd921e7236ea9d4356280876f51a..19cae65e75eef90fabbd4780ceb57795c7b709d7 100644 (file)
@@ -399,6 +399,9 @@ void si_set_mutable_tex_desc_fields(struct r600_texture *tex,
                state[7] = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
                            tex->dcc_offset +
                            base_level_info->dcc_offset) >> 8;
+       } else if (tex->tc_compatible_htile) {
+               state[6] |= S_008F28_COMPRESSION_EN(1);
+               state[7] = tex->htile_buffer->gpu_address >> 8;
        }
 }
 
@@ -508,8 +511,10 @@ static void si_set_sampler_views(struct pipe_context *ctx,
                if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) {
                        struct r600_texture *rtex =
                                (struct r600_texture*)views[i]->texture;
+                       struct si_sampler_view *rview = (struct si_sampler_view *)views[i];
 
-                       if (rtex->db_compatible) {
+                       if (rtex->db_compatible &&
+                           (!rtex->tc_compatible_htile || rview->is_stencil_sampler)) {
                                samplers->depth_texture_mask |= 1u << slot;
                        } else {
                                samplers->depth_texture_mask &= ~(1u << slot);
index fc50205633df04c63e45b018b0064725d3998852..b2d76994996f5acbcde4c19a25741f7f5aec3a8f 100644 (file)
@@ -4607,12 +4607,26 @@ static void tex_fetch_args(
 
        /* Pack depth comparison value */
        if (tgsi_is_shadow_target(target) && opcode != TGSI_OPCODE_LODQ) {
+               LLVMValueRef z;
+
                if (target == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
-                       address[count++] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
+                       z = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
                } else {
                        assert(ref_pos >= 0);
-                       address[count++] = coords[ref_pos];
+                       z = coords[ref_pos];
                }
+
+               /* TC-compatible HTILE promotes Z16 and Z24 to Z32_FLOAT,
+                * so the depth comparison value isn't clamped for Z16 and
+                * Z24 anymore. Do it manually here.
+                *
+                * It's unnecessary if the original texture format was
+                * Z32_FLOAT, but we don't know that here.
+                */
+               if (ctx->screen->b.chip_class == VI)
+                       z = radeon_llvm_saturate(bld_base, z);
+
+               address[count++] = z;
        }
 
        /* Pack user derivatives */
index ad65fc22f60c69c8c9d3ff2952526b2e224dd0e7..b23749c6d89ebf05144acb19c7050b66b253d616 100644 (file)
@@ -686,6 +686,9 @@ static void si_update_poly_offset_state(struct si_context *sctx)
        if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf)
                return;
 
+       /* Use the user format, not db_render_format, so that the polygon
+        * offset behaves as expected by applications.
+        */
        switch (sctx->framebuffer.state.zsbuf->texture->format) {
        case PIPE_FORMAT_Z16_UNORM:
                si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
@@ -2140,7 +2143,7 @@ static void si_init_depth_surface(struct si_context *sctx,
        uint64_t z_offs, s_offs;
        uint32_t db_htile_data_base, db_htile_surface;
 
-       format = si_translate_dbformat(rtex->resource.b.b.format);
+       format = si_translate_dbformat(rtex->db_render_format);
 
        if (format == V_028040_Z_INVALID) {
                R600_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
@@ -2151,7 +2154,7 @@ static void si_init_depth_surface(struct si_context *sctx,
        z_offs += rtex->surface.level[level].offset;
        s_offs += rtex->surface.stencil_level[level].offset;
 
-       db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(1);
+       db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
 
        z_info = S_028040_FORMAT(format);
        if (rtex->resource.b.b.nr_samples > 1) {
@@ -2208,13 +2211,37 @@ static void si_init_depth_surface(struct si_context *sctx,
                         */
                        if (rtex->resource.b.b.nr_samples <= 1)
                                s_info |= S_028044_ALLOW_EXPCLEAR(1);
-               } else
-                       /* Use all of the htile_buffer for depth if there's no stencil. */
+               } else if (!rtex->tc_compatible_htile) {
+                       /* Use all of the htile_buffer for depth if there's no stencil.
+                        * This must not be set when TC-compatible HTILE is enabled
+                        * due to a hw bug.
+                        */
                        s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+               }
 
                uint64_t va = rtex->htile_buffer->gpu_address;
                db_htile_data_base = va >> 8;
                db_htile_surface = S_028ABC_FULL_CACHE(1);
+
+               if (rtex->tc_compatible_htile) {
+                       db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
+
+                       switch (rtex->resource.b.b.nr_samples) {
+                       case 0:
+                       case 1:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
+                               break;
+                       case 2:
+                       case 4:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
+                               break;
+                       case 8:
+                               z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
+                               break;
+                       default:
+                               assert(0);
+                       }
+               }
        } else {
                db_htile_data_base = 0;
                db_htile_surface = 0;
@@ -2356,6 +2383,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
        if (state->zsbuf) {
                surf = (struct r600_surface*)state->zsbuf;
+               rtex = (struct r600_texture*)surf->base.texture;
 
                if (!surf->depth_initialized) {
                        si_init_depth_surface(sctx, surf);
@@ -3021,6 +3049,9 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
        surflevel = tmp->surface.level;
 
        if (tmp->db_compatible) {
+               if (!view->is_stencil_sampler)
+                       pipe_format = tmp->db_render_format;
+
                switch (pipe_format) {
                case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
                        pipe_format = PIPE_FORMAT_Z32_FLOAT;
index c14e852bec25501f49c18d6f6070f5a31d372197..d18137b66910c9885e9402a234861114a4e816fe 100644 (file)
@@ -1118,7 +1118,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
                struct r600_texture *rtex = (struct r600_texture *)surf->texture;
 
-               rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+               if (!rtex->tc_compatible_htile)
+                       rtex->dirty_level_mask |= 1 << surf->u.tex.level;
 
                if (rtex->surface.flags & RADEON_SURF_SBUFFER)
                        rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
index 8bfea457e452ebbad5391eb4481251b8a1f3c514..1bf07a7498cd0baee401a7b8a433e085af14f4e3 100644 (file)
@@ -137,6 +137,7 @@ ADDR_HANDLE amdgpu_addr_create(struct amdgpu_winsys *ws)
    createFlags.value = 0;
    createFlags.useTileIndex = 1;
    createFlags.degradeBaseLevel = 1;
+   createFlags.useHtileSliceAlign = 1;
 
    addrCreateInput.chipEngine = CIASICIDGFXENGINE_SOUTHERNISLAND;
    addrCreateInput.chipFamily = ws->family;
@@ -160,7 +161,9 @@ static int compute_level(struct amdgpu_winsys *ws,
                          ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
                          ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
                          ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
-                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
+                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut,
+                         ADDR_COMPUTE_HTILE_INFO_INPUT *AddrHtileIn,
+                         ADDR_COMPUTE_HTILE_INFO_OUTPUT *AddrHtileOut)
 {
    struct radeon_surf_level *surf_level;
    ADDR_E_RETURNCODE ret;
@@ -257,6 +260,32 @@ static int compute_level(struct amdgpu_winsys *ws,
       }
    }
 
+   /* TC-compatible HTILE. */
+   if (!is_stencil &&
+       AddrSurfInfoIn->flags.depth &&
+       AddrSurfInfoIn->flags.tcCompatible &&
+       surf_level->mode == RADEON_SURF_MODE_2D &&
+       level == 0) {
+      AddrHtileIn->flags.tcCompatible = 1;
+      AddrHtileIn->pitch = AddrSurfInfoOut->pitch;
+      AddrHtileIn->height = AddrSurfInfoOut->height;
+      AddrHtileIn->numSlices = AddrSurfInfoOut->depth;
+      AddrHtileIn->blockWidth = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->blockHeight = ADDR_HTILE_BLOCKSIZE_8;
+      AddrHtileIn->pTileInfo = AddrSurfInfoOut->pTileInfo;
+      AddrHtileIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrHtileIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeHtileInfo(ws->addrlib,
+                                 AddrHtileIn,
+                                 AddrHtileOut);
+
+      if (ret == ADDR_OK) {
+         surf->htile_size = AddrHtileOut->htileBytes;
+         surf->htile_alignment = AddrHtileOut->baseAlign;
+      }
+   }
+
    return 0;
 }
 
@@ -284,6 +313,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
    ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
    ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
+   ADDR_COMPUTE_HTILE_INFO_INPUT AddrHtileIn = {0};
+   ADDR_COMPUTE_HTILE_INFO_OUTPUT AddrHtileOut = {0};
    ADDR_TILEINFO AddrTileInfoIn = {0};
    ADDR_TILEINFO AddrTileInfoOut = {0};
    int r;
@@ -296,6 +327,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
    AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
    AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
+   AddrHtileIn.size = sizeof(ADDR_COMPUTE_HTILE_INFO_INPUT);
+   AddrHtileOut.size = sizeof(ADDR_COMPUTE_HTILE_INFO_OUTPUT);
    AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
 
    type = RADEON_SURF_GET(surf->flags, TYPE);
@@ -361,7 +394,12 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    AddrSurfInfoIn.flags.cube = type == RADEON_SURF_TYPE_CUBEMAP;
    AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
    AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
-   AddrSurfInfoIn.flags.degrade4Space = 1;
+   AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
+
+   /* Only degrade the tile mode for space if TC-compatible HTILE hasn't been
+    * requested, because TC-compatible HTILE requires 2D tiling.
+    */
+   AddrSurfInfoIn.flags.degrade4Space = !AddrSurfInfoIn.flags.tcCompatible;
 
    /* DCC notes:
     * - If we add MSAA support, keep in mind that CB can't decompress 8bpp
@@ -443,11 +481,14 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    surf->bo_size = 0;
    surf->dcc_size = 0;
    surf->dcc_alignment = 1;
+   surf->htile_size = 0;
+   surf->htile_alignment = 1;
 
    /* Calculate texture layout information. */
    for (level = 0; level <= surf->last_level; level++) {
       r = compute_level(ws, surf, false, level, type, compressed,
-                        &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
+                        &AddrSurfInfoIn, &AddrSurfInfoOut,
+                        &AddrDccIn, &AddrDccOut, &AddrHtileIn, &AddrHtileOut);
       if (r)
          return r;
 
@@ -475,12 +516,14 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
       AddrSurfInfoIn.bpp = 8;
       AddrSurfInfoIn.flags.depth = 0;
       AddrSurfInfoIn.flags.stencil = 1;
+      AddrSurfInfoIn.flags.tcCompatible = 0;
       /* This will be ignored if AddrSurfInfoIn.pTileInfo is NULL. */
       AddrTileInfoIn.tileSplitBytes = surf->stencil_tile_split;
 
       for (level = 0; level <= surf->last_level; level++) {
          r = compute_level(ws, surf, true, level, type, compressed,
-                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
+                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut,
+                           NULL, NULL);
          if (r)
             return r;
 
@@ -508,6 +551,12 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
                                ws->info.num_tile_pipes);
    }
 
+   /* Make sure HTILE covers the whole miptree, because the shader reads
+    * TC-compatible HTILE even for levels where it's disabled by DB.
+    */
+   if (surf->htile_size && surf->last_level)
+          surf->htile_size *= 2;
+
    return 0;
 }