r600g: check for PIPE_BIND_BLENDABLE in is_format_supported
[mesa.git] / src / gallium / drivers / r600 / r600_state.c
index 3e3b0ddd9a5d9171b727efa3851040643537dbfa..31d7bd045192318829fff3096eaf1b6d3b23ce87 100644 (file)
@@ -157,6 +157,11 @@ static bool r600_is_zs_format_supported(enum pipe_format format)
        return r600_translate_dbformat(format) != ~0U;
 }
 
+static inline bool r600_is_blending_supported(enum pipe_format format)
+{
+       return !(util_format_is_pure_integer(format) || util_format_is_depth_or_stencil(format));
+}
+
 boolean r600_is_format_supported(struct pipe_screen *screen,
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
@@ -235,6 +240,10 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
        if (usage & PIPE_BIND_TRANSFER_WRITE)
                retval |= PIPE_BIND_TRANSFER_WRITE;
 
+       if ((usage & PIPE_BIND_BLENDABLE) &&
+           r600_is_blending_supported(format))
+               retval |= PIPE_BIND_BLENDABLE;
+
        return retval == usage;
 }
 
@@ -460,6 +469,10 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
                S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) |
                S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip) |
                S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
+       if (rctx->b.chip_class == R700) {
+               rs->pa_cl_clip_cntl |=
+                       S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard);
+       }
        rs->multisample_enable = state->multisample;
 
        /* offset */
@@ -517,19 +530,25 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
                               S_028C08_PIX_CENTER_HALF(state->half_pixel_center) |
                               S_028C08_QUANT_MODE(V_028C08_X_1_256TH));
        r600_store_context_reg(&rs->buffer, R_028DFC_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
-       r600_store_context_reg(&rs->buffer, R_028814_PA_SU_SC_MODE_CNTL,
-                              S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
-                              S_028814_CULL_FRONT(state->cull_face & PIPE_FACE_FRONT ? 1 : 0) |
-                              S_028814_CULL_BACK(state->cull_face & PIPE_FACE_BACK ? 1 : 0) |
-                              S_028814_FACE(!state->front_ccw) |
-                              S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
-                              S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
-                              S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
-                              S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL ||
-                                                 state->fill_back != PIPE_POLYGON_MODE_FILL) |
-                              S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
-                              S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back)));
-       r600_store_context_reg(&rs->buffer, R_028350_SX_MISC, S_028350_MULTIPASS(state->rasterizer_discard));
+
+       rs->pa_su_sc_mode_cntl = S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
+                                S_028814_CULL_FRONT(state->cull_face & PIPE_FACE_FRONT ? 1 : 0) |
+                                S_028814_CULL_BACK(state->cull_face & PIPE_FACE_BACK ? 1 : 0) |
+                                S_028814_FACE(!state->front_ccw) |
+                                S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
+                                S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
+                                S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) |
+                                S_028814_POLY_MODE(state->fill_front != PIPE_POLYGON_MODE_FILL ||
+                                                                        state->fill_back != PIPE_POLYGON_MODE_FILL) |
+                                S_028814_POLYMODE_FRONT_PTYPE(r600_translate_fill(state->fill_front)) |
+                                S_028814_POLYMODE_BACK_PTYPE(r600_translate_fill(state->fill_back));
+       if (rctx->b.chip_class == R700) {
+               r600_store_context_reg(&rs->buffer, R_028814_PA_SU_SC_MODE_CNTL, rs->pa_su_sc_mode_cntl);
+       }
+       if (rctx->b.chip_class == R600) {
+               r600_store_context_reg(&rs->buffer, R_028350_SX_MISC,
+                                      S_028350_MULTIPASS(state->rasterizer_discard));
+       }
        return rs;
 }
 
@@ -1240,13 +1259,6 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
        rctx->framebuffer.atom.dirty = true;
 }
 
-#define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-       (((s0x) & 0xf) | (((s0y) & 0xf) << 4) |            \
-       (((s1x) & 0xf) << 8) | (((s1y) & 0xf) << 12) |     \
-       (((s2x) & 0xf) << 16) | (((s2y) & 0xf) << 20) |    \
-        (((s3x) & 0xf) << 24) | (((s3y) & 0xf) << 28))
-
-
 static uint32_t sample_locs_2x[] = {
        FILL_SREG(-4, 4, 4, -4, -4, 4, 4, -4),
        FILL_SREG(-4, 4, 4, -4, -4, 4, 4, -4),
@@ -1405,7 +1417,10 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
                        reloc = r600_context_bo_reloc(&rctx->b,
                                                      &rctx->b.rings.gfx,
                                                      (struct r600_resource*)cb[i]->base.texture,
-                                                     RADEON_USAGE_READWRITE);
+                                                     RADEON_USAGE_READWRITE,
+                                                     cb[i]->base.texture->nr_samples > 1 ?
+                                                             RADEON_PRIO_COLOR_BUFFER_MSAA :
+                                                             RADEON_PRIO_COLOR_BUFFER);
                        radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
                        radeon_emit(cs, reloc);
 
@@ -1415,7 +1430,10 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
                        reloc = r600_context_bo_reloc(&rctx->b,
                                                      &rctx->b.rings.gfx,
                                                      cb[i]->cb_buffer_fmask,
-                                                     RADEON_USAGE_READWRITE);
+                                                     RADEON_USAGE_READWRITE,
+                                                     cb[i]->base.texture->nr_samples > 1 ?
+                                                             RADEON_PRIO_COLOR_BUFFER_MSAA :
+                                                             RADEON_PRIO_COLOR_BUFFER);
                        radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
                        radeon_emit(cs, reloc);
 
@@ -1425,7 +1443,10 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
                        reloc = r600_context_bo_reloc(&rctx->b,
                                                      &rctx->b.rings.gfx,
                                                      cb[i]->cb_buffer_cmask,
-                                                     RADEON_USAGE_READWRITE);
+                                                     RADEON_USAGE_READWRITE,
+                                                     cb[i]->base.texture->nr_samples > 1 ?
+                                                             RADEON_PRIO_COLOR_BUFFER_MSAA :
+                                                             RADEON_PRIO_COLOR_BUFFER);
                        radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
                        radeon_emit(cs, reloc);
                }
@@ -1461,7 +1482,10 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
                unsigned reloc = r600_context_bo_reloc(&rctx->b,
                                                       &rctx->b.rings.gfx,
                                                       (struct r600_resource*)state->zsbuf->texture,
-                                                      RADEON_USAGE_READWRITE);
+                                                      RADEON_USAGE_READWRITE,
+                                                      surf->base.texture->nr_samples > 1 ?
+                                                              RADEON_PRIO_DEPTH_BUFFER_MSAA :
+                                                              RADEON_PRIO_DEPTH_BUFFER);
 
                r600_write_context_reg(cs, R_028DF8_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
                                       surf->pa_su_poly_offset_db_fmt_cntl);
@@ -1554,7 +1578,8 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom
                r600_write_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value));
                r600_write_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
                r600_write_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-               reloc_idx = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE);
+               reloc_idx = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer,
+                                                 RADEON_USAGE_READWRITE, RADEON_PRIO_DEPTH_META);
                cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
                cs->buf[cs->cdw++] = reloc_idx;
        } else {
@@ -1606,6 +1631,11 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom
                db_render_control |= S_028D0C_DEPTH_CLEAR_ENABLE(1);
        }
 
+       /* RV770 workaround for a hang with 8x MSAA. */
+       if (rctx->b.family == CHIP_RV770 && a->log_samples == 3) {
+               db_render_override |= S_028D10_MAX_TILES_IN_DTT(6);
+       }
+
        r600_write_context_reg_seq(cs, R_028D0C_DB_RENDER_CONTROL, 2);
        radeon_emit(cs, db_render_control); /* R_028D0C_DB_RENDER_CONTROL */
        radeon_emit(cs, db_render_override); /* R_028D10_DB_RENDER_OVERRIDE */
@@ -1652,7 +1682,8 @@ static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom
                radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer, RADEON_USAGE_READ));
+               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+                                                     RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO));
        }
 }
 
@@ -1684,7 +1715,8 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
                }
 
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer, RADEON_USAGE_READ));
+               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+                                                     RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO));
 
                radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0));
                radeon_emit(cs, (buffer_id_base + buffer_index) * 7);
@@ -1699,7 +1731,8 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
                radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer, RADEON_USAGE_READ));
+               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+                                                     RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO));
 
                dirty_mask &= ~(1 << buffer_index);
        }
@@ -1747,7 +1780,10 @@ static void r600_emit_sampler_views(struct r600_context *rctx,
                radeon_emit_array(cs, rview->tex_resource_words, 7);
 
                reloc = r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource,
-                                             RADEON_USAGE_READ);
+                                             RADEON_USAGE_READ,
+                                             rview->tex_resource->b.b.nr_samples > 1 ?
+                                                     RADEON_PRIO_SHADER_TEXTURE_MSAA :
+                                                     RADEON_PRIO_SHADER_TEXTURE_RO);
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
                radeon_emit(cs, reloc);
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
@@ -1874,7 +1910,8 @@ static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600
 
        r600_write_context_reg(cs, R_028894_SQ_PGM_START_FS, shader->offset >> 8);
        radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-       radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, shader->buffer, RADEON_USAGE_READ));
+       radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, shader->buffer,
+                                             RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA));
 }
 
 static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
@@ -1923,7 +1960,9 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
                r600_write_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE,
                                (r600_resource_va(screen, &rbuffer->b.b)) >> 8);
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer, RADEON_USAGE_READWRITE));
+               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+                                                     RADEON_USAGE_READWRITE,
+                                                     RADEON_PRIO_SHADER_RESOURCE_RW));
                r600_write_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE,
                                state->esgs_ring.buffer_size >> 8);
 
@@ -1931,7 +1970,9 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
                r600_write_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE,
                                (r600_resource_va(screen, &rbuffer->b.b)) >> 8);
                radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer, RADEON_USAGE_READWRITE));
+               radeon_emit(cs, r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+                                                     RADEON_USAGE_READWRITE,
+                                                     RADEON_PRIO_SHADER_RESOURCE_RW));
                r600_write_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE,
                                state->gsvs_ring.buffer_size >> 8);
        } else {
@@ -2299,8 +2340,7 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
        r600_store_context_reg(cb, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 0);
        r600_store_context_reg(cb, R_028AA4_VGT_INSTANCE_STEP_RATE_1, 0);
 
-       r600_store_context_reg_seq(cb, R_028AB0_VGT_STRMOUT_EN, 3);
-       r600_store_value(cb, 0); /* R_028AB0_VGT_STRMOUT_EN */
+       r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2);
        r600_store_value(cb, 1); /* R_028AB4_VGT_REUSE_OFF */
        r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */
 
@@ -2373,8 +2413,11 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 
        r600_store_context_reg(cb, R_0288A4_SQ_PGM_RESOURCES_FS, 0);
 
+       if (rctx->b.chip_class == R700)
+               r600_store_context_reg(cb, R_028350_SX_MISC, 0);
        if (rctx->b.chip_class == R700 && rctx->screen->b.has_streamout)
                r600_store_context_reg(cb, R_028354_SX_SURFACE_SYNC, S_028354_SURFACE_SYNC_MASK(0xf));
+
        r600_store_context_reg(cb, R_028800_DB_DEPTH_CONTROL, 0);
        if (rctx->screen->b.has_streamout) {
                r600_store_context_reg(cb, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
@@ -2552,32 +2595,10 @@ void r600_update_vs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
                S_02881C_VS_OUT_CCDIST0_VEC_ENA((rshader->clip_dist_write & 0x0F) != 0) |
                S_02881C_VS_OUT_CCDIST1_VEC_ENA((rshader->clip_dist_write & 0xF0) != 0) |
                S_02881C_VS_OUT_MISC_VEC_ENA(rshader->vs_out_misc_write) |
-               S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport) |
-               S_02881C_USE_VTX_POINT_SIZE(rshader->vs_out_point_size);
-}
-
-static unsigned r600_conv_prim_to_gs_out(unsigned mode)
-{
-       static const int prim_conv[] = {
-               V_028A6C_OUTPRIM_TYPE_POINTLIST,
-               V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-               V_028A6C_OUTPRIM_TYPE_TRISTRIP
-       };
-       assert(mode < Elements(prim_conv));
-
-       return prim_conv[mode];
+               S_02881C_USE_VTX_POINT_SIZE(rshader->vs_out_point_size) |
+               S_02881C_USE_VTX_EDGE_FLAG(rshader->vs_out_edgeflag) |
+               S_02881C_USE_VTX_RENDER_TARGET_INDX(rshader->vs_out_layer) |
+               S_02881C_USE_VTX_VIEWPORT_INDX(rshader->vs_out_viewport);
 }
 
 void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *shader)
@@ -2777,9 +2798,6 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
        unsigned ncopy, height, cheight, detile, i, x, y, z, src_mode, dst_mode;
        uint64_t base, addr;
 
-       /* make sure that the dma ring is only one active */
-       rctx->b.rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC);
-
        dst_mode = rdst->surface.level[dst_level].mode;
        src_mode = rsrc->surface.level[src_level].mode;
        /* downcast linear aligned to linear to simplify test */
@@ -2789,12 +2807,12 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 
        y = 0;
        lbpp = util_logbase2(bpp);
-       pitch_tile_max = ((pitch / bpp) >> 3) - 1;
+       pitch_tile_max = ((pitch / bpp) / 8) - 1;
 
        if (dst_mode == RADEON_SURF_MODE_LINEAR) {
                /* T2L */
                array_mode = r600_array_mode(src_mode);
-               slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) >> 6;
+               slice_tile_max = (rsrc->surface.level[src_level].nblk_x * rsrc->surface.level[src_level].nblk_y) / (8*8);
                slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
                /* linear height must be the same as the slice tile max height, it's ok even
                 * if the linear destination/source have smaller heigh as the size of the
@@ -2813,7 +2831,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
        } else {
                /* L2T */
                array_mode = r600_array_mode(dst_mode);
-               slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) >> 6;
+               slice_tile_max = (rdst->surface.level[dst_level].nblk_x * rdst->surface.level[dst_level].nblk_y) / (8*8);
                slice_tile_max = slice_tile_max ? slice_tile_max - 1 : 0;
                /* linear height must be the same as the slice tile max height, it's ok even
                 * if the linear destination/source have smaller heigh as the size of the
@@ -2831,23 +2849,25 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
                addr += src_y * pitch + src_x * bpp;
        }
        /* check that we are in dw/base alignment constraint */
-       if ((addr & 0x3) || (base & 0xff)) {
+       if (addr % 4 || base % 256) {
                return FALSE;
        }
 
        /* It's a r6xx/r7xx limitation, the blit must be on 8 boundary for number
         * line in the blit. Compute max 8 line we can copy in the size limit
         */
-       cheight = ((0x0000ffff << 2) / pitch) & 0xfffffff8;
+       cheight = ((R600_DMA_COPY_MAX_SIZE_DW * 4) / pitch) & 0xfffffff8;
        ncopy = (copy_height / cheight) + !!(copy_height % cheight);
-       r600_need_dma_space(rctx, ncopy * 7);
+       r600_need_dma_space(&rctx->b, ncopy * 7);
 
        for (i = 0; i < ncopy; i++) {
                cheight = cheight > copy_height ? copy_height : cheight;
-               size = (cheight * pitch) >> 2;
+               size = (cheight * pitch) / 4;
                /* emit reloc before writting cs so that cs is always in consistent state */
-               r600_context_bo_reloc(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ);
-               r600_context_bo_reloc(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE);
+               r600_context_bo_reloc(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ,
+                                     RADEON_PRIO_MIN);
+               r600_context_bo_reloc(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE,
+                                     RADEON_PRIO_MIN);
                cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
                cs->buf[cs->cdw++] = base >> 8;
                cs->buf[cs->cdw++] = (detile << 31) | (array_mode << 27) |
@@ -2864,13 +2884,13 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
        return TRUE;
 }
 
-static boolean r600_dma_blit(struct pipe_context *ctx,
-                            struct pipe_resource *dst,
-                            unsigned dst_level,
-                            unsigned dst_x, unsigned dst_y, unsigned dst_z,
-                            struct pipe_resource *src,
-                            unsigned src_level,
-                            const struct pipe_box *src_box)
+static void r600_dma_copy(struct pipe_context *ctx,
+                         struct pipe_resource *dst,
+                         unsigned dst_level,
+                         unsigned dstx, unsigned dsty, unsigned dstz,
+                         struct pipe_resource *src,
+                         unsigned src_level,
+                         const struct pipe_box *src_box)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct r600_texture *rsrc = (struct r600_texture*)src;
@@ -2878,18 +2898,22 @@ static boolean r600_dma_blit(struct pipe_context *ctx,
        unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode, copy_height;
        unsigned src_w, dst_w;
        unsigned src_x, src_y;
+       unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
        if (rctx->b.rings.dma.cs == NULL) {
-               return FALSE;
+               goto fallback;
        }
 
        if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-               r600_dma_copy(rctx, dst, src, dst_x, src_box->x, src_box->width);
-               return TRUE;
+               if (dst_x % 4 || src_box->x % 4 || src_box->width % 4)
+                       goto fallback;
+
+               r600_dma_copy_buffer(rctx, dst, src, dst_x, src_box->x, src_box->width);
+               return;
        }
 
-       if (src->format != dst->format) {
-               return FALSE;
+       if (src->format != dst->format || src_box->depth > 1) {
+               goto fallback;
        }
 
        src_x = util_format_get_nblocksx(src->format, src_box->x);
@@ -2911,12 +2935,12 @@ static boolean r600_dma_blit(struct pipe_context *ctx,
        dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
 
        if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w) {
-               /* strick requirement on r6xx/r7xx */
-               return FALSE;
+               /* strict requirement on r6xx/r7xx */
+               goto fallback;
        }
        /* lot of constraint on alignment this should capture them all */
-       if ((src_pitch & 0x7) || (src_box->y & 0x7) || (dst_y & 0x7)) {
-               return FALSE;
+       if (src_pitch % 8 || src_box->y % 8 || dst_y % 8) {
+               goto fallback;
        }
 
        if (src_mode == dst_mode) {
@@ -2935,16 +2959,22 @@ static boolean r600_dma_blit(struct pipe_context *ctx,
                dst_offset += dst_y * dst_pitch + dst_x * bpp;
                size = src_box->height * src_pitch;
                /* must be dw aligned */
-               if ((dst_offset & 0x3) || (src_offset & 0x3) || (size & 0x3)) {
-                       return FALSE;
+               if (dst_offset % 4 || src_offset % 4 || size % 4) {
+                       goto fallback;
                }
-               r600_dma_copy(rctx, dst, src, dst_offset, src_offset, size);
+               r600_dma_copy_buffer(rctx, dst, src, dst_offset, src_offset, size);
        } else {
-               return r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
+               if (!r600_dma_copy_tile(rctx, dst, dst_level, dst_x, dst_y, dst_z,
                                        src, src_level, src_x, src_y, src_box->z,
-                                       copy_height, dst_pitch, bpp);
+                                       copy_height, dst_pitch, bpp)) {
+                       goto fallback;
+               }
        }
-       return TRUE;
+       return;
+
+fallback:
+       ctx->resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
+                                 src, src_level, src_box);
 }
 
 void r600_init_state_functions(struct r600_context *rctx)
@@ -3008,6 +3038,7 @@ void r600_init_state_functions(struct r600_context *rctx)
        r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
        r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
        rctx->atoms[id++] = &rctx->b.streamout.begin_atom;
+       rctx->atoms[id++] = &rctx->b.streamout.enable_atom;
        r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
        r600_init_atom(rctx, &rctx->pixel_shader.atom, id++, r600_emit_shader, 0);
        r600_init_atom(rctx, &rctx->geometry_shader.atom, id++, r600_emit_shader, 0);
@@ -3024,6 +3055,6 @@ void r600_init_state_functions(struct r600_context *rctx)
        rctx->b.b.set_polygon_stipple = r600_set_polygon_stipple;
        rctx->b.b.set_scissor_states = r600_set_scissor_states;
        rctx->b.b.get_sample_position = r600_get_sample_position;
-       rctx->b.dma_copy = r600_dma_blit;
+       rctx->b.dma_copy = r600_dma_copy;
 }
 /* this function must be last */