nouveau: Add support for SV_WORK_DIM
[mesa.git] / src / gallium / drivers / r600 / r600_state_common.c
index c3346f29811c432f9ed8883f783a0706555c486a..4b282d0bc9ba378d1fda6713a6431238dee1a4b1 100644 (file)
@@ -121,7 +121,7 @@ static unsigned r600_conv_pipe_prim(unsigned prim)
                [PIPE_PRIM_PATCHES]                     = V_008958_DI_PT_PATCH,
                [R600_PRIM_RECTANGLE_LIST]              = V_008958_DI_PT_RECTLIST
        };
-       assert(prim < Elements(prim_conv));
+       assert(prim < ARRAY_SIZE(prim_conv));
        return prim_conv[prim];
 }
 
@@ -145,7 +145,7 @@ unsigned r600_conv_prim_to_gs_out(unsigned mode)
                [PIPE_PRIM_PATCHES]                     = V_028A6C_OUTPRIM_TYPE_POINTLIST,
                [R600_PRIM_RECTANGLE_LIST]              = V_028A6C_OUTPRIM_TYPE_TRISTRIP
        };
-       assert(mode < Elements(prim_conv));
+       assert(mode < ARRAY_SIZE(prim_conv));
 
        return prim_conv[mode];
 }
@@ -350,9 +350,11 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 
        if (rs->offset_enable &&
            (rs->offset_units != rctx->poly_offset_state.offset_units ||
-            rs->offset_scale != rctx->poly_offset_state.offset_scale)) {
+            rs->offset_scale != rctx->poly_offset_state.offset_scale ||
+            rs->offset_units_unscaled != rctx->poly_offset_state.offset_units_unscaled)) {
                rctx->poly_offset_state.offset_units = rs->offset_units;
                rctx->poly_offset_state.offset_scale = rs->offset_scale;
+               rctx->poly_offset_state.offset_units_unscaled = rs->offset_units_unscaled;
                r600_mark_atom_dirty(rctx, &rctx->poly_offset_state.atom);
        }
 
@@ -364,14 +366,7 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
                r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
        }
 
-       /* Workaround for a missing scissor enable on r600. */
-       if (rctx->b.chip_class == R600 &&
-           rs->scissor_enable != rctx->scissor.enable) {
-               rctx->scissor.enable = rs->scissor_enable;
-               rctx->scissor.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
-               rctx->scissor.atom.num_dw = R600_MAX_VIEWPORTS * 4;
-               r600_mark_atom_dirty(rctx, &rctx->scissor.atom);
-       }
+       r600_set_scissor_enable(&rctx->b, rs->scissor_enable);
 
        /* Re-emit PA_SC_LINE_STIPPLE. */
        rctx->last_primitive_type = -1;
@@ -519,7 +514,7 @@ static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
 static void r600_delete_vertex_elements(struct pipe_context *ctx, void *state)
 {
        struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state;
-       pipe_resource_reference((struct pipe_resource**)&shader->buffer, NULL);
+       r600_resource_reference(&shader->buffer, NULL);
        FREE(shader);
 }
 
@@ -540,7 +535,6 @@ static void r600_set_index_buffer(struct pipe_context *ctx,
 void r600_vertex_buffers_dirty(struct r600_context *rctx)
 {
        if (rctx->vertex_buffer_state.dirty_mask) {
-               rctx->b.flags |= R600_CONTEXT_INV_VERTEX_CACHE;
                rctx->vertex_buffer_state.atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 12 : 11) *
                                               util_bitcount(rctx->vertex_buffer_state.dirty_mask);
                r600_mark_atom_dirty(rctx, &rctx->vertex_buffer_state.atom);
@@ -597,7 +591,6 @@ void r600_sampler_views_dirty(struct r600_context *rctx,
                              struct r600_samplerview_state *state)
 {
        if (state->dirty_mask) {
-               rctx->b.flags |= R600_CONTEXT_INV_TEX_CACHE;
                state->atom.num_dw = (rctx->b.chip_class >= EVERGREEN ? 14 : 13) *
                                     util_bitcount(state->dirty_mask);
                r600_mark_atom_dirty(rctx, &state->atom);
@@ -645,21 +638,21 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
                if (rviews[i]) {
                        struct r600_texture *rtex =
                                (struct r600_texture*)rviews[i]->base.texture;
+                       bool is_buffer = rviews[i]->base.texture->target == PIPE_BUFFER;
 
-                       if (rviews[i]->base.texture->target != PIPE_BUFFER) {
-                               if (rtex->is_depth && !rtex->is_flushing_texture) {
-                                       dst->views.compressed_depthtex_mask |= 1 << i;
-                               } else {
-                                       dst->views.compressed_depthtex_mask &= ~(1 << i);
-                               }
+                       if (!is_buffer && rtex->is_depth && !rtex->is_flushing_texture) {
+                               dst->views.compressed_depthtex_mask |= 1 << i;
+                       } else {
+                               dst->views.compressed_depthtex_mask &= ~(1 << i);
+                       }
 
-                               /* Track compressed colorbuffers. */
-                               if (rtex->cmask.size) {
-                                       dst->views.compressed_colortex_mask |= 1 << i;
-                               } else {
-                                       dst->views.compressed_colortex_mask &= ~(1 << i);
-                               }
+                       /* Track compressed colorbuffers. */
+                       if (!is_buffer && rtex->cmask.size) {
+                               dst->views.compressed_colortex_mask |= 1 << i;
+                       } else {
+                               dst->views.compressed_colortex_mask &= ~(1 << i);
                        }
+
                        /* Changing from array to non-arrays textures and vice versa requires
                         * updating TEX_ARRAY_OVERRIDE in sampler states on R6xx-R7xx. */
                        if (rctx->b.chip_class <= R700 &&
@@ -693,45 +686,24 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
        }
 }
 
-static void r600_set_viewport_states(struct pipe_context *ctx,
-                                     unsigned start_slot,
-                                     unsigned num_viewports,
-                                     const struct pipe_viewport_state *state)
+static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *views)
 {
-       struct r600_context *rctx = (struct r600_context *)ctx;
-       struct r600_viewport_state *rstate = &rctx->viewport;
-       int i;
+       uint32_t mask = views->enabled_mask;
 
-       for (i = start_slot; i < start_slot + num_viewports; i++)
-               rstate->state[i] = state[i - start_slot];
-       rstate->dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
-       rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 8;
-       r600_mark_atom_dirty(rctx, &rctx->viewport.atom);
-}
+       while (mask) {
+               unsigned i = u_bit_scan(&mask);
+               struct pipe_resource *res = views->views[i]->base.texture;
 
-void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
-{
-       struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-       struct r600_viewport_state *rstate = &rctx->viewport;
-       struct pipe_viewport_state *state;
-       uint32_t dirty_mask;
-       unsigned i, offset;
-
-       dirty_mask = rstate->dirty_mask;
-       while (dirty_mask != 0) {
-               i = u_bit_scan(&dirty_mask);
-               offset = i * 6 * 4;
-               radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE_0 + offset, 6);
-               state = &rstate->state[i];
-               radeon_emit(cs, fui(state->scale[0]));     /* R_02843C_PA_CL_VPORT_XSCALE_0  */
-               radeon_emit(cs, fui(state->translate[0])); /* R_028440_PA_CL_VPORT_XOFFSET_0 */
-               radeon_emit(cs, fui(state->scale[1]));     /* R_028444_PA_CL_VPORT_YSCALE_0  */
-               radeon_emit(cs, fui(state->translate[1])); /* R_028448_PA_CL_VPORT_YOFFSET_0 */
-               radeon_emit(cs, fui(state->scale[2]));     /* R_02844C_PA_CL_VPORT_ZSCALE_0  */
-               radeon_emit(cs, fui(state->translate[2])); /* R_028450_PA_CL_VPORT_ZOFFSET_0 */
-       }
-       rstate->dirty_mask = 0;
-       rstate->atom.num_dw = 0;
+               if (res && res->target != PIPE_BUFFER) {
+                       struct r600_texture *rtex = (struct r600_texture *)res;
+
+                       if (rtex->cmask.size) {
+                               views->compressed_colortex_mask |= 1 << i;
+                       } else {
+                               views->compressed_colortex_mask &= ~(1 << i);
+                       }
+               }
+       }
 }
 
 /* Compute the key for the hw shader variant */
@@ -941,6 +913,18 @@ static void r600_bind_ps_state(struct pipe_context *ctx, void *state)
        rctx->ps_shader = (struct r600_pipe_shader_selector *)state;
 }
 
+static struct tgsi_shader_info *r600_get_vs_info(struct r600_context *rctx)
+{
+       if (rctx->gs_shader)
+               return &rctx->gs_shader->info;
+       else if (rctx->tes_shader)
+               return &rctx->tes_shader->info;
+       else if (rctx->vs_shader)
+               return &rctx->vs_shader->info;
+       else
+               return NULL;
+}
+
 static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
@@ -949,6 +933,7 @@ static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
                return;
 
        rctx->vs_shader = (struct r600_pipe_shader_selector *)state;
+       r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
        rctx->b.streamout.stride_in_dw = rctx->vs_shader->so.stride;
 }
 
@@ -957,6 +942,7 @@ static void r600_bind_gs_state(struct pipe_context *ctx, void *state)
        struct r600_context *rctx = (struct r600_context *)ctx;
 
        rctx->gs_shader = (struct r600_pipe_shader_selector *)state;
+       r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
 
        if (!state)
                return;
@@ -975,6 +961,7 @@ static void r600_bind_tes_state(struct pipe_context *ctx, void *state)
        struct r600_context *rctx = (struct r600_context *)ctx;
 
        rctx->tes_shader = (struct r600_pipe_shader_selector *)state;
+       r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
 
        if (!state)
                return;
@@ -1061,7 +1048,6 @@ static void r600_delete_tes_state(struct pipe_context *ctx, void *state)
 void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf_state *state)
 {
        if (state->dirty_mask) {
-               rctx->b.flags |= R600_CONTEXT_INV_CONST_CACHE;
                state->atom.num_dw = rctx->b.chip_class >= EVERGREEN ? util_bitcount(state->dirty_mask)*20
                                                                   : util_bitcount(state->dirty_mask)*19;
                r600_mark_atom_dirty(rctx, &state->atom);
@@ -1069,7 +1055,7 @@ void r600_constant_buffers_dirty(struct r600_context *rctx, struct r600_constbuf
 }
 
 static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
-                                    struct pipe_constant_buffer *input)
+                                    const struct pipe_constant_buffer *input)
 {
        struct r600_context *rctx = (struct r600_context *)ctx;
        struct r600_constbuf_state *state = &rctx->constbuf_state[shader];
@@ -1138,7 +1124,7 @@ static void r600_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask
 
 static void r600_update_driver_const_buffers(struct r600_context *rctx)
 {
-       int sh, size;;
+       int sh, size;
        void *ptr;
        struct pipe_constant_buffer cb;
        for (sh = 0; sh < PIPE_SHADER_TYPES; sh++) {
@@ -1296,7 +1282,7 @@ void r600_set_sample_locations_constant_buffer(struct r600_context *rctx)
        struct pipe_context *ctx = &rctx->b.b;
 
        assert(rctx->framebuffer.nr_samples < R600_UCP_SIZE);
-       assert(rctx->framebuffer.nr_samples <= Elements(rctx->sample_positions)/4);
+       assert(rctx->framebuffer.nr_samples <= ARRAY_SIZE(rctx->sample_positions)/4);
 
        memset(rctx->sample_positions, 0, 4 * 4 * 16);
        for (i = 0; i < rctx->framebuffer.nr_samples; i++) {
@@ -1391,7 +1377,7 @@ static void r600_generate_fixed_func_tcs(struct r600_context *rctx)
 {
        struct ureg_src const0, const1;
        struct ureg_dst tessouter, tessinner;
-       struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+       struct ureg_program *ureg = ureg_create(PIPE_SHADER_TESS_CTRL);
 
        if (!ureg)
                return; /* if we get here, we're screwed */
@@ -1457,6 +1443,16 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
        if (!rctx->blitter->running) {
                unsigned i;
+               unsigned counter;
+
+               counter = p_atomic_read(&rctx->screen->b.compressed_colortex_counter);
+               if (counter != rctx->b.last_compressed_colortex_counter) {
+                       rctx->b.last_compressed_colortex_counter = counter;
+
+                       for (i = 0; i < PIPE_SHADER_TYPES; ++i) {
+                               r600_update_compressed_colortex_mask(&rctx->samplers[i].views);
+                       }
+               }
 
                /* Decompress textures if needed. */
                for (i = 0; i < PIPE_SHADER_TYPES; i++) {
@@ -1672,7 +1668,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
        struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
        bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
        uint64_t mask;
-       unsigned num_patches;
+       unsigned num_patches, dirty_fb_counter;
 
        if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) {
                return;
@@ -1684,10 +1680,17 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
        }
 
        /* make sure that the gfx ring is only one active */
-       if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
+       if (radeon_emitted(rctx->b.dma.cs, 0)) {
                rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
        }
 
+       /* Re-emit the framebuffer state if needed. */
+       dirty_fb_counter = p_atomic_read(&rctx->b.screen->dirty_fb_counter);
+       if (dirty_fb_counter != rctx->b.last_dirty_fb_counter) {
+               rctx->b.last_dirty_fb_counter = dirty_fb_counter;
+               r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
+       }
+
        if (!r600_update_derived_state(rctx)) {
                /* useless to render because current rendering command
                 * can't be achieved
@@ -1804,8 +1807,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                        ia_switch_on_eop = true;
                }
 
-               if (rctx->b.streamout.streamout_enabled ||
-                   rctx->b.streamout.prims_gen_query_enabled)
+               if (r600_get_strmout_en(&rctx->b))
                        partial_vs_wave = true;
 
                radeon_set_context_reg(cs, CM_R_028AA8_IA_MULTI_VGT_PARAM,
@@ -1868,8 +1870,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 
        /* Draw packets. */
        if (!info.indirect) {
-               cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 0);
-               cs->buf[cs->cdw++] = info.instance_count;
+               radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, 0));
+               radeon_emit(cs, info.instance_count);
        }
 
        if (unlikely(info.indirect)) {
@@ -1880,66 +1882,65 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
                rctx->vgt_state.last_draw_was_indirect = true;
                rctx->last_start_instance = -1;
 
-               cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 0);
-               cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE;
-               cs->buf[cs->cdw++] = va;
-               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+               radeon_emit(cs, PKT3(EG_PKT3_SET_BASE, 2, 0));
+               radeon_emit(cs, EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE);
+               radeon_emit(cs, va);
+               radeon_emit(cs, (va >> 32UL) & 0xFF);
 
-               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-               cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-                                                          (struct r600_resource*)info.indirect,
-                                                          RADEON_USAGE_READ,
-                                                           RADEON_PRIO_DRAW_INDIRECT);
+               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+               radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+                                                         (struct r600_resource*)info.indirect,
+                                                         RADEON_USAGE_READ,
+                                                          RADEON_PRIO_DRAW_INDIRECT));
        }
 
        if (info.indexed) {
-               cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 0);
-               cs->buf[cs->cdw++] = ib.index_size == 4 ?
-                                       (VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) :
-                                       (VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0));
+               radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
+               radeon_emit(cs, ib.index_size == 4 ?
+                               (VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) :
+                               (VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0)));
 
                if (ib.user_buffer) {
                        unsigned size_bytes = info.count*ib.index_size;
                        unsigned size_dw = align(size_bytes, 4) / 4;
-                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit);
-                       cs->buf[cs->cdw++] = info.count;
-                       cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE;
-                       memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes);
-                       cs->cdw += size_dw;
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit));
+                       radeon_emit(cs, info.count);
+                       radeon_emit(cs, V_0287F0_DI_SRC_SEL_IMMEDIATE);
+                       radeon_emit_array(cs, ib.user_buffer, size_dw);
                } else {
                        uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset;
 
                        if (likely(!info.indirect)) {
-                               cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit);
-                               cs->buf[cs->cdw++] = va;
-                               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
-                               cs->buf[cs->cdw++] = info.count;
-                               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
-                               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-                               cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-                                                                          (struct r600_resource*)ib.buffer,
-                                                                          RADEON_USAGE_READ,
-                                                                           RADEON_PRIO_INDEX_BUFFER);
+                               radeon_emit(cs, PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit));
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, (va >> 32UL) & 0xFF);
+                               radeon_emit(cs, info.count);
+                               radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
+                               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+                               radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+                                                                         (struct r600_resource*)ib.buffer,
+                                                                         RADEON_USAGE_READ,
+                                                                          RADEON_PRIO_INDEX_BUFFER));
                        }
                        else {
                                uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size;
 
-                               cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, 0);
-                               cs->buf[cs->cdw++] = va;
-                               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+                               radeon_emit(cs, PKT3(EG_PKT3_INDEX_BASE, 1, 0));
+                               radeon_emit(cs, va);
+                               radeon_emit(cs, (va >> 32UL) & 0xFF);
 
-                               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-                               cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-                                                                          (struct r600_resource*)ib.buffer,
-                                                                          RADEON_USAGE_READ,
-                                                                           RADEON_PRIO_INDEX_BUFFER);
+                               radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+                               radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+                                                                         (struct r600_resource*)ib.buffer,
+                                                                         RADEON_USAGE_READ,
+                                                                          RADEON_PRIO_INDEX_BUFFER));
 
-                               cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0);
-                               cs->buf[cs->cdw++] = max_size;
+                               radeon_emit(cs, PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0));
+                               radeon_emit(cs, max_size);
 
-                               cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit);
-                               cs->buf[cs->cdw++] = info.indirect_offset;
-                               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
+                               radeon_emit(cs, PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit));
+                               radeon_emit(cs, info.indirect_offset);
+                               radeon_emit(cs, V_0287F0_DI_SRC_SEL_DMA);
                        }
                }
        } else {
@@ -1949,29 +1950,29 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 
                        radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, t->stride_in_dw);
 
-                       cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0);
-                       cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
-                       cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL;     /* src address lo */
-                       cs->buf[cs->cdw++] = (va >> 32UL) & 0xFFUL; /* src address hi */
-                       cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
-                       cs->buf[cs->cdw++] = 0; /* unused */
+                       radeon_emit(cs, PKT3(PKT3_COPY_DW, 4, 0));
+                       radeon_emit(cs, COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG);
+                       radeon_emit(cs, va & 0xFFFFFFFFUL);     /* src address lo */
+                       radeon_emit(cs, (va >> 32UL) & 0xFFUL); /* src address hi */
+                       radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); /* dst register */
+                       radeon_emit(cs, 0); /* unused */
 
-                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-                       cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
-                                                                  t->buf_filled_size, RADEON_USAGE_READ,
-                                                                  RADEON_PRIO_SO_FILLED_SIZE);
+                       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
+                       radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
+                                                                 t->buf_filled_size, RADEON_USAGE_READ,
+                                                                 RADEON_PRIO_SO_FILLED_SIZE));
                }
 
                if (likely(!info.indirect)) {
-                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit);
-                       cs->buf[cs->cdw++] = info.count;
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
+                       radeon_emit(cs, info.count);
                }
                else {
-                       cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit);
-                       cs->buf[cs->cdw++] = info.indirect_offset;
+                       radeon_emit(cs, PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit));
+                       radeon_emit(cs, info.indirect_offset);
                }
-               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX |
-                                       (info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0);
+               radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
+                               (info.count_from_stream_output ? S_0287F0_USE_OPAQUE(1) : 0));
        }
 
        /* SMX returns CONTEXT_DONE too early workaround */
@@ -1981,19 +1982,15 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
            rctx->b.family == CHIP_RV635) {
                /* if we have gs shader or streamout
                   we need to do a wait idle after every draw */
-               if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+               if (rctx->gs_shader || r600_get_strmout_en(&rctx->b)) {
                        radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
                }
        }
 
        /* ES ring rolling over at EOP - workaround */
        if (rctx->b.chip_class == R600) {
-               cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
-               cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
-       }
-
-       if (rctx->screen->b.trace_bo) {
-               r600_trace_emit(rctx);
+               radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+               radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT));
        }
 
        /* Set the depth buffer as dirty. */
@@ -2090,17 +2087,6 @@ unsigned r600_tex_wrap(unsigned wrap)
        }
 }
 
-unsigned r600_tex_filter(unsigned filter)
-{
-       switch (filter) {
-       default:
-       case PIPE_TEX_FILTER_NEAREST:
-               return V_03C000_SQ_TEX_XY_FILTER_POINT;
-       case PIPE_TEX_FILTER_LINEAR:
-               return V_03C000_SQ_TEX_XY_FILTER_BILINEAR;
-       }
-}
-
 unsigned r600_tex_mipfilter(unsigned filter)
 {
        switch (filter) {
@@ -2203,22 +2189,22 @@ unsigned r600_get_swizzle_combined(const unsigned char *swizzle_format,
        /* Get swizzle. */
        for (i = 0; i < 4; i++) {
                switch (swizzle[i]) {
-               case UTIL_FORMAT_SWIZZLE_Y:
+               case PIPE_SWIZZLE_Y:
                        result |= swizzle_bit[1] << swizzle_shift[i];
                        break;
-               case UTIL_FORMAT_SWIZZLE_Z:
+               case PIPE_SWIZZLE_Z:
                        result |= swizzle_bit[2] << swizzle_shift[i];
                        break;
-               case UTIL_FORMAT_SWIZZLE_W:
+               case PIPE_SWIZZLE_W:
                        result |= swizzle_bit[3] << swizzle_shift[i];
                        break;
-               case UTIL_FORMAT_SWIZZLE_0:
+               case PIPE_SWIZZLE_0:
                        result |= V_038010_SQ_SEL_0 << swizzle_shift[i];
                        break;
-               case UTIL_FORMAT_SWIZZLE_1:
+               case PIPE_SWIZZLE_1:
                        result |= V_038010_SQ_SEL_1 << swizzle_shift[i];
                        break;
-               default: /* UTIL_FORMAT_SWIZZLE_X */
+               default: /* PIPE_SWIZZLE_X */
                        result |= swizzle_bit[0] << swizzle_shift[i];
                }
        }
@@ -2229,16 +2215,19 @@ unsigned r600_get_swizzle_combined(const unsigned char *swizzle_format,
 uint32_t r600_translate_texformat(struct pipe_screen *screen,
                                  enum pipe_format format,
                                  const unsigned char *swizzle_view,
-                                 uint32_t *word4_p, uint32_t *yuv_format_p)
+                                 uint32_t *word4_p, uint32_t *yuv_format_p,
+                                 bool do_endian_swap)
 {
        struct r600_screen *rscreen = (struct r600_screen *)screen;
        uint32_t result = 0, word4 = 0, yuv_format = 0;
        const struct util_format_description *desc;
        boolean uniform = TRUE;
-       bool enable_s3tc = rscreen->b.info.drm_minor >= 9;
        bool is_srgb_valid = FALSE;
        const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
        const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
+       const unsigned char swizzle_xxxy[4] = {0, 0, 0, 1};
+       const unsigned char swizzle_zyx1[4] = {2, 1, 0, 5};
+       const unsigned char swizzle_zyxw[4] = {2, 1, 0, 3};
 
        int i;
        const uint32_t sign_bit[4] = {
@@ -2247,11 +2236,41 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
                S_038010_FORMAT_COMP_Z(V_038010_SQ_FORMAT_COMP_SIGNED),
                S_038010_FORMAT_COMP_W(V_038010_SQ_FORMAT_COMP_SIGNED)
        };
+
+       /* Need to replace the specified texture formats in case of big-endian.
+        * These formats are formats that have channels with number of bits
+        * not divisible by 8.
+        * Mesa conversion functions don't swap bits for those formats, and because
+        * we transmit this over a serial bus to the GPU (PCIe), the
+        * bit-endianess is important!!!
+        * In case we have an "opposite" format, just use that for the swizzling
+        * information. If we don't have such an "opposite" format, we need
+        * to use a fixed swizzle info instead (see below)
+        */
+       if (format == PIPE_FORMAT_R4A4_UNORM && do_endian_swap)
+               format = PIPE_FORMAT_A4R4_UNORM;
+
        desc = util_format_description(format);
 
        /* Depth and stencil swizzling is handled separately. */
        if (desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) {
-               word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view, FALSE);
+               /* Need to check for specific texture formats that don't have
+                * an "opposite" format we can use. For those formats, we directly
+                * specify the swizzling, which is the LE swizzling as defined in
+                * u_format.csv
+                */
+               if (do_endian_swap) {
+                       if (format == PIPE_FORMAT_L4A4_UNORM)
+                               word4 |= r600_get_swizzle_combined(swizzle_xxxy, swizzle_view, FALSE);
+                       else if (format == PIPE_FORMAT_B4G4R4A4_UNORM)
+                               word4 |= r600_get_swizzle_combined(swizzle_zyxw, swizzle_view, FALSE);
+                       else if (format == PIPE_FORMAT_B4G4R4X4_UNORM || format == PIPE_FORMAT_B5G6R5_UNORM)
+                               word4 |= r600_get_swizzle_combined(swizzle_zyx1, swizzle_view, FALSE);
+                       else
+                               word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view, FALSE);
+               } else {
+                       word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view, FALSE);
+               }
        }
 
        /* Colorspace (return non-RGB formats directly). */
@@ -2330,9 +2349,6 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-               if (!enable_s3tc)
-                       goto out_unknown;
-
                switch (format) {
                case PIPE_FORMAT_RGTC1_SNORM:
                case PIPE_FORMAT_LATC1_SNORM:
@@ -2354,10 +2370,6 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-
-               if (!enable_s3tc)
-                       goto out_unknown;
-
                if (!util_format_s3tc_enabled) {
                        goto out_unknown;
                }
@@ -2386,9 +2398,6 @@ uint32_t r600_translate_texformat(struct pipe_screen *screen,
        }
 
        if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-               if (!enable_s3tc)
-                       goto out_unknown;
-
                if (rscreen->b.chip_class < EVERGREEN)
                        goto out_unknown;
 
@@ -2602,7 +2611,8 @@ out_unknown:
        return ~0;
 }
 
-uint32_t r600_translate_colorformat(enum chip_class chip, enum pipe_format format)
+uint32_t r600_translate_colorformat(enum chip_class chip, enum pipe_format format,
+                                               bool do_endian_swap)
 {
        const struct util_format_description *desc = util_format_description(format);
        int channel = util_format_get_first_non_void_channel(format);
@@ -2660,7 +2670,7 @@ uint32_t r600_translate_colorformat(enum chip_class chip, enum pipe_format forma
                                        return V_0280A0_COLOR_32_32;
                        }
                } else if (HAS_SIZE(8,24,0,0)) {
-                       return V_0280A0_COLOR_24_8;
+                       return (do_endian_swap ? V_0280A0_COLOR_8_24 : V_0280A0_COLOR_24_8);
                } else if (HAS_SIZE(24,8,0,0)) {
                        return V_0280A0_COLOR_8_24;
                }
@@ -2702,7 +2712,7 @@ uint32_t r600_translate_colorformat(enum chip_class chip, enum pipe_format forma
        return ~0U;
 }
 
-uint32_t r600_colorformat_endian_swap(uint32_t colorformat)
+uint32_t r600_colorformat_endian_swap(uint32_t colorformat, bool do_endian_swap)
 {
        if (R600_BIG_ENDIAN) {
                switch(colorformat) {
@@ -2712,22 +2722,38 @@ uint32_t r600_colorformat_endian_swap(uint32_t colorformat)
                        return ENDIAN_NONE;
 
                /* 16-bit buffers. */
+               case V_0280A0_COLOR_8_8:
+                       /*
+                        * No need to do endian swaps on array formats,
+                        * as mesa<-->pipe formats conversion take into account
+                        * the endianess
+                        */
+                       return ENDIAN_NONE;
+
                case V_0280A0_COLOR_5_6_5:
                case V_0280A0_COLOR_1_5_5_5:
                case V_0280A0_COLOR_4_4_4_4:
                case V_0280A0_COLOR_16:
-               case V_0280A0_COLOR_8_8:
-                       return ENDIAN_8IN16;
+                       return (do_endian_swap ? ENDIAN_8IN16 : ENDIAN_NONE);
 
                /* 32-bit buffers. */
                case V_0280A0_COLOR_8_8_8_8:
+                       /*
+                        * No need to do endian swaps on array formats,
+                        * as mesa<-->pipe formats conversion take into account
+                        * the endianess
+                        */
+                       return ENDIAN_NONE;
+
                case V_0280A0_COLOR_2_10_10_10:
                case V_0280A0_COLOR_8_24:
                case V_0280A0_COLOR_24_8:
                case V_0280A0_COLOR_32_FLOAT:
+                       return (do_endian_swap ? ENDIAN_8IN32 : ENDIAN_NONE);
+
                case V_0280A0_COLOR_16_16_FLOAT:
                case V_0280A0_COLOR_16_16:
-                       return ENDIAN_8IN32;
+                       return ENDIAN_8IN16;
 
                /* 64-bit buffers. */
                case V_0280A0_COLOR_16_16_16_16:
@@ -2760,7 +2786,7 @@ static void r600_invalidate_buffer(struct pipe_context *ctx, struct pipe_resourc
 
        /* Reallocate the buffer in the same pipe_resource. */
        r600_init_resource(&rctx->screen->b, rbuffer, rbuffer->b.b.width0,
-                          alignment, TRUE);
+                          alignment);
 
        /* We changed the buffer, now we need to bind it where the old one was bound. */
        /* Vertex buffers. */
@@ -2774,7 +2800,8 @@ static void r600_invalidate_buffer(struct pipe_context *ctx, struct pipe_resourc
        }
        /* Streamout buffers. */
        for (i = 0; i < rctx->b.streamout.num_targets; i++) {
-               if (rctx->b.streamout.targets[i]->b.buffer == &rbuffer->b.b) {
+               if (rctx->b.streamout.targets[i] &&
+                   rctx->b.streamout.targets[i]->b.buffer == &rbuffer->b.b) {
                        if (rctx->b.streamout.begin_emitted) {
                                r600_emit_streamout_end(&rctx->b);
                        }
@@ -2832,16 +2859,33 @@ static void r600_invalidate_buffer(struct pipe_context *ctx, struct pipe_resourc
        }
 }
 
-static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
+static void r600_set_active_query_state(struct pipe_context *ctx, boolean enable)
 {
        struct r600_context *rctx = (struct r600_context*)ctx;
 
-       if (rctx->db_misc_state.occlusion_query_enabled != enable) {
-               rctx->db_misc_state.occlusion_query_enabled = enable;
+       /* Pipeline stat & streamout queries. */
+       if (enable) {
+               rctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS;
+               rctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS;
+       } else {
+               rctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS;
+               rctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS;
+       }
+
+       /* Occlusion queries. */
+       if (rctx->db_misc_state.occlusion_queries_disabled != !enable) {
+               rctx->db_misc_state.occlusion_queries_disabled = !enable;
                r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
        }
 }
 
+static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
+{
+       struct r600_context *rctx = (struct r600_context*)ctx;
+
+       r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
+}
+
 static void r600_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
                                    bool include_draw_vbo)
 {
@@ -2882,34 +2926,15 @@ void r600_init_common_state_functions(struct r600_context *rctx)
        rctx->b.b.set_constant_buffer = r600_set_constant_buffer;
        rctx->b.b.set_sample_mask = r600_set_sample_mask;
        rctx->b.b.set_stencil_ref = r600_set_pipe_stencil_ref;
-       rctx->b.b.set_viewport_states = r600_set_viewport_states;
        rctx->b.b.set_vertex_buffers = r600_set_vertex_buffers;
        rctx->b.b.set_index_buffer = r600_set_index_buffer;
        rctx->b.b.set_sampler_views = r600_set_sampler_views;
        rctx->b.b.sampler_view_destroy = r600_sampler_view_destroy;
        rctx->b.b.texture_barrier = r600_texture_barrier;
        rctx->b.b.set_stream_output_targets = r600_set_streamout_targets;
+       rctx->b.b.set_active_query_state = r600_set_active_query_state;
        rctx->b.b.draw_vbo = r600_draw_vbo;
        rctx->b.invalidate_buffer = r600_invalidate_buffer;
        rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state;
        rctx->b.need_gfx_cs_space = r600_need_gfx_cs_space;
 }
-
-void r600_trace_emit(struct r600_context *rctx)
-{
-       struct r600_screen *rscreen = rctx->screen;
-       struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-       uint64_t va;
-       uint32_t reloc;
-
-       va = rscreen->b.trace_bo->gpu_address;
-       reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo,
-                                     RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
-       radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
-       radeon_emit(cs, va & 0xFFFFFFFFUL);
-       radeon_emit(cs, (va >> 32UL) & 0xFFUL);
-       radeon_emit(cs, cs->cdw);
-       radeon_emit(cs, rscreen->b.cs_count);
-       radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-       radeon_emit(cs, reloc);
-}