r600: add support for ARB_shader_clock.

[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c

index 7768f503c0ccd1077c452db8156b1a4f164a61b6..623e6f7f70473a6ce0e83926b55ea8159a6e07c1 100644 (file)
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -190,6 +190,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
         }
         use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
         use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
+       use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
  
         /* disable SB for shaders using doubles */
         use_sb &= !shader->shader.uses_doubles;
@@ -279,6 +280,9 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
                         r600_update_ps_state(ctx, shader);
                 }
                 break;
+       case PIPE_SHADER_COMPUTE:
+               evergreen_update_ls_state(ctx, shader);
+               break;
         default:
                 r = -EINVAL;
                 goto error;
@@ -342,8 +346,10 @@ struct r600_shader_ctx {
         boolean                 clip_vertex_write;
         unsigned                cv_output;
         unsigned                edgeflag_output;
+       int                                     cs_block_size_reg;
+       int                                     cs_grid_size_reg;
+       bool cs_block_size_loaded, cs_grid_size_loaded;
         int                                     fragcoord_input;
-       int                                     native_integers;
         int                                     next_ring_offset;
         int                                     gs_out_ring_offset;
         int                                     gs_next_vertex;
@@ -971,6 +977,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
         case TGSI_FILE_ADDRESS:
         case TGSI_FILE_BUFFER:
         case TGSI_FILE_IMAGE:
+       case TGSI_FILE_MEMORY:
                 break;
  
         case TGSI_FILE_HW_ATOMIC:
@@ -990,22 +997,6 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
                         d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
                         break; /* Already handled from allocate_system_value_inputs */
                 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
-                       if (!ctx->native_integers) {
-                               struct r600_bytecode_alu alu;
-                               memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-
-                               alu.op = ALU_OP1_INT_TO_FLT;
-                               alu.src[0].sel = 0;
-                               alu.src[0].chan = 3;
-
-                               alu.dst.sel = 0;
-                               alu.dst.chan = 3;
-                               alu.dst.write = 1;
-                               alu.last = 1;
-
-                               if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
-                                       return r;
-                       }
                         break;
                 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
                         break;
@@ -1293,7 +1284,7 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
         vtx.num_format_all = 2;
         vtx.format_comp_all = 1;
         vtx.use_const_fields = 0;
-       vtx.offset = 1; // first element is size of buffer
+       vtx.offset = 0;
         vtx.endian = r600_endian_swap(32);
         vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
  
@@ -1304,6 +1295,60 @@ static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_
         return t1;
  }
  
+static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
+{
+       struct r600_bytecode_vtx vtx;
+       int r, t1;
+
+       if (ctx->cs_block_size_loaded)
+               return ctx->cs_block_size_reg;
+       if (ctx->cs_grid_size_loaded)
+               return ctx->cs_grid_size_reg;
+
+       t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
+       struct r600_bytecode_alu alu;
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = ALU_OP1_MOV;
+       alu.src[0].sel = V_SQ_ALU_SRC_0;
+       alu.dst.sel = t1;
+       alu.dst.write = 1;
+       alu.last = 1;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+
+       memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
+       vtx.op = FETCH_OP_VFETCH;
+       vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
+       vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
+       vtx.src_gpr = t1;
+       vtx.src_sel_x = 0;
+
+       vtx.mega_fetch_count = 16;
+       vtx.dst_gpr = t1;
+       vtx.dst_sel_x = 0;
+       vtx.dst_sel_y = 1;
+       vtx.dst_sel_z = 2;
+       vtx.dst_sel_w = 7;
+       vtx.data_format = FMT_32_32_32_32;
+       vtx.num_format_all = 1;
+       vtx.format_comp_all = 0;
+       vtx.use_const_fields = 0;
+       vtx.offset = load_block ? 0 : 16; // first element is size of buffer
+       vtx.endian = r600_endian_swap(32);
+       vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
+
+       r = r600_bytecode_add_vtx(ctx->bc, &vtx);
+       if (r)
+               return r;
+
+       if (load_block)
+               ctx->cs_block_size_loaded = true;
+       else
+               ctx->cs_grid_size_loaded = true;
+       return t1;
+}
+
  static void tgsi_src(struct r600_shader_ctx *ctx,
                      const struct tgsi_full_src_register *tgsi_src,
                      struct r600_shader_src *r600_src)
@@ -1361,6 +1406,10 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
                         r600_src->swizzle[2] = 0;
                         r600_src->swizzle[3] = 0;
                         r600_src->sel = 0;
+               } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
+                       r600_src->sel = 0;
+               } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
+                       r600_src->sel = 1;
                 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
                         r600_src->swizzle[0] = 3;
                         r600_src->swizzle[1] = 3;
@@ -1405,6 +1454,10 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
                         r600_src->swizzle[1] = 3;
                         r600_src->swizzle[2] = 3;
                         r600_src->swizzle[3] = 3;
+               } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
+                       r600_src->sel = load_block_grid_size(ctx, false);
+               } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
+                       r600_src->sel = load_block_grid_size(ctx, true);
                 }
         } else {
                 if (tgsi_src->Register.Indirect)
@@ -2798,7 +2851,7 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
  }
  
  static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
-                                int output_idx)
+                                int output_idx, int nc)
  {
         int param;
         unsigned temp_reg = r600_get_temp(ctx);
@@ -2818,7 +2871,7 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
         if (r)
                 return r;
  
-       do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
+       do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
         return 0;
  }
  
@@ -2888,13 +2941,13 @@ static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
                 return -1;
  
         if (tessouter_idx != -1) {
-               r = r600_tess_factor_read(ctx, tessouter_idx);
+               r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
                 if (r)
                         return r;
         }
  
         if (tessinner_idx != -1) {
-               r = r600_tess_factor_read(ctx, tessinner_idx);
+               r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
                 if (r)
                         return r;
         }
@@ -3058,7 +3111,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
  
         ctx.bc = &shader->bc;
         ctx.shader = shader;
-       ctx.native_integers = true;
  
         r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
                            rscreen->has_compressed_msaa_texturing);
@@ -3113,6 +3165,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
                 shader->rat_base = key.ps.nr_cbufs;
                 shader->image_size_const_offset = key.ps.image_size_const_offset;
                 break;
+       case PIPE_SHADER_COMPUTE:
+               shader->rat_base = 0;
+               shader->image_size_const_offset = 0;
+               break;
         default:
                 break;
         }
@@ -3135,6 +3191,11 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
         ctx.clip_vertex_write = 0;
         ctx.thread_id_gpr_loaded = false;
  
+       ctx.cs_block_size_reg = -1;
+       ctx.cs_grid_size_reg = -1;
+       ctx.cs_block_size_loaded = false;
+       ctx.cs_grid_size_loaded = false;
+
         shader->nr_ps_color_exports = 0;
         shader->nr_ps_max_color_exports = 0;
  
@@ -3198,6 +3259,15 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
                 if (add_tess_inout)
                         ctx.file_offset[TGSI_FILE_INPUT]+=2;
         }
+       if (ctx.type == PIPE_SHADER_COMPUTE) {
+               ctx.file_offset[TGSI_FILE_INPUT] = 2;
+               for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
+                       if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
+                               ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
+                       if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
+                               ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
+               }
+       }
  
         ctx.file_offset[TGSI_FILE_OUTPUT] =
                         ctx.file_offset[TGSI_FILE_INPUT] +
@@ -3266,6 +3336,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
  
         ctx.nliterals = 0;
         ctx.literals = NULL;
+       ctx.max_driver_temp_used = 0;
  
         shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
                                ctx.info.colors_written == 1;
@@ -3839,7 +3910,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
                         last = r600_isa_cf(ctx.bc->cf_last->op);
  
                 /* alu clause instructions don't have EOP bit, so add NOP */
-               if (!last || last->flags & CF_ALU)
+               if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
                         r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
  
                 ctx.bc->cf_last->end_of_program = 1;
@@ -6786,6 +6857,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
         int src_gpr, r, i;
         int id = tgsi_tex_get_src_gpr(ctx, 1);
+       int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
  
         src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
         if (src_requires_loading) {
@@ -6817,6 +6889,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l
         vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;          /* SEL_Z */
         vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;          /* SEL_W */
         vtx.use_const_fields = 1;
+       vtx.buffer_index_mode = sampler_index_mode;
  
         if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
                 return r;
@@ -6877,31 +6950,48 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_l
  static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-       struct r600_bytecode_alu alu;
         int r;
         int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
+       int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
  
-       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-       alu.op = ALU_OP1_MOV;
-       alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
-       if (ctx->bc->chip_class >= EVERGREEN) {
-               /* channel 0 or 2 of each word */
-               alu.src[0].sel += (id / 2);
-               alu.src[0].chan = (id % 2) * 2;
-       } else {
+       if (ctx->bc->chip_class < EVERGREEN) {
+               struct r600_bytecode_alu alu;
+               memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+               alu.op = ALU_OP1_MOV;
+               alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
                 /* r600 we have them at channel 2 of the second dword */
                 alu.src[0].sel += (id * 2) + 1;
                 alu.src[0].chan = 1;
+               alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
+               tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+               alu.last = 1;
+               r = r600_bytecode_add_alu(ctx->bc, &alu);
+               if (r)
+                       return r;
+               return 0;
+       } else {
+               struct r600_bytecode_vtx vtx;
+               memset(&vtx, 0, sizeof(vtx));
+               vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
+               vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
+               vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
+               vtx.src_gpr = 0;
+               vtx.mega_fetch_count = 16; /* no idea here really... */
+               vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
+               vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;          /* SEL_X */
+               vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7;          /* SEL_Y */
+               vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7;          /* SEL_Z */
+               vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7;          /* SEL_W */
+               vtx.data_format = FMT_32_32_32_32;
+               vtx.buffer_index_mode = sampler_index_mode;
+
+               if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
+                       return r;
+               return 0;
         }
-       alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
-       tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
-       alu.last = 1;
-       r = r600_bytecode_add_alu(ctx->bc, &alu);
-       if (r)
-               return r;
-       return 0;
  }
  
+
  static int tgsi_tex(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -6955,7 +7045,8 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
  
         if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
                 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
-                       ctx->shader->uses_tex_buffers = true;
+                       if (ctx->bc->chip_class < EVERGREEN)
+                               ctx->shader->uses_tex_buffers = true;
                         return r600_do_buffer_txq(ctx, 1, 0);
                 }
                 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
@@ -7545,9 +7636,9 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
  
                 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
                 if (ctx->bc->chip_class >= EVERGREEN) {
-                       /* channel 1 or 3 of each word */
-                       alu.src[0].sel += (id / 2);
-                       alu.src[0].chan = ((id % 2) * 2) + 1;
+                       /* with eg each dword is number of cubes */
+                       alu.src[0].sel += id / 4;
+                       alu.src[0].chan = id % 4;
                 } else {
                         /* r600 we have them at channel 2 of the second dword */
                         alu.src[0].sel += (id * 2) + 1;
@@ -7814,7 +7905,7 @@ static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
                              int *uav_id_p, int *uav_index_mode_p)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-       int uav_id, uav_index_mode;
+       int uav_id, uav_index_mode = 0;
         int r;
         bool is_cm = (ctx->bc->chip_class == CAYMAN);
  
@@ -8115,6 +8206,30 @@ static int tgsi_load_rat(struct r600_shader_ctx *ctx)
         return 0;
  }
  
+static int tgsi_load_lds(struct r600_shader_ctx *ctx)
+{
+       struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+       struct r600_bytecode_alu alu;
+       int r;
+       int temp_reg = r600_get_temp(ctx);
+       
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = ALU_OP1_MOV;
+       r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
+       alu.dst.sel = temp_reg;
+       alu.dst.write = 1;
+       alu.last = 1;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+       
+       r = do_lds_fetch_values(ctx, temp_reg,
+                               ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
+       if (r)
+               return r;
+       return 0;
+}
+
  static int tgsi_load(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -8124,6 +8239,8 @@ static int tgsi_load(struct r600_shader_ctx *ctx)
                 return tgsi_load_gds(ctx);
         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
                 return tgsi_load_buffer(ctx);
+       if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
+               return tgsi_load_lds(ctx);
         return 0;
  }
  
@@ -8258,11 +8375,82 @@ static int tgsi_store_rat(struct r600_shader_ctx *ctx)
         return 0;
  }
  
+static int tgsi_store_lds(struct r600_shader_ctx *ctx)
+{
+       struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+       struct r600_bytecode_alu alu;
+       int r, i, lasti;
+       int write_mask = inst->Dst[0].Register.WriteMask;
+       int temp_reg = r600_get_temp(ctx);
+
+       /* LDS write */
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = ALU_OP1_MOV;
+       r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+       alu.dst.sel = temp_reg;
+       alu.dst.write = 1;
+       alu.last = 1;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+
+       lasti = tgsi_last_instruction(write_mask);
+       for (i = 1; i <= lasti; i++) {
+               if (!(write_mask & (1 << i)))
+                       continue;
+               r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
+                                  temp_reg, i,
+                                  temp_reg, 0,
+                                  V_SQ_ALU_SRC_LITERAL, 4 * i);
+               if (r)
+                       return r;
+       }
+       for (i = 0; i <= lasti; i++) {
+               if (!(write_mask & (1 << i)))
+                       continue;
+
+               if ((i == 0 && ((write_mask & 3) == 3)) ||
+                   (i == 2 && ((write_mask & 0xc) == 0xc))) {
+                       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+                       alu.op = LDS_OP3_LDS_WRITE_REL;
+
+                       alu.src[0].sel = temp_reg;
+                       alu.src[0].chan = i;
+                       r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+                       r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
+                       alu.last = 1;
+                       alu.is_lds_idx_op = true;
+                       alu.lds_idx = 1;
+                       r = r600_bytecode_add_alu(ctx->bc, &alu);
+                       if (r)
+                               return r;
+                       i += 1;
+                       continue;
+               }
+               memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+               alu.op = LDS_OP2_LDS_WRITE;
+
+               alu.src[0].sel = temp_reg;
+               alu.src[0].chan = i;
+               r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
+
+               alu.last = 1;
+               alu.is_lds_idx_op = true;
+
+               r = r600_bytecode_add_alu(ctx->bc, &alu);
+               if (r)
+                       return r;
+       }
+       return 0;
+}
+
  static int tgsi_store(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
         if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
                 return tgsi_store_buffer_rat(ctx);
+       else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
+               return tgsi_store_lds(ctx);
         else
                 return tgsi_store_rat(ctx);
  }
@@ -8502,6 +8690,71 @@ static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
         return 0;
  }
  
+static int get_lds_op(int opcode)
+{
+       switch (opcode) {
+       case TGSI_OPCODE_ATOMUADD:
+               return LDS_OP2_LDS_ADD_RET;
+       case TGSI_OPCODE_ATOMAND:
+               return LDS_OP2_LDS_AND_RET;
+       case TGSI_OPCODE_ATOMOR:
+               return LDS_OP2_LDS_OR_RET;
+       case TGSI_OPCODE_ATOMXOR:
+               return LDS_OP2_LDS_XOR_RET;
+       case TGSI_OPCODE_ATOMUMIN:
+               return LDS_OP2_LDS_MIN_UINT_RET;
+       case TGSI_OPCODE_ATOMUMAX:
+               return LDS_OP2_LDS_MAX_UINT_RET;
+       case TGSI_OPCODE_ATOMIMIN:
+               return LDS_OP2_LDS_MIN_INT_RET;
+       case TGSI_OPCODE_ATOMIMAX:
+               return LDS_OP2_LDS_MAX_INT_RET;
+       case TGSI_OPCODE_ATOMXCHG:
+               return LDS_OP2_LDS_XCHG_RET;
+       case TGSI_OPCODE_ATOMCAS:
+               return LDS_OP3_LDS_CMP_XCHG_RET;
+       default:
+               return -1;
+       }
+}
+
+static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
+{
+       struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+       int lds_op = get_lds_op(inst->Instruction.Opcode);
+       int r;
+
+       struct r600_bytecode_alu alu;
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = lds_op;
+       alu.is_lds_idx_op = true;
+       alu.last = 1;
+       r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
+       r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
+       if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
+               r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
+       else
+               alu.src[2].sel = V_SQ_ALU_SRC_0;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+
+       /* then read from LDS_OQ_A_POP */
+       memset(&alu, 0, sizeof(alu));
+
+       alu.op = ALU_OP1_MOV;
+       alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
+       alu.src[0].chan = 0;
+       tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+       alu.dst.write = 1;
+       alu.last = 1;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+
+       return 0;
+}
+
  static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
  {
         struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -8511,6 +8764,8 @@ static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
                 return tgsi_atomic_op_gds(ctx);
         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
                 return tgsi_atomic_op_rat(ctx);
+       if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
+               return tgsi_atomic_op_lds(ctx);
         return 0;
  }
  
@@ -8524,7 +8779,8 @@ static int tgsi_resq(struct r600_shader_ctx *ctx)
  
         if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
             (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
-               ctx->shader->uses_tex_buffers = true;
+               if (ctx->bc->chip_class < EVERGREEN)
+                       ctx->shader->uses_tex_buffers = true;
                 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
         }
  
@@ -8548,9 +8804,9 @@ static int tgsi_resq(struct r600_shader_ctx *ctx)
                 alu.op = ALU_OP1_MOV;
  
                 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
-               /* channel 1 or 3 of each word */
-               alu.src[0].sel += (id / 2);
-               alu.src[0].chan = ((id % 2) * 2) + 1;
+               /* with eg each dword is either number of cubes */
+               alu.src[0].sel += id / 4;
+               alu.src[0].chan = id % 4;
                 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
                 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
                 alu.last = 1;
@@ -9934,6 +10190,29 @@ static int tgsi_bfe(struct r600_shader_ctx *ctx)
         return 0;
  }
  
+static int tgsi_clock(struct r600_shader_ctx *ctx)
+{
+       struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+       struct r600_bytecode_alu alu;
+       int r;
+
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = ALU_OP1_MOV;
+       tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
+       alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+       memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+       alu.op = ALU_OP1_MOV;
+       tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
+       alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
+       r = r600_bytecode_add_alu(ctx->bc, &alu);
+       if (r)
+               return r;
+       return 0;
+}
+
  static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
         [TGSI_OPCODE_ARL]       = { ALU_OP0_NOP, tgsi_r600_arl},
         [TGSI_OPCODE_MOV]       = { ALU_OP1_MOV, tgsi_op2},
@@ -9970,7 +10249,7 @@ static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[]
         [TGSI_OPCODE_POW]       = { ALU_OP0_NOP, tgsi_pow},
         [31]    = { ALU_OP0_NOP, tgsi_unsupported},
         [32]                    = { ALU_OP0_NOP, tgsi_unsupported},
-       [33]                    = { ALU_OP0_NOP, tgsi_unsupported},
+       [TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_unsupported},
         [34]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [35]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [TGSI_OPCODE_COS]       = { ALU_OP1_COS, tgsi_trig},
@@ -10168,7 +10447,7 @@ static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] =
         [TGSI_OPCODE_POW]       = { ALU_OP0_NOP, tgsi_pow},
         [31]    = { ALU_OP0_NOP, tgsi_unsupported},
         [32]                    = { ALU_OP0_NOP, tgsi_unsupported},
-       [33]                    = { ALU_OP0_NOP, tgsi_unsupported},
+       [TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
         [34]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [35]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [TGSI_OPCODE_COS]       = { ALU_OP1_COS, tgsi_trig},
@@ -10390,7 +10669,7 @@ static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] =
         [TGSI_OPCODE_POW]       = { ALU_OP0_NOP, cayman_pow},
         [31]    = { ALU_OP0_NOP, tgsi_unsupported},
         [32]                    = { ALU_OP0_NOP, tgsi_unsupported},
-       [33]                    = { ALU_OP0_NOP, tgsi_unsupported},
+       [TGSI_OPCODE_CLOCK]     = { ALU_OP0_NOP, tgsi_clock},
         [34]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [35]                    = { ALU_OP0_NOP, tgsi_unsupported},
         [TGSI_OPCODE_COS]       = { ALU_OP1_COS, cayman_trig},