src/mesa/drivers/dri/r600/r700_render.c

   1 /*
   2  * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included
  12  * in all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  18  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  19  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  20  */
  21
  22 /*
  23  * Authors:
  24  *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
  25  *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
  26  */
  27
  28 #include "main/glheader.h"
  29 #include "main/state.h"
  30 #include "main/imports.h"
  31 #include "main/enums.h"
  32 #include "main/macros.h"
  33 #include "main/context.h"
  34 #include "main/dd.h"
  35 #include "main/simple_list.h"
  36 #include "main/api_arrayelt.h"
  37 #include "swrast/swrast.h"
  38 #include "swrast_setup/swrast_setup.h"
  39 #include "vbo/vbo.h"
  40
  41 #include "tnl/tnl.h"
  42 #include "tnl/t_vp_build.h"
  43 #include "tnl/t_context.h"
  44 #include "tnl/t_vertex.h"
  45 #include "tnl/t_pipeline.h"
  46 #include "vbo/vbo_context.h"
  47
  48 #include "r600_context.h"
  49 #include "r600_cmdbuf.h"
  50
  51 #include "r600_tex.h"
  52
  53 #include "r700_vertprog.h"
  54 #include "r700_fragprog.h"
  55 #include "r700_state.h"
  56
  57 #include "radeon_buffer_objects.h"
  58 #include "radeon_common_context.h"
  59
  60 void r700WaitForIdle(context_t *context);
  61 void r700WaitForIdleClean(context_t *context);
  62 GLboolean r700SendTextureState(context_t *context);
  63 static unsigned int r700PrimitiveType(int prim);
  64 void r600UpdateTextureState(GLcontext * ctx);
  65 GLboolean r700SyncSurf(context_t *context,
  66                        struct radeon_bo *pbo,
  67                        uint32_t read_domain,
  68                        uint32_t write_domain,
  69                        uint32_t sync_type);
  70
  71 void r700WaitForIdle(context_t *context)
  72 {
  73     BATCH_LOCALS(&context->radeon);
  74     radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
  75     BEGIN_BATCH_NO_AUTOSTATE(3);
  76
  77     R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
  78     R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
  79     R600_OUT_BATCH(WAIT_3D_IDLE_bit);
  80
  81     END_BATCH();
  82     COMMIT_BATCH();
  83 }
  84
  85 void r700WaitForIdleClean(context_t *context)
  86 {
  87     BATCH_LOCALS(&context->radeon);
  88     radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
  89     BEGIN_BATCH_NO_AUTOSTATE(5);
  90
  91     R600_OUT_BATCH(CP_PACKET3(R600_IT_EVENT_WRITE, 0));
  92     R600_OUT_BATCH(CACHE_FLUSH_AND_INV_EVENT);
  93
  94     R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
  95     R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
  96     R600_OUT_BATCH(WAIT_3D_IDLE_bit | WAIT_3D_IDLECLEAN_bit);
  97
  98     END_BATCH();
  99     COMMIT_BATCH();
 100 }
 101
 102 void r700Start3D(context_t *context)
 103 {
 104     BATCH_LOCALS(&context->radeon);
 105     radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
 106     if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
 107     {
 108         BEGIN_BATCH_NO_AUTOSTATE(2);
 109         R600_OUT_BATCH(CP_PACKET3(R600_IT_START_3D_CMDBUF, 0));
 110         R600_OUT_BATCH(0);
 111         END_BATCH();
 112     }
 113
 114     BEGIN_BATCH_NO_AUTOSTATE(3);
 115     R600_OUT_BATCH(CP_PACKET3(R600_IT_CONTEXT_CONTROL, 1));
 116     R600_OUT_BATCH(0x80000000);
 117     R600_OUT_BATCH(0x80000000);
 118     END_BATCH();
 119
 120     COMMIT_BATCH();
 121
 122     r700WaitForIdleClean(context);
 123 }
 124
 125 GLboolean r700SyncSurf(context_t *context,
 126                        struct radeon_bo *pbo,
 127                        uint32_t read_domain,
 128                        uint32_t write_domain,
 129                        uint32_t sync_type)
 130 {
 131     BATCH_LOCALS(&context->radeon);
 132     radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
 133     uint32_t cp_coher_size;
 134
 135     if (!pbo)
 136             return GL_FALSE;
 137
 138     if (pbo->size == 0xffffffff)
 139             cp_coher_size = 0xffffffff;
 140     else
 141             cp_coher_size = ((pbo->size + 255) >> 8);
 142
 143     BEGIN_BATCH_NO_AUTOSTATE(5 + 2);
 144     R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_SYNC, 3));
 145     R600_OUT_BATCH(sync_type);
 146     R600_OUT_BATCH(cp_coher_size);
 147     R600_OUT_BATCH(0);
 148     R600_OUT_BATCH(10);
 149     R600_OUT_BATCH_RELOC(0,
 150                          pbo,
 151                          0,
 152                          read_domain, write_domain, 0);
 153     END_BATCH();
 154     COMMIT_BATCH();
 155
 156     return GL_TRUE;
 157 }
 158
 159 static unsigned int r700PrimitiveType(int prim)
 160 {
 161     switch (prim & PRIM_MODE_MASK)
 162     {
 163     case GL_POINTS:
 164         return DI_PT_POINTLIST;
 165         break;
 166     case GL_LINES:
 167         return DI_PT_LINELIST;
 168         break;
 169     case GL_LINE_STRIP:
 170         return DI_PT_LINESTRIP;
 171         break;
 172     case GL_LINE_LOOP:
 173         return DI_PT_LINELOOP;
 174         break;
 175     case GL_TRIANGLES:
 176         return DI_PT_TRILIST;
 177         break;
 178     case GL_TRIANGLE_STRIP:
 179         return DI_PT_TRISTRIP;
 180         break;
 181     case GL_TRIANGLE_FAN:
 182         return DI_PT_TRIFAN;
 183         break;
 184     case GL_QUADS:
 185         return DI_PT_QUADLIST;
 186         break;
 187     case GL_QUAD_STRIP:
 188         return DI_PT_QUADSTRIP;
 189         break;
 190     case GL_POLYGON:
 191         return DI_PT_POLYGON;
 192         break;
 193     default:
 194         assert(0);
 195         return -1;
 196         break;
 197     }
 198 }
 199
 200 static int r700NumVerts(int num_verts, int prim)
 201 {
 202         int verts_off = 0;
 203
 204         switch (prim & PRIM_MODE_MASK) {
 205         case GL_POINTS:
 206                 verts_off = 0;
 207                 break;
 208         case GL_LINES:
 209                 verts_off = num_verts % 2;
 210                 break;
 211         case GL_LINE_STRIP:
 212                 if (num_verts < 2)
 213                         verts_off = num_verts;
 214                 break;
 215         case GL_LINE_LOOP:
 216                 if (num_verts < 2)
 217                         verts_off = num_verts;
 218                 break;
 219         case GL_TRIANGLES:
 220                 verts_off = num_verts % 3;
 221                 break;
 222         case GL_TRIANGLE_STRIP:
 223                 if (num_verts < 3)
 224                         verts_off = num_verts;
 225                 break;
 226         case GL_TRIANGLE_FAN:
 227                 if (num_verts < 3)
 228                         verts_off = num_verts;
 229                 break;
 230         case GL_QUADS:
 231                 verts_off = num_verts % 4;
 232                 break;
 233         case GL_QUAD_STRIP:
 234                 if (num_verts < 4)
 235                         verts_off = num_verts;
 236                 else
 237                         verts_off = num_verts % 2;
 238                 break;
 239         case GL_POLYGON:
 240                 if (num_verts < 3)
 241                         verts_off = num_verts;
 242                 break;
 243         default:
 244                 assert(0);
 245                 return -1;
 246                 break;
 247         }
 248
 249         return num_verts - verts_off;
 250 }
 251
 252 static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 253 {
 254     context_t *context = R700_CONTEXT(ctx);
 255     BATCH_LOCALS(&context->radeon);
 256     int type, i, total_emit;
 257     int num_indices;
 258     uint32_t vgt_draw_initiator = 0;
 259     uint32_t vgt_index_type     = 0;
 260     uint32_t vgt_primitive_type = 0;
 261     uint32_t vgt_num_indices    = 0;
 262     TNLcontext *tnl = TNL_CONTEXT(ctx);
 263     struct vertex_buffer *vb = &tnl->vb;
 264     GLboolean bUseDrawIndex;
 265
 266     if(NULL != context->ind_buf.bo)
 267     {
 268         bUseDrawIndex = GL_TRUE;
 269     }
 270     else
 271     {
 272         bUseDrawIndex = GL_FALSE;
 273     }
 274
 275     type = r700PrimitiveType(prim);
 276     num_indices = r700NumVerts(end - start, prim);
 277
 278     radeon_print(RADEON_RENDER, RADEON_TRACE,
 279                  "%s type %x num_indices %d\n",
 280                  __func__, type, num_indices);
 281
 282     if (type < 0 || num_indices <= 0)
 283             return;
 284
 285     if(GL_TRUE == bUseDrawIndex)
 286     {
 287         total_emit =   3  /* VGT_PRIMITIVE_TYPE */
 288                      + 2  /* VGT_INDEX_TYPE */
 289                      + 2  /* NUM_INSTANCES */
 290                      + 5 + 2; /* DRAW_INDEX */
 291     }
 292     else
 293     {
 294         total_emit =   3 /* VGT_PRIMITIVE_TYPE */
 295                      + 2 /* VGT_INDEX_TYPE */
 296                      + 2 /* NUM_INSTANCES */
 297                      + num_indices + 3; /* DRAW_INDEX_IMMD */
 298     }
 299
 300     BEGIN_BATCH_NO_AUTOSTATE(total_emit);
 301     // prim
 302     SETfield(vgt_primitive_type, type,
 303              VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift, VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
 304     R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
 305     R600_OUT_BATCH(mmVGT_PRIMITIVE_TYPE - ASIC_CONFIG_BASE_INDEX);
 306     R600_OUT_BATCH(vgt_primitive_type);
 307
 308         // index type
 309     SETfield(vgt_index_type, DI_INDEX_SIZE_32_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
 310
 311     if(GL_TRUE == bUseDrawIndex)
 312     {
 313         if(GL_TRUE != context->ind_buf.is_32bit)
 314         {
 315             SETfield(vgt_index_type, DI_INDEX_SIZE_16_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
 316         }
 317     }
 318
 319     R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
 320     R600_OUT_BATCH(vgt_index_type);
 321
 322     // num instances
 323     R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
 324     R600_OUT_BATCH(1);
 325
 326     // draw packet
 327     vgt_num_indices = num_indices;
 328
 329     if(GL_TRUE == bUseDrawIndex)
 330     {
 331         SETfield(vgt_draw_initiator, DI_SRC_SEL_DMA, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
 332     }
 333     else
 334     {
 335         SETfield(vgt_draw_initiator, DI_SRC_SEL_IMMEDIATE, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
 336     }
 337
 338         SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift, MAJOR_MODE_mask);
 339
 340     if(GL_TRUE == bUseDrawIndex)
 341     {
 342         R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX, 3));
 343         R600_OUT_BATCH(context->ind_buf.bo_offset);
 344         R600_OUT_BATCH(0);
 345         R600_OUT_BATCH(vgt_num_indices);
 346         R600_OUT_BATCH(vgt_draw_initiator);
 347         R600_OUT_BATCH_RELOC(context->ind_buf.bo_offset,
 348                              context->ind_buf.bo,
 349                              context->ind_buf.bo_offset,
 350                              RADEON_GEM_DOMAIN_GTT, 0, 0);
 351     }
 352     else
 353     {
 354         R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (num_indices + 1)));
 355         R600_OUT_BATCH(vgt_num_indices);
 356         R600_OUT_BATCH(vgt_draw_initiator);
 357
 358         for (i = start; i < (start + num_indices); i++)
 359         {
 360             if(vb->Elts)
 361             {
 362                 R600_OUT_BATCH(vb->Elts[i]);
 363             }
 364             else
 365             {
 366                 R600_OUT_BATCH(i);
 367             }
 368         }
 369     }
 370
 371     END_BATCH();
 372     COMMIT_BATCH();
 373 }
 374
 375 /* start 3d, idle, cb/db flush */
 376 #define PRE_EMIT_STATE_BUFSZ 10 + 5 + 14
 377
 378 static GLuint r700PredictRenderSize(GLcontext* ctx, GLuint nr_prims)
 379 {
 380     context_t *context = R700_CONTEXT(ctx);
 381     struct r700_vertex_program *vp = context->selected_vp;
 382     GLboolean flushed;
 383     GLuint dwords, i;
 384     GLuint state_size;
 385     /* pre calculate aos count so state prediction works */
 386     context->radeon.tcl.aos_count = _mesa_bitcount(vp->mesa_program->Base.InputsRead);
 387
 388     dwords = PRE_EMIT_STATE_BUFSZ;
 389     if (nr_prims)
 390             dwords += nr_prims * 14;
 391     else {
 392             TNLcontext *tnl = TNL_CONTEXT(ctx);
 393             struct vertex_buffer *vb = &tnl->vb;
 394
 395             for (i = 0; i < vb->PrimitiveCount; i++)
 396                     dwords += vb->Primitive[i].count + 10;
 397     }
 398     state_size = radeonCountStateEmitSize(&context->radeon);
 399     flushed = rcommonEnsureCmdBufSpace(&context->radeon,
 400             dwords + state_size, __FUNCTION__);
 401
 402     if (flushed)
 403         dwords += radeonCountStateEmitSize(&context->radeon);
 404     else
 405         dwords += state_size;
 406
 407     radeon_print(RADEON_RENDER, RADEON_VERBOSE,
 408         "%s: total prediction size is %d.\n", __FUNCTION__, dwords);
 409     return dwords;
 410 }
 411
 412 static GLboolean r700RunRender(GLcontext * ctx,
 413                                struct tnl_pipeline_stage *stage)
 414 {
 415     context_t *context = R700_CONTEXT(ctx);
 416     radeonContextPtr radeon = &context->radeon;
 417     unsigned int i, id = 0;
 418     TNLcontext *tnl = TNL_CONTEXT(ctx);
 419     struct vertex_buffer *vb = &tnl->vb;
 420     struct radeon_renderbuffer *rrb;
 421
 422     radeon_print(RADEON_RENDER, RADEON_NORMAL, "%s: cs begin at %d\n",
 423                 __func__, context->radeon.cmdbuf.cs->cdw);
 424
 425     /* always emit CB base to prevent
 426      * lock ups on some chips.
 427      */
 428     R600_STATECHANGE(context, cb_target);
 429     /* mark vtx as dirty since it changes per-draw */
 430     R600_STATECHANGE(context, vtx);
 431
 432     r700SetScissor(context);
 433     r700SetupVertexProgram(ctx);
 434     r700SetupFragmentProgram(ctx);
 435     r600UpdateTextureState(ctx);
 436
 437     GLuint emit_end = r700PredictRenderSize(ctx, 0)
 438         + context->radeon.cmdbuf.cs->cdw;
 439     r700SetupStreams(ctx);
 440
 441     radeonEmitState(radeon);
 442
 443     radeon_debug_add_indent();
 444     /* richard test code */
 445     for (i = 0; i < vb->PrimitiveCount; i++) {
 446         GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
 447         GLuint start = vb->Primitive[i].start;
 448         GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
 449         r700RunRenderPrimitive(ctx, start, end, prim);
 450     }
 451     radeon_debug_remove_indent();
 452
 453     /* Flush render op cached for last several quads. */
 454     r700WaitForIdleClean(context);
 455
 456     rrb = radeon_get_colorbuffer(&context->radeon);
 457     if (rrb && rrb->bo)
 458             r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
 459                          CB_ACTION_ENA_bit | (1 << (id + 6)));
 460
 461     rrb = radeon_get_depthbuffer(&context->radeon);
 462     if (rrb && rrb->bo)
 463             r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
 464                          DB_ACTION_ENA_bit | DB_DEST_BASE_ENA_bit);
 465
 466     radeonReleaseArrays(ctx, ~0);
 467
 468     radeon_print(RADEON_RENDER, RADEON_TRACE, "%s: cs end at %d\n",
 469                 __func__, context->radeon.cmdbuf.cs->cdw);
 470
 471     if ( emit_end < context->radeon.cmdbuf.cs->cdw )
 472        WARN_ONCE("Rendering was %d commands larger than predicted size."
 473                " We might overflow  command buffer.\n", context->radeon.cmdbuf.cs->cdw - emit_end);
 474
 475     return GL_FALSE;
 476 }
 477
 478 static GLboolean r700RunNonTCLRender(GLcontext * ctx,
 479                                      struct tnl_pipeline_stage *stage) /* -------------------- */
 480 {
 481         GLboolean bRet = GL_TRUE;
 482
 483         return bRet;
 484 }
 485
 486 static GLboolean r700RunTCLRender(GLcontext * ctx,  /*----------------------*/
 487                                   struct tnl_pipeline_stage *stage)
 488 {
 489         GLboolean bRet = GL_FALSE;
 490
 491     /* TODO : sw fallback */
 492
 493     /* Need shader bo's setup before bo check */
 494     r700UpdateShaders(ctx);
 495     /**
 496
 497     * Ensure all enabled and complete textures are uploaded along with any buffers being used.
 498     */
 499     if(!r600ValidateBuffers(ctx))
 500     {
 501         return GL_TRUE;
 502     }
 503
 504     bRet = r700RunRender(ctx, stage);
 505
 506     return bRet;
 507         //GL_FALSE will stop to do other pipe stage in _tnl_run_pipeline
 508     //The render here DOES finish the whole pipe, so GL_FALSE should be returned for success.
 509 }
 510
 511 const struct tnl_pipeline_stage _r700_render_stage = {
 512         "r700 Hardware Rasterization",
 513         NULL,
 514         NULL,
 515         NULL,
 516         NULL,
 517         r700RunNonTCLRender
 518 };
 519
 520 const struct tnl_pipeline_stage _r700_tcl_stage = {
 521         "r700 Hardware Transform, Clipping and Lighting",
 522         NULL,
 523         NULL,
 524         NULL,
 525         NULL,
 526         r700RunTCLRender
 527 };
 528
 529 const struct tnl_pipeline_stage *r700_pipeline[] =
 530 {
 531     &_r700_tcl_stage,
 532     &_tnl_vertex_transform_stage,
 533         &_tnl_normal_transform_stage,
 534         &_tnl_lighting_stage,
 535         &_tnl_fog_coordinate_stage,
 536         &_tnl_texgen_stage,
 537         &_tnl_texture_transform_stage,
 538         &_tnl_vertex_program_stage,
 539
 540     &_r700_render_stage,
 541     &_tnl_render_stage,
 542     0,
 543 };
 544
 545 #define CONVERT( TYPE, MACRO ) do {             \
 546         GLuint i, j, sz;                                \
 547         sz = input->Size;                               \
 548         if (input->Normalized) {                        \
 549                 for (i = 0; i < count; i++) {           \
 550                         const TYPE *in = (TYPE *)src_ptr;               \
 551                         for (j = 0; j < sz; j++) {              \
 552                                 *dst_ptr++ = MACRO(*in);                \
 553                                 in++;                           \
 554                         }                                       \
 555                         src_ptr += stride;                      \
 556                 }                                               \
 557         } else {                                        \
 558                 for (i = 0; i < count; i++) {           \
 559                         const TYPE *in = (TYPE *)src_ptr;               \
 560                         for (j = 0; j < sz; j++) {              \
 561                                 *dst_ptr++ = (GLfloat)(*in);            \
 562                                 in++;                           \
 563                         }                                       \
 564                         src_ptr += stride;                      \
 565                 }                                               \
 566         }                                               \
 567 } while (0)
 568
 569 /**
 570  * Convert attribute data type to float
 571  * If the attribute uses named buffer object replace the bo with newly allocated bo
 572  */
 573 static void r700ConvertAttrib(GLcontext *ctx, int count,
 574                               const struct gl_client_array *input,
 575                               struct StreamDesc *attr)
 576 {
 577     context_t *context = R700_CONTEXT(ctx);
 578     const GLvoid *src_ptr;
 579     GLboolean mapped_named_bo = GL_FALSE;
 580     GLfloat *dst_ptr;
 581     GLuint stride;
 582
 583     stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
 584
 585     /* Convert value for first element only */
 586     if (input->StrideB == 0)
 587     {
 588         count = 1;
 589     }
 590
 591     if (input->BufferObj->Name)
 592     {
 593         if (!input->BufferObj->Pointer)
 594         {
 595             ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
 596             mapped_named_bo = GL_TRUE;
 597         }
 598
 599         src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
 600     }
 601     else
 602     {
 603         src_ptr = input->Ptr;
 604     }
 605
 606     radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset,
 607                          sizeof(GLfloat) * input->Size * count, 32);
 608     dst_ptr = (GLfloat *)ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
 609
 610     assert(src_ptr != NULL);
 611
 612     switch (input->Type)
 613     {
 614         case GL_DOUBLE:
 615             CONVERT(GLdouble, (GLfloat));
 616             break;
 617         case GL_UNSIGNED_INT:
 618             CONVERT(GLuint, UINT_TO_FLOAT);
 619             break;
 620         case GL_INT:
 621             CONVERT(GLint, INT_TO_FLOAT);
 622             break;
 623         case GL_UNSIGNED_SHORT:
 624             CONVERT(GLushort, USHORT_TO_FLOAT);
 625             break;
 626         case GL_SHORT:
 627             CONVERT(GLshort, SHORT_TO_FLOAT);
 628             break;
 629         case GL_UNSIGNED_BYTE:
 630             assert(input->Format != GL_BGRA);
 631             CONVERT(GLubyte, UBYTE_TO_FLOAT);
 632             break;
 633         case GL_BYTE:
 634             CONVERT(GLbyte, BYTE_TO_FLOAT);
 635             break;
 636         default:
 637             assert(0);
 638             break;
 639     }
 640
 641     if (mapped_named_bo)
 642     {
 643         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
 644     }
 645 }
 646
 647 static void r700AlignDataToDword(GLcontext *ctx,
 648                                  const struct gl_client_array *input,
 649                                  int count,
 650                                  struct StreamDesc *attr)
 651 {
 652     context_t *context = R700_CONTEXT(ctx);
 653     const int dst_stride = (input->StrideB + 3) & ~3;
 654     const int size = getTypeSize(input->Type) * input->Size * count;
 655     GLboolean mapped_named_bo = GL_FALSE;
 656
 657     radeonAllocDmaRegion(&context->radeon, &attr->bo, &attr->bo_offset, size, 32);
 658
 659     if (!input->BufferObj->Pointer)
 660     {
 661         ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
 662         mapped_named_bo = GL_TRUE;
 663     }
 664
 665     {
 666         GLvoid *src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
 667         GLvoid *dst_ptr = ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
 668         int i;
 669
 670         for (i = 0; i < count; ++i)
 671         {
 672             _mesa_memcpy(dst_ptr, src_ptr, input->StrideB);
 673             src_ptr += input->StrideB;
 674             dst_ptr += dst_stride;
 675         }
 676     }
 677
 678     if (mapped_named_bo)
 679     {
 680         ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
 681     }
 682
 683     attr->stride = dst_stride;
 684 }
 685
 686 static void r700SetupStreams2(GLcontext *ctx, const struct gl_client_array *input[], int count)
 687 {
 688         context_t *context = R700_CONTEXT(ctx);
 689     GLuint stride;
 690     int ret;
 691     int i, index;
 692
 693     R600_STATECHANGE(context, vtx);
 694
 695     for(index = 0; index < context->nNumActiveAos; index++)
 696     {
 697         struct radeon_aos *aos = &context->radeon.tcl.aos[index];
 698         i = context->stream_desc[index].element;
 699
 700         stride = (input[i]->StrideB == 0) ? getTypeSize(input[i]->Type) * input[i]->Size : input[i]->StrideB;
 701
 702         if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT ||
 703 #if MESA_BIG_ENDIAN
 704             getTypeSize(input[i]->Type) != 4 ||
 705 #endif
 706             stride < 4)
 707         {
 708             r700ConvertAttrib(ctx, count, input[i], &context->stream_desc[index]);
 709         }
 710         else
 711         {
 712             if (input[i]->BufferObj->Name)
 713             {
 714                 if (stride % 4 != 0)
 715                 {
 716                     assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0);
 717                     r700AlignDataToDword(ctx, input[i], count, &context->stream_desc[index]);
 718                     context->stream_desc[index].is_named_bo = GL_FALSE;
 719                 }
 720                 else
 721                 {
 722                     context->stream_desc[index].stride = input[i]->StrideB;
 723                     context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr;
 724                     context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo;
 725                     context->stream_desc[index].is_named_bo = GL_TRUE;
 726                 }
 727             }
 728             else
 729             {
 730                 int size;
 731                 int local_count = count;
 732                 uint32_t *dst;
 733
 734                 if (input[i]->StrideB == 0)
 735                 {
 736                     size = getTypeSize(input[i]->Type) * input[i]->Size;
 737                     local_count = 1;
 738                 }
 739                 else
 740                 {
 741                     size = getTypeSize(input[i]->Type) * input[i]->Size * local_count;
 742                 }
 743
 744                 radeonAllocDmaRegion(&context->radeon, &context->stream_desc[index].bo,
 745                                      &context->stream_desc[index].bo_offset, size, 32);
 746                 assert(context->stream_desc[index].bo->ptr != NULL);
 747                 dst = (uint32_t *)ADD_POINTERS(context->stream_desc[index].bo->ptr,
 748                                                context->stream_desc[index].bo_offset);
 749
 750                 switch (context->stream_desc[index].dwords)
 751                 {
 752                 case 1:
 753                     radeonEmitVec4(dst, input[i]->Ptr, input[i]->StrideB, local_count);
 754                                         context->stream_desc[index].stride = 4;
 755                     break;
 756                 case 2:
 757                     radeonEmitVec8(dst, input[i]->Ptr, input[i]->StrideB, local_count);
 758                                         context->stream_desc[index].stride = 8;
 759                     break;
 760                 case 3:
 761                     radeonEmitVec12(dst, input[i]->Ptr, input[i]->StrideB, local_count);
 762                                         context->stream_desc[index].stride = 12;
 763                     break;
 764                 case 4:
 765                     radeonEmitVec16(dst, input[i]->Ptr, input[i]->StrideB, local_count);
 766                                         context->stream_desc[index].stride = 16;
 767                     break;
 768                 default:
 769                     assert(0);
 770                     break;
 771                 }
 772             }
 773         }
 774
 775         aos->count = context->stream_desc[index].stride == 0 ? 1 : count;
 776         aos->stride = context->stream_desc[index].stride / sizeof(float);
 777         aos->components = context->stream_desc[index].dwords;
 778         aos->bo = context->stream_desc[index].bo;
 779         aos->offset = context->stream_desc[index].bo_offset;
 780
 781         if(context->stream_desc[index].is_named_bo)
 782         {
 783             radeon_cs_space_add_persistent_bo(context->radeon.cmdbuf.cs,
 784                                               context->stream_desc[index].bo,
 785                                               RADEON_GEM_DOMAIN_GTT, 0);
 786         }
 787     }
 788
 789     context->radeon.tcl.aos_count = context->nNumActiveAos;
 790     ret = radeon_cs_space_check_with_bo(context->radeon.cmdbuf.cs,
 791                                         first_elem(&context->radeon.dma.reserved)->bo,
 792                                         RADEON_GEM_DOMAIN_GTT, 0);
 793 }
 794
 795 static void r700FreeData(GLcontext *ctx)
 796 {
 797     /* Need to zero tcl.aos[n].bo and tcl.elt_dma_bo
 798      * to prevent double unref in radeonReleaseArrays
 799      * called during context destroy
 800      */
 801     context_t *context = R700_CONTEXT(ctx);
 802
 803     int i;
 804
 805     for (i = 0; i < context->nNumActiveAos; i++)
 806     {
 807         if (!context->stream_desc[i].is_named_bo)
 808         {
 809                 radeon_bo_unref(context->stream_desc[i].bo);
 810         }
 811         context->radeon.tcl.aos[i].bo = NULL;
 812     }
 813
 814     if (context->ind_buf.bo != NULL)
 815     {
 816             radeon_bo_unref(context->ind_buf.bo);
 817     }
 818 }
 819
 820 static void r700FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
 821 {
 822     context_t *context = R700_CONTEXT(ctx);
 823     GLvoid *src_ptr;
 824     GLuint *out;
 825     int i;
 826     GLboolean mapped_named_bo = GL_FALSE;
 827
 828     if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
 829     {
 830         ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
 831         mapped_named_bo = GL_TRUE;
 832         assert(mesa_ind_buf->obj->Pointer != NULL);
 833     }
 834     src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
 835
 836     if (mesa_ind_buf->type == GL_UNSIGNED_BYTE)
 837     {
 838         GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
 839         GLubyte *in = (GLubyte *)src_ptr;
 840
 841         radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 842                              &context->ind_buf.bo_offset, size, 4);
 843
 844         assert(context->ind_buf.bo->ptr != NULL);
 845         out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 846
 847         for (i = 0; i + 1 < mesa_ind_buf->count; i += 2)
 848         {
 849             *out++ = in[i] | in[i + 1] << 16;
 850         }
 851
 852         if (i < mesa_ind_buf->count)
 853         {
 854             *out++ = in[i];
 855         }
 856
 857 #if MESA_BIG_ENDIAN
 858     }
 859     else
 860     { /* if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) */
 861         GLushort *in = (GLushort *)src_ptr;
 862         GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
 863
 864         radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 865                              &context->ind_buf.bo_offset, size, 4);
 866
 867         assert(context->ind_buf.bo->ptr != NULL);
 868         out = (GLuint *)ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 869
 870         for (i = 0; i + 1 < mesa_ind_buf->count; i += 2)
 871         {
 872             *out++ = in[i] | in[i + 1] << 16;
 873         }
 874
 875         if (i < mesa_ind_buf->count)
 876         {
 877             *out++ = in[i];
 878         }
 879 #endif
 880     }
 881
 882     context->ind_buf.is_32bit = GL_FALSE;
 883     context->ind_buf.count = mesa_ind_buf->count;
 884
 885     if (mapped_named_bo)
 886     {
 887         ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
 888     }
 889 }
 890
 891 static void r700SetupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
 892 {
 893     context_t *context = R700_CONTEXT(ctx);
 894
 895     if (!mesa_ind_buf) {
 896         context->ind_buf.bo = NULL;
 897         return;
 898     }
 899
 900 #if MESA_BIG_ENDIAN
 901     if (mesa_ind_buf->type == GL_UNSIGNED_INT)
 902     {
 903 #else
 904     if (mesa_ind_buf->type != GL_UNSIGNED_BYTE)
 905     {
 906 #endif
 907         const GLvoid *src_ptr;
 908         GLvoid *dst_ptr;
 909         GLboolean mapped_named_bo = GL_FALSE;
 910
 911         if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer)
 912         {
 913                 ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
 914                 assert(mesa_ind_buf->obj->Pointer != NULL);
 915                 mapped_named_bo = GL_TRUE;
 916         }
 917
 918         src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
 919
 920         const GLuint size = mesa_ind_buf->count * getTypeSize(mesa_ind_buf->type);
 921
 922         radeonAllocDmaRegion(&context->radeon, &context->ind_buf.bo,
 923                              &context->ind_buf.bo_offset, size, 4);
 924         assert(context->ind_buf.bo->ptr != NULL);
 925         dst_ptr = ADD_POINTERS(context->ind_buf.bo->ptr, context->ind_buf.bo_offset);
 926
 927         _mesa_memcpy(dst_ptr, src_ptr, size);
 928
 929         context->ind_buf.is_32bit = (mesa_ind_buf->type == GL_UNSIGNED_INT);
 930         context->ind_buf.count = mesa_ind_buf->count;
 931
 932         if (mapped_named_bo)
 933         {
 934                 ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
 935         }
 936     }
 937     else
 938     {
 939             r700FixupIndexBuffer(ctx, mesa_ind_buf);
 940     }
 941 }
 942
 943 static GLboolean r700TryDrawPrims(GLcontext *ctx,
 944                                          const struct gl_client_array *arrays[],
 945                                          const struct _mesa_prim *prim,
 946                                          GLuint nr_prims,
 947                                          const struct _mesa_index_buffer *ib,
 948                                          GLuint min_index,
 949                                          GLuint max_index )
 950 {
 951     context_t *context = R700_CONTEXT(ctx);
 952     radeonContextPtr radeon = &context->radeon;
 953     GLuint i, id = 0;
 954     struct radeon_renderbuffer *rrb;
 955
 956     if (ctx->NewState)
 957     {
 958         _mesa_update_state( ctx );
 959     }
 960
 961     _tnl_UpdateFixedFunctionProgram(ctx);
 962     r700SetVertexFormat(ctx, arrays, max_index + 1);
 963     /* shaders need to be updated before buffers are validated */
 964     r700UpdateShaders2(ctx);
 965     if (!r600ValidateBuffers(ctx))
 966             return GL_FALSE;
 967
 968     /* always emit CB base to prevent
 969      * lock ups on some chips.
 970      */
 971     R600_STATECHANGE(context, cb_target);
 972     /* mark vtx as dirty since it changes per-draw */
 973     R600_STATECHANGE(context, vtx);
 974
 975     r700SetScissor(context);
 976     r700SetupVertexProgram(ctx);
 977     r700SetupFragmentProgram(ctx);
 978     r600UpdateTextureState(ctx);
 979
 980     GLuint emit_end = r700PredictRenderSize(ctx, nr_prims)
 981                     + context->radeon.cmdbuf.cs->cdw;
 982
 983     r700SetupIndexBuffer(ctx, ib);
 984     r700SetupStreams2(ctx, arrays, max_index + 1);
 985
 986     radeonEmitState(radeon);
 987
 988     radeon_debug_add_indent();
 989     for (i = 0; i < nr_prims; ++i)
 990     {
 991             r700RunRenderPrimitive(ctx,
 992                                prim[i].start,
 993                                prim[i].start + prim[i].count,
 994                                prim[i].mode);
 995     }
 996     radeon_debug_remove_indent();
 997
 998     /* Flush render op cached for last several quads. */
 999     r700WaitForIdleClean(context);
1000
1001     rrb = radeon_get_colorbuffer(&context->radeon);
1002     if (rrb && rrb->bo)
1003             r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
1004                          CB_ACTION_ENA_bit | (1 << (id + 6)));
1005
1006     rrb = radeon_get_depthbuffer(&context->radeon);
1007     if (rrb && rrb->bo)
1008             r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
1009                          DB_ACTION_ENA_bit | DB_DEST_BASE_ENA_bit);
1010
1011     r700FreeData(ctx);
1012
1013     if (emit_end < context->radeon.cmdbuf.cs->cdw)
1014     {
1015         WARN_ONCE("Rendering was %d commands larger than predicted size."
1016             " We might overflow  command buffer.\n", context->radeon.cmdbuf.cs->cdw - emit_end);
1017     }
1018
1019     return GL_TRUE;
1020 }
1021
1022 static void r700DrawPrimsRe(GLcontext *ctx,
1023                          const struct gl_client_array *arrays[],
1024                          const struct _mesa_prim *prim,
1025                          GLuint nr_prims,
1026                          const struct _mesa_index_buffer *ib,
1027                          GLboolean index_bounds_valid,
1028                          GLuint min_index,
1029                          GLuint max_index)
1030 {
1031     GLboolean retval = GL_FALSE;
1032
1033     /* This check should get folded into just the places that
1034          * min/max index are really needed.
1035          */
1036         if (!index_bounds_valid) {
1037                 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
1038         }
1039
1040         if (min_index) {
1041                 vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r700DrawPrimsRe );
1042                 return;
1043         }
1044
1045         /* Make an attempt at drawing */
1046         retval = r700TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
1047
1048         /* If failed run tnl pipeline - it should take care of fallbacks */
1049         if (!retval)
1050                 _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
1051 }
1052
1053 static void r700DrawPrims(GLcontext *ctx,
1054                          const struct gl_client_array *arrays[],
1055                          const struct _mesa_prim *prim,
1056                          GLuint nr_prims,
1057                          const struct _mesa_index_buffer *ib,
1058                          GLboolean index_bounds_valid,
1059                          GLuint min_index,
1060                          GLuint max_index)
1061 {
1062     context_t *context = R700_CONTEXT(ctx);
1063
1064     /* For non indexed drawing, using tnl pipe. */
1065     if(!ib)
1066     {
1067         context->ind_buf.bo = NULL;
1068
1069         _tnl_vbo_draw_prims(ctx, arrays, prim, nr_prims, ib,
1070                             index_bounds_valid, min_index, max_index);
1071         return;
1072     }
1073
1074         r700DrawPrimsRe(ctx, arrays, prim, nr_prims, ib, index_bounds_valid, min_index, max_index);
1075 }
1076
1077 void r700InitDraw(GLcontext *ctx)
1078 {
1079         struct vbo_context *vbo = vbo_context(ctx);
1080
1081         /* to be enabled */
1082         vbo->draw_prims = r700DrawPrims;
1083 }
1084
1085