From: Rob Clark <robdclark@gmail.com>
Date: Sun, 21 Oct 2018 14:22:11 +0000 (-0400)
Subject: freedreno: import libdrm_freedreno + redesign submit
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=f3cc0d2747568a186dba433ac94af607c38fa024;p=mesa.git

freedreno: import libdrm_freedreno + redesign submit

In the pursuit of lowering driver overhead, it became clear that some
amount of redesign of how libdrm_freedreno constructs the submit ioctl
would be needed.  In particular, as the gallium driver is starting to
make heavier use of CP_SET_DRAW_STATE state groups/objects, the over-
head of tracking cmd buffers and relocs becomes too much.  And for
"streaming" state, which isn't ever reused (like uniform uploads) the
overhead of allocating/freeing ringbuffer[1] objects is too high.

This redesign makes two main changes:

 1) Introduces a fd_submit object for tracking bos and cmds table
    for the submit ioctl, making ringbuffer objects more light-
    weight.  This was previously done in the ringbuffer.  But we
    have many ringbuffer instances involved in a submit (gmem +
    draw + potentially 1000's of state-group rbs), and only need
    a single bos and cmds table.  (Reloc table is still per-rb)

    The submit is also a convenient place for a slab allocator for
    ringbuffer objects.  Other options would have required locking
    because, while we can guarantee allocations will only happen on
    a single thread, free's could happen either on the application
    thread or the flush_queue thread.  With the slab allocator in
    the submit object, any frees that happen on the flush_queue
    thread happen after we know that the application thread is done
    with the submit.

 2) Introduce a new "softpin" msm_ringbuffer_sp implementation that
    does not use relocs and only has cmds table entries for IB1 (ie.
    the cmdstream buffers that kernel needs to CP_INDIRECT_BUFFER
    to from the RB).  To do this properly will require some updates
    on the kernel side, so whether you get the softpin or legacy
    submit/ringbuffer implementation at runtime depends on your
    kernel version.

To make all these changes in libdrm would basically require adding a
libdrm_freedreno2, so this is a good point to just pull the libdrm code
into mesa.  Plus it allows for using mesa's hashtable, slab allocator,
etc.  And it lets us have asserts enabled for debug mesa buids but
omitted for release builds.  And it makes life easier if further API
changes become necessary.

At this point I haven't tried to pull in the kgsl backend.  Although
I left the level of vfunc indirection which would make it possible
to have other backends.  (And this was convenient to keep to allow
for the "softpin" ringbuffer to coexist.)

NOTE: if bisecting a build error takes you here, try a clean build.
There are a bunch of ways things can go wrong if you still have
libdrm_freedreno cflags.

[1] "ringbuffer" is probably a bad name, the only level of cmdstream
    buffer that is actually a ring is RB managed by kernel.  User-
    space cmdstream is all IB1/IB2 and state-groups.

Reviewed-by: Kristian H. Kristensen <hoegsberg@chromium.org>
Reviewed-by: Eric Engestrom <eric.engestrom@intel.com>
Signed-off-by: Rob Clark <robdclark@gmail.com>
---

diff --git a/configure.ac b/configure.ac
index 634245807f3..156fb15fd3d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -78,7 +78,6 @@ LIBDRM_AMDGPU_REQUIRED=2.4.95
 LIBDRM_INTEL_REQUIRED=2.4.75
 LIBDRM_NVVIEUX_REQUIRED=2.4.66
 LIBDRM_NOUVEAU_REQUIRED=2.4.66
-LIBDRM_FREEDRENO_REQUIRED=2.4.96
 LIBDRM_ETNAVIV_REQUIRED=2.4.89
 LIBDRM_VC4_REQUIRED=2.4.89
 
@@ -2722,7 +2721,6 @@ if test -n "$with_gallium_drivers"; then
             ;;
         xfreedreno)
             HAVE_GALLIUM_FREEDRENO=yes
-            PKG_CHECK_MODULES([FREEDRENO], [libdrm >= $LIBDRM_FREEDRENO_REQUIRED libdrm_freedreno >= $LIBDRM_FREEDRENO_REQUIRED])
             require_libdrm "freedreno"
             ;;
         xetnaviv)
diff --git a/meson.build b/meson.build
index 690e7d3d8aa..18667988bac 100644
--- a/meson.build
+++ b/meson.build
@@ -1099,14 +1099,12 @@ dep_libdrm_amdgpu = null_dep
 dep_libdrm_radeon = null_dep
 dep_libdrm_nouveau = null_dep
 dep_libdrm_etnaviv = null_dep
-dep_libdrm_freedreno = null_dep
 dep_libdrm_intel = null_dep
 
 _drm_amdgpu_ver = '2.4.95'
 _drm_radeon_ver = '2.4.71'
 _drm_nouveau_ver = '2.4.66'
 _drm_etnaviv_ver = '2.4.89'
-_drm_freedreno_ver = '2.4.96'
 _drm_intel_ver = '2.4.75'
 _drm_ver = '2.4.75'
 
@@ -1117,7 +1115,6 @@ _libdrm_checks = [
               with_gallium_r300 or with_gallium_r600)],
   ['nouveau', (with_gallium_nouveau or with_dri_nouveau)],
   ['etnaviv', with_gallium_etnaviv],
-  ['freedreno', with_gallium_freedreno],
 ]
 
 # VC4 only needs core libdrm support of this version, not a libdrm_vc4
diff --git a/src/gallium/drivers/freedreno/Android.mk b/src/gallium/drivers/freedreno/Android.mk
index 9cc727aa80d..9c9d0707ba9 100644
--- a/src/gallium/drivers/freedreno/Android.mk
+++ b/src/gallium/drivers/freedreno/Android.mk
@@ -27,6 +27,7 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
+	$(drm_SOURCES) \
 	$(a2xx_SOURCES) \
 	$(a3xx_SOURCES)	\
 	$(a4xx_SOURCES) \
@@ -42,7 +43,7 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_GENERATED_SOURCES := $(MESA_GEN_NIR_H)
 
-LOCAL_SHARED_LIBRARIES := libdrm_freedreno
+LOCAL_SHARED_LIBRARIES := libdrm
 LOCAL_STATIC_LIBRARIES := libmesa_glsl libmesa_nir
 LOCAL_MODULE := libmesa_pipe_freedreno
 
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index c19b776892c..2024a2da232 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -7,7 +7,8 @@ AM_CFLAGS = \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/src/compiler/nir \
 	$(GALLIUM_DRIVER_CFLAGS) \
-	$(FREEDRENO_CFLAGS)
+	$(LIBDRM_CFLAGS) \
+	$(VALGRIND_CFLAGS)
 
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 ir3/ir3_nir_trig.c: ir3/ir3_nir_trig.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
@@ -18,6 +19,7 @@ noinst_LTLIBRARIES = libfreedreno.la
 
 libfreedreno_la_SOURCES = \
 	$(C_SOURCES) \
+	$(drm_SOURCES) \
 	$(a2xx_SOURCES) \
 	$(a3xx_SOURCES) \
 	$(a4xx_SOURCES) \
@@ -45,6 +47,7 @@ ir3_compiler_LDADD = \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/mesa/libmesagallium.la \
 	$(GALLIUM_COMMON_LIB_DEPS) \
-	$(FREEDRENO_LIBS)
+	$(LIBDRM_LIBS) \
+	$(VALGRIND_LIBS)
 
 EXTRA_DIST += meson.build
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index c76c53278a9..8b4d61c9884 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -40,6 +40,23 @@ C_SOURCES := \
 	freedreno_util.c \
 	freedreno_util.h
 
+drm_SOURCES := \
+	drm/freedreno_bo.c \
+	drm/freedreno_bo_cache.c \
+	drm/freedreno_device.c \
+	drm/freedreno_drmif.h \
+	drm/freedreno_pipe.c \
+	drm/freedreno_priv.h \
+	drm/freedreno_ringbuffer.c \
+	drm/freedreno_ringbuffer.h \
+	drm/msm_bo.c \
+	drm/msm_device.c \
+	drm/msm_drm.h \
+	drm/msm_pipe.c \
+	drm/msm_priv.h \
+	drm/msm_ringbuffer.c \
+	drm/msm_ringbuffer_sp.c
+
 a2xx_SOURCES := \
 	a2xx/a2xx.xml.h \
 	a2xx/disasm-a2xx.c \
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index cebd5b30aaa..4596aeee025 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -29,8 +29,6 @@
 
 #include "util/u_upload_mgr.h"
 
-#include "freedreno_drmif.h"
-
 #include "freedreno_context.h"
 
 #include "ir3_shader.h"
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 3ed1b201b08..a4b84d400ef 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -29,8 +29,6 @@
 
 #include "util/u_upload_mgr.h"
 
-#include "freedreno_drmif.h"
-
 #include "freedreno_context.h"
 
 #include "ir3_shader.h"
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
index 37573460c21..0cd252167b7 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h
@@ -29,8 +29,6 @@
 
 #include "util/u_upload_mgr.h"
 
-#include "freedreno_drmif.h"
-
 #include "freedreno_context.h"
 
 #include "ir3_shader.h"
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
index bbb12897b85..25f297cf572 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
@@ -197,8 +197,7 @@ fd5_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
 	// draw
 
 	if (!batch->lrz_clear) {
-		batch->lrz_clear = fd_ringbuffer_new(batch->ctx->pipe, 0x1000);
-		fd_ringbuffer_set_parent(batch->lrz_clear, batch->gmem);
+		batch->lrz_clear = fd_submit_new_ringbuffer(batch->submit, 0x1000, 0);
 	}
 
 	ring = batch->lrz_clear;
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h
index 43a1b1837c4..f3cdd44dec4 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h
@@ -30,8 +30,6 @@
 
 #include "util/u_upload_mgr.h"
 
-#include "freedreno_drmif.h"
-
 #include "freedreno_context.h"
 
 #include "ir3_shader.h"
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c
index c0670d3a11c..9ccb03990f7 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c
@@ -297,8 +297,7 @@ fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth)
 	// draw
 
 	if (!batch->lrz_clear) {
-		batch->lrz_clear = fd_ringbuffer_new(batch->ctx->pipe, 0x1000);
-		fd_ringbuffer_set_parent(batch->lrz_clear, batch->gmem);
+		batch->lrz_clear = fd_submit_new_ringbuffer(batch->submit, 0x1000, 0);
 	}
 
 	ring = batch->lrz_clear;
diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
index 8c3336d5ea6..001d69bf1c9 100644
--- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
+++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c
@@ -359,8 +359,7 @@ fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring,
 
 	if (tex->num_samplers > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_flags(pipe, tex->num_samplers * 4 * 4,
-					FD_RINGBUFFER_OBJECT);
+			fd_ringbuffer_new_object(pipe, tex->num_samplers * 4 * 4);
 		for (unsigned i = 0; i < tex->num_samplers; i++) {
 			static const struct fd6_sampler_stateobj dummy_sampler = {};
 			const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
@@ -390,8 +389,7 @@ fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring,
 
 	if (tex->num_textures > 0) {
 		struct fd_ringbuffer *state =
-			fd_ringbuffer_new_flags(pipe, tex->num_textures * 16 * 4,
-					FD_RINGBUFFER_OBJECT);
+			fd_ringbuffer_new_object(pipe, tex->num_textures * 16 * 4);
 		for (unsigned i = 0; i < tex->num_textures; i++) {
 			static const struct fd6_pipe_sampler_view dummy_view = {};
 			const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
@@ -534,9 +532,8 @@ fd6_build_vbo_state(struct fd6_emit *emit, const struct ir3_shader_variant *vp)
 	const struct fd_vertex_state *vtx = emit->vtx;
 	int32_t i, j;
 
-	struct fd_ringbuffer *ring =
-		fd_ringbuffer_new_flags(emit->ctx->pipe, 4 * (10 * vp->inputs_count + 2),
-				FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+	struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit,
+			4 * (10 * vp->inputs_count + 2), FD_RINGBUFFER_STREAMING);
 
 	for (i = 0, j = 0; i <= vp->inputs_count; i++) {
 		if (vp->inputs[i].sysval)
@@ -597,9 +594,8 @@ build_zsa(struct fd6_emit *emit, bool binning_pass)
 	uint32_t gras_lrz_cntl = zsa->gras_lrz_cntl;
 	uint32_t rb_lrz_cntl = zsa->rb_lrz_cntl;
 
-	struct fd_ringbuffer *ring =
-		fd_ringbuffer_new_flags(emit->ctx->pipe, 16,
-				FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+	struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit,
+			16, FD_RINGBUFFER_STREAMING);
 
 	if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) {
 		gras_lrz_cntl = 0;
@@ -786,9 +782,8 @@ fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
 					 FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
 
 	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
-		struct fd_ringbuffer *vsconstobj =
-			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
-					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+		struct fd_ringbuffer *vsconstobj = fd_submit_new_ringbuffer(
+				ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
 
 		ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
 		fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
@@ -796,9 +791,8 @@ fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
 	}
 
 	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) {
-		struct fd_ringbuffer *fsconstobj =
-			fd_ringbuffer_new_flags(ctx->pipe, 0x1000,
-					FD_RINGBUFFER_OBJECT | FD_RINGBUFFER_STREAMING);
+		struct fd_ringbuffer *fsconstobj = fd_submit_new_ringbuffer(
+				ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
 
 		ir3_emit_fs_consts(fp, fsconstobj, ctx);
 		fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x6);
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_bo.c b/src/gallium/drivers/freedreno/drm/freedreno_bo.c
new file mode 100644
index 00000000000..ec46e16e9e9
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_bo.c
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "os/os_mman.h"
+
+#include "freedreno_drmif.h"
+#include "freedreno_priv.h"
+
+pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER;
+void bo_del(struct fd_bo *bo);
+
+/* set buffer name, and add to table, call w/ table_lock held: */
+static void set_name(struct fd_bo *bo, uint32_t name)
+{
+	bo->name = name;
+	/* add ourself into the handle table: */
+	_mesa_hash_table_insert(bo->dev->name_table, &bo->name, bo);
+}
+
+/* lookup a buffer, call w/ table_lock held: */
+static struct fd_bo * lookup_bo(struct hash_table *tbl, uint32_t key)
+{
+	struct fd_bo *bo = NULL;
+	struct hash_entry *entry = _mesa_hash_table_search(tbl, &key);
+	if (entry) {
+		/* found, incr refcnt and return: */
+		bo = fd_bo_ref(entry->data);
+
+		/* don't break the bucket if this bo was found in one */
+		list_delinit(&bo->list);
+	}
+	return bo;
+}
+
+/* allocate a new buffer object, call w/ table_lock held */
+static struct fd_bo * bo_from_handle(struct fd_device *dev,
+		uint32_t size, uint32_t handle)
+{
+	struct fd_bo *bo;
+
+	bo = dev->funcs->bo_from_handle(dev, size, handle);
+	if (!bo) {
+		struct drm_gem_close req = {
+				.handle = handle,
+		};
+		drmIoctl(dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
+		return NULL;
+	}
+	bo->dev = fd_device_ref(dev);
+	bo->size = size;
+	bo->handle = handle;
+	p_atomic_set(&bo->refcnt, 1);
+	list_inithead(&bo->list);
+	/* add ourself into the handle table: */
+	_mesa_hash_table_insert(dev->handle_table, &bo->handle, bo);
+	return bo;
+}
+
+static struct fd_bo *
+bo_new(struct fd_device *dev, uint32_t size, uint32_t flags,
+		struct fd_bo_cache *cache)
+{
+	struct fd_bo *bo = NULL;
+	uint32_t handle;
+	int ret;
+
+	bo = fd_bo_cache_alloc(cache, &size, flags);
+	if (bo)
+		return bo;
+
+	ret = dev->funcs->bo_new_handle(dev, size, flags, &handle);
+	if (ret)
+		return NULL;
+
+	pthread_mutex_lock(&table_lock);
+	bo = bo_from_handle(dev, size, handle);
+	pthread_mutex_unlock(&table_lock);
+
+	VG_BO_ALLOC(bo);
+
+	return bo;
+}
+
+struct fd_bo *
+fd_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags)
+{
+	struct fd_bo *bo = bo_new(dev, size, flags, &dev->bo_cache);
+	if (bo)
+		bo->bo_reuse = BO_CACHE;
+	return bo;
+}
+
+/* internal function to allocate bo's that use the ringbuffer cache
+ * instead of the normal bo_cache.  The purpose is, because cmdstream
+ * bo's get vmap'd on the kernel side, and that is expensive, we want
+ * to re-use cmdstream bo's for cmdstream and not unrelated purposes.
+ */
+struct fd_bo *
+fd_bo_new_ring(struct fd_device *dev, uint32_t size, uint32_t flags)
+{
+	struct fd_bo *bo = bo_new(dev, size, flags, &dev->ring_cache);
+	if (bo)
+		bo->bo_reuse = RING_CACHE;
+	return bo;
+}
+
+struct fd_bo *
+fd_bo_from_handle(struct fd_device *dev, uint32_t handle, uint32_t size)
+{
+	struct fd_bo *bo = NULL;
+
+	pthread_mutex_lock(&table_lock);
+
+	bo = lookup_bo(dev->handle_table, handle);
+	if (bo)
+		goto out_unlock;
+
+	bo = bo_from_handle(dev, size, handle);
+
+	VG_BO_ALLOC(bo);
+
+out_unlock:
+	pthread_mutex_unlock(&table_lock);
+
+	return bo;
+}
+
+struct fd_bo *
+fd_bo_from_dmabuf(struct fd_device *dev, int fd)
+{
+	int ret, size;
+	uint32_t handle;
+	struct fd_bo *bo;
+
+	pthread_mutex_lock(&table_lock);
+	ret = drmPrimeFDToHandle(dev->fd, fd, &handle);
+	if (ret) {
+		pthread_mutex_unlock(&table_lock);
+		return NULL;
+	}
+
+	bo = lookup_bo(dev->handle_table, handle);
+	if (bo)
+		goto out_unlock;
+
+	/* lseek() to get bo size */
+	size = lseek(fd, 0, SEEK_END);
+	lseek(fd, 0, SEEK_CUR);
+
+	bo = bo_from_handle(dev, size, handle);
+
+	VG_BO_ALLOC(bo);
+
+out_unlock:
+	pthread_mutex_unlock(&table_lock);
+
+	return bo;
+}
+
+struct fd_bo * fd_bo_from_name(struct fd_device *dev, uint32_t name)
+{
+	struct drm_gem_open req = {
+			.name = name,
+	};
+	struct fd_bo *bo;
+
+	pthread_mutex_lock(&table_lock);
+
+	/* check name table first, to see if bo is already open: */
+	bo = lookup_bo(dev->name_table, name);
+	if (bo)
+		goto out_unlock;
+
+	if (drmIoctl(dev->fd, DRM_IOCTL_GEM_OPEN, &req)) {
+		ERROR_MSG("gem-open failed: %s", strerror(errno));
+		goto out_unlock;
+	}
+
+	bo = lookup_bo(dev->handle_table, req.handle);
+	if (bo)
+		goto out_unlock;
+
+	bo = bo_from_handle(dev, req.size, req.handle);
+	if (bo) {
+		set_name(bo, name);
+		VG_BO_ALLOC(bo);
+	}
+
+out_unlock:
+	pthread_mutex_unlock(&table_lock);
+
+	return bo;
+}
+
+uint64_t fd_bo_get_iova(struct fd_bo *bo)
+{
+	if (!bo->iova)
+		bo->iova = bo->funcs->iova(bo);
+	return bo->iova;
+}
+
+void fd_bo_put_iova(struct fd_bo *bo)
+{
+	/* currently a no-op */
+}
+
+struct fd_bo * fd_bo_ref(struct fd_bo *bo)
+{
+	p_atomic_inc(&bo->refcnt);
+	return bo;
+}
+
+void fd_bo_del(struct fd_bo *bo)
+{
+	struct fd_device *dev = bo->dev;
+
+	if (!atomic_dec_and_test(&bo->refcnt))
+		return;
+
+	pthread_mutex_lock(&table_lock);
+
+	if ((bo->bo_reuse == BO_CACHE) && (fd_bo_cache_free(&dev->bo_cache, bo) == 0))
+		goto out;
+	if ((bo->bo_reuse == RING_CACHE) && (fd_bo_cache_free(&dev->ring_cache, bo) == 0))
+		goto out;
+
+	bo_del(bo);
+	fd_device_del_locked(dev);
+out:
+	pthread_mutex_unlock(&table_lock);
+}
+
+/* Called under table_lock */
+void bo_del(struct fd_bo *bo)
+{
+	VG_BO_FREE(bo);
+
+	if (bo->map)
+		os_munmap(bo->map, bo->size);
+
+	/* TODO probably bo's in bucket list get removed from
+	 * handle table??
+	 */
+
+	if (bo->handle) {
+		struct drm_gem_close req = {
+				.handle = bo->handle,
+		};
+		_mesa_hash_table_remove_key(bo->dev->handle_table, &bo->handle);
+		if (bo->name)
+			_mesa_hash_table_remove_key(bo->dev->name_table, &bo->name);
+		drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
+	}
+
+	bo->funcs->destroy(bo);
+}
+
+int fd_bo_get_name(struct fd_bo *bo, uint32_t *name)
+{
+	if (!bo->name) {
+		struct drm_gem_flink req = {
+				.handle = bo->handle,
+		};
+		int ret;
+
+		ret = drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_FLINK, &req);
+		if (ret) {
+			return ret;
+		}
+
+		pthread_mutex_lock(&table_lock);
+		set_name(bo, req.name);
+		pthread_mutex_unlock(&table_lock);
+		bo->bo_reuse = NO_CACHE;
+	}
+
+	*name = bo->name;
+
+	return 0;
+}
+
+uint32_t fd_bo_handle(struct fd_bo *bo)
+{
+	return bo->handle;
+}
+
+int fd_bo_dmabuf(struct fd_bo *bo)
+{
+	int ret, prime_fd;
+
+	ret = drmPrimeHandleToFD(bo->dev->fd, bo->handle, DRM_CLOEXEC,
+			&prime_fd);
+	if (ret) {
+		ERROR_MSG("failed to get dmabuf fd: %d", ret);
+		return ret;
+	}
+
+	bo->bo_reuse = NO_CACHE;
+
+	return prime_fd;
+}
+
+uint32_t fd_bo_size(struct fd_bo *bo)
+{
+	return bo->size;
+}
+
+void * fd_bo_map(struct fd_bo *bo)
+{
+	if (!bo->map) {
+		uint64_t offset;
+		int ret;
+
+		ret = bo->funcs->offset(bo, &offset);
+		if (ret) {
+			return NULL;
+		}
+
+		bo->map = os_mmap(0, bo->size, PROT_READ | PROT_WRITE, MAP_SHARED,
+				bo->dev->fd, offset);
+		if (bo->map == MAP_FAILED) {
+			ERROR_MSG("mmap failed: %s", strerror(errno));
+			bo->map = NULL;
+		}
+	}
+	return bo->map;
+}
+
+/* a bit odd to take the pipe as an arg, but it's a, umm, quirk of kgsl.. */
+int fd_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
+{
+	return bo->funcs->cpu_prep(bo, pipe, op);
+}
+
+void fd_bo_cpu_fini(struct fd_bo *bo)
+{
+	bo->funcs->cpu_fini(bo);
+}
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_bo_cache.c b/src/gallium/drivers/freedreno/drm/freedreno_bo_cache.c
new file mode 100644
index 00000000000..e8193caa721
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_bo_cache.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "freedreno_drmif.h"
+#include "freedreno_priv.h"
+
+void bo_del(struct fd_bo *bo);
+extern pthread_mutex_t table_lock;
+
+static void
+add_bucket(struct fd_bo_cache *cache, int size)
+{
+	unsigned int i = cache->num_buckets;
+
+	assert(i < ARRAY_SIZE(cache->cache_bucket));
+
+	list_inithead(&cache->cache_bucket[i].list);
+	cache->cache_bucket[i].size = size;
+	cache->num_buckets++;
+}
+
+/**
+ * @coarse: if true, only power-of-two bucket sizes, otherwise
+ *    fill in for a bit smoother size curve..
+ */
+void
+fd_bo_cache_init(struct fd_bo_cache *cache, int coarse)
+{
+	unsigned long size, cache_max_size = 64 * 1024 * 1024;
+
+	/* OK, so power of two buckets was too wasteful of memory.
+	 * Give 3 other sizes between each power of two, to hopefully
+	 * cover things accurately enough.  (The alternative is
+	 * probably to just go for exact matching of sizes, and assume
+	 * that for things like composited window resize the tiled
+	 * width/height alignment and rounding of sizes to pages will
+	 * get us useful cache hit rates anyway)
+	 */
+	add_bucket(cache, 4096);
+	add_bucket(cache, 4096 * 2);
+	if (!coarse)
+		add_bucket(cache, 4096 * 3);
+
+	/* Initialize the linked lists for BO reuse cache. */
+	for (size = 4 * 4096; size <= cache_max_size; size *= 2) {
+		add_bucket(cache, size);
+		if (!coarse) {
+			add_bucket(cache, size + size * 1 / 4);
+			add_bucket(cache, size + size * 2 / 4);
+			add_bucket(cache, size + size * 3 / 4);
+		}
+	}
+}
+
+/* Frees older cached buffers.  Called under table_lock */
+void
+fd_bo_cache_cleanup(struct fd_bo_cache *cache, time_t time)
+{
+	int i;
+
+	if (cache->time == time)
+		return;
+
+	for (i = 0; i < cache->num_buckets; i++) {
+		struct fd_bo_bucket *bucket = &cache->cache_bucket[i];
+		struct fd_bo *bo;
+
+		while (!LIST_IS_EMPTY(&bucket->list)) {
+			bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
+
+			/* keep things in cache for at least 1 second: */
+			if (time && ((time - bo->free_time) <= 1))
+				break;
+
+			VG_BO_OBTAIN(bo);
+			list_del(&bo->list);
+			bo_del(bo);
+		}
+	}
+
+	cache->time = time;
+}
+
+static struct fd_bo_bucket * get_bucket(struct fd_bo_cache *cache, uint32_t size)
+{
+	int i;
+
+	/* hmm, this is what intel does, but I suppose we could calculate our
+	 * way to the correct bucket size rather than looping..
+	 */
+	for (i = 0; i < cache->num_buckets; i++) {
+		struct fd_bo_bucket *bucket = &cache->cache_bucket[i];
+		if (bucket->size >= size) {
+			return bucket;
+		}
+	}
+
+	return NULL;
+}
+
+static int is_idle(struct fd_bo *bo)
+{
+	return fd_bo_cpu_prep(bo, NULL,
+			DRM_FREEDRENO_PREP_READ |
+			DRM_FREEDRENO_PREP_WRITE |
+			DRM_FREEDRENO_PREP_NOSYNC) == 0;
+}
+
+static struct fd_bo *find_in_bucket(struct fd_bo_bucket *bucket, uint32_t flags)
+{
+	struct fd_bo *bo = NULL;
+
+	/* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could
+	 * skip the busy check.. if it is only going to be a render target
+	 * then we probably don't need to stall..
+	 *
+	 * NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail
+	 * (MRU, since likely to be in GPU cache), rather than head (LRU)..
+	 */
+	pthread_mutex_lock(&table_lock);
+	if (!LIST_IS_EMPTY(&bucket->list)) {
+		bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
+		/* TODO check for compatible flags? */
+		if (is_idle(bo)) {
+			list_del(&bo->list);
+		} else {
+			bo = NULL;
+		}
+	}
+	pthread_mutex_unlock(&table_lock);
+
+	return bo;
+}
+
+/* NOTE: size is potentially rounded up to bucket size: */
+struct fd_bo *
+fd_bo_cache_alloc(struct fd_bo_cache *cache, uint32_t *size, uint32_t flags)
+{
+	struct fd_bo *bo = NULL;
+	struct fd_bo_bucket *bucket;
+
+	*size = align(*size, 4096);
+	bucket = get_bucket(cache, *size);
+
+	/* see if we can be green and recycle: */
+retry:
+	if (bucket) {
+		*size = bucket->size;
+		bo = find_in_bucket(bucket, flags);
+		if (bo) {
+			VG_BO_OBTAIN(bo);
+			if (bo->funcs->madvise(bo, TRUE) <= 0) {
+				/* we've lost the backing pages, delete and try again: */
+				pthread_mutex_lock(&table_lock);
+				bo_del(bo);
+				pthread_mutex_unlock(&table_lock);
+				goto retry;
+			}
+			p_atomic_set(&bo->refcnt, 1);
+			fd_device_ref(bo->dev);
+			return bo;
+		}
+	}
+
+	return NULL;
+}
+
+int
+fd_bo_cache_free(struct fd_bo_cache *cache, struct fd_bo *bo)
+{
+	struct fd_bo_bucket *bucket = get_bucket(cache, bo->size);
+
+	/* see if we can be green and recycle: */
+	if (bucket) {
+		struct timespec time;
+
+		bo->funcs->madvise(bo, FALSE);
+
+		clock_gettime(CLOCK_MONOTONIC, &time);
+
+		bo->free_time = time.tv_sec;
+		VG_BO_RELEASE(bo);
+		list_addtail(&bo->list, &bucket->list);
+		fd_bo_cache_cleanup(cache, time.tv_sec);
+
+		/* bo's in the bucket cache don't have a ref and
+		 * don't hold a ref to the dev:
+		 */
+		fd_device_del_locked(bo->dev);
+
+		return 0;
+	}
+
+	return -1;
+}
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_device.c b/src/gallium/drivers/freedreno/drm/freedreno_device.c
new file mode 100644
index 00000000000..b2f6c981963
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_device.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "freedreno_drmif.h"
+#include "freedreno_priv.h"
+
+static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static uint32_t
+u32_hash(const void *key)
+{
+	return _mesa_hash_data(key, sizeof(uint32_t));
+}
+
+static bool
+u32_equals(const void *key1, const void *key2)
+{
+	return *(const uint32_t *)key1 == *(const uint32_t *)key2;
+}
+
+
+struct fd_device * kgsl_device_new(int fd);
+struct fd_device * msm_device_new(int fd);
+
+struct fd_device * fd_device_new(int fd)
+{
+	struct fd_device *dev;
+	drmVersionPtr version;
+
+	/* figure out if we are kgsl or msm drm driver: */
+	version = drmGetVersion(fd);
+	if (!version) {
+		ERROR_MSG("cannot get version: %s", strerror(errno));
+		return NULL;
+	}
+
+	if (!strcmp(version->name, "msm")) {
+		DEBUG_MSG("msm DRM device");
+		if (version->version_major != 1) {
+			ERROR_MSG("unsupported version: %u.%u.%u", version->version_major,
+				version->version_minor, version->version_patchlevel);
+			dev = NULL;
+			goto out;
+		}
+
+		dev = msm_device_new(fd);
+		dev->version = version->version_minor;
+#if HAVE_FREEDRENO_KGSL
+	} else if (!strcmp(version->name, "kgsl")) {
+		DEBUG_MSG("kgsl DRM device");
+		dev = kgsl_device_new(fd);
+#endif
+	} else {
+		ERROR_MSG("unknown device: %s", version->name);
+		dev = NULL;
+	}
+
+out:
+	drmFreeVersion(version);
+
+	if (!dev)
+		return NULL;
+
+	p_atomic_set(&dev->refcnt, 1);
+	dev->fd = fd;
+	dev->handle_table = _mesa_hash_table_create(NULL, u32_hash, u32_equals);
+	dev->name_table = _mesa_hash_table_create(NULL, u32_hash, u32_equals);
+	fd_bo_cache_init(&dev->bo_cache, FALSE);
+	fd_bo_cache_init(&dev->ring_cache, TRUE);
+
+	return dev;
+}
+
+/* like fd_device_new() but creates it's own private dup() of the fd
+ * which is close()d when the device is finalized.
+ */
+struct fd_device * fd_device_new_dup(int fd)
+{
+	int dup_fd = dup(fd);
+	struct fd_device *dev = fd_device_new(dup_fd);
+	if (dev)
+		dev->closefd = 1;
+	else
+		close(dup_fd);
+	return dev;
+}
+
+struct fd_device * fd_device_ref(struct fd_device *dev)
+{
+	p_atomic_inc(&dev->refcnt);
+	return dev;
+}
+
+static void fd_device_del_impl(struct fd_device *dev)
+{
+	int close_fd = dev->closefd ? dev->fd : -1;
+	fd_bo_cache_cleanup(&dev->bo_cache, 0);
+	_mesa_hash_table_destroy(dev->handle_table, NULL);
+	_mesa_hash_table_destroy(dev->name_table, NULL);
+	dev->funcs->destroy(dev);
+	if (close_fd >= 0)
+		close(close_fd);
+}
+
+void fd_device_del_locked(struct fd_device *dev)
+{
+	if (!atomic_dec_and_test(&dev->refcnt))
+		return;
+	fd_device_del_impl(dev);
+}
+
+void fd_device_del(struct fd_device *dev)
+{
+	if (!atomic_dec_and_test(&dev->refcnt))
+		return;
+	pthread_mutex_lock(&table_lock);
+	fd_device_del_impl(dev);
+	pthread_mutex_unlock(&table_lock);
+}
+
+int fd_device_fd(struct fd_device *dev)
+{
+	return dev->fd;
+}
+
+enum fd_version fd_device_version(struct fd_device *dev)
+{
+	return dev->version;
+}
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_drmif.h b/src/gallium/drivers/freedreno/drm/freedreno_drmif.h
new file mode 100644
index 00000000000..6468eac4a07
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_drmif.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_DRMIF_H_
+#define FREEDRENO_DRMIF_H_
+
+#include <stdint.h>
+
+struct fd_bo;
+struct fd_pipe;
+struct fd_device;
+
+enum fd_pipe_id {
+	FD_PIPE_3D = 1,
+	FD_PIPE_2D = 2,
+	/* some devices have two 2d blocks.. not really sure how to
+	 * use that yet, so just ignoring the 2nd 2d pipe for now
+	 */
+	FD_PIPE_MAX
+};
+
+enum fd_param_id {
+	FD_DEVICE_ID,
+	FD_GMEM_SIZE,
+	FD_GPU_ID,
+	FD_CHIP_ID,
+	FD_MAX_FREQ,
+	FD_TIMESTAMP,
+	FD_NR_RINGS,      /* # of rings == # of distinct priority levels */
+};
+
+/* bo flags: */
+#define DRM_FREEDRENO_GEM_TYPE_SMI        0x00000001
+#define DRM_FREEDRENO_GEM_TYPE_KMEM       0x00000002
+#define DRM_FREEDRENO_GEM_TYPE_MEM_MASK   0x0000000f
+#define DRM_FREEDRENO_GEM_CACHE_NONE      0x00000000
+#define DRM_FREEDRENO_GEM_CACHE_WCOMBINE  0x00100000
+#define DRM_FREEDRENO_GEM_CACHE_WTHROUGH  0x00200000
+#define DRM_FREEDRENO_GEM_CACHE_WBACK     0x00400000
+#define DRM_FREEDRENO_GEM_CACHE_WBACKWA   0x00800000
+#define DRM_FREEDRENO_GEM_CACHE_MASK      0x00f00000
+#define DRM_FREEDRENO_GEM_GPUREADONLY     0x01000000
+
+/* bo access flags: (keep aligned to MSM_PREP_x) */
+#define DRM_FREEDRENO_PREP_READ           0x01
+#define DRM_FREEDRENO_PREP_WRITE          0x02
+#define DRM_FREEDRENO_PREP_NOSYNC         0x04
+
+/* device functions:
+ */
+
+struct fd_device * fd_device_new(int fd);
+struct fd_device * fd_device_new_dup(int fd);
+struct fd_device * fd_device_ref(struct fd_device *dev);
+void fd_device_del(struct fd_device *dev);
+int fd_device_fd(struct fd_device *dev);
+
+enum fd_version {
+	FD_VERSION_MADVISE = 1,            /* kernel supports madvise */
+	FD_VERSION_UNLIMITED_CMDS = 1,     /* submits w/ >4 cmd buffers (growable ringbuffer) */
+	FD_VERSION_FENCE_FD = 2,           /* submit command supports in/out fences */
+	FD_VERSION_SUBMIT_QUEUES = 3,      /* submit queues and multiple priority levels */
+	FD_VERSION_BO_IOVA = 3,            /* supports fd_bo_get/put_iova() */
+};
+enum fd_version fd_device_version(struct fd_device *dev);
+
+/* pipe functions:
+ */
+
+struct fd_pipe * fd_pipe_new(struct fd_device *dev, enum fd_pipe_id id);
+struct fd_pipe * fd_pipe_new2(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio);
+struct fd_pipe * fd_pipe_ref(struct fd_pipe *pipe);
+void fd_pipe_del(struct fd_pipe *pipe);
+int fd_pipe_get_param(struct fd_pipe *pipe, enum fd_param_id param,
+		uint64_t *value);
+int fd_pipe_wait(struct fd_pipe *pipe, uint32_t timestamp);
+/* timeout in nanosec */
+int fd_pipe_wait_timeout(struct fd_pipe *pipe, uint32_t timestamp,
+		uint64_t timeout);
+
+
+/* buffer-object functions:
+ */
+
+struct fd_bo * fd_bo_new(struct fd_device *dev,
+		uint32_t size, uint32_t flags);
+struct fd_bo *fd_bo_from_handle(struct fd_device *dev,
+		uint32_t handle, uint32_t size);
+struct fd_bo * fd_bo_from_name(struct fd_device *dev, uint32_t name);
+struct fd_bo * fd_bo_from_dmabuf(struct fd_device *dev, int fd);
+uint64_t fd_bo_get_iova(struct fd_bo *bo);
+void fd_bo_put_iova(struct fd_bo *bo);
+struct fd_bo * fd_bo_ref(struct fd_bo *bo);
+void fd_bo_del(struct fd_bo *bo);
+int fd_bo_get_name(struct fd_bo *bo, uint32_t *name);
+uint32_t fd_bo_handle(struct fd_bo *bo);
+int fd_bo_dmabuf(struct fd_bo *bo);
+uint32_t fd_bo_size(struct fd_bo *bo);
+void * fd_bo_map(struct fd_bo *bo);
+int fd_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op);
+void fd_bo_cpu_fini(struct fd_bo *bo);
+
+#endif /* FREEDRENO_DRMIF_H_ */
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_pipe.c b/src/gallium/drivers/freedreno/drm/freedreno_pipe.c
new file mode 100644
index 00000000000..a4fd856bea6
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_pipe.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "freedreno_drmif.h"
+#include "freedreno_priv.h"
+
+/**
+ * priority of zero is highest priority, and higher numeric values are
+ * lower priorities
+ */
+struct fd_pipe *
+fd_pipe_new2(struct fd_device *dev, enum fd_pipe_id id, uint32_t prio)
+{
+	struct fd_pipe *pipe;
+	uint64_t val;
+
+	if (id > FD_PIPE_MAX) {
+		ERROR_MSG("invalid pipe id: %d", id);
+		return NULL;
+	}
+
+	if ((prio != 1) && (fd_device_version(dev) < FD_VERSION_SUBMIT_QUEUES)) {
+		ERROR_MSG("invalid priority!");
+		return NULL;
+	}
+
+	pipe = dev->funcs->pipe_new(dev, id, prio);
+	if (!pipe) {
+		ERROR_MSG("allocation failed");
+		return NULL;
+	}
+
+	pipe->dev = dev;
+	pipe->id = id;
+	p_atomic_set(&pipe->refcnt, 1);
+
+	fd_pipe_get_param(pipe, FD_GPU_ID, &val);
+	pipe->gpu_id = val;
+
+	return pipe;
+}
+
+struct fd_pipe *
+fd_pipe_new(struct fd_device *dev, enum fd_pipe_id id)
+{
+	return fd_pipe_new2(dev, id, 1);
+}
+
+struct fd_pipe * fd_pipe_ref(struct fd_pipe *pipe)
+{
+	p_atomic_inc(&pipe->refcnt);
+	return pipe;
+}
+
+void fd_pipe_del(struct fd_pipe *pipe)
+{
+	if (!atomic_dec_and_test(&pipe->refcnt))
+		return;
+	pipe->funcs->destroy(pipe);
+}
+
+int fd_pipe_get_param(struct fd_pipe *pipe,
+				 enum fd_param_id param, uint64_t *value)
+{
+	return pipe->funcs->get_param(pipe, param, value);
+}
+
+int fd_pipe_wait(struct fd_pipe *pipe, uint32_t timestamp)
+{
+	return fd_pipe_wait_timeout(pipe, timestamp, ~0);
+}
+
+int fd_pipe_wait_timeout(struct fd_pipe *pipe, uint32_t timestamp,
+		uint64_t timeout)
+{
+	return pipe->funcs->wait(pipe, timestamp, timeout);
+}
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_priv.h b/src/gallium/drivers/freedreno/drm/freedreno_priv.h
new file mode 100644
index 00000000000..45a5d6ccba1
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_priv.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_PRIV_H_
+#define FREEDRENO_PRIV_H_
+
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <stdio.h>
+
+#include <xf86drm.h>
+
+#include "util/hash_table.h"
+#include "util/list.h"
+#include "util/u_debug.h"
+#include "util/u_atomic.h"
+#include "util/u_math.h"
+#include "util/u_debug.h"
+
+#include "freedreno_drmif.h"
+#include "freedreno_ringbuffer.h"
+
+#define atomic_dec_and_test(x) (__sync_add_and_fetch (x, -1) == 0)
+
+struct fd_device_funcs {
+	int (*bo_new_handle)(struct fd_device *dev, uint32_t size,
+			uint32_t flags, uint32_t *handle);
+	struct fd_bo * (*bo_from_handle)(struct fd_device *dev,
+			uint32_t size, uint32_t handle);
+	struct fd_pipe * (*pipe_new)(struct fd_device *dev, enum fd_pipe_id id,
+			unsigned prio);
+	void (*destroy)(struct fd_device *dev);
+};
+
+struct fd_bo_bucket {
+	uint32_t size;
+	struct list_head list;
+};
+
+struct fd_bo_cache {
+	struct fd_bo_bucket cache_bucket[14 * 4];
+	int num_buckets;
+	time_t time;
+};
+
+struct fd_device {
+	int fd;
+	enum fd_version version;
+	int32_t refcnt;
+
+	/* tables to keep track of bo's, to avoid "evil-twin" fd_bo objects:
+	 *
+	 *   handle_table: maps handle to fd_bo
+	 *   name_table: maps flink name to fd_bo
+	 *
+	 * We end up needing two tables, because DRM_IOCTL_GEM_OPEN always
+	 * returns a new handle.  So we need to figure out if the bo is already
+	 * open in the process first, before calling gem-open.
+	 */
+	struct hash_table *handle_table, *name_table;
+
+	const struct fd_device_funcs *funcs;
+
+	struct fd_bo_cache bo_cache;
+	struct fd_bo_cache ring_cache;
+
+	int closefd;        /* call close(fd) upon destruction */
+
+	/* just for valgrind: */
+	int bo_size;
+};
+
+void fd_bo_cache_init(struct fd_bo_cache *cache, int coarse);
+void fd_bo_cache_cleanup(struct fd_bo_cache *cache, time_t time);
+struct fd_bo * fd_bo_cache_alloc(struct fd_bo_cache *cache,
+		uint32_t *size, uint32_t flags);
+int fd_bo_cache_free(struct fd_bo_cache *cache, struct fd_bo *bo);
+
+/* for where @table_lock is already held: */
+void fd_device_del_locked(struct fd_device *dev);
+
+struct fd_pipe_funcs {
+	struct fd_ringbuffer * (*ringbuffer_new_object)(struct fd_pipe *pipe, uint32_t size);
+	struct fd_submit * (*submit_new)(struct fd_pipe *pipe);
+	int (*get_param)(struct fd_pipe *pipe, enum fd_param_id param, uint64_t *value);
+	int (*wait)(struct fd_pipe *pipe, uint32_t timestamp, uint64_t timeout);
+	void (*destroy)(struct fd_pipe *pipe);
+};
+
+struct fd_pipe {
+	struct fd_device *dev;
+	enum fd_pipe_id id;
+	uint32_t gpu_id;
+	int32_t refcnt;
+	const struct fd_pipe_funcs *funcs;
+};
+
+struct fd_submit_funcs {
+	struct fd_ringbuffer * (*new_ringbuffer)(struct fd_submit *submit,
+			uint32_t size, enum fd_ringbuffer_flags flags);
+	int (*flush)(struct fd_submit *submit, int in_fence_fd,
+			int *out_fence_fd, uint32_t *out_fence);
+	void (*destroy)(struct fd_submit *submit);
+};
+
+struct fd_submit {
+	struct fd_pipe *pipe;
+	const struct fd_submit_funcs *funcs;
+};
+
+struct fd_ringbuffer_funcs {
+	void (*grow)(struct fd_ringbuffer *ring, uint32_t size);
+	void (*emit_reloc)(struct fd_ringbuffer *ring,
+			const struct fd_reloc *reloc);
+	uint32_t (*emit_reloc_ring)(struct fd_ringbuffer *ring,
+			struct fd_ringbuffer *target, uint32_t cmd_idx);
+	uint32_t (*cmd_count)(struct fd_ringbuffer *ring);
+	void (*destroy)(struct fd_ringbuffer *ring);
+};
+
+struct fd_bo_funcs {
+	int (*offset)(struct fd_bo *bo, uint64_t *offset);
+	int (*cpu_prep)(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op);
+	void (*cpu_fini)(struct fd_bo *bo);
+	int (*madvise)(struct fd_bo *bo, int willneed);
+	uint64_t (*iova)(struct fd_bo *bo);
+	void (*destroy)(struct fd_bo *bo);
+};
+
+struct fd_bo {
+	struct fd_device *dev;
+	uint32_t size;
+	uint32_t handle;
+	uint32_t name;
+	int32_t refcnt;
+	uint64_t iova;
+	void *map;
+	const struct fd_bo_funcs *funcs;
+
+	enum {
+		NO_CACHE = 0,
+		BO_CACHE = 1,
+		RING_CACHE = 2,
+	} bo_reuse;
+
+	struct list_head list;   /* bucket-list entry */
+	time_t free_time;        /* time when added to bucket-list */
+};
+
+struct fd_bo *fd_bo_new_ring(struct fd_device *dev,
+		uint32_t size, uint32_t flags);
+
+#define enable_debug 0  /* TODO make dynamic */
+
+#define INFO_MSG(fmt, ...) \
+		do { debug_printf("[I] "fmt " (%s:%d)\n", \
+				##__VA_ARGS__, __FUNCTION__, __LINE__); } while (0)
+#define DEBUG_MSG(fmt, ...) \
+		do if (enable_debug) { debug_printf("[D] "fmt " (%s:%d)\n", \
+				##__VA_ARGS__, __FUNCTION__, __LINE__); } while (0)
+#define WARN_MSG(fmt, ...) \
+		do { debug_printf("[W] "fmt " (%s:%d)\n", \
+				##__VA_ARGS__, __FUNCTION__, __LINE__); } while (0)
+#define ERROR_MSG(fmt, ...) \
+		do { debug_printf("[E] " fmt " (%s:%d)\n", \
+				##__VA_ARGS__, __FUNCTION__, __LINE__); } while (0)
+
+#define U642VOID(x) ((void *)(unsigned long)(x))
+#define VOID2U64(x) ((uint64_t)(unsigned long)(x))
+
+#if HAVE_VALGRIND
+#  include <memcheck.h>
+
+/*
+ * For tracking the backing memory (if valgrind enabled, we force a mmap
+ * for the purposes of tracking)
+ */
+static inline void VG_BO_ALLOC(struct fd_bo *bo)
+{
+	if (bo && RUNNING_ON_VALGRIND) {
+		VALGRIND_MALLOCLIKE_BLOCK(fd_bo_map(bo), bo->size, 0, 1);
+	}
+}
+
+static inline void VG_BO_FREE(struct fd_bo *bo)
+{
+	VALGRIND_FREELIKE_BLOCK(bo->map, 0);
+}
+
+/*
+ * For tracking bo structs that are in the buffer-cache, so that valgrind
+ * doesn't attribute ownership to the first one to allocate the recycled
+ * bo.
+ *
+ * Note that the list_head in fd_bo is used to track the buffers in cache
+ * so disable error reporting on the range while they are in cache so
+ * valgrind doesn't squawk about list traversal.
+ *
+ */
+static inline void VG_BO_RELEASE(struct fd_bo *bo)
+{
+	if (RUNNING_ON_VALGRIND) {
+		VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(bo, bo->dev->bo_size);
+		VALGRIND_MAKE_MEM_NOACCESS(bo, bo->dev->bo_size);
+		VALGRIND_FREELIKE_BLOCK(bo->map, 0);
+	}
+}
+static inline void VG_BO_OBTAIN(struct fd_bo *bo)
+{
+	if (RUNNING_ON_VALGRIND) {
+		VALGRIND_MAKE_MEM_DEFINED(bo, bo->dev->bo_size);
+		VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(bo, bo->dev->bo_size);
+		VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1);
+	}
+}
+#else
+static inline void VG_BO_ALLOC(struct fd_bo *bo)   {}
+static inline void VG_BO_FREE(struct fd_bo *bo)    {}
+static inline void VG_BO_RELEASE(struct fd_bo *bo) {}
+static inline void VG_BO_OBTAIN(struct fd_bo *bo)  {}
+#endif
+
+#define FD_DEFINE_CAST(parent, child) \
+static inline struct child * to_ ## child (struct parent *x) \
+{ return (struct child *)x; }
+
+
+#endif /* FREEDRENO_PRIV_H_ */
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.c b/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.c
new file mode 100644
index 00000000000..671cbb11f68
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <assert.h>
+
+#include "freedreno_drmif.h"
+#include "freedreno_ringbuffer.h"
+#include "freedreno_priv.h"
+
+struct fd_submit *
+fd_submit_new(struct fd_pipe *pipe)
+{
+	return pipe->funcs->submit_new(pipe);
+}
+
+void
+fd_submit_del(struct fd_submit *submit)
+{
+	return submit->funcs->destroy(submit);
+}
+
+int
+fd_submit_flush(struct fd_submit *submit, int in_fence_fd, int *out_fence_fd,
+		uint32_t *out_fence)
+{
+	return submit->funcs->flush(submit, in_fence_fd, out_fence_fd, out_fence);
+}
+
+struct fd_ringbuffer *
+fd_submit_new_ringbuffer(struct fd_submit *submit, uint32_t size,
+		enum fd_ringbuffer_flags flags)
+{
+	debug_assert(!(flags & _FD_RINGBUFFER_OBJECT));
+	if (flags & FD_RINGBUFFER_STREAMING) {
+		debug_assert(!(flags & FD_RINGBUFFER_GROWABLE));
+		debug_assert(!(flags & FD_RINGBUFFER_PRIMARY));
+	}
+	return submit->funcs->new_ringbuffer(submit, size, flags);
+}
+
+struct fd_ringbuffer *
+fd_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size)
+{
+	return pipe->funcs->ringbuffer_new_object(pipe, size);
+}
+
+void fd_ringbuffer_del(struct fd_ringbuffer *ring)
+{
+	if (!atomic_dec_and_test(&ring->refcnt))
+		return;
+
+	ring->funcs->destroy(ring);
+}
+
+struct fd_ringbuffer *
+fd_ringbuffer_ref(struct fd_ringbuffer *ring)
+{
+	p_atomic_inc(&ring->refcnt);
+	return ring;
+}
+
+void fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords)
+{
+	assert(ring->funcs->grow);     /* unsupported on kgsl */
+
+	/* there is an upper bound on IB size, which appears to be 0x100000 */
+	if (ring->size < 0x100000)
+		ring->size *= 2;
+
+	ring->funcs->grow(ring, ring->size);
+}
+
+void fd_ringbuffer_reloc(struct fd_ringbuffer *ring,
+				     const struct fd_reloc *reloc)
+{
+	ring->funcs->emit_reloc(ring, reloc);
+}
+
+uint32_t fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
+{
+	if (!ring->funcs->cmd_count)
+		return 1;
+	return ring->funcs->cmd_count(ring);
+}
+
+uint32_t
+fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring,
+		struct fd_ringbuffer *target, uint32_t cmd_idx)
+{
+	return ring->funcs->emit_reloc_ring(ring, target, cmd_idx);
+}
diff --git a/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.h b/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.h
new file mode 100644
index 00000000000..4292c8f65d6
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/freedreno_ringbuffer.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_RINGBUFFER_H_
+#define FREEDRENO_RINGBUFFER_H_
+
+#include "util/u_debug.h"
+
+#include "freedreno_drmif.h"
+
+struct fd_submit;
+struct fd_ringbuffer;
+
+enum fd_ringbuffer_flags {
+
+	/* Primary ringbuffer for a submit, ie. an IB1 level rb
+	 * which kernel must setup RB->IB1 CP_INDIRECT_BRANCH
+	 * packets.
+	 */
+	FD_RINGBUFFER_PRIMARY = 0x1,
+
+	/* Hint that the stateobj will be used for streaming state
+	 * that is used once or a few times and then discarded.
+	 *
+	 * For sub-allocation, non streaming stateobj's should be
+	 * sub-allocated from a page size buffer, so one long lived
+	 * state obj doesn't prevent other pages from being freed.
+	 * (Ie. it would be no worse than allocating a page sized
+	 * bo for each small non-streaming stateobj).
+	 *
+	 * But streaming stateobj's could be sub-allocated from a
+	 * larger buffer to reduce the alloc/del overhead.
+	 */
+	FD_RINGBUFFER_STREAMING = 0x2,
+
+	/* Indicates that "growable" cmdstream can be used,
+	 * consisting of multiple physical cmdstream buffers
+	 */
+	FD_RINGBUFFER_GROWABLE = 0x4,
+
+	/* Internal use only: */
+	_FD_RINGBUFFER_OBJECT = 0x8,
+};
+
+/* A submit object manages/tracks all the state buildup for a "submit"
+ * ioctl to the kernel.  Additionally, with the exception of long-lived
+ * non-STREAMING stateobj rb's, rb's are allocated from the submit.
+ */
+struct fd_submit * fd_submit_new(struct fd_pipe *pipe);
+
+/* NOTE: all ringbuffer's create from the submit should be unref'd
+ * before destroying the submit.
+ */
+void fd_submit_del(struct fd_submit *submit);
+
+/* Allocate a new rb from the submit. */
+struct fd_ringbuffer * fd_submit_new_ringbuffer(struct fd_submit *submit,
+		uint32_t size, enum fd_ringbuffer_flags flags);
+
+/* in_fence_fd: -1 for no in-fence, else fence fd
+ * out_fence_fd: NULL for no output-fence requested, else ptr to return out-fence
+ */
+int fd_submit_flush(struct fd_submit *submit,
+		int in_fence_fd, int *out_fence_fd,
+		uint32_t *out_fence);
+
+struct fd_ringbuffer_funcs;
+
+/* the ringbuffer object is not opaque so that OUT_RING() type stuff
+ * can be inlined.  Note that users should not make assumptions about
+ * the size of this struct.
+ */
+struct fd_ringbuffer {
+	uint32_t *cur, *end, *start;
+	const struct fd_ringbuffer_funcs *funcs;
+
+// size or end coudl probably go away
+	int size;
+	int32_t refcnt;
+	enum fd_ringbuffer_flags flags;
+};
+
+/* Allocate a new long-lived state object, not associated with
+ * a submit:
+ */
+struct fd_ringbuffer * fd_ringbuffer_new_object(struct fd_pipe *pipe,
+		uint32_t size);
+
+struct fd_ringbuffer *fd_ringbuffer_ref(struct fd_ringbuffer *ring);
+void fd_ringbuffer_del(struct fd_ringbuffer *ring);
+
+void fd_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t ndwords);
+
+static inline void fd_ringbuffer_emit(struct fd_ringbuffer *ring,
+		uint32_t data)
+{
+	(*ring->cur++) = data;
+}
+
+struct fd_reloc {
+	struct fd_bo *bo;
+#define FD_RELOC_READ             0x0001
+#define FD_RELOC_WRITE            0x0002
+	uint32_t flags;
+	uint32_t offset;
+	uint32_t or;
+	int32_t  shift;
+	uint32_t orhi;      /* used for a5xx+ */
+};
+
+/* NOTE: relocs are 2 dwords on a5xx+ */
+
+void fd_ringbuffer_reloc(struct fd_ringbuffer *ring, const struct fd_reloc *reloc);
+uint32_t fd_ringbuffer_cmd_count(struct fd_ringbuffer *ring);
+uint32_t fd_ringbuffer_emit_reloc_ring_full(struct fd_ringbuffer *ring,
+		struct fd_ringbuffer *target, uint32_t cmd_idx);
+
+static inline uint32_t
+offset_bytes(void *end, void *start)
+{
+	return ((char *)end) - ((char *)start);
+}
+
+static inline uint32_t
+fd_ringbuffer_size(struct fd_ringbuffer *ring)
+{
+	/* only really needed for stateobj ringbuffers, and won't really
+	 * do what you expect for growable rb's.. so lets just restrict
+	 * this to stateobj's for now:
+	 */
+	debug_assert(!(ring->flags & FD_RINGBUFFER_GROWABLE));
+	return offset_bytes(ring->cur, ring->start);
+}
+
+
+#endif /* FREEDRENO_RINGBUFFER_H_ */
diff --git a/src/gallium/drivers/freedreno/drm/msm_bo.c b/src/gallium/drivers/freedreno/drm/msm_bo.c
new file mode 100644
index 00000000000..da3315c9ab6
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_bo.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "msm_priv.h"
+
+static int bo_allocate(struct msm_bo *msm_bo)
+{
+	struct fd_bo *bo = &msm_bo->base;
+	if (!msm_bo->offset) {
+		struct drm_msm_gem_info req = {
+				.handle = bo->handle,
+		};
+		int ret;
+
+		/* if the buffer is already backed by pages then this
+		 * doesn't actually do anything (other than giving us
+		 * the offset)
+		 */
+		ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_INFO,
+				&req, sizeof(req));
+		if (ret) {
+			ERROR_MSG("alloc failed: %s", strerror(errno));
+			return ret;
+		}
+
+		msm_bo->offset = req.offset;
+	}
+
+	return 0;
+}
+
+static int msm_bo_offset(struct fd_bo *bo, uint64_t *offset)
+{
+	struct msm_bo *msm_bo = to_msm_bo(bo);
+	int ret = bo_allocate(msm_bo);
+	if (ret)
+		return ret;
+	*offset = msm_bo->offset;
+	return 0;
+}
+
+static int msm_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op)
+{
+	struct drm_msm_gem_cpu_prep req = {
+			.handle = bo->handle,
+			.op = op,
+	};
+
+	get_abs_timeout(&req.timeout, 5000000000);
+
+	return drmCommandWrite(bo->dev->fd, DRM_MSM_GEM_CPU_PREP, &req, sizeof(req));
+}
+
+static void msm_bo_cpu_fini(struct fd_bo *bo)
+{
+	struct drm_msm_gem_cpu_fini req = {
+			.handle = bo->handle,
+	};
+
+	drmCommandWrite(bo->dev->fd, DRM_MSM_GEM_CPU_FINI, &req, sizeof(req));
+}
+
+static int msm_bo_madvise(struct fd_bo *bo, int willneed)
+{
+	struct drm_msm_gem_madvise req = {
+			.handle = bo->handle,
+			.madv = willneed ? MSM_MADV_WILLNEED : MSM_MADV_DONTNEED,
+	};
+	int ret;
+
+	/* older kernels do not support this: */
+	if (bo->dev->version < FD_VERSION_MADVISE)
+		return willneed;
+
+	ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_MADVISE, &req, sizeof(req));
+	if (ret)
+		return ret;
+
+	return req.retained;
+}
+
+static uint64_t msm_bo_iova(struct fd_bo *bo)
+{
+	struct drm_msm_gem_info req = {
+			.handle = bo->handle,
+			.flags = MSM_INFO_IOVA,
+	};
+	int ret;
+
+	ret = drmCommandWriteRead(bo->dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
+	debug_assert(ret == 0);
+
+	return req.offset;
+}
+
+static void msm_bo_destroy(struct fd_bo *bo)
+{
+	struct msm_bo *msm_bo = to_msm_bo(bo);
+	free(msm_bo);
+
+}
+
+static const struct fd_bo_funcs funcs = {
+		.offset = msm_bo_offset,
+		.cpu_prep = msm_bo_cpu_prep,
+		.cpu_fini = msm_bo_cpu_fini,
+		.madvise = msm_bo_madvise,
+		.iova = msm_bo_iova,
+		.destroy = msm_bo_destroy,
+};
+
+/* allocate a buffer handle: */
+int msm_bo_new_handle(struct fd_device *dev,
+		uint32_t size, uint32_t flags, uint32_t *handle)
+{
+	struct drm_msm_gem_new req = {
+			.size = size,
+			.flags = MSM_BO_WC,  // TODO figure out proper flags..
+	};
+	int ret;
+
+	ret = drmCommandWriteRead(dev->fd, DRM_MSM_GEM_NEW,
+			&req, sizeof(req));
+	if (ret)
+		return ret;
+
+	*handle = req.handle;
+
+	return 0;
+}
+
+/* allocate a new buffer object */
+struct fd_bo * msm_bo_from_handle(struct fd_device *dev,
+		uint32_t size, uint32_t handle)
+{
+	struct msm_bo *msm_bo;
+	struct fd_bo *bo;
+
+	msm_bo = calloc(1, sizeof(*msm_bo));
+	if (!msm_bo)
+		return NULL;
+
+	bo = &msm_bo->base;
+	bo->funcs = &funcs;
+
+	return bo;
+}
diff --git a/src/gallium/drivers/freedreno/drm/msm_device.c b/src/gallium/drivers/freedreno/drm/msm_device.c
new file mode 100644
index 00000000000..d391ef01307
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_device.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "msm_priv.h"
+
+static void msm_device_destroy(struct fd_device *dev)
+{
+	struct msm_device *msm_dev = to_msm_device(dev);
+	free(msm_dev);
+}
+
+static const struct fd_device_funcs funcs = {
+		.bo_new_handle = msm_bo_new_handle,
+		.bo_from_handle = msm_bo_from_handle,
+		.pipe_new = msm_pipe_new,
+		.destroy = msm_device_destroy,
+};
+
+struct fd_device * msm_device_new(int fd)
+{
+	struct msm_device *msm_dev;
+	struct fd_device *dev;
+
+	msm_dev = calloc(1, sizeof(*msm_dev));
+	if (!msm_dev)
+		return NULL;
+
+	dev = &msm_dev->base;
+	dev->funcs = &funcs;
+
+	dev->bo_size = sizeof(struct msm_bo);
+
+	return dev;
+}
diff --git a/src/gallium/drivers/freedreno/drm/msm_drm.h b/src/gallium/drivers/freedreno/drm/msm_drm.h
new file mode 100644
index 00000000000..c06d0a5bdd8
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_drm.h
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) 2013 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MSM_DRM_H__
+#define __MSM_DRM_H__
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints:
+ *  1) Do not use pointers, use __u64 instead for 32 bit / 64 bit
+ *     user/kernel compatibility
+ *  2) Keep fields aligned to their size
+ *  3) Because of how drm_ioctl() works, we can add new fields at
+ *     the end of an ioctl if some care is taken: drm_ioctl() will
+ *     zero out the new fields at the tail of the ioctl, so a zero
+ *     value should have a backwards compatible meaning.  And for
+ *     output params, userspace won't see the newly added output
+ *     fields.. so that has to be somehow ok.
+ */
+
+#define MSM_PIPE_NONE        0x00
+#define MSM_PIPE_2D0         0x01
+#define MSM_PIPE_2D1         0x02
+#define MSM_PIPE_3D0         0x10
+
+/* The pipe-id just uses the lower bits, so can be OR'd with flags in
+ * the upper 16 bits (which could be extended further, if needed, maybe
+ * we extend/overload the pipe-id some day to deal with multiple rings,
+ * but even then I don't think we need the full lower 16 bits).
+ */
+#define MSM_PIPE_ID_MASK     0xffff
+#define MSM_PIPE_ID(x)       ((x) & MSM_PIPE_ID_MASK)
+#define MSM_PIPE_FLAGS(x)    ((x) & ~MSM_PIPE_ID_MASK)
+
+/* timeouts are specified in clock-monotonic absolute times (to simplify
+ * restarting interrupted ioctls).  The following struct is logically the
+ * same as 'struct timespec' but 32/64b ABI safe.
+ */
+struct drm_msm_timespec {
+	__s64 tv_sec;          /* seconds */
+	__s64 tv_nsec;         /* nanoseconds */
+};
+
+#define MSM_PARAM_GPU_ID     0x01
+#define MSM_PARAM_GMEM_SIZE  0x02
+#define MSM_PARAM_CHIP_ID    0x03
+#define MSM_PARAM_MAX_FREQ   0x04
+#define MSM_PARAM_TIMESTAMP  0x05
+#define MSM_PARAM_GMEM_BASE  0x06
+#define MSM_PARAM_NR_RINGS   0x07
+
+struct drm_msm_param {
+	__u32 pipe;           /* in, MSM_PIPE_x */
+	__u32 param;          /* in, MSM_PARAM_x */
+	__u64 value;          /* out (get_param) or in (set_param) */
+};
+
+/*
+ * GEM buffers:
+ */
+
+#define MSM_BO_SCANOUT       0x00000001     /* scanout capable */
+#define MSM_BO_GPU_READONLY  0x00000002
+#define MSM_BO_CACHE_MASK    0x000f0000
+/* cache modes */
+#define MSM_BO_CACHED        0x00010000
+#define MSM_BO_WC            0x00020000
+#define MSM_BO_UNCACHED      0x00040000
+
+#define MSM_BO_FLAGS         (MSM_BO_SCANOUT | \
+                              MSM_BO_GPU_READONLY | \
+                              MSM_BO_CACHED | \
+                              MSM_BO_WC | \
+                              MSM_BO_UNCACHED)
+
+struct drm_msm_gem_new {
+	__u64 size;           /* in */
+	__u32 flags;          /* in, mask of MSM_BO_x */
+	__u32 handle;         /* out */
+};
+
+#define MSM_INFO_IOVA	0x01
+
+#define MSM_INFO_FLAGS (MSM_INFO_IOVA)
+
+struct drm_msm_gem_info {
+	__u32 handle;         /* in */
+	__u32 flags;	      /* in - combination of MSM_INFO_* flags */
+	__u64 offset;         /* out, mmap() offset or iova */
+};
+
+#define MSM_PREP_READ        0x01
+#define MSM_PREP_WRITE       0x02
+#define MSM_PREP_NOSYNC      0x04
+
+#define MSM_PREP_FLAGS       (MSM_PREP_READ | MSM_PREP_WRITE | MSM_PREP_NOSYNC)
+
+struct drm_msm_gem_cpu_prep {
+	__u32 handle;         /* in */
+	__u32 op;             /* in, mask of MSM_PREP_x */
+	struct drm_msm_timespec timeout;   /* in */
+};
+
+struct drm_msm_gem_cpu_fini {
+	__u32 handle;         /* in */
+};
+
+/*
+ * Cmdstream Submission:
+ */
+
+/* The value written into the cmdstream is logically:
+ *
+ *   ((relocbuf->gpuaddr + reloc_offset) << shift) | or
+ *
+ * When we have GPU's w/ >32bit ptrs, it should be possible to deal
+ * with this by emit'ing two reloc entries with appropriate shift
+ * values.  Or a new MSM_SUBMIT_CMD_x type would also be an option.
+ *
+ * NOTE that reloc's must be sorted by order of increasing submit_offset,
+ * otherwise EINVAL.
+ */
+struct drm_msm_gem_submit_reloc {
+	__u32 submit_offset;  /* in, offset from submit_bo */
+	__u32 or;             /* in, value OR'd with result */
+	__s32 shift;          /* in, amount of left shift (can be negative) */
+	__u32 reloc_idx;      /* in, index of reloc_bo buffer */
+	__u64 reloc_offset;   /* in, offset from start of reloc_bo */
+};
+
+/* submit-types:
+ *   BUF - this cmd buffer is executed normally.
+ *   IB_TARGET_BUF - this cmd buffer is an IB target.  Reloc's are
+ *      processed normally, but the kernel does not setup an IB to
+ *      this buffer in the first-level ringbuffer
+ *   CTX_RESTORE_BUF - only executed if there has been a GPU context
+ *      switch since the last SUBMIT ioctl
+ */
+#define MSM_SUBMIT_CMD_BUF             0x0001
+#define MSM_SUBMIT_CMD_IB_TARGET_BUF   0x0002
+#define MSM_SUBMIT_CMD_CTX_RESTORE_BUF 0x0003
+struct drm_msm_gem_submit_cmd {
+	__u32 type;           /* in, one of MSM_SUBMIT_CMD_x */
+	__u32 submit_idx;     /* in, index of submit_bo cmdstream buffer */
+	__u32 submit_offset;  /* in, offset into submit_bo */
+	__u32 size;           /* in, cmdstream size */
+	__u32 pad;
+	__u32 nr_relocs;      /* in, number of submit_reloc's */
+	__u64 relocs;         /* in, ptr to array of submit_reloc's */
+};
+
+/* Each buffer referenced elsewhere in the cmdstream submit (ie. the
+ * cmdstream buffer(s) themselves or reloc entries) has one (and only
+ * one) entry in the submit->bos[] table.
+ *
+ * As a optimization, the current buffer (gpu virtual address) can be
+ * passed back through the 'presumed' field.  If on a subsequent reloc,
+ * userspace passes back a 'presumed' address that is still valid,
+ * then patching the cmdstream for this entry is skipped.  This can
+ * avoid kernel needing to map/access the cmdstream bo in the common
+ * case.
+ */
+#define MSM_SUBMIT_BO_READ             0x0001
+#define MSM_SUBMIT_BO_WRITE            0x0002
+
+#define MSM_SUBMIT_BO_FLAGS            (MSM_SUBMIT_BO_READ | MSM_SUBMIT_BO_WRITE)
+
+struct drm_msm_gem_submit_bo {
+	__u32 flags;          /* in, mask of MSM_SUBMIT_BO_x */
+	__u32 handle;         /* in, GEM handle */
+	__u64 presumed;       /* in/out, presumed buffer address */
+};
+
+/* Valid submit ioctl flags: */
+#define MSM_SUBMIT_NO_IMPLICIT   0x80000000 /* disable implicit sync */
+#define MSM_SUBMIT_FENCE_FD_IN   0x40000000 /* enable input fence_fd */
+#define MSM_SUBMIT_FENCE_FD_OUT  0x20000000 /* enable output fence_fd */
+#define MSM_SUBMIT_SUDO          0x10000000 /* run submitted cmds from RB */
+#define MSM_SUBMIT_FLAGS                ( \
+		MSM_SUBMIT_NO_IMPLICIT   | \
+		MSM_SUBMIT_FENCE_FD_IN   | \
+		MSM_SUBMIT_FENCE_FD_OUT  | \
+		MSM_SUBMIT_SUDO          | \
+		0)
+
+/* Each cmdstream submit consists of a table of buffers involved, and
+ * one or more cmdstream buffers.  This allows for conditional execution
+ * (context-restore), and IB buffers needed for per tile/bin draw cmds.
+ */
+struct drm_msm_gem_submit {
+	__u32 flags;          /* MSM_PIPE_x | MSM_SUBMIT_x */
+	__u32 fence;          /* out */
+	__u32 nr_bos;         /* in, number of submit_bo's */
+	__u32 nr_cmds;        /* in, number of submit_cmd's */
+	__u64 bos;            /* in, ptr to array of submit_bo's */
+	__u64 cmds;           /* in, ptr to array of submit_cmd's */
+	__s32 fence_fd;       /* in/out fence fd (see MSM_SUBMIT_FENCE_FD_IN/OUT) */
+	__u32 queueid;         /* in, submitqueue id */
+};
+
+/* The normal way to synchronize with the GPU is just to CPU_PREP on
+ * a buffer if you need to access it from the CPU (other cmdstream
+ * submission from same or other contexts, PAGE_FLIP ioctl, etc, all
+ * handle the required synchronization under the hood).  This ioctl
+ * mainly just exists as a way to implement the gallium pipe_fence
+ * APIs without requiring a dummy bo to synchronize on.
+ */
+struct drm_msm_wait_fence {
+	__u32 fence;          /* in */
+	__u32 pad;
+	struct drm_msm_timespec timeout;   /* in */
+	__u32 queueid;         /* in, submitqueue id */
+};
+
+/* madvise provides a way to tell the kernel in case a buffers contents
+ * can be discarded under memory pressure, which is useful for userspace
+ * bo cache where we want to optimistically hold on to buffer allocate
+ * and potential mmap, but allow the pages to be discarded under memory
+ * pressure.
+ *
+ * Typical usage would involve madvise(DONTNEED) when buffer enters BO
+ * cache, and madvise(WILLNEED) if trying to recycle buffer from BO cache.
+ * In the WILLNEED case, 'retained' indicates to userspace whether the
+ * backing pages still exist.
+ */
+#define MSM_MADV_WILLNEED 0       /* backing pages are needed, status returned in 'retained' */
+#define MSM_MADV_DONTNEED 1       /* backing pages not needed */
+#define __MSM_MADV_PURGED 2       /* internal state */
+
+struct drm_msm_gem_madvise {
+	__u32 handle;         /* in, GEM handle */
+	__u32 madv;           /* in, MSM_MADV_x */
+	__u32 retained;       /* out, whether backing store still exists */
+};
+
+/*
+ * Draw queues allow the user to set specific submission parameter. Command
+ * submissions specify a specific submitqueue to use.  ID 0 is reserved for
+ * backwards compatibility as a "default" submitqueue
+ */
+
+#define MSM_SUBMITQUEUE_FLAGS (0)
+
+struct drm_msm_submitqueue {
+	__u32 flags;   /* in, MSM_SUBMITQUEUE_x */
+	__u32 prio;    /* in, Priority level */
+	__u32 id;      /* out, identifier */
+};
+
+#define DRM_MSM_GET_PARAM              0x00
+/* placeholder:
+#define DRM_MSM_SET_PARAM              0x01
+ */
+#define DRM_MSM_GEM_NEW                0x02
+#define DRM_MSM_GEM_INFO               0x03
+#define DRM_MSM_GEM_CPU_PREP           0x04
+#define DRM_MSM_GEM_CPU_FINI           0x05
+#define DRM_MSM_GEM_SUBMIT             0x06
+#define DRM_MSM_WAIT_FENCE             0x07
+#define DRM_MSM_GEM_MADVISE            0x08
+/* placeholder:
+#define DRM_MSM_GEM_SVM_NEW            0x09
+ */
+#define DRM_MSM_SUBMITQUEUE_NEW        0x0A
+#define DRM_MSM_SUBMITQUEUE_CLOSE      0x0B
+
+#define DRM_IOCTL_MSM_GET_PARAM        DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param)
+#define DRM_IOCTL_MSM_GEM_NEW          DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new)
+#define DRM_IOCTL_MSM_GEM_INFO         DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_INFO, struct drm_msm_gem_info)
+#define DRM_IOCTL_MSM_GEM_CPU_PREP     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_CPU_PREP, struct drm_msm_gem_cpu_prep)
+#define DRM_IOCTL_MSM_GEM_CPU_FINI     DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_CPU_FINI, struct drm_msm_gem_cpu_fini)
+#define DRM_IOCTL_MSM_GEM_SUBMIT       DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_SUBMIT, struct drm_msm_gem_submit)
+#define DRM_IOCTL_MSM_WAIT_FENCE       DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_WAIT_FENCE, struct drm_msm_wait_fence)
+#define DRM_IOCTL_MSM_GEM_MADVISE      DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_MADVISE, struct drm_msm_gem_madvise)
+#define DRM_IOCTL_MSM_SUBMITQUEUE_NEW    DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue)
+#define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE  DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* __MSM_DRM_H__ */
diff --git a/src/gallium/drivers/freedreno/drm/msm_pipe.c b/src/gallium/drivers/freedreno/drm/msm_pipe.c
new file mode 100644
index 00000000000..b7996e5528a
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_pipe.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include "util/slab.h"
+
+#include "freedreno_util.h"
+#include "msm_priv.h"
+
+static int query_param(struct fd_pipe *pipe, uint32_t param,
+		uint64_t *value)
+{
+	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
+	struct drm_msm_param req = {
+			.pipe = msm_pipe->pipe,
+			.param = param,
+	};
+	int ret;
+
+	ret = drmCommandWriteRead(pipe->dev->fd, DRM_MSM_GET_PARAM,
+			&req, sizeof(req));
+	if (ret)
+		return ret;
+
+	*value = req.value;
+
+	return 0;
+}
+
+static int msm_pipe_get_param(struct fd_pipe *pipe,
+		enum fd_param_id param, uint64_t *value)
+{
+	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
+	switch(param) {
+	case FD_DEVICE_ID: // XXX probably get rid of this..
+	case FD_GPU_ID:
+		*value = msm_pipe->gpu_id;
+		return 0;
+	case FD_GMEM_SIZE:
+		*value = msm_pipe->gmem;
+		return 0;
+	case FD_CHIP_ID:
+		*value = msm_pipe->chip_id;
+		return 0;
+	case FD_MAX_FREQ:
+		return query_param(pipe, MSM_PARAM_MAX_FREQ, value);
+	case FD_TIMESTAMP:
+		return query_param(pipe, MSM_PARAM_TIMESTAMP, value);
+	case FD_NR_RINGS:
+		return query_param(pipe, MSM_PARAM_NR_RINGS, value);
+	default:
+		ERROR_MSG("invalid param id: %d", param);
+		return -1;
+	}
+}
+
+static int msm_pipe_wait(struct fd_pipe *pipe, uint32_t timestamp,
+		uint64_t timeout)
+{
+	struct fd_device *dev = pipe->dev;
+	struct drm_msm_wait_fence req = {
+			.fence = timestamp,
+			.queueid = to_msm_pipe(pipe)->queue_id,
+	};
+	int ret;
+
+	get_abs_timeout(&req.timeout, timeout);
+
+	ret = drmCommandWrite(dev->fd, DRM_MSM_WAIT_FENCE, &req, sizeof(req));
+	if (ret) {
+		ERROR_MSG("wait-fence failed! %d (%s)", ret, strerror(errno));
+		return ret;
+	}
+
+	return 0;
+}
+
+static int open_submitqueue(struct fd_pipe *pipe, uint32_t prio)
+{
+	struct drm_msm_submitqueue req = {
+		.flags = 0,
+		.prio = prio,
+	};
+	uint64_t nr_rings = 1;
+	int ret;
+
+	if (fd_device_version(pipe->dev) < FD_VERSION_SUBMIT_QUEUES) {
+		to_msm_pipe(pipe)->queue_id = 0;
+		return 0;
+	}
+
+	msm_pipe_get_param(pipe, FD_NR_RINGS, &nr_rings);
+
+	req.prio = MIN2(req.prio, MAX2(nr_rings, 1) - 1);
+
+	ret = drmCommandWriteRead(pipe->dev->fd, DRM_MSM_SUBMITQUEUE_NEW,
+			&req, sizeof(req));
+	if (ret) {
+		ERROR_MSG("could not create submitqueue! %d (%s)", ret, strerror(errno));
+		return ret;
+	}
+
+	to_msm_pipe(pipe)->queue_id = req.id;
+	return 0;
+}
+
+static void close_submitqueue(struct fd_pipe *pipe, uint32_t queue_id)
+{
+	if (fd_device_version(pipe->dev) < FD_VERSION_SUBMIT_QUEUES)
+		return;
+
+	drmCommandWrite(pipe->dev->fd, DRM_MSM_SUBMITQUEUE_CLOSE,
+			&queue_id, sizeof(queue_id));
+}
+
+static void msm_pipe_destroy(struct fd_pipe *pipe)
+{
+	struct msm_pipe *msm_pipe = to_msm_pipe(pipe);
+	close_submitqueue(pipe, msm_pipe->queue_id);
+	free(msm_pipe);
+}
+
+static const struct fd_pipe_funcs sp_funcs = {
+		.ringbuffer_new_object = msm_ringbuffer_sp_new_object,
+		.submit_new = msm_submit_sp_new,
+		.get_param = msm_pipe_get_param,
+		.wait = msm_pipe_wait,
+		.destroy = msm_pipe_destroy,
+};
+
+static const struct fd_pipe_funcs legacy_funcs = {
+		.ringbuffer_new_object = msm_ringbuffer_new_object,
+		.submit_new = msm_submit_new,
+		.get_param = msm_pipe_get_param,
+		.wait = msm_pipe_wait,
+		.destroy = msm_pipe_destroy,
+};
+
+static uint64_t get_param(struct fd_pipe *pipe, uint32_t param)
+{
+	uint64_t value;
+	int ret = query_param(pipe, param, &value);
+	if (ret) {
+		ERROR_MSG("get-param failed! %d (%s)", ret, strerror(errno));
+		return 0;
+	}
+	return value;
+}
+
+struct fd_pipe * msm_pipe_new(struct fd_device *dev,
+		enum fd_pipe_id id, uint32_t prio)
+{
+	static const uint32_t pipe_id[] = {
+			[FD_PIPE_3D] = MSM_PIPE_3D0,
+			[FD_PIPE_2D] = MSM_PIPE_2D0,
+	};
+	struct msm_pipe *msm_pipe = NULL;
+	struct fd_pipe *pipe = NULL;
+
+	msm_pipe = calloc(1, sizeof(*msm_pipe));
+	if (!msm_pipe) {
+		ERROR_MSG("allocation failed");
+		goto fail;
+	}
+
+	pipe = &msm_pipe->base;
+
+	// TODO once kernel changes are in place, this switch will be
+	// based on kernel version:
+	if (fd_mesa_debug & FD_DBG_SOFTPIN) {
+		pipe->funcs = &sp_funcs;
+	} else {
+		pipe->funcs = &legacy_funcs;
+	}
+
+	/* initialize before get_param(): */
+	pipe->dev = dev;
+	msm_pipe->pipe = pipe_id[id];
+
+	/* these params should be supported since the first version of drm/msm: */
+	msm_pipe->gpu_id = get_param(pipe, MSM_PARAM_GPU_ID);
+	msm_pipe->gmem   = get_param(pipe, MSM_PARAM_GMEM_SIZE);
+	msm_pipe->chip_id = get_param(pipe, MSM_PARAM_CHIP_ID);
+
+	if (! msm_pipe->gpu_id)
+		goto fail;
+
+	INFO_MSG("Pipe Info:");
+	INFO_MSG(" GPU-id:          %d", msm_pipe->gpu_id);
+	INFO_MSG(" Chip-id:         0x%08x", msm_pipe->chip_id);
+	INFO_MSG(" GMEM size:       0x%08x", msm_pipe->gmem);
+
+	if (open_submitqueue(pipe, prio))
+		goto fail;
+
+	return pipe;
+fail:
+	if (pipe)
+		fd_pipe_del(pipe);
+	return NULL;
+}
diff --git a/src/gallium/drivers/freedreno/drm/msm_priv.h b/src/gallium/drivers/freedreno/drm/msm_priv.h
new file mode 100644
index 00000000000..9cb60bc1db5
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_priv.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef MSM_PRIV_H_
+#define MSM_PRIV_H_
+
+#include "freedreno_priv.h"
+
+#ifndef __user
+#  define __user
+#endif
+
+#include "msm_drm.h"
+
+struct msm_device {
+	struct fd_device base;
+	struct fd_bo_cache ring_cache;
+};
+FD_DEFINE_CAST(fd_device, msm_device);
+
+struct fd_device * msm_device_new(int fd);
+
+struct msm_pipe {
+	struct fd_pipe base;
+	uint32_t pipe;
+	uint32_t gpu_id;
+	uint32_t gmem;
+	uint32_t chip_id;
+	uint32_t queue_id;
+};
+FD_DEFINE_CAST(fd_pipe, msm_pipe);
+
+struct fd_pipe * msm_pipe_new(struct fd_device *dev,
+		enum fd_pipe_id id, uint32_t prio);
+
+struct fd_ringbuffer * msm_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size);
+struct fd_ringbuffer * msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size);
+
+struct fd_submit * msm_submit_new(struct fd_pipe *pipe);
+struct fd_submit * msm_submit_sp_new(struct fd_pipe *pipe);
+
+struct msm_bo {
+	struct fd_bo base;
+	uint64_t offset;
+	/* to avoid excess hashtable lookups, cache the ring this bo was
+	 * last emitted on (since that will probably also be the next ring
+	 * it is emitted on)
+	 */
+	unsigned current_submit_seqno;
+	uint32_t idx;
+};
+FD_DEFINE_CAST(fd_bo, msm_bo);
+
+int msm_bo_new_handle(struct fd_device *dev,
+		uint32_t size, uint32_t flags, uint32_t *handle);
+struct fd_bo * msm_bo_from_handle(struct fd_device *dev,
+		uint32_t size, uint32_t handle);
+
+static inline void
+msm_dump_submit(struct drm_msm_gem_submit *req)
+{
+	for (unsigned i = 0; i < req->nr_bos; i++) {
+		struct drm_msm_gem_submit_bo *bos = U642VOID(req->bos);
+		struct drm_msm_gem_submit_bo *bo = &bos[i];
+		ERROR_MSG("  bos[%d]: handle=%u, flags=%x", i, bo->handle, bo->flags);
+	}
+	for (unsigned i = 0; i < req->nr_cmds; i++) {
+		struct drm_msm_gem_submit_cmd *cmds = U642VOID(req->cmds);
+		struct drm_msm_gem_submit_cmd *cmd = &cmds[i];
+		struct drm_msm_gem_submit_reloc *relocs = U642VOID(cmd->relocs);
+		ERROR_MSG("  cmd[%d]: type=%u, submit_idx=%u, submit_offset=%u, size=%u",
+				i, cmd->type, cmd->submit_idx, cmd->submit_offset, cmd->size);
+		for (unsigned j = 0; j < cmd->nr_relocs; j++) {
+			struct drm_msm_gem_submit_reloc *r = &relocs[j];
+			ERROR_MSG("    reloc[%d]: submit_offset=%u, or=%08x, shift=%d, reloc_idx=%u"
+					", reloc_offset=%"PRIu64, j, r->submit_offset, r->or, r->shift,
+					r->reloc_idx, r->reloc_offset);
+		}
+	}
+}
+
+static inline void get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
+{
+	struct timespec t;
+	uint32_t s = ns / 1000000000;
+	clock_gettime(CLOCK_MONOTONIC, &t);
+	tv->tv_sec = t.tv_sec + s;
+	tv->tv_nsec = t.tv_nsec + ns - (s * 1000000000);
+}
+
+/*
+ * Stupid/simple growable array implementation:
+ */
+
+static inline void *
+grow(void *ptr, uint16_t nr, uint16_t *max, uint16_t sz)
+{
+	if ((nr + 1) > *max) {
+		if ((*max * 2) < (nr + 1))
+			*max = nr + 5;
+		else
+			*max = *max * 2;
+		ptr = realloc(ptr, *max * sz);
+	}
+	return ptr;
+}
+
+#define DECLARE_ARRAY(type, name) \
+	unsigned short nr_ ## name, max_ ## name; \
+	type * name;
+
+#define APPEND(x, name) ({ \
+	(x)->name = grow((x)->name, (x)->nr_ ## name, &(x)->max_ ## name, sizeof((x)->name[0])); \
+	(x)->nr_ ## name ++; \
+})
+
+#endif /* MSM_PRIV_H_ */
diff --git a/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c b/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c
new file mode 100644
index 00000000000..f1e96740231
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_ringbuffer.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (C) 2012-2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+#include "util/hash_table.h"
+#include "util/set.h"
+#include "util/slab.h"
+
+#include "drm/freedreno_ringbuffer.h"
+#include "msm_priv.h"
+
+/* The legacy implementation of submit/ringbuffer, which still does the
+ * traditional reloc and cmd tracking
+ */
+
+
+#define INIT_SIZE 0x1000
+
+static pthread_mutex_t idx_lock = PTHREAD_MUTEX_INITIALIZER;
+
+
+struct msm_submit {
+	struct fd_submit base;
+
+	DECLARE_ARRAY(struct drm_msm_gem_submit_bo, submit_bos);
+	DECLARE_ARRAY(struct fd_bo *, bos);
+
+	unsigned seqno;
+
+	/* maps fd_bo to idx in bos table: */
+	struct hash_table *bo_table;
+
+	struct slab_mempool ring_pool;
+
+	/* hash-set of associated rings: */
+	struct set *ring_set;
+
+	struct fd_ringbuffer *primary;
+
+	/* Allow for sub-allocation of stateobj ring buffers (ie. sharing
+	 * the same underlying bo)..
+	 *
+	 * We also rely on previous stateobj having been fully constructed
+	 * so we can reclaim extra space at it's end.
+	 */
+	struct fd_ringbuffer *suballoc_ring;
+};
+FD_DEFINE_CAST(fd_submit, msm_submit);
+
+/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers
+ * and sizes.  Ie. a finalized buffer can have no more commands appended to
+ * it.
+ */
+struct msm_cmd {
+	struct fd_bo *ring_bo;
+	unsigned size;
+	DECLARE_ARRAY(struct drm_msm_gem_submit_reloc, relocs);
+};
+
+static struct msm_cmd *
+cmd_new(struct fd_bo *ring_bo)
+{
+	struct msm_cmd *cmd = malloc(sizeof(*cmd));
+	cmd->ring_bo = fd_bo_ref(ring_bo);
+	cmd->size = 0;
+	cmd->nr_relocs = cmd->max_relocs = 0;
+	cmd->relocs = NULL;
+	return cmd;
+}
+
+static void
+cmd_free(struct msm_cmd *cmd)
+{
+	fd_bo_del(cmd->ring_bo);
+	free(cmd);
+}
+
+/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to
+ * later copy into the submit when the stateobj rb is later referenced by
+ * a regular rb:
+ */
+struct msm_reloc_bo {
+	struct fd_bo *bo;
+	unsigned flags;
+};
+
+struct msm_ringbuffer {
+	struct fd_ringbuffer base;
+
+	/* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */
+	unsigned offset;
+
+	union {
+		/* for _FD_RINGBUFFER_OBJECT case: */
+		struct {
+			struct fd_pipe *pipe;
+			DECLARE_ARRAY(struct msm_reloc_bo, reloc_bos);
+			struct set *ring_set;
+		};
+		/* for other cases: */
+		struct {
+			struct fd_submit *submit;
+			DECLARE_ARRAY(struct msm_cmd *, cmds);
+		};
+	} u;
+
+	struct msm_cmd *cmd;          /* current cmd */
+	struct fd_bo *ring_bo;
+};
+FD_DEFINE_CAST(fd_ringbuffer, msm_ringbuffer);
+
+static void finalize_current_cmd(struct fd_ringbuffer *ring);
+static struct fd_ringbuffer * msm_ringbuffer_init(
+		struct msm_ringbuffer *msm_ring,
+		uint32_t size, enum fd_ringbuffer_flags flags);
+
+/* add (if needed) bo to submit and return index: */
+static uint32_t
+append_bo(struct msm_submit *submit, struct fd_bo *bo, uint32_t flags)
+{
+	struct msm_bo *msm_bo = to_msm_bo(bo);
+	uint32_t idx;
+	pthread_mutex_lock(&idx_lock);
+	if (likely(msm_bo->current_submit_seqno == submit->seqno)) {
+		idx = msm_bo->idx;
+	} else {
+		uint32_t hash = _mesa_hash_pointer(bo);
+		struct hash_entry *entry;
+
+		entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo);
+		if (entry) {
+			/* found */
+			idx = (uint32_t)(uintptr_t)entry->data;
+		} else {
+			idx = APPEND(submit, submit_bos);
+			idx = APPEND(submit, bos);
+
+			submit->submit_bos[idx].flags = 0;
+			submit->submit_bos[idx].handle = bo->handle;
+			submit->submit_bos[idx].presumed = 0;
+
+			submit->bos[idx] = fd_bo_ref(bo);
+
+			_mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo,
+					(void *)(uintptr_t)idx);
+		}
+		msm_bo->current_submit_seqno = submit->seqno;
+		msm_bo->idx = idx;
+	}
+	pthread_mutex_unlock(&idx_lock);
+	if (flags & FD_RELOC_READ)
+		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_READ;
+	if (flags & FD_RELOC_WRITE)
+		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_WRITE;
+	return idx;
+}
+
+static void
+append_ring(struct set *set, struct fd_ringbuffer *ring)
+{
+	uint32_t hash = _mesa_hash_pointer(ring);
+
+	if (!_mesa_set_search_pre_hashed(set, hash, ring)) {
+		fd_ringbuffer_ref(ring);
+		_mesa_set_add_pre_hashed(set, hash, ring);
+	}
+}
+
+static void
+msm_submit_suballoc_ring_bo(struct fd_submit *submit,
+		struct msm_ringbuffer *msm_ring, uint32_t size)
+{
+	struct msm_submit *msm_submit = to_msm_submit(submit);
+	unsigned suballoc_offset = 0;
+	struct fd_bo *suballoc_bo = NULL;
+
+	if (msm_submit->suballoc_ring) {
+		struct msm_ringbuffer *suballoc_ring =
+				to_msm_ringbuffer(msm_submit->suballoc_ring);
+
+		suballoc_bo = suballoc_ring->ring_bo;
+		suballoc_offset = fd_ringbuffer_size(msm_submit->suballoc_ring) +
+				suballoc_ring->offset;
+
+		suballoc_offset = align(suballoc_offset, 0x10);
+
+		if ((size + suballoc_offset) > suballoc_bo->size) {
+			suballoc_bo = NULL;
+		}
+	}
+
+	if (!suballoc_bo) {
+		// TODO possibly larger size for streaming bo?
+		msm_ring->ring_bo = fd_bo_new_ring(
+				submit->pipe->dev, 0x8000, 0);
+		msm_ring->offset = 0;
+	} else {
+		msm_ring->ring_bo = fd_bo_ref(suballoc_bo);
+		msm_ring->offset = suballoc_offset;
+	}
+
+	struct fd_ringbuffer *old_suballoc_ring = msm_submit->suballoc_ring;
+
+	msm_submit->suballoc_ring = fd_ringbuffer_ref(&msm_ring->base);
+
+	if (old_suballoc_ring)
+		fd_ringbuffer_del(old_suballoc_ring);
+}
+
+static struct fd_ringbuffer *
+msm_submit_new_ringbuffer(struct fd_submit *submit, uint32_t size,
+		enum fd_ringbuffer_flags flags)
+{
+	struct msm_submit *msm_submit = to_msm_submit(submit);
+	struct msm_ringbuffer *msm_ring;
+
+	msm_ring = slab_alloc_st(&msm_submit->ring_pool);
+
+	msm_ring->u.submit = submit;
+
+	/* NOTE: needs to be before _suballoc_ring_bo() since it could
+	 * increment the refcnt of the current ring
+	 */
+	msm_ring->base.refcnt = 1;
+
+	if (flags & FD_RINGBUFFER_STREAMING) {
+		msm_submit_suballoc_ring_bo(submit, msm_ring, size);
+	} else {
+		if (flags & FD_RINGBUFFER_GROWABLE)
+			size = INIT_SIZE;
+
+		msm_ring->offset = 0;
+		msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size, 0);
+	}
+
+	if (!msm_ringbuffer_init(msm_ring, size, flags))
+		return NULL;
+
+	if (flags & FD_RINGBUFFER_PRIMARY) {
+		debug_assert(!msm_submit->primary);
+		msm_submit->primary = fd_ringbuffer_ref(&msm_ring->base);
+	}
+
+	return &msm_ring->base;
+}
+
+static struct drm_msm_gem_submit_reloc *
+handle_stateobj_relocs(struct msm_submit *submit, struct msm_ringbuffer *ring)
+{
+	struct msm_cmd *cmd = ring->cmd;
+	struct drm_msm_gem_submit_reloc *relocs;
+
+	relocs = malloc(cmd->nr_relocs * sizeof(*relocs));
+
+	for (unsigned i = 0; i < cmd->nr_relocs; i++) {
+		unsigned idx = cmd->relocs[i].reloc_idx;
+		struct fd_bo *bo = ring->u.reloc_bos[idx].bo;
+		unsigned flags = 0;
+
+		if (ring->u.reloc_bos[idx].flags & MSM_SUBMIT_BO_READ)
+			flags |= FD_RELOC_READ;
+		if (ring->u.reloc_bos[idx].flags & MSM_SUBMIT_BO_WRITE)
+			flags |= FD_RELOC_WRITE;
+
+		relocs[i] = cmd->relocs[i];
+		relocs[i].reloc_idx = append_bo(submit, bo, flags);
+	}
+
+	return relocs;
+}
+
+static int
+msm_submit_flush(struct fd_submit *submit, int in_fence_fd,
+		int *out_fence_fd, uint32_t *out_fence)
+{
+	struct msm_submit *msm_submit = to_msm_submit(submit);
+	struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
+	struct drm_msm_gem_submit req = {
+			.flags = msm_pipe->pipe,
+			.queueid = msm_pipe->queue_id,
+	};
+	int ret;
+
+	debug_assert(msm_submit->primary);
+
+	finalize_current_cmd(msm_submit->primary);
+	append_ring(msm_submit->ring_set, msm_submit->primary);
+
+	struct set_entry *entry;
+	unsigned nr_cmds = 0;
+	unsigned nr_objs = 0;
+
+	set_foreach(msm_submit->ring_set, entry) {
+		struct fd_ringbuffer *ring = (void *)entry->key;
+		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+			nr_cmds += 1;
+			nr_objs += 1;
+		} else {
+			if (ring != msm_submit->primary)
+				finalize_current_cmd(ring);
+			nr_cmds += to_msm_ringbuffer(ring)->u.nr_cmds;
+		}
+	}
+
+	void *obj_relocs[nr_objs];
+	struct drm_msm_gem_submit_cmd cmds[nr_cmds];
+	unsigned i = 0, o = 0;
+
+	set_foreach(msm_submit->ring_set, entry) {
+		struct fd_ringbuffer *ring = (void *)entry->key;
+		struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+
+		debug_assert(i < nr_cmds);
+
+		// TODO handle relocs:
+		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+
+			debug_assert(o < nr_objs);
+
+			void *relocs = handle_stateobj_relocs(msm_submit, msm_ring);
+			obj_relocs[o++] = relocs;
+
+			cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
+			cmds[i].submit_idx =
+				append_bo(msm_submit, msm_ring->ring_bo, FD_RELOC_READ);
+			cmds[i].submit_offset = msm_ring->offset;
+			cmds[i].size = offset_bytes(ring->cur, ring->start);
+			cmds[i].pad = 0;
+			cmds[i].nr_relocs = msm_ring->cmd->nr_relocs;
+			cmds[i].relocs = VOID2U64(relocs);
+
+			i++;
+		} else {
+			for (unsigned j = 0; j < msm_ring->u.nr_cmds; j++) {
+				if (ring->flags & FD_RINGBUFFER_PRIMARY) {
+					cmds[i].type = MSM_SUBMIT_CMD_BUF;
+				} else {
+					cmds[i].type = MSM_SUBMIT_CMD_IB_TARGET_BUF;
+				}
+				cmds[i].submit_idx = append_bo(msm_submit,
+						msm_ring->u.cmds[j]->ring_bo, FD_RELOC_READ);
+				cmds[i].submit_offset = msm_ring->offset;
+				cmds[i].size = msm_ring->u.cmds[j]->size;
+				cmds[i].pad = 0;
+				cmds[i].nr_relocs = msm_ring->u.cmds[j]->nr_relocs;
+				cmds[i].relocs = VOID2U64(msm_ring->u.cmds[j]->relocs);
+
+				i++;
+			}
+		}
+	}
+
+	if (in_fence_fd != -1) {
+		req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
+		req.fence_fd = in_fence_fd;
+	}
+
+	if (out_fence_fd) {
+		req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
+	}
+
+	/* needs to be after get_cmd() as that could create bos/cmds table: */
+	req.bos = VOID2U64(msm_submit->submit_bos),
+	req.nr_bos = msm_submit->nr_submit_bos;
+	req.cmds = VOID2U64(cmds),
+	req.nr_cmds = nr_cmds;
+
+	DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
+
+	ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT,
+			&req, sizeof(req));
+	if (ret) {
+		ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
+		msm_dump_submit(&req);
+	} else if (!ret) {
+		if (out_fence)
+			*out_fence = req.fence;
+
+		if (out_fence_fd)
+			*out_fence_fd = req.fence_fd;
+	}
+
+	for (unsigned o = 0; o < nr_objs; o++)
+		free(obj_relocs[o]);
+
+	return ret;
+}
+
+static void
+unref_rings(struct set_entry *entry)
+{
+	struct fd_ringbuffer *ring = (void *)entry->key;
+	fd_ringbuffer_del(ring);
+}
+
+static void
+msm_submit_destroy(struct fd_submit *submit)
+{
+	struct msm_submit *msm_submit = to_msm_submit(submit);
+
+	if (msm_submit->primary)
+		fd_ringbuffer_del(msm_submit->primary);
+	if (msm_submit->suballoc_ring)
+		fd_ringbuffer_del(msm_submit->suballoc_ring);
+
+	_mesa_hash_table_destroy(msm_submit->bo_table, NULL);
+	_mesa_set_destroy(msm_submit->ring_set, unref_rings);
+
+	// TODO it would be nice to have a way to debug_assert() if all
+	// rb's haven't been free'd back to the slab, because that is
+	// an indication that we are leaking bo's
+	slab_destroy(&msm_submit->ring_pool);
+
+	for (unsigned i = 0; i < msm_submit->nr_bos; i++)
+		fd_bo_del(msm_submit->bos[i]);
+
+	free(msm_submit->submit_bos);
+	free(msm_submit->bos);
+	free(msm_submit);
+}
+
+static const struct fd_submit_funcs submit_funcs = {
+		.new_ringbuffer = msm_submit_new_ringbuffer,
+		.flush = msm_submit_flush,
+		.destroy = msm_submit_destroy,
+};
+
+struct fd_submit *
+msm_submit_new(struct fd_pipe *pipe)
+{
+	struct msm_submit *msm_submit = calloc(1, sizeof(*msm_submit));
+	struct fd_submit *submit;
+	static unsigned submit_cnt = 0;
+
+	msm_submit->seqno = ++submit_cnt;
+	msm_submit->bo_table = _mesa_hash_table_create(NULL,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	msm_submit->ring_set = _mesa_set_create(NULL,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	// TODO tune size:
+	slab_create(&msm_submit->ring_pool, sizeof(struct msm_ringbuffer), 16);
+
+	submit = &msm_submit->base;
+	submit->pipe = pipe;
+	submit->funcs = &submit_funcs;
+
+	return submit;
+}
+
+
+static void
+finalize_current_cmd(struct fd_ringbuffer *ring)
+{
+	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+
+	debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
+
+	if (!msm_ring->cmd)
+		return;
+
+	debug_assert(msm_ring->cmd->ring_bo == msm_ring->ring_bo);
+
+	unsigned idx = APPEND(&msm_ring->u, cmds);
+
+	msm_ring->u.cmds[idx] = msm_ring->cmd;
+	msm_ring->cmd = NULL;
+
+	msm_ring->u.cmds[idx]->size = offset_bytes(ring->cur, ring->start);
+}
+
+static void
+msm_ringbuffer_grow(struct fd_ringbuffer *ring, uint32_t size)
+{
+	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+	struct fd_pipe *pipe = msm_ring->u.submit->pipe;
+
+	debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE);
+
+	finalize_current_cmd(ring);
+
+	fd_bo_del(msm_ring->ring_bo);
+	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
+	msm_ring->cmd = cmd_new(msm_ring->ring_bo);
+
+	ring->start = fd_bo_map(msm_ring->ring_bo);
+	ring->end = &(ring->start[size/4]);
+	ring->cur = ring->start;
+	ring->size = size;
+}
+
+static void
+msm_ringbuffer_emit_reloc(struct fd_ringbuffer *ring,
+		const struct fd_reloc *reloc)
+{
+	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+	struct fd_pipe *pipe;
+	unsigned reloc_idx;
+
+	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+		unsigned idx = APPEND(&msm_ring->u, reloc_bos);
+
+		msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo);
+		msm_ring->u.reloc_bos[idx].flags = reloc->flags;
+
+		/* this gets fixed up at submit->flush() time, since this state-
+		 * object rb can be used with many different submits
+		 */
+		reloc_idx = idx;
+
+		pipe = msm_ring->u.pipe;
+	} else {
+		struct msm_submit *msm_submit =
+				to_msm_submit(msm_ring->u.submit);
+
+		reloc_idx = append_bo(msm_submit, reloc->bo, reloc->flags);
+
+		pipe = msm_ring->u.submit->pipe;
+	}
+
+	struct drm_msm_gem_submit_reloc *r;
+	unsigned idx = APPEND(msm_ring->cmd, relocs);
+
+	r = &msm_ring->cmd->relocs[idx];
+
+	r->reloc_idx = reloc_idx;
+	r->reloc_offset = reloc->offset;
+	r->or = reloc->or;
+	r->shift = reloc->shift;
+	r->submit_offset = offset_bytes(ring->cur, ring->start) +
+			msm_ring->offset;
+
+	ring->cur++;
+
+	if (pipe->gpu_id >= 500) {
+		idx = APPEND(msm_ring->cmd, relocs);
+		r = &msm_ring->cmd->relocs[idx];
+
+		r->reloc_idx = reloc_idx;
+		r->reloc_offset = reloc->offset;
+		r->or = reloc->orhi;
+		r->shift = reloc->shift - 32;
+		r->submit_offset = offset_bytes(ring->cur, ring->start) +
+				msm_ring->offset;
+
+		ring->cur++;
+	}
+}
+
+static void
+append_stateobj_rings(struct msm_submit *submit, struct fd_ringbuffer *target)
+{
+	struct msm_ringbuffer *msm_target = to_msm_ringbuffer(target);
+
+	debug_assert(target->flags & _FD_RINGBUFFER_OBJECT);
+
+	struct set_entry *entry;
+	set_foreach(msm_target->u.ring_set, entry) {
+		struct fd_ringbuffer *ring = (void *)entry->key;
+
+		append_ring(submit->ring_set, ring);
+
+		if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+			append_stateobj_rings(submit, ring);
+		}
+	}
+}
+
+static uint32_t
+msm_ringbuffer_emit_reloc_ring(struct fd_ringbuffer *ring,
+		struct fd_ringbuffer *target, uint32_t cmd_idx)
+{
+	struct msm_ringbuffer *msm_target = to_msm_ringbuffer(target);
+	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+	struct fd_bo *bo;
+	uint32_t size;
+
+	if ((target->flags & FD_RINGBUFFER_GROWABLE) &&
+			(cmd_idx < msm_target->u.nr_cmds)) {
+		bo   = msm_target->u.cmds[cmd_idx]->ring_bo;
+		size = msm_target->u.cmds[cmd_idx]->size;
+	} else {
+		bo   = msm_target->ring_bo;
+		size = offset_bytes(target->cur, target->start);
+	}
+
+	msm_ringbuffer_emit_reloc(ring, &(struct fd_reloc){
+		.bo     = bo,
+		.flags  = FD_RELOC_READ,
+		.offset = msm_target->offset,
+	});
+
+	if ((target->flags & _FD_RINGBUFFER_OBJECT) &&
+			!(ring->flags & _FD_RINGBUFFER_OBJECT)) {
+		struct msm_submit *msm_submit = to_msm_submit(msm_ring->u.submit);
+
+		append_stateobj_rings(msm_submit, target);
+	}
+
+	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+		append_ring(msm_ring->u.ring_set, target);
+	} else {
+		struct msm_submit *msm_submit = to_msm_submit(msm_ring->u.submit);
+		append_ring(msm_submit->ring_set, target);
+	}
+
+	return size;
+}
+
+static uint32_t
+msm_ringbuffer_cmd_count(struct fd_ringbuffer *ring)
+{
+	if (ring->flags & FD_RINGBUFFER_GROWABLE)
+		return to_msm_ringbuffer(ring)->u.nr_cmds + 1;
+	return 1;
+}
+
+static void
+msm_ringbuffer_destroy(struct fd_ringbuffer *ring)
+{
+	struct msm_ringbuffer *msm_ring = to_msm_ringbuffer(ring);
+
+	fd_bo_del(msm_ring->ring_bo);
+	if (msm_ring->cmd)
+		cmd_free(msm_ring->cmd);
+
+	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+		for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) {
+			fd_bo_del(msm_ring->u.reloc_bos[i].bo);
+		}
+
+		_mesa_set_destroy(msm_ring->u.ring_set, unref_rings);
+
+		free(msm_ring);
+	} else {
+		struct fd_submit *submit = msm_ring->u.submit;
+
+		for (unsigned i = 0; i < msm_ring->u.nr_cmds; i++) {
+			cmd_free(msm_ring->u.cmds[i]);
+		}
+
+		slab_free_st(&to_msm_submit(submit)->ring_pool, msm_ring);
+	}
+}
+
+static const struct fd_ringbuffer_funcs ring_funcs = {
+		.grow = msm_ringbuffer_grow,
+		.emit_reloc = msm_ringbuffer_emit_reloc,
+		.emit_reloc_ring = msm_ringbuffer_emit_reloc_ring,
+		.cmd_count = msm_ringbuffer_cmd_count,
+		.destroy = msm_ringbuffer_destroy,
+};
+
+static inline struct fd_ringbuffer *
+msm_ringbuffer_init(struct msm_ringbuffer *msm_ring, uint32_t size,
+		enum fd_ringbuffer_flags flags)
+{
+	struct fd_ringbuffer *ring = &msm_ring->base;
+
+	debug_assert(msm_ring->ring_bo);
+
+	uint8_t *base = fd_bo_map(msm_ring->ring_bo);
+	ring->start = (void *)(base + msm_ring->offset);
+	ring->end = &(ring->start[size/4]);
+	ring->cur = ring->start;
+
+	ring->size = size;
+	ring->flags = flags;
+
+	ring->funcs = &ring_funcs;
+
+	msm_ring->u.cmds = NULL;
+	msm_ring->u.nr_cmds = msm_ring->u.max_cmds = 0;
+
+	msm_ring->cmd = cmd_new(msm_ring->ring_bo);
+
+	return ring;
+}
+
+struct fd_ringbuffer *
+msm_ringbuffer_new_object(struct fd_pipe *pipe, uint32_t size)
+{
+	struct msm_ringbuffer *msm_ring = malloc(sizeof(*msm_ring));
+
+	msm_ring->u.pipe = pipe;
+	msm_ring->offset = 0;
+	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
+	msm_ring->base.refcnt = 1;
+
+	msm_ring->u.reloc_bos = NULL;
+	msm_ring->u.nr_reloc_bos = msm_ring->u.max_reloc_bos = 0;
+
+	msm_ring->u.ring_set = _mesa_set_create(NULL,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+
+	return msm_ringbuffer_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
+}
diff --git a/src/gallium/drivers/freedreno/drm/msm_ringbuffer_sp.c b/src/gallium/drivers/freedreno/drm/msm_ringbuffer_sp.c
new file mode 100644
index 00000000000..997ff147659
--- /dev/null
+++ b/src/gallium/drivers/freedreno/drm/msm_ringbuffer_sp.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+
+#include "util/hash_table.h"
+#include "util/slab.h"
+
+#include "drm/freedreno_ringbuffer.h"
+#include "msm_priv.h"
+
+/* A "softpin" implementation of submit/ringbuffer, which lowers CPU overhead
+ * by avoiding the additional tracking necessary to build cmds/relocs tables
+ * (but still builds a bos table)
+ */
+
+
+#define INIT_SIZE 0x1000
+
+static pthread_mutex_t idx_lock = PTHREAD_MUTEX_INITIALIZER;
+
+
+struct msm_submit_sp {
+	struct fd_submit base;
+
+	DECLARE_ARRAY(struct drm_msm_gem_submit_bo, submit_bos);
+	DECLARE_ARRAY(struct fd_bo *, bos);
+
+	unsigned seqno;
+
+	/* maps fd_bo to idx in bos table: */
+	struct hash_table *bo_table;
+
+	struct slab_mempool ring_pool;
+
+	struct fd_ringbuffer *primary;
+
+	/* Allow for sub-allocation of stateobj ring buffers (ie. sharing
+	 * the same underlying bo)..
+	 *
+	 * We also rely on previous stateobj having been fully constructed
+	 * so we can reclaim extra space at it's end.
+	 */
+	struct fd_ringbuffer *suballoc_ring;
+};
+FD_DEFINE_CAST(fd_submit, msm_submit_sp);
+
+/* for FD_RINGBUFFER_GROWABLE rb's, tracks the 'finalized' cmdstream buffers
+ * and sizes.  Ie. a finalized buffer can have no more commands appended to
+ * it.
+ */
+struct msm_cmd_sp {
+	struct fd_bo *ring_bo;
+	unsigned size;
+};
+
+/* for _FD_RINGBUFFER_OBJECT rb's we need to track the bo's and flags to
+ * later copy into the submit when the stateobj rb is later referenced by
+ * a regular rb:
+ */
+struct msm_reloc_bo_sp {
+	struct fd_bo *bo;
+	unsigned flags;
+};
+
+struct msm_ringbuffer_sp {
+	struct fd_ringbuffer base;
+
+	/* for FD_RINGBUFFER_STREAMING rb's which are sub-allocated */
+	unsigned offset;
+
+// TODO check disasm.. hopefully compilers CSE can realize that
+// reloc_bos and cmds are at the same offsets and optimize some
+// divergent cases into single case
+	union {
+		/* for _FD_RINGBUFFER_OBJECT case: */
+		struct {
+			struct fd_pipe *pipe;
+			DECLARE_ARRAY(struct msm_reloc_bo_sp, reloc_bos);
+		};
+		/* for other cases: */
+		struct {
+			struct fd_submit *submit;
+			DECLARE_ARRAY(struct msm_cmd_sp, cmds);
+		};
+	} u;
+
+	struct fd_bo *ring_bo;
+};
+FD_DEFINE_CAST(fd_ringbuffer, msm_ringbuffer_sp);
+
+static void finalize_current_cmd(struct fd_ringbuffer *ring);
+static struct fd_ringbuffer * msm_ringbuffer_sp_init(
+		struct msm_ringbuffer_sp *msm_ring,
+		uint32_t size, enum fd_ringbuffer_flags flags);
+
+/* add (if needed) bo to submit and return index: */
+static uint32_t
+append_bo(struct msm_submit_sp *submit, struct fd_bo *bo, uint32_t flags)
+{
+	struct msm_bo *msm_bo = to_msm_bo(bo);
+	uint32_t idx;
+	pthread_mutex_lock(&idx_lock);
+	if (likely(msm_bo->current_submit_seqno == submit->seqno)) {
+		idx = msm_bo->idx;
+	} else {
+		uint32_t hash = _mesa_hash_pointer(bo);
+		struct hash_entry *entry;
+
+		entry = _mesa_hash_table_search_pre_hashed(submit->bo_table, hash, bo);
+		if (entry) {
+			/* found */
+			idx = (uint32_t)(uintptr_t)entry->data;
+		} else {
+			idx = APPEND(submit, submit_bos);
+			idx = APPEND(submit, bos);
+
+			submit->submit_bos[idx].flags = 0;
+			submit->submit_bos[idx].handle = bo->handle;
+			submit->submit_bos[idx].presumed = 0;
+
+			submit->bos[idx] = fd_bo_ref(bo);
+
+			_mesa_hash_table_insert_pre_hashed(submit->bo_table, hash, bo,
+					(void *)(uintptr_t)idx);
+		}
+		msm_bo->current_submit_seqno = submit->seqno;
+		msm_bo->idx = idx;
+	}
+	pthread_mutex_unlock(&idx_lock);
+	if (flags & FD_RELOC_READ)
+		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_READ;
+	if (flags & FD_RELOC_WRITE)
+		submit->submit_bos[idx].flags |= MSM_SUBMIT_BO_WRITE;
+	return idx;
+}
+
+static void
+msm_submit_suballoc_ring_bo(struct fd_submit *submit,
+		struct msm_ringbuffer_sp *msm_ring, uint32_t size)
+{
+	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+	unsigned suballoc_offset = 0;
+	struct fd_bo *suballoc_bo = NULL;
+
+	if (msm_submit->suballoc_ring) {
+		struct msm_ringbuffer_sp *suballoc_ring =
+				to_msm_ringbuffer_sp(msm_submit->suballoc_ring);
+
+		suballoc_bo = suballoc_ring->ring_bo;
+		suballoc_offset = fd_ringbuffer_size(msm_submit->suballoc_ring) +
+				suballoc_ring->offset;
+
+		suballoc_offset = align(suballoc_offset, 0x10);
+
+		if ((size + suballoc_offset) > suballoc_bo->size) {
+			suballoc_bo = NULL;
+		}
+	}
+
+	if (!suballoc_bo) {
+		// TODO possibly larger size for streaming bo?
+		msm_ring->ring_bo = fd_bo_new_ring(
+				submit->pipe->dev, 0x8000, 0);
+		msm_ring->offset = 0;
+	} else {
+		msm_ring->ring_bo = fd_bo_ref(suballoc_bo);
+		msm_ring->offset = suballoc_offset;
+	}
+
+	struct fd_ringbuffer *old_suballoc_ring = msm_submit->suballoc_ring;
+
+	msm_submit->suballoc_ring = fd_ringbuffer_ref(&msm_ring->base);
+
+	if (old_suballoc_ring)
+		fd_ringbuffer_del(old_suballoc_ring);
+}
+
+static struct fd_ringbuffer *
+msm_submit_sp_new_ringbuffer(struct fd_submit *submit, uint32_t size,
+		enum fd_ringbuffer_flags flags)
+{
+	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+	struct msm_ringbuffer_sp *msm_ring;
+
+	msm_ring = slab_alloc_st(&msm_submit->ring_pool);
+
+	msm_ring->u.submit = submit;
+
+	/* NOTE: needs to be before _suballoc_ring_bo() since it could
+	 * increment the refcnt of the current ring
+	 */
+	msm_ring->base.refcnt = 1;
+
+	if (flags & FD_RINGBUFFER_STREAMING) {
+		msm_submit_suballoc_ring_bo(submit, msm_ring, size);
+	} else {
+		if (flags & FD_RINGBUFFER_GROWABLE)
+			size = INIT_SIZE;
+
+		msm_ring->offset = 0;
+		msm_ring->ring_bo = fd_bo_new_ring(submit->pipe->dev, size, 0);
+	}
+
+	if (!msm_ringbuffer_sp_init(msm_ring, size, flags))
+		return NULL;
+
+	if (flags & FD_RINGBUFFER_PRIMARY) {
+		debug_assert(!msm_submit->primary);
+		msm_submit->primary = fd_ringbuffer_ref(&msm_ring->base);
+	}
+
+	return &msm_ring->base;
+}
+
+static int
+msm_submit_sp_flush(struct fd_submit *submit, int in_fence_fd,
+		int *out_fence_fd, uint32_t *out_fence)
+{
+	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+	struct msm_pipe *msm_pipe = to_msm_pipe(submit->pipe);
+	struct drm_msm_gem_submit req = {
+			.flags = msm_pipe->pipe,
+			.queueid = msm_pipe->queue_id,
+	};
+	int ret;
+
+	debug_assert(msm_submit->primary);
+	finalize_current_cmd(msm_submit->primary);
+
+	struct msm_ringbuffer_sp *primary = to_msm_ringbuffer_sp(msm_submit->primary);
+	struct drm_msm_gem_submit_cmd cmds[primary->u.nr_cmds];
+
+	for (unsigned i = 0; i < primary->u.nr_cmds; i++) {
+		cmds[i].type = MSM_SUBMIT_CMD_BUF;
+		cmds[i].submit_idx =
+			append_bo(msm_submit, primary->u.cmds[i].ring_bo, FD_RELOC_READ);
+		cmds[i].submit_offset = primary->offset;
+		cmds[i].size = primary->u.cmds[i].size;
+		cmds[i].pad = 0;
+		cmds[i].nr_relocs = 0;
+	}
+
+	if (in_fence_fd != -1) {
+		req.flags |= MSM_SUBMIT_FENCE_FD_IN | MSM_SUBMIT_NO_IMPLICIT;
+		req.fence_fd = in_fence_fd;
+	}
+
+	if (out_fence_fd) {
+		req.flags |= MSM_SUBMIT_FENCE_FD_OUT;
+	}
+
+	/* needs to be after get_cmd() as that could create bos/cmds table: */
+	req.bos = VOID2U64(msm_submit->submit_bos),
+	req.nr_bos = msm_submit->nr_submit_bos;
+	req.cmds = VOID2U64(cmds),
+	req.nr_cmds = primary->u.nr_cmds;
+
+	DEBUG_MSG("nr_cmds=%u, nr_bos=%u", req.nr_cmds, req.nr_bos);
+
+	ret = drmCommandWriteRead(submit->pipe->dev->fd, DRM_MSM_GEM_SUBMIT,
+			&req, sizeof(req));
+	if (ret) {
+		ERROR_MSG("submit failed: %d (%s)", ret, strerror(errno));
+		msm_dump_submit(&req);
+	} else if (!ret) {
+		if (out_fence)
+			*out_fence = req.fence;
+
+		if (out_fence_fd)
+			*out_fence_fd = req.fence_fd;
+	}
+
+	return ret;
+}
+
+static void
+msm_submit_sp_destroy(struct fd_submit *submit)
+{
+	struct msm_submit_sp *msm_submit = to_msm_submit_sp(submit);
+
+	if (msm_submit->primary)
+		fd_ringbuffer_del(msm_submit->primary);
+	if (msm_submit->suballoc_ring)
+		fd_ringbuffer_del(msm_submit->suballoc_ring);
+
+	_mesa_hash_table_destroy(msm_submit->bo_table, NULL);
+
+	// TODO it would be nice to have a way to debug_assert() if all
+	// rb's haven't been free'd back to the slab, because that is
+	// an indication that we are leaking bo's
+	slab_destroy(&msm_submit->ring_pool);
+
+	for (unsigned i = 0; i < msm_submit->nr_bos; i++)
+		fd_bo_del(msm_submit->bos[i]);
+
+	free(msm_submit->submit_bos);
+	free(msm_submit->bos);
+	free(msm_submit);
+}
+
+static const struct fd_submit_funcs submit_funcs = {
+		.new_ringbuffer = msm_submit_sp_new_ringbuffer,
+		.flush = msm_submit_sp_flush,
+		.destroy = msm_submit_sp_destroy,
+};
+
+struct fd_submit *
+msm_submit_sp_new(struct fd_pipe *pipe)
+{
+	struct msm_submit_sp *msm_submit = calloc(1, sizeof(*msm_submit));
+	struct fd_submit *submit;
+	static unsigned submit_cnt = 0;
+
+	msm_submit->seqno = ++submit_cnt;
+	msm_submit->bo_table = _mesa_hash_table_create(NULL,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	// TODO tune size:
+	slab_create(&msm_submit->ring_pool, sizeof(struct msm_ringbuffer_sp), 16);
+
+	submit = &msm_submit->base;
+	submit->pipe = pipe;
+	submit->funcs = &submit_funcs;
+
+	return submit;
+}
+
+
+static void
+finalize_current_cmd(struct fd_ringbuffer *ring)
+{
+	debug_assert(!(ring->flags & _FD_RINGBUFFER_OBJECT));
+
+	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
+	unsigned idx = APPEND(&msm_ring->u, cmds);
+
+	msm_ring->u.cmds[idx].ring_bo = fd_bo_ref(msm_ring->ring_bo);
+	msm_ring->u.cmds[idx].size = offset_bytes(ring->cur, ring->start);
+}
+
+static void
+msm_ringbuffer_sp_grow(struct fd_ringbuffer *ring, uint32_t size)
+{
+	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
+	struct fd_pipe *pipe = msm_ring->u.submit->pipe;
+
+	debug_assert(ring->flags & FD_RINGBUFFER_GROWABLE);
+
+	finalize_current_cmd(ring);
+
+	fd_bo_del(msm_ring->ring_bo);
+	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
+
+	ring->start = fd_bo_map(msm_ring->ring_bo);
+	ring->end = &(ring->start[size/4]);
+	ring->cur = ring->start;
+	ring->size = size;
+}
+
+static void
+msm_ringbuffer_sp_emit_reloc(struct fd_ringbuffer *ring,
+		const struct fd_reloc *reloc)
+{
+	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
+	struct fd_pipe *pipe;
+
+	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+		unsigned idx = APPEND(&msm_ring->u, reloc_bos);
+
+		msm_ring->u.reloc_bos[idx].bo = fd_bo_ref(reloc->bo);
+		msm_ring->u.reloc_bos[idx].flags = reloc->flags;
+
+		pipe = msm_ring->u.pipe;
+	} else {
+		struct msm_submit_sp *msm_submit =
+				to_msm_submit_sp(msm_ring->u.submit);
+
+		append_bo(msm_submit, reloc->bo, reloc->flags);
+
+		pipe = msm_ring->u.submit->pipe;
+	}
+
+	uint64_t iova = fd_bo_get_iova(reloc->bo) + reloc->offset;
+	uint32_t dword = iova;
+	int shift = reloc->shift;
+
+	if (shift < 0)
+		dword >>= -shift;
+	else
+		dword <<= shift;
+
+	(*ring->cur++) = dword | reloc->or;
+
+	if (pipe->gpu_id >= 500) {
+		dword = iova >> 32;
+		shift -= 32;
+
+		if (shift < 0)
+			dword >>= -shift;
+		else
+			dword <<= shift;
+
+		(*ring->cur++) = dword | reloc->orhi;
+	}
+}
+
+static uint32_t
+msm_ringbuffer_sp_emit_reloc_ring(struct fd_ringbuffer *ring,
+		struct fd_ringbuffer *target, uint32_t cmd_idx)
+{
+	struct msm_ringbuffer_sp *msm_target = to_msm_ringbuffer_sp(target);
+	struct fd_bo *bo;
+	uint32_t size;
+
+	if ((target->flags & FD_RINGBUFFER_GROWABLE) &&
+			(cmd_idx < msm_target->u.nr_cmds)) {
+		bo   = msm_target->u.cmds[cmd_idx].ring_bo;
+		size = msm_target->u.cmds[cmd_idx].size;
+	} else {
+		bo   = msm_target->ring_bo;
+		size = offset_bytes(target->cur, target->start);
+	}
+
+	msm_ringbuffer_sp_emit_reloc(ring, &(struct fd_reloc){
+		.bo     = bo,
+		.flags  = FD_RELOC_READ,
+		.offset = msm_target->offset,
+	});
+
+	if ((target->flags & _FD_RINGBUFFER_OBJECT) &&
+			!(ring->flags & _FD_RINGBUFFER_OBJECT)) {
+		// TODO it would be nice to know whether we have already
+		// seen this target before.  But hopefully we hit the
+		// append_bo() fast path enough for this to not matter:
+		struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
+		struct msm_submit_sp *msm_submit = to_msm_submit_sp(msm_ring->u.submit);
+
+		for (unsigned i = 0; i < msm_target->u.nr_reloc_bos; i++) {
+			append_bo(msm_submit, msm_target->u.reloc_bos[i].bo,
+					msm_target->u.reloc_bos[i].flags);
+		}
+	}
+
+	return size;
+}
+
+static uint32_t
+msm_ringbuffer_sp_cmd_count(struct fd_ringbuffer *ring)
+{
+	if (ring->flags & FD_RINGBUFFER_GROWABLE)
+		return to_msm_ringbuffer_sp(ring)->u.nr_cmds + 1;
+	return 1;
+}
+
+static void
+msm_ringbuffer_sp_destroy(struct fd_ringbuffer *ring)
+{
+	struct msm_ringbuffer_sp *msm_ring = to_msm_ringbuffer_sp(ring);
+
+	fd_bo_del(msm_ring->ring_bo);
+
+	if (ring->flags & _FD_RINGBUFFER_OBJECT) {
+		for (unsigned i = 0; i < msm_ring->u.nr_reloc_bos; i++) {
+			fd_bo_del(msm_ring->u.reloc_bos[i].bo);
+		}
+
+		free(msm_ring);
+	} else {
+		struct fd_submit *submit = msm_ring->u.submit;
+
+		for (unsigned i = 0; i < msm_ring->u.nr_cmds; i++) {
+			fd_bo_del(msm_ring->u.cmds[i].ring_bo);
+		}
+
+		slab_free_st(&to_msm_submit_sp(submit)->ring_pool, msm_ring);
+	}
+}
+
+static const struct fd_ringbuffer_funcs ring_funcs = {
+		.grow = msm_ringbuffer_sp_grow,
+		.emit_reloc = msm_ringbuffer_sp_emit_reloc,
+		.emit_reloc_ring = msm_ringbuffer_sp_emit_reloc_ring,
+		.cmd_count = msm_ringbuffer_sp_cmd_count,
+		.destroy = msm_ringbuffer_sp_destroy,
+};
+
+static inline struct fd_ringbuffer *
+msm_ringbuffer_sp_init(struct msm_ringbuffer_sp *msm_ring, uint32_t size,
+		enum fd_ringbuffer_flags flags)
+{
+	struct fd_ringbuffer *ring = &msm_ring->base;
+
+	debug_assert(msm_ring->ring_bo);
+
+	uint8_t *base = fd_bo_map(msm_ring->ring_bo);
+	ring->start = (void *)(base + msm_ring->offset);
+	ring->end = &(ring->start[size/4]);
+	ring->cur = ring->start;
+
+	ring->size = size;
+	ring->flags = flags;
+
+	ring->funcs = &ring_funcs;
+
+	// TODO initializing these could probably be conditional on flags
+	// since unneed for FD_RINGBUFFER_STAGING case..
+	msm_ring->u.cmds = NULL;
+	msm_ring->u.nr_cmds = msm_ring->u.max_cmds = 0;
+
+	msm_ring->u.reloc_bos = NULL;
+	msm_ring->u.nr_reloc_bos = msm_ring->u.max_reloc_bos = 0;
+
+	return ring;
+}
+
+struct fd_ringbuffer *
+msm_ringbuffer_sp_new_object(struct fd_pipe *pipe, uint32_t size)
+{
+	struct msm_ringbuffer_sp *msm_ring = malloc(sizeof(*msm_ring));
+
+	msm_ring->u.pipe = pipe;
+	msm_ring->offset = 0;
+	msm_ring->ring_bo = fd_bo_new_ring(pipe->dev, size, 0);
+	msm_ring->base.refcnt = 1;
+
+	return msm_ringbuffer_sp_init(msm_ring, size, _FD_RINGBUFFER_OBJECT);
+}
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index a46fbecf2d7..91d7ce73535 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -48,25 +48,28 @@ batch_init(struct fd_batch *batch)
 	 * have no option but to allocate large worst-case sizes so that
 	 * we don't need to grow the ringbuffer.  Performance is likely to
 	 * suffer, but there is no good alternative.
+	 *
+	 * XXX I think we can just require new enough kernel for this?
 	 */
 	if ((fd_device_version(ctx->screen->dev) < FD_VERSION_UNLIMITED_CMDS) ||
 			(fd_mesa_debug & FD_DBG_NOGROW)){
 		size = 0x100000;
 	}
 
-	batch->draw    = fd_ringbuffer_new(ctx->pipe, size);
-	if (!batch->nondraw) {
-		batch->gmem    = fd_ringbuffer_new(ctx->pipe, size);
-
-		fd_ringbuffer_set_parent(batch->gmem, NULL);
-		fd_ringbuffer_set_parent(batch->draw, batch->gmem);
+	batch->submit = fd_submit_new(ctx->pipe);
+	if (batch->nondraw) {
+		batch->draw = fd_submit_new_ringbuffer(batch->submit, size,
+				FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
+	} else {
+		batch->gmem = fd_submit_new_ringbuffer(batch->submit, size,
+				FD_RINGBUFFER_PRIMARY | FD_RINGBUFFER_GROWABLE);
+		batch->draw = fd_submit_new_ringbuffer(batch->submit, size,
+				FD_RINGBUFFER_GROWABLE);
 
 		if (ctx->screen->gpu_id < 600) {
-			batch->binning = fd_ringbuffer_new(ctx->pipe, size);
-			fd_ringbuffer_set_parent(batch->binning, batch->gmem);
+			batch->binning = fd_submit_new_ringbuffer(batch->submit,
+					size, FD_RINGBUFFER_GROWABLE);
 		}
-	} else {
-		fd_ringbuffer_set_parent(batch->draw, NULL);
 	}
 
 	batch->in_fence_fd = -1;
@@ -146,6 +149,8 @@ batch_fini(struct fd_batch *batch)
 		batch->lrz_clear = NULL;
 	}
 
+	fd_submit_del(batch->submit);
+
 	util_dynarray_fini(&batch->draw_patches);
 
 	if (is_a3xx(batch->ctx->screen))
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index 6ff4014ddcf..e0902f66914 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -151,6 +151,8 @@ struct fd_batch {
 
 	struct pipe_framebuffer_state framebuffer;
 
+	struct fd_submit *submit;
+
 	/** draw pass cmdstream: */
 	struct fd_ringbuffer *draw;
 	/** binning pass cmdstream: */
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 29f835879b4..bb15f0a3e16 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -373,15 +373,13 @@ render_sysmem(struct fd_batch *batch)
 static void
 flush_ring(struct fd_batch *batch)
 {
-	/* for compute/blit batch, there is no batch->gmem, only batch->draw: */
-	struct fd_ringbuffer *ring = batch->nondraw ? batch->draw : batch->gmem;
 	uint32_t timestamp;
 	int out_fence_fd = -1;
 
-	fd_ringbuffer_flush2(ring, batch->in_fence_fd,
-			batch->needs_out_fence_fd ? &out_fence_fd : NULL);
+	fd_submit_flush(batch->submit, batch->in_fence_fd,
+			batch->needs_out_fence_fd ? &out_fence_fd : NULL,
+			&timestamp);
 
-	timestamp = fd_ringbuffer_timestamp(ring);
 	fd_fence_populate(batch->fence, timestamp, out_fence_fd);
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 3fedd6195e7..88d91a91234 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -86,6 +86,7 @@ static const struct debug_named_value debug_options[] = {
 		{"hiprio",    FD_DBG_HIPRIO, "Force high-priority context"},
 		{"ttile",     FD_DBG_TTILE,  "Enable texture tiling (a5xx)"},
 		{"perfcntrs", FD_DBG_PERFC,  "Expose performance counters"},
+		{"softpin",   FD_DBG_SOFTPIN,"Enable softpin command submission (experimental)"},
 		DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index b0b0e4f0d74..fedb8ffc906 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -27,8 +27,8 @@
 #ifndef FREEDRENO_SCREEN_H_
 #define FREEDRENO_SCREEN_H_
 
-#include <freedreno_drmif.h>
-#include <freedreno_ringbuffer.h>
+#include "drm/freedreno_drmif.h"
+#include "drm/freedreno_ringbuffer.h"
 
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 82bcb9b33f0..81622506f1e 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -27,8 +27,8 @@
 #ifndef FREEDRENO_UTIL_H_
 #define FREEDRENO_UTIL_H_
 
-#include <freedreno_drmif.h>
-#include <freedreno_ringbuffer.h>
+#include "drm/freedreno_drmif.h"
+#include "drm/freedreno_ringbuffer.h"
 
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
@@ -84,6 +84,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_HIPRIO 0x100000
 #define FD_DBG_TTILE  0x200000
 #define FD_DBG_PERFC  0x400000
+#define FD_DBG_SOFTPIN 0x800000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -202,7 +203,7 @@ OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
 {
 	if (LOG_DWORDS) {
 		DBG("ring[%p]: OUT_RING   %04x:  %08x", ring,
-				(uint32_t)(ring->cur - ring->last_start), data);
+				(uint32_t)(ring->cur - ring->start), data);
 	}
 	fd_ringbuffer_emit(ring, data);
 }
@@ -214,7 +215,7 @@ OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data,
 {
 	if (LOG_DWORDS) {
 		DBG("ring[%p]: OUT_RINGP  %04x:  %08x", ring,
-				(uint32_t)(ring->cur - ring->last_start), data);
+				(uint32_t)(ring->cur - ring->start), data);
 	}
 	util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){
 		.cs  = ring->cur++,
@@ -232,10 +233,10 @@ OUT_RELOC(struct fd_ringbuffer *ring, struct fd_bo *bo,
 {
 	if (LOG_DWORDS) {
 		DBG("ring[%p]: OUT_RELOC   %04x:  %p+%u << %d", ring,
-				(uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
+				(uint32_t)(ring->cur - ring->start), bo, offset, shift);
 	}
 	debug_assert(offset < fd_bo_size(bo));
-	fd_ringbuffer_reloc2(ring, &(struct fd_reloc){
+	fd_ringbuffer_reloc(ring, &(struct fd_reloc){
 		.bo = bo,
 		.flags = FD_RELOC_READ,
 		.offset = offset,
@@ -251,10 +252,10 @@ OUT_RELOCW(struct fd_ringbuffer *ring, struct fd_bo *bo,
 {
 	if (LOG_DWORDS) {
 		DBG("ring[%p]: OUT_RELOCW  %04x:  %p+%u << %d", ring,
-				(uint32_t)(ring->cur - ring->last_start), bo, offset, shift);
+				(uint32_t)(ring->cur - ring->start), bo, offset, shift);
 	}
 	debug_assert(offset < fd_bo_size(bo));
-	fd_ringbuffer_reloc2(ring, &(struct fd_reloc){
+	fd_ringbuffer_reloc(ring, &(struct fd_reloc){
 		.bo = bo,
 		.flags = FD_RELOC_READ | FD_RELOC_WRITE,
 		.offset = offset,
@@ -276,18 +277,9 @@ static inline void BEGIN_RING(struct fd_ringbuffer *ring, uint32_t ndwords)
 		fd_ringbuffer_grow(ring, ndwords);
 }
 
-static inline uint32_t
-__gpu_id(struct fd_ringbuffer *ring)
-{
-	uint64_t val;
-	fd_pipe_get_param(ring->pipe, FD_GPU_ID, &val);
-	return val;
-}
-
 static inline void
 OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 {
-	debug_assert(__gpu_id(ring) < 500);
 	BEGIN_RING(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE0_PKT | ((cnt-1) << 16) | (regindx & 0x7FFF));
 }
@@ -295,7 +287,6 @@ OUT_PKT0(struct fd_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
 static inline void
 OUT_PKT2(struct fd_ringbuffer *ring)
 {
-	debug_assert(__gpu_id(ring) < 500);
 	BEGIN_RING(ring, 1);
 	OUT_RING(ring, CP_TYPE2_PKT);
 }
@@ -303,7 +294,6 @@ OUT_PKT2(struct fd_ringbuffer *ring)
 static inline void
 OUT_PKT3(struct fd_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 {
-	debug_assert(__gpu_id(ring) < 500);
 	BEGIN_RING(ring, cnt+1);
 	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
 }
@@ -366,8 +356,6 @@ __OUT_IB(struct fd_ringbuffer *ring, bool prefetch, struct fd_ringbuffer *target
 
 	unsigned count = fd_ringbuffer_cmd_count(target);
 
-	debug_assert(__gpu_id(ring) < 500);
-
 	/* for debug after a lock up, write a unique counter value
 	 * to scratch6 for each IB, to make it easier to match up
 	 * register dumps to cmdstream.  The combination of IB and
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index b3127ff8c38..d00323b3bf7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -569,13 +569,20 @@ ir3_shader_outputs(const struct ir3_shader *so)
 
 #include "freedreno_resource.h"
 
+static inline bool
+is_stateobj(struct fd_ringbuffer *ring)
+{
+	/* XXX this is an ugly way to differentiate.. */
+	return !!(ring->flags & FD_RINGBUFFER_STREAMING);
+}
+
 static inline void
 ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring)
 {
 	/* when we emit const state via ring (IB2) we need a WFI, but when
 	 * it is emit'd via stateobj, we don't
 	 */
-	if (ring->flags & FD_RINGBUFFER_OBJECT)
+	if (is_stateobj(ring))
 		return;
 
 	fd_wfi(batch, ring);
@@ -836,7 +843,7 @@ emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin
 	 * Possibly if we split up different parts of the const state to
 	 * different state-objects we could avoid this.
 	 */
-	if (dirty && (ring->flags & FD_RINGBUFFER_OBJECT))
+	if (dirty && is_stateobj(ring))
 		dirty = ~0;
 
 	if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build
index 9272602e547..4024d2fa99f 100644
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -71,6 +71,21 @@ files_libfreedreno = files(
   'freedreno_texture.h',
   'freedreno_util.c',
   'freedreno_util.h',
+  'drm/freedreno_bo.c',
+  'drm/freedreno_bo_cache.c',
+  'drm/freedreno_device.c',
+  'drm/freedreno_drmif.h',
+  'drm/freedreno_pipe.c',
+  'drm/freedreno_priv.h',
+  'drm/freedreno_ringbuffer.c',
+  'drm/freedreno_ringbuffer.h',
+  'drm/msm_bo.c',
+  'drm/msm_device.c',
+  'drm/msm_drm.h',
+  'drm/msm_pipe.c',
+  'drm/msm_priv.h',
+  'drm/msm_ringbuffer.c',
+  'drm/msm_ringbuffer_sp.c',
   'a2xx/a2xx.xml.h',
   'a2xx/disasm-a2xx.c',
   'a2xx/fd2_blend.c',
@@ -259,7 +274,11 @@ libfreedreno = static_library(
   include_directories : freedreno_includes,
   c_args : [freedreno_c_args, c_vis_args],
   cpp_args : [freedreno_cpp_args, cpp_vis_args],
-  dependencies : [dep_libdrm, dep_libdrm_freedreno, idep_nir_headers],
+  dependencies : [
+    dep_libdrm,
+    dep_valgrind,
+    idep_nir_headers
+  ],
 )
 
 driver_freedreno = declare_dependency(
@@ -274,7 +293,7 @@ ir3_compiler = executable(
   include_directories : freedreno_includes,
   dependencies : [
     dep_libdrm,
-    dep_libdrm_freedreno,
+    dep_valgrind,
     dep_thread,
     idep_nir,
   ],
diff --git a/src/gallium/winsys/freedreno/drm/meson.build b/src/gallium/winsys/freedreno/drm/meson.build
index 34aff635dde..0fc02897ddd 100644
--- a/src/gallium/winsys/freedreno/drm/meson.build
+++ b/src/gallium/winsys/freedreno/drm/meson.build
@@ -25,5 +25,5 @@ libfreedrenowinsys = static_library(
     inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_gallium_drivers,
   ],
   c_args : [c_vis_args],
-  dependencies : [dep_libdrm_freedreno],
+  dependencies : [dep_libdrm],
 )