17 files changed, 761 insertions, 625 deletions
diff --git a/drivers/video/tegra/host/3dctx_common.c b/drivers/video/tegra/host/3dctx_common.c
index a194340dc88d..a3f38d9127cb 100644
--- a/drivers/video/tegra/host/3dctx_common.c
+++ b/drivers/video/tegra/host/3dctx_common.c
@@ -151,6 +151,7 @@ int nvhost_3dctx_prepare_power_off(struct nvhost_module *mod)
 	struct nvhost_channel *ch =
 			container_of(mod, struct nvhost_channel, mod);
 	struct nvhost_hwctx *hwctx_to_save;
+	struct nvhost_job *job;
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	u32 syncpt_incrs, syncpt_val;
 	int err = 0;
@@ -163,7 +164,6 @@ int nvhost_3dctx_prepare_power_off(struct nvhost_module *mod)
 		err = -ENOMEM;
 		goto done;
 	}
-
 	if (mod->desc->busy)
 		mod->desc->busy(mod);
 
@@ -174,6 +174,15 @@ int nvhost_3dctx_prepare_power_off(struct nvhost_module *mod)
 		goto done;
 	}
 
+	job = nvhost_job_alloc(ch, hwctx_to_save,
+			NULL,
+			ch->dev->nvmap, 0, hwctx_to_save->timeout);
+	if (IS_ERR_OR_NULL(job)) {
+		err = PTR_ERR(job);
+		mutex_unlock(&ch->submitlock);
+		goto done;
+	}
+
 	err = nvhost_cdma_begin(&ch->cdma, hwctx_to_save->timeout);
 	if (err) {
 		mutex_unlock(&ch->submitlock);
@@ -188,9 +197,14 @@ int nvhost_3dctx_prepare_power_off(struct nvhost_module *mod)
 	syncpt_val = nvhost_syncpt_incr_max(&ch->dev->syncpt,
 					NVSYNCPT_3D, syncpt_incrs);
 
+	job->syncpt_id = NVSYNCPT_3D;
+	job->syncpt_incrs = syncpt_incrs;
+	job->syncpt_end = syncpt_val;
+
 	ch->ctxhandler.save_push(&ch->cdma, hwctx_to_save);
-	nvhost_cdma_end(&ch->cdma, ch->dev->nvmap, NVSYNCPT_3D, syncpt_val,
-			NULL, 0, hwctx_to_save->timeout);
+	nvhost_cdma_end(&ch->cdma, job);
+	nvhost_job_put(job);
+	job = NULL;
 
 	err = nvhost_intr_add_action(&ch->dev->intr, NVSYNCPT_3D,
 			syncpt_val - syncpt_incrs + hwctx_to_save->save_thresh,
diff --git a/drivers/video/tegra/host/Makefile b/drivers/video/tegra/host/Makefile
index d0b1aec247eb..a72b35a2d7f9 100644
--- a/drivers/video/tegra/host/Makefile
+++ b/drivers/video/tegra/host/Makefile
@@ -6,6 +6,7 @@ nvhost-objs = \
 	nvhost_cpuaccess.o \
 	nvhost_intr.o \
 	nvhost_channel.o \
+	nvhost_job.o \
 	dev.o \
 	bus.o \
 	3dctx_common.o \
diff --git a/drivers/video/tegra/host/chip_support.h b/drivers/video/tegra/host/chip_support.h
index 190ae1feb479..d909d34bbe73 100644
--- a/drivers/video/tegra/host/chip_support.h
+++ b/drivers/video/tegra/host/chip_support.h
@@ -29,7 +29,6 @@ struct nvhost_userctx_timeout;
 struct nvhost_master;
 struct nvhost_channel;
 struct nvmap_handle;
-struct nvhost_waitchk;
 struct nvmap_client;
 struct nvhost_hwctx;
 struct nvhost_cdma;
@@ -40,27 +39,14 @@ struct nvhost_cpuaccess;
 struct nvhost_module;
 struct nvhost_master;
 struct dentry;
+struct nvhost_job;
 
 struct nvhost_chip_support {
 	struct {
 		int (*init)(struct nvhost_channel *,
 			    struct nvhost_master *,
 			    int chid);
-		int (*submit)(struct nvhost_channel *,
-			      struct nvhost_hwctx *,
-			      struct nvmap_client *,
-			      struct nvhost_channel_gather *gathers,
-			      int num_gathers,
-			      struct nvhost_waitchk *waitchk,
-			      struct nvhost_waitchk *waitchk_end,
-			      u32 waitchk_mask,
-			      struct nvmap_handle **unpins,
-			      int nr_unpins,
-			      u32 syncpt_id,
-			      u32 syncpt_incrs,
-			      struct nvhost_userctx_timeout *timeout,
-			      u32 *syncpt_value,
-			      bool null_kickoff);
+		int (*submit)(struct nvhost_job *job);
 		int (*read3dreg)(struct nvhost_channel *channel,
 				struct nvhost_hwctx *hwctx,
 				struct nvhost_userctx_timeout *timeout,
@@ -127,7 +113,7 @@ struct nvhost_chip_support {
 				  struct nvmap_client *nvmap,
 				  u32 waitchk_mask,
 				  struct nvhost_waitchk *wait,
-				  struct nvhost_waitchk *waitend);
+				  int num_waitchk);
 		void (*debug)(struct nvhost_syncpt *);
 		const char * (*name)(struct nvhost_syncpt *, u32 id);
 	} syncpt;
diff --git a/drivers/video/tegra/host/dev.c b/drivers/video/tegra/host/dev.c
index 75c116b4f081..47d6daa4a482 100644
--- a/drivers/video/tegra/host/dev.c
+++ b/drivers/video/tegra/host/dev.c
@@ -44,6 +44,7 @@
 #include <mach/hardware.h>
 
 #include "debug.h"
+#include "nvhost_job.h"
 
 #define DRIVER_NAME "tegra_grhost"
 #define IFACE_NAME "nvhost"
@@ -58,15 +59,8 @@ struct nvhost_channel_userctx {
 	struct nvhost_hwctx *hwctx;
 	struct nvhost_submit_hdr_ext hdr;
 	int num_relocshifts;
-	struct nvmap_handle_ref *gather_mem;
-	struct nvhost_channel_gather *gathers;
-	int num_gathers;
-	int pinarray_size;
-	struct nvmap_pinarray_elem pinarray[NVHOST_MAX_HANDLES];
-	struct nvmap_handle *unpinarray[NVHOST_MAX_HANDLES];
+	struct nvhost_job *job;
 	struct nvmap_client *nvmap;
-	struct nvhost_waitchk waitchks[NVHOST_MAX_WAIT_CHECKS];
-	struct nvhost_waitchk *cur_waitchk;
 	struct nvhost_userctx_timeout timeout;
 	u32 priority;
 };
@@ -80,14 +74,14 @@ struct nvhost_ctrl_userctx {
  * Write cmdbuf to ftrace output. Checks if cmdbuf contents should be output
  * and mmaps the cmdbuf contents if required.
  */
-static void trace_write_cmdbufs(struct nvhost_channel_userctx *ctx)
+static void trace_write_cmdbufs(struct nvhost_job *job)
 {
 	struct nvmap_handle_ref handle;
 	void *mem = NULL;
 	int i = 0;
 
-	for (i = 0; i < ctx->num_gathers; i++) {
-		struct nvhost_channel_gather *gather = &ctx->gathers[i];
+	for (i = 0; i < job->num_gathers; i++) {
+		struct nvhost_channel_gather *gather = &job->gathers[i];
 		if (nvhost_debug_trace_cmdbuf) {
 			handle.handle = nvmap_id_to_handle(gather->mem_id);
 			mem = nvmap_mmap(&handle);
@@ -103,7 +97,7 @@ static void trace_write_cmdbufs(struct nvhost_channel_userctx *ctx)
 			 */
 			for (i = 0; i < gather->words; i += TRACE_MAX_LENGTH) {
 				trace_nvhost_channel_write_cmdbuf_data(
-					ctx->ch->desc->name,
+					job->ch->desc->name,
 					gather->mem_id,
 					min(gather->words - i,
 					    TRACE_MAX_LENGTH),
@@ -129,11 +123,8 @@ static int nvhost_channelrelease(struct inode *inode, struct file *filp)
 	if (priv->hwctx)
 		priv->ch->ctxhandler.put(priv->hwctx);
 
-	if (priv->gathers)
-		nvmap_munmap(priv->gather_mem, priv->gathers);
-
-	if (!IS_ERR_OR_NULL(priv->gather_mem))
-		nvmap_free(priv->ch->dev->nvmap, priv->gather_mem);
+	if (priv->job)
+		nvhost_job_put(priv->job);
 
 	nvmap_client_put(priv->nvmap);
 	kfree(priv);
@@ -145,7 +136,6 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	struct nvhost_channel_userctx *priv;
 	struct nvhost_channel *ch;
 
-
 	ch = container_of(inode->i_cdev, struct nvhost_channel, cdev);
 	ch = nvhost_getchannel(ch);
 	if (!ch)
@@ -160,12 +150,6 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	filp->private_data = priv;
 	priv->ch = ch;
 	nvhost_module_add_client(ch->dev, &ch->mod, priv);
-	priv->gather_mem = nvmap_alloc(ch->dev->nvmap,
-				sizeof(struct nvhost_channel_gather)
-					* NVHOST_MAX_GATHERS, 32,
-				NVMAP_HANDLE_CACHEABLE);
-	if (IS_ERR(priv->gather_mem))
-		goto fail;
 
 	if (ch->ctxhandler.alloc) {
 		priv->hwctx = ch->ctxhandler.alloc(ch);
@@ -176,8 +160,9 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	}
 	priv->priority = NVHOST_PRIORITY_MEDIUM;
 
-	priv->gathers = nvmap_mmap(priv->gather_mem);
-	if (!priv->gathers)
+	priv->job = nvhost_job_alloc(ch, priv->hwctx, &priv->hdr,
+			NULL, priv->priority, &priv->timeout);
+	if (!priv->job)
 		goto fail;
 
 	return 0;
@@ -186,49 +171,25 @@ fail:
 	return -ENOMEM;
 }
 
-static void add_gather(struct nvhost_channel_userctx *ctx,
-		u32 mem_id, u32 words, u32 offset)
-{
-	struct nvmap_pinarray_elem *pin;
-	struct nvhost_channel_gather *cur_gather =
-			&ctx->gathers[ctx->num_gathers];
-
-	pin = &ctx->pinarray[ctx->pinarray_size++];
-	pin->patch_mem = (u32)nvmap_ref_to_handle(ctx->gather_mem);
-	pin->patch_offset = (void *)&(cur_gather->mem) - (void *)ctx->gathers;
-	pin->pin_mem = mem_id;
-	pin->pin_offset = offset;
-	cur_gather->words = words;
-	cur_gather->mem_id = mem_id;
-	cur_gather->offset = offset;
-	ctx->num_gathers += 1;
-}
-
 static int set_submit(struct nvhost_channel_userctx *ctx)
 {
+	struct device *device = &ctx->ch->dev->pdev->dev;
+
 	/* submit should have at least 1 cmdbuf */
 	if (!ctx->hdr.num_cmdbufs)
 		return -EIO;
 
-	/* check submit doesn't exceed static structs */
-	if ((ctx->hdr.num_cmdbufs + ctx->hdr.num_relocs) > NVHOST_MAX_HANDLES) {
-		dev_err(&ctx->ch->dev->pdev->dev,
-			"channel submit exceeded max handles (%d > %d)\n",
-			ctx->hdr.num_cmdbufs + ctx->hdr.num_relocs,
-			NVHOST_MAX_HANDLES);
-		return -EIO;
-	}
-	if (ctx->hdr.num_waitchks > NVHOST_MAX_WAIT_CHECKS) {
-		dev_err(&ctx->ch->dev->pdev->dev,
-			"channel submit exceeded max waitchks (%d > %d)\n",
-			ctx->hdr.num_waitchks,
-			NVHOST_MAX_WAIT_CHECKS);
-		return -EIO;
+	if (!ctx->nvmap) {
+		dev_err(device, "no nvmap context set\n");
+		return -EFAULT;
 	}
 
-	ctx->num_gathers = 0;
-	ctx->cur_waitchk = ctx->waitchks;
-	ctx->pinarray_size = 0;
+	ctx->job = nvhost_job_realloc(ctx->job,
+			&ctx->hdr,
+			ctx->nvmap,
+			ctx->priority);
+	if (!ctx->job)
+		return -ENOMEM;
 
 	if (ctx->hdr.submit_version >= NVHOST_SUBMIT_VERSION_V2)
 		ctx->num_relocshifts = ctx->hdr.num_relocs;
@@ -250,28 +211,31 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 	struct nvhost_channel_userctx *priv = filp->private_data;
 	size_t remaining = count;
 	int err = 0;
+	struct nvhost_job *job = priv->job;
+	struct nvhost_submit_hdr_ext *hdr = &priv->hdr;
+	const char *chname = priv->ch->desc->name;
 
 	while (remaining) {
 		size_t consumed;
-		if (!priv->hdr.num_relocs &&
+		if (!hdr->num_relocs &&
 		    !priv->num_relocshifts &&
-		    !priv->hdr.num_cmdbufs &&
-		    !priv->hdr.num_waitchks) {
+		    !hdr->num_cmdbufs &&
+		    !hdr->num_waitchks) {
 			consumed = sizeof(struct nvhost_submit_hdr);
 			if (remaining < consumed)
 				break;
-			if (copy_from_user(&priv->hdr, buf, consumed)) {
+			if (copy_from_user(hdr, buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			priv->hdr.submit_version = NVHOST_SUBMIT_VERSION_V0;
+			hdr->submit_version = NVHOST_SUBMIT_VERSION_V0;
 			err = set_submit(priv);
 			if (err)
 				break;
-			trace_nvhost_channel_write_submit(priv->ch->desc->name,
-			  count, priv->hdr.num_cmdbufs, priv->hdr.num_relocs,
-			  priv->hdr.syncpt_id, priv->hdr.syncpt_incrs);
-		} else if (priv->hdr.num_cmdbufs) {
+			trace_nvhost_channel_write_submit(chname,
+			  count, hdr->num_cmdbufs, hdr->num_relocs,
+			  hdr->syncpt_id, hdr->syncpt_incrs);
+		} else if (hdr->num_cmdbufs) {
 			struct nvhost_cmdbuf cmdbuf;
 			consumed = sizeof(cmdbuf);
 			if (remaining < consumed)
@@ -280,47 +244,49 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 				err = -EFAULT;
 				break;
 			}
-			trace_nvhost_channel_write_cmdbuf(priv->ch->desc->name,
+			trace_nvhost_channel_write_cmdbuf(chname,
 				cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
-			add_gather(priv,
+			nvhost_job_add_gather(job,
 				cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
-			priv->hdr.num_cmdbufs--;
-		} else if (priv->hdr.num_relocs) {
+			hdr->num_cmdbufs--;
+		} else if (hdr->num_relocs) {
 			consumed = sizeof(struct nvhost_reloc);
 			if (remaining < consumed)
 				break;
-			if (copy_from_user(&priv->pinarray[priv->pinarray_size],
-						buf, consumed)) {
+			if (copy_from_user(&job->pinarray[job->num_pins],
+					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			trace_nvhost_channel_write_reloc(priv->ch->desc->name);
-			priv->pinarray_size++;
-			priv->hdr.num_relocs--;
-		} else if (priv->hdr.num_waitchks) {
+			trace_nvhost_channel_write_reloc(chname);
+			job->num_pins++;
+			hdr->num_relocs--;
+		} else if (hdr->num_waitchks) {
 			int numwaitchks =
 				(remaining / sizeof(struct nvhost_waitchk));
 			if (!numwaitchks)
 				break;
 			numwaitchks = min_t(int,
-				numwaitchks, priv->hdr.num_waitchks);
+				numwaitchks, hdr->num_waitchks);
 			consumed = numwaitchks * sizeof(struct nvhost_waitchk);
-			if (copy_from_user(priv->cur_waitchk, buf, consumed)) {
+			if (copy_from_user(&job->waitchk[job->num_waitchk],
+					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
 			trace_nvhost_channel_write_waitchks(
-			  priv->ch->desc->name, numwaitchks,
-			  priv->hdr.waitchk_mask);
-			priv->cur_waitchk += numwaitchks;
-			priv->hdr.num_waitchks -= numwaitchks;
+			  chname, numwaitchks,
+			  hdr->waitchk_mask);
+			job->num_waitchk += numwaitchks;
+			hdr->num_waitchks -= numwaitchks;
 		} else if (priv->num_relocshifts) {
 			int next_shift =
-				priv->pinarray_size - priv->num_relocshifts;
+				job->num_pins - priv->num_relocshifts;
 			consumed = sizeof(struct nvhost_reloc_shift);
 			if (remaining < consumed)
 				break;
-			if (copy_from_user(&priv->pinarray[next_shift].reloc_shift,
+			if (copy_from_user(
+					&job->pinarray[next_shift].reloc_shift,
 					buf, consumed)) {
 				err = -EFAULT;
 				break;
@@ -349,37 +315,28 @@ static int nvhost_ioctl_channel_flush(
 	int null_kickoff)
 {
 	struct device *device = &ctx->ch->dev->pdev->dev;
-	int num_unpin;
 	int err;
 
 	trace_nvhost_ioctl_channel_flush(ctx->ch->desc->name);
 
-	if (ctx->hdr.num_relocs ||
+	if (!ctx->job ||
+	    ctx->hdr.num_relocs ||
 	    ctx->hdr.num_cmdbufs ||
 	    ctx->hdr.num_waitchks) {
 		reset_submit(ctx);
 		dev_err(device, "channel submit out of sync\n");
 		return -EFAULT;
 	}
-	if (!ctx->nvmap) {
-		dev_err(device, "no nvmap context set\n");
-		return -EFAULT;
-	}
-	if (ctx->num_gathers == 0)
-		return 0;
-
-	/* pin mem handles and patch physical addresses */
-	num_unpin = nvmap_pin_array(ctx->nvmap,
-				    nvmap_ref_to_handle(ctx->gather_mem),
-				    ctx->pinarray, ctx->pinarray_size,
-				    ctx->unpinarray);
-	if (num_unpin < 0) {
-		dev_warn(device, "nvmap_pin_array failed: %d\n", num_unpin);
-		return num_unpin;
+
+	err = nvhost_job_pin(ctx->job);
+	if (err) {
+		dev_warn(device, "nvhost_job_pin failed: %d\n", err);
+		return err;
 	}
 
 	if (nvhost_debug_null_kickoff_pid == current->tgid)
 		null_kickoff = 1;
+	ctx->job->null_kickoff = null_kickoff;
 
 	if ((nvhost_debug_force_timeout_pid == current->tgid) &&
 	    (nvhost_debug_force_timeout_channel == ctx->ch->chid)) {
@@ -387,21 +344,13 @@ static int nvhost_ioctl_channel_flush(
 	}
 	ctx->timeout.syncpt_id = ctx->hdr.syncpt_id;
 
-	trace_write_cmdbufs(ctx);
+	trace_write_cmdbufs(ctx->job);
 
 	/* context switch if needed, and submit user's gathers to the channel */
-	err = nvhost_channel_submit(ctx->ch, ctx->hwctx, ctx->nvmap,
-				ctx->gathers, ctx->num_gathers,
-				ctx->waitchks, ctx->cur_waitchk,
-				ctx->hdr.waitchk_mask,
-				ctx->unpinarray, num_unpin,
-				ctx->hdr.syncpt_id, ctx->hdr.syncpt_incrs,
-				&ctx->timeout,
-				ctx->priority,
-				&args->value,
-				null_kickoff);
+	err = nvhost_channel_submit(ctx->job);
+	args->value = ctx->job->syncpt_end;
 	if (err)
-		nvmap_unpin_handles(ctx->nvmap, ctx->unpinarray, num_unpin);
+		nvhost_job_unpin(ctx->job);
 
 	return err;
 }
diff --git a/drivers/video/tegra/host/nvhost_cdma.c b/drivers/video/tegra/host/nvhost_cdma.c
index da9308c99f1f..efa6a1104ac9 100644
--- a/drivers/video/tegra/host/nvhost_cdma.c
+++ b/drivers/video/tegra/host/nvhost_cdma.c
@@ -25,6 +25,7 @@
 #include <asm/cacheflush.h>
 
 #include <linux/slab.h>
+#include <linux/kfifo.h>
 #include <trace/events/nvhost.h>
 #include <linux/interrupt.h>
 
@@ -36,228 +37,45 @@
  *     - some channels hardly need any, some channels (3d) could use more
  */
 
-/* Sync Queue
- *
- * The sync queue is a circular buffer of u32s interpreted as:
- *   0: SyncPointID
- *   1: SyncPointValue
- *   2: FirstDMAGet (start of submit in pushbuffer)
- *   3: Timeout (time to live for this submit)
- *   4: TimeoutContext (userctx that submitted buffer)
- *   5: NumSlots (how many pushbuffer slots to free)
- *   6: NumHandles
- *   7: nvmap client which pinned the handles
- *   8..: NumHandles * nvmemhandle to unpin
- *
- * There's always one word unused, so (accounting for wrap):
- *   - Write == Read => queue empty
- *   - Write + 1 == Read => queue full
- * The queue must not be left with less than SYNC_QUEUE_MIN_ENTRY words
- * of space at the end of the array.
- *
- * We want to pass contiguous arrays of handles to nvmap_unpin_handles,
- * so arrays that would wrap at the end of the buffer will be split into
- * two (or more) entries.
- */
-
-/* Number of words needed to store an entry containing one handle */
-#define SYNC_QUEUE_MIN_ENTRY (SQ_IDX_HANDLES + (sizeof(void *)/4))
-
-/* Magic to use to fill freed handle slots */
-#define BAD_MAGIC 0xdeadbeef
-
 /**
- * Reset to empty queue.
+ * kfifo_save - save current out pointer
+ * @fifo: address of the fifo to be used
  */
-static void reset_sync_queue(struct sync_queue *queue)
-{
-	queue->read = 0;
-	queue->write = 0;
-}
+#define	kfifo_save(fifo) \
+__kfifo_uint_must_check_helper( \
+({ \
+	typeof((fifo) + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__kfifo->out; \
+}) \
+)
 
 /**
- *  Find the number of handles that can be stashed in the sync queue without
- *  waiting.
- *  0 -> queue is full, must update to wait for some entries to be freed.
+ * kfifo_restore - restore previously saved pointer
+ * @fifo: address of the fifo to be used
+ * @out: output pointer
  */
-static unsigned int sync_queue_space(struct sync_queue *queue)
-{
-	struct nvhost_cdma *cdma;
-	struct nvhost_master *host;
-
-	unsigned int read = queue->read;
-	unsigned int write = queue->write;
-	u32 size;
-
-	cdma = container_of(queue, struct nvhost_cdma, sync_queue);
-	host = cdma_to_dev(cdma);
-
-	BUG_ON(read  > (host->sync_queue_size - SYNC_QUEUE_MIN_ENTRY));
-	BUG_ON(write > (host->sync_queue_size - SYNC_QUEUE_MIN_ENTRY));
-
-	/*
-	 * We can use all of the space up to the end of the buffer, unless the
-	 * read position is within that space (the read position may advance
-	 * asynchronously, but that can't take space away once we've seen it).
-	 */
-	if (read > write) {
-		size = (read - 1) - write;
-	} else {
-		size = host->sync_queue_size - write;
-
-		/*
-		 * If the read position is zero, it gets complicated. We can't
-		 * use the last word in the buffer, because that would leave
-		 * the queue empty.
-		 * But also if we use too much we would not leave enough space
-		 * for a single handle packet, and would have to wrap in
-		 * add_to_sync_queue - also leaving write == read == 0,
-		 * an empty queue.
-		 */
-		if (read == 0)
-			size -= SYNC_QUEUE_MIN_ENTRY;
-	}
-
-	/*
-	 * There must be room for an entry header and at least one handle,
-	 * otherwise we report a full queue.
-	 */
-	if (size < SYNC_QUEUE_MIN_ENTRY)
-		return 0;
-	/* Minimum entry stores one handle */
-	return (size - SYNC_QUEUE_MIN_ENTRY) + 1;
-}
-
-/**
- * Debug routine used to dump sync_queue entries
- */
-static void dump_sync_queue_entry(struct nvhost_cdma *cdma, u32 *entry)
-{
-	struct nvhost_master *dev = cdma_to_dev(cdma);
-
-	dev_dbg(&dev->pdev->dev, "sync_queue index 0x%x\n",
-		(entry - cdma->sync_queue.buffer));
-	dev_dbg(&dev->pdev->dev, "    SYNCPT_ID   %d\n",
-		entry[SQ_IDX_SYNCPT_ID]);
-	dev_dbg(&dev->pdev->dev, "    SYNCPT_VAL  %d\n",
-		entry[SQ_IDX_SYNCPT_VAL]);
-	dev_dbg(&dev->pdev->dev, "    FIRST_GET   0x%x\n",
-		entry[SQ_IDX_FIRST_GET]);
-	dev_dbg(&dev->pdev->dev, "    TIMEOUT     %d\n",
-		entry[SQ_IDX_TIMEOUT]);
-	dev_dbg(&dev->pdev->dev, "    TIMEOUT_CTX 0x%x\n",
-		entry[SQ_IDX_TIMEOUT_CTX]);
-	dev_dbg(&dev->pdev->dev, "    NUM_SLOTS   %d\n",
-		entry[SQ_IDX_NUM_SLOTS]);
-	dev_dbg(&dev->pdev->dev, "    NUM_HANDLES %d\n",
-		entry[SQ_IDX_NUM_HANDLES]);
-}
+#define	kfifo_restore(fifo, restore) \
+(void)({ \
+	typeof((fifo) + 1) __tmp = (fifo); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	__kfifo->out = (restore); \
+})
 
 /**
  * Add an entry to the sync queue.
  */
-#define entry_size(_cnt)	((_cnt)*sizeof(void *)/sizeof(u32))
-
-static void add_to_sync_queue(struct sync_queue *queue,
-			      u32 sync_point_id, u32 sync_point_value,
-			      u32 nr_slots, struct nvmap_client *user_nvmap,
-			      struct nvmap_handle **handles, u32 nr_handles,
-			      u32 first_get,
-			      struct nvhost_userctx_timeout *timeout)
-{
-	struct nvhost_cdma *cdma;
-	struct nvhost_master *host;
-	u32 size, write = queue->write;
-	u32 *p = queue->buffer + write;
-
-	cdma = container_of(queue, struct nvhost_cdma, sync_queue);
-	host = cdma_to_dev(cdma);
-
-	BUG_ON(sync_point_id == NVSYNCPT_INVALID);
-	BUG_ON(sync_queue_space(queue) < nr_handles);
-
-	size  = SQ_IDX_HANDLES;
-	size += entry_size(nr_handles);
-
-	write += size;
-	BUG_ON(write > host->sync_queue_size);
-
-	p[SQ_IDX_SYNCPT_ID] = sync_point_id;
-	p[SQ_IDX_SYNCPT_VAL] = sync_point_value;
-	p[SQ_IDX_FIRST_GET] = first_get;
-	p[SQ_IDX_TIMEOUT] = timeout->timeout;
-	p[SQ_IDX_NUM_SLOTS] = nr_slots;
-	p[SQ_IDX_NUM_HANDLES] = nr_handles;
-
-	*(void **)(&p[SQ_IDX_TIMEOUT_CTX]) = timeout;
-
-	BUG_ON(!user_nvmap);
-	*(struct nvmap_client **)(&p[SQ_IDX_NVMAP_CTX]) =
-		nvmap_client_get(user_nvmap);
-
-	if (nr_handles) {
-		memcpy(&p[SQ_IDX_HANDLES], handles,
-			(nr_handles * sizeof(struct nvmap_handle *)));
-	}
-
-	/* If there's not enough room for another entry, wrap to the start. */
-	if ((write + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size) {
-		/*
-		 * It's an error for the read position to be zero, as that
-		 * would mean we emptied the queue while adding something.
-		 */
-		BUG_ON(queue->read == 0);
-		write = 0;
-	}
-	queue->write = write;
-}
-
-/**
- * Get a pointer to the next entry in the queue, or NULL if the queue is empty.
- * Doesn't consume the entry.
- */
-static u32 *sync_queue_head(struct sync_queue *queue)
-{
-	struct nvhost_cdma *cdma = container_of(queue,
-						struct nvhost_cdma,
-						sync_queue);
-	struct nvhost_master *host = cdma_to_dev(cdma);
-	u32 read = queue->read;
-	u32 write = queue->write;
-
-	BUG_ON(read  > (host->sync_queue_size - SYNC_QUEUE_MIN_ENTRY));
-	BUG_ON(write > (host->sync_queue_size - SYNC_QUEUE_MIN_ENTRY));
-
-	if (read == write)
-		return NULL;
-	return queue->buffer + read;
-}
-
-/**
- * Advances to the next queue entry, if you want to consume it.
- */
-static void
-dequeue_sync_queue_head(struct sync_queue *queue)
+static void add_to_sync_queue(struct nvhost_cdma *cdma,
+			      struct nvhost_job *job,
+			      u32 nr_slots,
+			      u32 first_get)
 {
-	struct nvhost_cdma *cdma = container_of(queue,
-						struct nvhost_cdma,
-						sync_queue);
-	struct nvhost_master *host = cdma_to_dev(cdma);
-	u32 read = queue->read;
-	u32 size;
-
-	BUG_ON(read == queue->write);
-
-	size  = SQ_IDX_HANDLES;
-	size += entry_size(queue->buffer[read + SQ_IDX_NUM_HANDLES]);
+	BUG_ON(job->syncpt_id == NVSYNCPT_INVALID);
 
-	read += size;
-	BUG_ON(read > host->sync_queue_size);
-
-	/* If there's not enough room for another entry, wrap to the start. */
-	if ((read + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size)
-		read = 0;
-	queue->read = read;
+	job->first_get = first_get;
+	job->num_slots = nr_slots;
+	nvhost_job_get(job);
+	kfifo_in(&cdma->sync_queue, (void *)&job, 1);
 }
 
 /**
@@ -272,9 +90,9 @@ static unsigned int cdma_status_locked(struct nvhost_cdma *cdma,
 {
 	switch (event) {
 	case CDMA_EVENT_SYNC_QUEUE_EMPTY:
-		return sync_queue_head(&cdma->sync_queue) ? 0 : 1;
+		return kfifo_len(&cdma->sync_queue) == 0 ? 1 : 0;
 	case CDMA_EVENT_SYNC_QUEUE_SPACE:
-		return sync_queue_space(&cdma->sync_queue);
+		return kfifo_avail(&cdma->sync_queue);
 	case CDMA_EVENT_PUSH_BUFFER_SPACE: {
 		struct push_buffer *pb = &cdma->push_buffer;
 		BUG_ON(!cdma_pb_op(cdma).space);
@@ -370,36 +188,26 @@ static void update_cdma_locked(struct nvhost_cdma *cdma)
 	 * to consume as many sync queue entries as possible without blocking
 	 */
 	for (;;) {
-		u32 syncpt_id, syncpt_val;
-		u32 timeout;
-		struct nvhost_userctx_timeout *timeout_ref = NULL;
-		unsigned int nr_slots, nr_handles;
 		struct nvhost_syncpt *sp = &dev->syncpt;
-		struct nvmap_handle **handles;
-		struct nvmap_client *nvmap;
-		u32 *sync;
+		struct nvhost_job *job;
+		int result;
 
-		sync = sync_queue_head(&cdma->sync_queue);
-		if (!sync) {
+		result = kfifo_peek(&cdma->sync_queue, &job);
+		if (!result) {
 			if (cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
 				signal = true;
 			break;
 		}
 
-		syncpt_id = sync[SQ_IDX_SYNCPT_ID];
-		syncpt_val = sync[SQ_IDX_SYNCPT_VAL];
-		timeout = sync[SQ_IDX_TIMEOUT];
-		timeout_ref = (struct nvhost_userctx_timeout *)
-				sync[SQ_IDX_TIMEOUT_CTX];
-
-		BUG_ON(syncpt_id == NVSYNCPT_INVALID);
+		BUG_ON(job->syncpt_id == NVSYNCPT_INVALID);
 
 		/* Check whether this syncpt has completed, and bail if not */
-		if (!nvhost_syncpt_min_cmp(sp, syncpt_id, syncpt_val)) {
+		if (!nvhost_syncpt_min_cmp(sp,
+				job->syncpt_id, job->syncpt_end)) {
 			/* Start timer on next pending syncpt */
-			if (timeout) {
-				cdma_start_timer_locked(cdma, syncpt_id,
-					syncpt_val, timeout_ref);
+			if (job->timeout->timeout) {
+				cdma_start_timer_locked(cdma, job->syncpt_id,
+					job->syncpt_end, job->timeout);
 			}
 			break;
 		}
@@ -408,29 +216,20 @@ static void update_cdma_locked(struct nvhost_cdma *cdma)
 		if (cdma->timeout.ctx_timeout)
 			stop_cdma_timer_locked(cdma);
 
-		nr_slots = sync[SQ_IDX_NUM_SLOTS];
-		nr_handles = sync[SQ_IDX_NUM_HANDLES];
-		nvmap = (struct nvmap_client *)sync[SQ_IDX_NVMAP_CTX];
-		handles = (struct nvmap_handle **)&sync[SQ_IDX_HANDLES];
-
-		BUG_ON(!nvmap);
-
 		/* Unpin the memory */
-		nvmap_unpin_handles(nvmap, handles, nr_handles);
-		memset(handles, BAD_MAGIC, nr_handles * sizeof(*handles));
-		nvmap_client_put(nvmap);
-		sync[SQ_IDX_NVMAP_CTX] = 0;
+		nvhost_job_unpin(job);
 
 		/* Pop push buffer slots */
-		if (nr_slots) {
+		if (job->num_slots) {
 			struct push_buffer *pb = &cdma->push_buffer;
 			BUG_ON(!cdma_pb_op(cdma).pop_from);
-			cdma_pb_op(cdma).pop_from(pb, nr_slots);
+			cdma_pb_op(cdma).pop_from(pb, job->num_slots);
 			if (cdma->event == CDMA_EVENT_PUSH_BUFFER_SPACE)
 				signal = true;
 		}
 
-		dequeue_sync_queue_head(&cdma->sync_queue);
+		nvhost_job_put(job);
+		kfifo_skip(&cdma->sync_queue);
 		if (cdma->event == CDMA_EVENT_SYNC_QUEUE_SPACE)
 			signal = true;
 	}
@@ -442,38 +241,24 @@ static void update_cdma_locked(struct nvhost_cdma *cdma)
 	}
 }
 
-static u32 *advance_next_entry(struct nvhost_cdma *cdma, u32 *read)
-{
-	struct nvhost_master *host;
-	u32 ridx;
-
-	host = cdma_to_dev(cdma);
-
-	/* move sync_queue read ptr to next entry */
-	ridx = (read - cdma->sync_queue.buffer);
-	ridx += (SQ_IDX_HANDLES + entry_size(read[SQ_IDX_NUM_HANDLES]));
-	if ((ridx + SYNC_QUEUE_MIN_ENTRY) > host->sync_queue_size)
-		ridx = 0;
-
-	/* return sync_queue entry */
-	return cdma->sync_queue.buffer + ridx;
-}
-
 void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
 		struct nvhost_syncpt *syncpt, struct device *dev)
 {
-	u32 first_get, get_restart;
-	u32 syncpt_incrs, nr_slots;
+	u32 get_restart;
+	u32 syncpt_incrs;
 	bool exec_ctxsave;
-	struct sync_queue *queue = &cdma->sync_queue;
-	u32 *sync = sync_queue_head(queue);
-	u32 syncpt_val = nvhost_syncpt_update_min(syncpt,
-			cdma->timeout.syncpt_id);
+	unsigned int queue_restore;
+	struct nvhost_job *job = NULL;
+	int result;
+	u32 syncpt_val;
+
+	syncpt_val = nvhost_syncpt_update_min(syncpt, cdma->timeout.syncpt_id);
+	queue_restore = kfifo_save(&cdma->sync_queue);
 
 	dev_dbg(dev,
-		"%s: starting cleanup (thresh %d, queue rd 0x%x wr 0x%x)\n",
+		"%s: starting cleanup (thresh %d, queue length %d)\n",
 		__func__,
-		syncpt_val, queue->read, queue->write);
+		syncpt_val, kfifo_len(&cdma->sync_queue));
 
 	/*
 	 * Move the sync_queue read pointer to the first entry that hasn't
@@ -486,13 +271,11 @@ void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
 		"%s: skip completed buffers still in sync_queue\n",
 		__func__);
 
-	while (sync != (queue->buffer + queue->write)) {
-		/* move read ptr to first blocked entry */
-		if (syncpt_val < sync[SQ_IDX_SYNCPT_VAL])
-			break;	/* not completed */
-
-		dump_sync_queue_entry(cdma, sync);
-		sync = advance_next_entry(cdma, sync);
+	result = kfifo_peek(&cdma->sync_queue, &job);
+	while (result && syncpt_val >= job->syncpt_end) {
+		nvhost_job_dump(dev, job);
+		kfifo_skip(&cdma->sync_queue);
+		result = kfifo_peek(&cdma->sync_queue, &job);
 	}
 
 	/*
@@ -515,34 +298,33 @@ void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
 		__func__);
 
 	get_restart = cdma->last_put;
-	if (sync != (queue->buffer + queue->write))
-		get_restart = sync[SQ_IDX_FIRST_GET];
-
-	/* do CPU increments */
-	while (sync != (queue->buffer + queue->write)) {
+	if (kfifo_len(&cdma->sync_queue) > 0)
+		get_restart = job->first_get;
 
+	/* do CPU increments as long as this context continues */
+	while (result && job->timeout == cdma->timeout.ctx_timeout) {
 		/* different context, gets us out of this loop */
-		if ((void *)sync[SQ_IDX_TIMEOUT_CTX] !=
-				cdma->timeout.ctx_timeout)
+		if (job->timeout != cdma->timeout.ctx_timeout)
 			break;
 
-		syncpt_incrs = (sync[SQ_IDX_SYNCPT_VAL] - syncpt_val);
-		first_get = sync[SQ_IDX_FIRST_GET];
-		nr_slots = sync[SQ_IDX_NUM_SLOTS];
-
 		/* won't need a timeout when replayed */
-		sync[SQ_IDX_TIMEOUT] = 0;
+		job->timeout->timeout = 0;
 
+		syncpt_incrs = job->syncpt_end - syncpt_val;
 		dev_dbg(dev,
 			"%s: CPU incr (%d)\n", __func__, syncpt_incrs);
 
-		dump_sync_queue_entry(cdma, sync);
+		nvhost_job_dump(dev, job);
 
 		/* safe to use CPU to incr syncpts */
-		cdma_op(cdma).timeout_cpu_incr(cdma, first_get,
-			syncpt_incrs, sync[SQ_IDX_SYNCPT_VAL], nr_slots);
-		syncpt_val += syncpt_incrs;
-		sync = advance_next_entry(cdma, sync);
+		cdma_op(cdma).timeout_cpu_incr(cdma,
+				job->first_get,
+				syncpt_incrs,
+				job->syncpt_end,
+				job->num_slots);
+
+		kfifo_skip(&cdma->sync_queue);
+		result = kfifo_peek(&cdma->sync_queue, &job);
 	}
 
 	dev_dbg(dev,
@@ -552,26 +334,22 @@ void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
 	exec_ctxsave = false;
 
 	/* setup GPU increments */
-	while (sync != (queue->buffer + queue->write)) {
-
-		syncpt_incrs = (sync[SQ_IDX_SYNCPT_VAL] - syncpt_val);
-		first_get = sync[SQ_IDX_FIRST_GET];
-		nr_slots = sync[SQ_IDX_NUM_SLOTS];
-
+	while (result) {
 		/* same context, increment in the pushbuffer */
-		if ((void *)sync[SQ_IDX_TIMEOUT_CTX] ==
-				cdma->timeout.ctx_timeout) {
-
+		if (job->timeout == cdma->timeout.ctx_timeout) {
 			/* won't need a timeout when replayed */
-			sync[SQ_IDX_TIMEOUT] = 0;
+			job->timeout->timeout = 0;
 
 			/* update buffer's syncpts in the pushbuffer */
-			cdma_op(cdma).timeout_pb_incr(cdma, first_get,
-				syncpt_incrs, nr_slots, exec_ctxsave);
+			cdma_op(cdma).timeout_pb_incr(cdma,
+					job->first_get,
+					job->syncpt_incrs,
+					job->num_slots,
+					exec_ctxsave);
 
 			exec_ctxsave = false;
 		} else {
-			dev_dbg(dev,
+			dev_warn(dev,
 				"%s: switch to a different userctx\n",
 				__func__);
 			/*
@@ -581,20 +359,21 @@ void nvhost_cdma_update_sync_queue(struct nvhost_cdma *cdma,
 			exec_ctxsave = true;
 		}
 
-		dump_sync_queue_entry(cdma, sync);
+		nvhost_job_dump(dev, job);
 
-		syncpt_val = sync[SQ_IDX_SYNCPT_VAL];
-		sync = advance_next_entry(cdma, sync);
+		kfifo_skip(&cdma->sync_queue);
+		result = kfifo_peek(&cdma->sync_queue, &job);
 	}
 
 	dev_dbg(dev,
 		"%s: finished sync_queue modification\n", __func__);
 
+	kfifo_restore(&cdma->sync_queue, queue_restore);
+
 	/* roll back DMAGET and start up channel again */
 	cdma_op(cdma).timeout_teardown_end(cdma, get_restart);
 
 	cdma->timeout.ctx_timeout->has_timedout = true;
-	mutex_unlock(&cdma->lock);
 }
 
 /**
@@ -607,20 +386,21 @@ int nvhost_cdma_init(struct nvhost_cdma *cdma)
 	BUG_ON(!cdma_pb_op(cdma).init);
 	mutex_init(&cdma->lock);
 	sema_init(&cdma->sem, 0);
+
+	err = kfifo_alloc(&cdma->sync_queue,
+			cdma_to_dev(cdma)->sync_queue_size
+				* sizeof(struct nvhost_job *),
+			GFP_KERNEL);
+	if (err)
+		return err;
+
 	cdma->event = CDMA_EVENT_NONE;
 	cdma->running = false;
 	cdma->torndown = false;
 
-	/* allocate sync queue memory */
-	cdma->sync_queue.buffer = kzalloc(cdma_to_dev(cdma)->sync_queue_size
-					  * sizeof(u32), GFP_KERNEL);
-	if (!cdma->sync_queue.buffer)
-		return -ENOMEM;
-
 	err = cdma_pb_op(cdma).init(pb);
 	if (err)
 		return err;
-	reset_sync_queue(&cdma->sync_queue);
 	return 0;
 }
 
@@ -633,8 +413,7 @@ void nvhost_cdma_deinit(struct nvhost_cdma *cdma)
 
 	BUG_ON(!cdma_pb_op(cdma).destroy);
 	BUG_ON(cdma->running);
-	kfree(cdma->sync_queue.buffer);
-	cdma->sync_queue.buffer = NULL;
+	kfifo_free(&cdma->sync_queue);
 	cdma_pb_op(cdma).destroy(pb);
 	cdma_op(cdma).timeout_destroy(cdma);
 }
@@ -725,44 +504,26 @@ void nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
  * can be unpinned in smaller chunks.
  */
 void nvhost_cdma_end(struct nvhost_cdma *cdma,
-		struct nvmap_client *user_nvmap,
-		u32 sync_point_id, u32 sync_point_value,
-		struct nvmap_handle **handles, unsigned int nr_handles,
-		struct nvhost_userctx_timeout *timeout)
+		struct nvhost_job *job)
 {
-	bool was_idle = (cdma->sync_queue.read == cdma->sync_queue.write);
+	bool was_idle = kfifo_len(&cdma->sync_queue) == 0;
 
 	BUG_ON(!cdma_op(cdma).kick);
 	cdma_op(cdma).kick(cdma);
 
-	while (nr_handles || cdma->slots_used) {
-		unsigned int count;
-		/*
-		 * Wait until there's enough room in the
-		 * sync queue to write something.
-		 */
-		count = nvhost_cdma_wait_locked(cdma,
-				CDMA_EVENT_SYNC_QUEUE_SPACE);
-
-		/* Add reloc entries to sync queue (as many as will fit) */
-		if (count > nr_handles)
-			count = nr_handles;
-
-		add_to_sync_queue(&cdma->sync_queue, sync_point_id,
-				  sync_point_value, cdma->slots_used,
-				  user_nvmap, handles, count, cdma->first_get,
-				  timeout);
-
-		/* NumSlots only goes in the first packet */
-		cdma->slots_used = 0;
-		handles += count;
-		nr_handles -= count;
-	}
+	BUG_ON(job->syncpt_id == NVSYNCPT_INVALID);
+
+	nvhost_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_SPACE);
+	add_to_sync_queue(cdma,
+			job,
+			cdma->slots_used,
+			cdma->first_get);
 
 	/* start timer on idle -> active transitions */
-	if (timeout->timeout && was_idle) {
-		cdma_start_timer_locked(cdma, sync_point_id, sync_point_value,
-			timeout);
+	if (job->timeout->timeout && was_idle) {
+		cdma_start_timer_locked(cdma,
+				job->syncpt_id, job->syncpt_end,
+			job->timeout);
 	}
 
 	mutex_unlock(&cdma->lock);
diff --git a/drivers/video/tegra/host/nvhost_cdma.h b/drivers/video/tegra/host/nvhost_cdma.h
index ae87d13f137f..c8c9e168b833 100644
--- a/drivers/video/tegra/host/nvhost_cdma.h
+++ b/drivers/video/tegra/host/nvhost_cdma.h
@@ -28,11 +28,13 @@
 
 #include <linux/nvhost.h>
 #include <mach/nvmap.h>
+#include <linux/kfifo.h>
 
 #include "nvhost_acm.h"
 
 struct nvhost_syncpt;
 struct nvhost_userctx_timeout;
+struct nvhost_job;
 
 /*
  * cdma
@@ -71,24 +73,6 @@ struct syncpt_buffer {
 	u32 words_per_incr;	/* # of DWORDS in buffer to incr a syncpt */
 };
 
-enum sync_queue_idx {
-	SQ_IDX_SYNCPT_ID   = 0,
-	SQ_IDX_SYNCPT_VAL  = 1,
-	SQ_IDX_FIRST_GET   = 2,
-	SQ_IDX_TIMEOUT     = 3,
-	SQ_IDX_TIMEOUT_CTX = 4,
-	SQ_IDX_NUM_SLOTS   = (SQ_IDX_TIMEOUT_CTX + sizeof(void *)/4),
-	SQ_IDX_NUM_HANDLES = (SQ_IDX_NUM_SLOTS + 1),
-	SQ_IDX_NVMAP_CTX   = (SQ_IDX_NUM_HANDLES + 1),
-	SQ_IDX_HANDLES     = (SQ_IDX_NVMAP_CTX + sizeof(void *)/4),
-};
-
-struct sync_queue {
-	unsigned int read;		    /* read position within buffer */
-	unsigned int write;		    /* write position within buffer */
-	u32 *buffer;                        /* queue data */
-};
-
 struct buffer_timeout {
 	struct delayed_work wq;		/* work queue */
 	bool initialized;		/* timer one-time setup flag */
@@ -116,7 +100,7 @@ struct nvhost_cdma {
 	unsigned int last_put;		/* last value written to DMAPUT */
 	struct push_buffer push_buffer;	/* channel's push buffer */
 	struct syncpt_buffer syncpt_buffer; /* syncpt incr buffer */
-	struct sync_queue sync_queue;	/* channel's sync queue */
+	DECLARE_KFIFO_PTR(sync_queue, struct nvhost_job *); /* job queue */
 	struct buffer_timeout timeout;	/* channel's timeout state/wq */
 	bool running;
 	bool torndown;
@@ -140,10 +124,7 @@ void	nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_client *client,
 		struct nvmap_handle *handle, u32 op1, u32 op2);
 void	nvhost_cdma_end(struct nvhost_cdma *cdma,
-		struct nvmap_client *user_nvmap,
-		u32 sync_point_id, u32 sync_point_value,
-		struct nvmap_handle **handles, unsigned int nr_handles,
-		struct nvhost_userctx_timeout *timeout);
+		struct nvhost_job *job);
 void	nvhost_cdma_update(struct nvhost_cdma *cdma);
 int	nvhost_cdma_flush(struct nvhost_cdma *cdma, int timeout);
 void	nvhost_cdma_peek(struct nvhost_cdma *cdma,
diff --git a/drivers/video/tegra/host/nvhost_channel.c b/drivers/video/tegra/host/nvhost_channel.c
index 6c4212d776c7..85256016ad70 100644
--- a/drivers/video/tegra/host/nvhost_channel.c
+++ b/drivers/video/tegra/host/nvhost_channel.c
@@ -23,50 +23,25 @@
 #include "nvhost_channel.h"
 #include "dev.h"
 #include "nvhost_hwctx.h"
+#include "nvhost_job.h"
 #include <trace/events/nvhost.h>
 #include <linux/nvhost_ioctl.h>
+#include <linux/slab.h>
 
 #include <linux/platform_device.h>
 
 #define NVHOST_CHANNEL_LOW_PRIO_MAX_WAIT 50
 
-int nvhost_channel_submit(
-	struct nvhost_channel *channel,
-	struct nvhost_hwctx *hwctx,
-	struct nvmap_client *user_nvmap,
-	struct nvhost_channel_gather *gathers,
-	int num_gathers,
-	struct nvhost_waitchk *waitchk,
-	struct nvhost_waitchk *waitchk_end,
-	u32 waitchk_mask,
-	struct nvmap_handle **unpins,
-	int nr_unpins,
-	u32 syncpt_id,
-	u32 syncpt_incrs,
-	struct nvhost_userctx_timeout *timeout_ctx,
-	u32 priority,
-	u32 *syncpt_value,
-	bool null_kickoff)
+int nvhost_channel_submit(struct nvhost_job *job)
 {
-	BUG_ON(!channel_op(channel).submit);
-
 	/* Low priority submits wait until sync queue is empty. Ignores result
 	 * from nvhost_cdma_flush, as we submit either when push buffer is
 	 * empty or when we reach the timeout. */
-	if (priority < NVHOST_PRIORITY_MEDIUM)
-		(void)nvhost_cdma_flush(&channel->cdma,
+	if (job->priority < NVHOST_PRIORITY_MEDIUM)
+		(void)nvhost_cdma_flush(&job->ch->cdma,
 				NVHOST_CHANNEL_LOW_PRIO_MAX_WAIT);
 
-	return channel_op(channel).submit(channel,
-			hwctx,
-			user_nvmap,
-			gathers, num_gathers,
-			waitchk, waitchk_end, waitchk_mask,
-			unpins, nr_unpins,
-			syncpt_id, syncpt_incrs,
-			timeout_ctx,
-			syncpt_value,
-			null_kickoff);
+	return channel_op(job->ch).submit(job);
 }
 
 struct nvhost_channel *nvhost_getchannel(struct nvhost_channel *ch)
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index b21e3bf035a2..c6d60fbf1189 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -26,6 +26,7 @@
 #include "nvhost_cdma.h"
 #include "nvhost_acm.h"
 #include "nvhost_hwctx.h"
+#include "nvhost_job.h"
 
 #include <linux/cdev.h>
 #include <linux/io.h>
@@ -78,23 +79,7 @@ int nvhost_channel_init(
 	struct nvhost_channel *ch,
 	struct nvhost_master *dev, int index);
 
-int nvhost_channel_submit(
-	struct nvhost_channel *channel,
-	struct nvhost_hwctx *hwctx,
-	struct nvmap_client *user_nvmap,
-	struct nvhost_channel_gather *gathers,
-	int num_gathers,
-	struct nvhost_waitchk *waitchk,
-	struct nvhost_waitchk *waitchk_end,
-	u32 waitchk_mask,
-	struct nvmap_handle **unpins,
-	int nr_unpins,
-	u32 syncpt_id,
-	u32 syncpt_incrs,
-	struct nvhost_userctx_timeout *timeout_ctx,
-	u32 priority,
-	u32 *syncpt_value,
-	bool null_kickoff);
+int nvhost_channel_submit(struct nvhost_job *job);
 
 struct nvhost_channel *nvhost_getchannel(struct nvhost_channel *ch);
 void nvhost_putchannel(struct nvhost_channel *ch, struct nvhost_hwctx *ctx);
diff --git a/drivers/video/tegra/host/nvhost_job.c b/drivers/video/tegra/host/nvhost_job.c
new file mode 100644
index 000000000000..ac9c5fefa083
--- /dev/null
+++ b/drivers/video/tegra/host/nvhost_job.c
@@ -0,0 +1,321 @@
+/*
+ * drivers/video/tegra/host/nvhost_job.c
+ *
+ * Tegra Graphics Host Job
+ *
+ * Copyright (c) 2010-2011, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/kref.h>
+#include <linux/err.h>
+#include <mach/nvmap.h>
+#include "nvhost_channel.h"
+#include "nvhost_job.h"
+#include "dev.h"
+
+/* Magic to use to fill freed handle slots */
+#define BAD_MAGIC 0xdeadbeef
+
+static int job_size(struct nvhost_submit_hdr_ext *hdr)
+{
+	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+
+	return sizeof(struct nvhost_job)
+			+ num_pins * sizeof(struct nvmap_pinarray_elem)
+			+ num_pins * sizeof(struct nvmap_handle *)
+			+ num_waitchks * sizeof(struct nvhost_waitchk);
+}
+
+static int gather_size(int num_cmdbufs)
+{
+	return num_cmdbufs * sizeof(struct nvhost_channel_gather);
+}
+
+static void free_gathers(struct nvhost_job *job)
+{
+	if (job->gathers) {
+		nvmap_munmap(job->gather_mem, job->gathers);
+		job->gathers = NULL;
+	}
+	if (job->gather_mem) {
+		nvmap_free(job->nvmap, job->gather_mem);
+		job->gather_mem = NULL;
+	}
+}
+
+static int alloc_gathers(struct nvhost_job *job,
+		int num_cmdbufs)
+{
+	int err = 0;
+
+	job->gather_mem = NULL;
+	job->gathers = NULL;
+	job->gather_mem_size = 0;
+
+	if (num_cmdbufs) {
+		/* Allocate memory */
+		job->gather_mem = nvmap_alloc(job->nvmap,
+				gather_size(num_cmdbufs),
+				32, NVMAP_HANDLE_CACHEABLE);
+		if (IS_ERR_OR_NULL(job->gather_mem)) {
+			err = PTR_ERR(job->gather_mem);
+			job->gather_mem = NULL;
+			goto error;
+		}
+		job->gather_mem_size = gather_size(num_cmdbufs);
+
+		/* Map memory to kernel */
+		job->gathers = nvmap_mmap(job->gather_mem);
+		if (IS_ERR_OR_NULL(job->gathers)) {
+			err = PTR_ERR(job->gathers);
+			job->gathers = NULL;
+			goto error;
+		}
+	}
+
+	return 0;
+
+error:
+	free_gathers(job);
+	return err;
+}
+
+static int realloc_gathers(struct nvhost_job *oldjob,
+		struct nvhost_job *newjob,
+		int num_cmdbufs)
+{
+	int err = 0;
+
+	/* Check if we can reuse gather buffer */
+	if (oldjob->gather_mem_size < gather_size(num_cmdbufs)
+			|| oldjob->nvmap != newjob->nvmap) {
+		free_gathers(oldjob);
+		err = alloc_gathers(newjob, num_cmdbufs);
+	} else {
+		newjob->gather_mem = oldjob->gather_mem;
+		newjob->gathers = oldjob->gathers;
+		newjob->gather_mem_size = oldjob->gather_mem_size;
+
+		oldjob->gather_mem = NULL;
+		oldjob->gathers = NULL;
+		oldjob->gather_mem_size = 0;
+	}
+	return err;
+}
+
+static void init_fields(struct nvhost_job *job,
+		struct nvhost_submit_hdr_ext *hdr,
+		int priority)
+{
+	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+	void *mem = job;
+
+	/* First init state to zero */
+	job->num_gathers = 0;
+	job->num_pins = 0;
+	job->num_unpins = 0;
+	job->num_waitchk = 0;
+	job->waitchk_mask = 0;
+	job->syncpt_id = 0;
+	job->syncpt_incrs = 0;
+	job->syncpt_end = 0;
+	job->priority = priority;
+	job->null_kickoff = false;
+	job->first_get = 0;
+	job->num_slots = 0;
+
+	/* Redistribute memory to the structs */
+	mem += sizeof(struct nvhost_job);
+	if (num_pins) {
+		job->pinarray = mem;
+		mem += num_pins * sizeof(struct nvmap_pinarray_elem);
+		job->unpins = mem;
+		mem += num_pins * sizeof(struct nvmap_handle *);
+	} else {
+		job->pinarray = NULL;
+		job->unpins = NULL;
+	}
+
+	job->waitchk = num_waitchks ? mem : NULL;
+
+	/* Copy information from header */
+	if (hdr) {
+		job->waitchk_mask = hdr->waitchk_mask;
+		job->syncpt_id = hdr->syncpt_id;
+		job->syncpt_incrs = hdr->syncpt_incrs;
+	}
+}
+
+struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
+		struct nvhost_hwctx *hwctx,
+		struct nvhost_submit_hdr_ext *hdr,
+		struct nvmap_client *nvmap,
+		int priority,
+		struct nvhost_userctx_timeout *timeout)
+{
+	struct nvhost_job *job = NULL;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int err = 0;
+
+	job = kzalloc(job_size(hdr), GFP_KERNEL);
+	if (!job)
+		goto error;
+
+	kref_init(&job->ref);
+	job->ch = ch;
+	job->hwctx = hwctx;
+	job->timeout = timeout;
+	job->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
+
+	err = alloc_gathers(job, num_cmdbufs);
+	if (err)
+		goto error;
+
+	init_fields(job, hdr, priority);
+
+	return job;
+
+error:
+	if (job)
+		nvhost_job_put(job);
+	return NULL;
+}
+
+struct nvhost_job *nvhost_job_realloc(
+		struct nvhost_job *oldjob,
+		struct nvhost_submit_hdr_ext *hdr,
+		struct nvmap_client *nvmap,
+		int priority)
+{
+	struct nvhost_job *newjob = NULL;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int err = 0;
+
+	newjob = kzalloc(job_size(hdr), GFP_KERNEL);
+	if (!newjob)
+		goto error;
+	kref_init(&newjob->ref);
+	newjob->ch = oldjob->ch;
+	newjob->hwctx = oldjob->hwctx;
+	newjob->timeout = oldjob->timeout;
+	newjob->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
+
+	err = realloc_gathers(oldjob, newjob, num_cmdbufs);
+	if (err)
+		goto error;
+
+	nvhost_job_put(oldjob);
+
+	init_fields(newjob, hdr, priority);
+
+	return newjob;
+
+error:
+	if (newjob)
+		nvhost_job_put(newjob);
+	if (oldjob)
+		nvhost_job_put(oldjob);
+	return NULL;
+}
+
+void nvhost_job_get(struct nvhost_job *job)
+{
+	kref_get(&job->ref);
+}
+
+static void job_free(struct kref *ref)
+{
+	struct nvhost_job *job = container_of(ref, struct nvhost_job, ref);
+
+	if (job->gathers)
+		nvmap_munmap(job->gather_mem, job->gathers);
+	if (job->gather_mem)
+		nvmap_free(job->nvmap, job->gather_mem);
+	if (job->nvmap)
+		nvmap_client_put(job->nvmap);
+	kfree(job);
+}
+
+void nvhost_job_put(struct nvhost_job *job)
+{
+	kref_put(&job->ref, job_free);
+}
+
+void nvhost_job_add_gather(struct nvhost_job *job,
+		u32 mem_id, u32 words, u32 offset)
+{
+	struct nvmap_pinarray_elem *pin;
+	struct nvhost_channel_gather *cur_gather =
+			&job->gathers[job->num_gathers];
+
+	pin = &job->pinarray[job->num_pins++];
+	pin->patch_mem = (u32)nvmap_ref_to_handle(job->gather_mem);
+	pin->patch_offset = (void *)&(cur_gather->mem) - (void *)job->gathers;
+	pin->pin_mem = mem_id;
+	pin->pin_offset = offset;
+	cur_gather->words = words;
+	cur_gather->mem_id = mem_id;
+	cur_gather->offset = offset;
+	job->num_gathers += 1;
+}
+
+int nvhost_job_pin(struct nvhost_job *job)
+{
+	int err = 0;
+
+	/* pin mem handles and patch physical addresses */
+	job->num_unpins = nvmap_pin_array(job->nvmap,
+				nvmap_ref_to_handle(job->gather_mem),
+				job->pinarray, job->num_pins,
+				job->unpins);
+	if (job->num_unpins < 0)
+		err = job->num_unpins;
+
+	return err;
+}
+
+void nvhost_job_unpin(struct nvhost_job *job)
+{
+	nvmap_unpin_handles(job->nvmap, job->unpins,
+			job->num_unpins);
+	memset(job->unpins, BAD_MAGIC,
+			job->num_unpins * sizeof(struct nvmap_handle *));
+}
+
+/**
+ * Debug routine used to dump job entries
+ */
+void nvhost_job_dump(struct device *dev, struct nvhost_job *job)
+{
+	dev_dbg(dev, "    SYNCPT_ID   %d\n",
+		job->syncpt_id);
+	dev_dbg(dev, "    SYNCPT_VAL  %d\n",
+		job->syncpt_end);
+	dev_dbg(dev, "    FIRST_GET   0x%x\n",
+		job->first_get);
+	dev_dbg(dev, "    TIMEOUT     %d\n",
+		job->timeout->timeout);
+	dev_dbg(dev, "    TIMEOUT_CTX 0x%p\n",
+		job->timeout);
+	dev_dbg(dev, "    NUM_SLOTS   %d\n",
+		job->num_slots);
+	dev_dbg(dev, "    NUM_HANDLES %d\n",
+		job->num_unpins);
+}
diff --git a/drivers/video/tegra/host/nvhost_job.h b/drivers/video/tegra/host/nvhost_job.h
new file mode 100644
index 000000000000..42e1c46d4525
--- /dev/null
+++ b/drivers/video/tegra/host/nvhost_job.h
@@ -0,0 +1,139 @@
+/*
+ * drivers/video/tegra/host/nvhost_job.h
+ *
+ * Tegra Graphics Host Interrupt Management
+ *
+ * Copyright (c) 2010, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __NVHOST_JOB_H
+#define __NVHOST_JOB_H
+
+#include <linux/nvhost_ioctl.h>
+
+struct nvhost_channel;
+struct nvhost_hwctx;
+struct nvmap_client;
+struct nvhost_waitchk;
+struct nvmap_handle;
+struct nvhost_userctx_timeout;
+
+/*
+ * Each submit is tracked as a nvhost_job.
+ */
+struct nvhost_job {
+	/* When refcount goes to zero, job can be freed */
+	struct kref ref;
+
+	/* Channel where job is submitted to */
+	struct nvhost_channel *ch;
+
+	/* Hardware context valid for this client */
+	struct nvhost_hwctx *hwctx;
+	struct nvhost_userctx_timeout *timeout;
+
+	/* Nvmap to be used for pinning & unpinning memory */
+	struct nvmap_client *nvmap;
+
+	/* Gathers and their memory */
+	struct nvmap_handle_ref *gather_mem;
+	struct nvhost_channel_gather *gathers;
+	int num_gathers;
+	int gather_mem_size;
+
+	/* Wait checks to be processed at submit time */
+	struct nvhost_waitchk *waitchk;
+	int num_waitchk;
+	u32 waitchk_mask;
+
+	/* Array of handles to be pinned & unpinned */
+	struct nvmap_pinarray_elem *pinarray;
+	int num_pins;
+	struct nvmap_handle **unpins;
+	int num_unpins;
+
+	/* Sync point id, number of increments and end related to the submit */
+	u32 syncpt_id;
+	u32 syncpt_incrs;
+	u32 syncpt_end;
+
+	/* Priority of this submit. */
+	int priority;
+
+	/* Null kickoff prevents submit from being sent to hardware */
+	bool null_kickoff;
+
+	/* Index and number of slots used in the push buffer */
+	int first_get;
+	int num_slots;
+};
+
+/*
+ * Allocate memory for a job. Just enough memory will be allocated to
+ * accomodate the submit announced in submit header.
+ */
+struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
+		struct nvhost_hwctx *hwctx,
+		struct nvhost_submit_hdr_ext *hdr,
+		struct nvmap_client *nvmap,
+		int priority,
+		struct nvhost_userctx_timeout *timeout);
+
+/*
+ * Allocate memory for a job. Just enough memory will be allocated to
+ * accomodate the submit announced in submit header. Gather memory from
+ * oldjob will be reused, and nvhost_job_put() will be called to it.
+ */
+struct nvhost_job *nvhost_job_realloc(struct nvhost_job *oldjob,
+		struct nvhost_submit_hdr_ext *hdr,
+		struct nvmap_client *nvmap,
+		int priority);
+
+/*
+ * Add a gather to a job.
+ */
+void nvhost_job_add_gather(struct nvhost_job *job,
+		u32 mem_id, u32 words, u32 offset);
+
+/*
+ * Increment reference going to nvhost_job.
+ */
+void nvhost_job_get(struct nvhost_job *job);
+
+/*
+ * Decrement reference job, free if goes to zero.
+ */
+void nvhost_job_put(struct nvhost_job *job);
+
+/*
+ * Pin memory related to job. This handles relocation of addresses to the
+ * host1x address space. Handles both the gather memory and any other memory
+ * referred to from the gather buffers.
+ */
+int nvhost_job_pin(struct nvhost_job *job);
+
+/*
+ * Unpin memory related to job.
+ */
+void nvhost_job_unpin(struct nvhost_job *job);
+
+/*
+ * Dump contents of job to debug output.
+ */
+void nvhost_job_dump(struct device *dev, struct nvhost_job *job);
+
+#endif
diff --git a/drivers/video/tegra/host/nvhost_syncpt.c b/drivers/video/tegra/host/nvhost_syncpt.c
index 51d79f06d178..0fa6d3e1ce20 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.c
+++ b/drivers/video/tegra/host/nvhost_syncpt.c
@@ -240,7 +240,8 @@ int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
 			     struct nvmap_client *nvmap,
 			     u32 waitchk_mask,
 			     struct nvhost_waitchk *wait,
-			     struct nvhost_waitchk *waitend)
+			     int num_waitchk)
 {
-	return syncpt_op(sp).wait_check(sp, nvmap, waitchk_mask, wait, waitend);
+	return syncpt_op(sp).wait_check(sp, nvmap,
+			waitchk_mask, wait, num_waitchk);
 }
diff --git a/drivers/video/tegra/host/nvhost_syncpt.h b/drivers/video/tegra/host/nvhost_syncpt.h
index 790ecc5eb4ea..0dfb11775980 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.h
+++ b/drivers/video/tegra/host/nvhost_syncpt.h
@@ -155,7 +155,7 @@ int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
 			struct nvmap_client *nvmap,
 			u32 mask,
 			struct nvhost_waitchk *wait,
-			struct nvhost_waitchk *waitend);
+			int num_waitchk);
 
 void nvhost_syncpt_debug(struct nvhost_syncpt *sp);
 
diff --git a/drivers/video/tegra/host/t20/cdma_t20.c b/drivers/video/tegra/host/t20/cdma_t20.c
index 7585261ef529..fc2afdb26189 100644
--- a/drivers/video/tegra/host/t20/cdma_t20.c
+++ b/drivers/video/tegra/host/t20/cdma_t20.c
@@ -630,6 +630,7 @@ static void t20_cdma_timeout_handler(struct work_struct *work)
 	cdma_op(cdma).timeout_teardown_begin(cdma);
 
 	nvhost_cdma_update_sync_queue(cdma, sp, &dev->pdev->dev);
+	mutex_unlock(&cdma->lock);
 }
 
 int nvhost_init_t20_cdma_support(struct nvhost_master *host)
diff --git a/drivers/video/tegra/host/t20/channel_t20.c b/drivers/video/tegra/host/t20/channel_t20.c
index dfb6b2539912..132bdb526bbc 100644
--- a/drivers/video/tegra/host/t20/channel_t20.c
+++ b/drivers/video/tegra/host/t20/channel_t20.c
@@ -189,25 +189,12 @@ static void t20_channel_sync_waitbases(struct nvhost_channel *ch, u32 syncpt_val
 	}
 }
 
-static int t20_channel_submit(struct nvhost_channel *channel,
-			      struct nvhost_hwctx *hwctx,
-			      struct nvmap_client *user_nvmap,
-			      struct nvhost_channel_gather *gathers,
-			      int num_gathers,
-			      struct nvhost_waitchk *waitchk,
-			      struct nvhost_waitchk *waitchk_end,
-			      u32 waitchk_mask,
-			      struct nvmap_handle **unpins,
-			      int nr_unpins,
-			      u32 syncpt_id,
-			      u32 syncpt_incrs,
-			      struct nvhost_userctx_timeout *timeout,
-			      u32 *syncpt_value,
-			      bool null_kickoff)
+static int t20_channel_submit(struct nvhost_job *job)
 {
 	struct nvhost_hwctx *hwctx_to_save = NULL;
-	struct nvhost_syncpt *sp = &channel->dev->syncpt;
-	u32 user_syncpt_incrs = syncpt_incrs;
+	struct nvhost_channel *channel = job->ch;
+	struct nvhost_syncpt *sp = &job->ch->dev->syncpt;
+	u32 user_syncpt_incrs = job->syncpt_incrs;
 	bool need_restore = false;
 	u32 syncval;
 	int err;
@@ -227,7 +214,7 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 		channel->mod.desc->busy(&channel->mod);
 
 	/* before error checks, return current max */
-	*syncpt_value = nvhost_syncpt_read_max(sp, syncpt_id);
+	job->syncpt_end = nvhost_syncpt_read_max(sp, job->syncpt_id);
 
 	/* get submit lock */
 	err = mutex_lock_interruptible(&channel->submitlock);
@@ -237,7 +224,7 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	}
 
 	/* If we are going to need a restore, allocate a waiter for it */
-	if (channel->cur_ctx != hwctx && hwctx && hwctx->valid) {
+	if (channel->cur_ctx != job->hwctx && job->hwctx && job->hwctx->valid) {
 		ctxrestore_waiter = nvhost_intr_alloc_waiter();
 		if (!ctxrestore_waiter) {
 			mutex_unlock(&channel->submitlock);
@@ -249,11 +236,12 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	}
 
 	/* remove stale waits */
-	if (waitchk != waitchk_end) {
+	if (job->num_waitchk) {
 		err = nvhost_syncpt_wait_check(sp,
-					       user_nvmap,
-					       waitchk_mask,
-					       waitchk, waitchk_end);
+					       job->nvmap,
+					       job->waitchk_mask,
+					       job->waitchk,
+					       job->num_waitchk);
 		if (err) {
 			dev_warn(&channel->dev->pdev->dev,
 				 "nvhost_syncpt_wait_check failed: %d\n", err);
@@ -264,19 +252,19 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	}
 
 	/* begin a CDMA submit */
-	err = nvhost_cdma_begin(&channel->cdma, timeout);
+	err = nvhost_cdma_begin(&channel->cdma, job->timeout);
 	if (err) {
 		mutex_unlock(&channel->submitlock);
 		nvhost_module_idle(&channel->mod);
 		goto done;
 	}
 
-	t20_channel_sync_waitbases(channel, *syncpt_value);
+	t20_channel_sync_waitbases(channel, job->syncpt_end);
 
 	/* context switch */
-	if (channel->cur_ctx != hwctx) {
+	if (channel->cur_ctx != job->hwctx) {
 		trace_nvhost_channel_context_switch(channel->desc->name,
-		  channel->cur_ctx, hwctx);
+		  channel->cur_ctx, job->hwctx);
 		hwctx_to_save = channel->cur_ctx;
 		if (hwctx_to_save && hwctx_to_save->timeout &&
 			hwctx_to_save->timeout->has_timedout) {
@@ -286,22 +274,24 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 				__func__, channel->cur_ctx->timeout);
 		}
 		if (hwctx_to_save) {
-			syncpt_incrs += hwctx_to_save->save_incrs;
+			job->syncpt_incrs += hwctx_to_save->save_incrs;
 			hwctx_to_save->valid = true;
 			channel->ctxhandler.get(hwctx_to_save);
 		}
-		channel->cur_ctx = hwctx;
+		channel->cur_ctx = job->hwctx;
 		if (need_restore)
-			syncpt_incrs += channel->cur_ctx->restore_incrs;
+			job->syncpt_incrs += channel->cur_ctx->restore_incrs;
 	}
 
 	/* get absolute sync value */
-	if (BIT(syncpt_id) & sp->client_managed)
+	if (BIT(job->syncpt_id) & sp->client_managed)
 		syncval = nvhost_syncpt_set_max(sp,
-						syncpt_id, syncpt_incrs);
+				job->syncpt_id, job->syncpt_incrs);
 	else
 		syncval = nvhost_syncpt_incr_max(sp,
-						syncpt_id, syncpt_incrs);
+				job->syncpt_id, job->syncpt_incrs);
+
+	job->syncpt_end = syncval;
 
 	/* push save buffer (pre-gather setup depends on unit) */
 	if (hwctx_to_save)
@@ -323,14 +313,14 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 			nvhost_opcode_setclass(channel->desc->class, 0, 0),
 			NVHOST_OPCODE_NOOP);
 
-	if (null_kickoff) {
+	if (job->null_kickoff) {
 		int incr;
 		u32 op_incr;
 
 		/* TODO ideally we'd also perform host waits here */
 
 		/* push increments that correspond to nulled out commands */
-		op_incr = nvhost_opcode_imm(0, 0x100 | syncpt_id);
+		op_incr = nvhost_opcode_imm(0, 0x100 | job->syncpt_id);
 		for (incr = 0; incr < (user_syncpt_incrs >> 1); incr++)
 			nvhost_cdma_push(&channel->cdma, op_incr, op_incr);
 		if (user_syncpt_incrs & 1)
@@ -350,30 +340,30 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	} else {
 		/* push user gathers */
 		int i = 0;
-		for ( ; i < num_gathers; i++) {
+		for ( ; i < job->num_gathers; i++) {
+			u32 op1 = nvhost_opcode_gather(job->gathers[i].words);
+			u32 op2 = job->gathers[i].mem;
 			nvhost_cdma_push_gather(&channel->cdma,
-					user_nvmap,
-					unpins[i/2],
-					nvhost_opcode_gather(gathers[i].words),
-					gathers[i].mem);
+					job->nvmap, job->unpins[i/2],
+					op1, op2);
 		}
 	}
 
 	/* end CDMA submit & stash pinned hMems into sync queue */
-	nvhost_cdma_end(&channel->cdma, user_nvmap,
-			syncpt_id, syncval, unpins, nr_unpins,
-			timeout);
+	nvhost_cdma_end(&channel->cdma, job);
 
 	trace_nvhost_channel_submitted(channel->desc->name,
-			syncval-syncpt_incrs, syncval);
+			syncval - job->syncpt_incrs, syncval);
 
 	/*
 	 * schedule a context save interrupt (to drain the host FIFO
 	 * if necessary, and to release the restore buffer)
 	 */
 	if (hwctx_to_save) {
-		err = nvhost_intr_add_action(&channel->dev->intr, syncpt_id,
-			syncval - syncpt_incrs + hwctx_to_save->save_thresh,
+		err = nvhost_intr_add_action(&channel->dev->intr,
+			job->syncpt_id,
+			syncval - job->syncpt_incrs
+				+ hwctx_to_save->save_thresh,
 			NVHOST_INTR_ACTION_CTXSAVE, hwctx_to_save,
 			ctxsave_waiter,
 			NULL);
@@ -383,7 +373,8 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 
 	if (need_restore) {
 		BUG_ON(!ctxrestore_waiter);
-		err = nvhost_intr_add_action(&channel->dev->intr, syncpt_id,
+		err = nvhost_intr_add_action(&channel->dev->intr,
+			job->syncpt_id,
 			syncval - user_syncpt_incrs,
 			NVHOST_INTR_ACTION_CTXRESTORE, channel->cur_ctx,
 			ctxrestore_waiter,
@@ -393,7 +384,8 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 	}
 
 	/* schedule a submit complete interrupt */
-	err = nvhost_intr_add_action(&channel->dev->intr, syncpt_id, syncval,
+	err = nvhost_intr_add_action(&channel->dev->intr, job->syncpt_id,
+			syncval,
 			NVHOST_INTR_ACTION_SUBMIT_COMPLETE, channel,
 			completed_waiter,
 			NULL);
@@ -402,8 +394,6 @@ static int t20_channel_submit(struct nvhost_channel *channel,
 
 	mutex_unlock(&channel->submitlock);
 
-	*syncpt_value = syncval;
-
 done:
 	kfree(ctxrestore_waiter);
 	kfree(ctxsave_waiter);
@@ -425,6 +415,7 @@ static int t20_channel_read_3d_reg(
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 	void *ref;
 	void *ctx_waiter, *read_waiter, *completed_waiter;
+	struct nvhost_job *job;
 	u32 syncval;
 	int err;
 
@@ -436,6 +427,14 @@ static int t20_channel_read_3d_reg(
 		goto done;
 	}
 
+	job = nvhost_job_alloc(channel, hwctx,
+			NULL,
+			channel->dev->nvmap, 0, timeout);
+	if (!job) {
+		err = -ENOMEM;
+		goto done;
+	}
+
 	/* keep module powered */
 	nvhost_module_busy(&channel->mod);
 
@@ -464,6 +463,10 @@ static int t20_channel_read_3d_reg(
 	syncval = nvhost_syncpt_incr_max(&channel->dev->syncpt,
 		NVSYNCPT_3D, syncpt_incrs);
 
+	job->syncpt_id = NVSYNCPT_3D;
+	job->syncpt_incrs = syncpt_incrs;
+	job->syncpt_end = syncval;
+
 	/* begin a CDMA submit */
 	nvhost_cdma_begin(&channel->cdma, timeout);
 
@@ -513,9 +516,9 @@ static int t20_channel_read_3d_reg(
 		nvhost_opcode_imm(NV_CLASS_HOST_INCR_SYNCPT, NVSYNCPT_3D));
 
 	/* end CDMA submit  */
-	nvhost_cdma_end(&channel->cdma, channel->dev->nvmap,
-			NVSYNCPT_3D, syncval, NULL, 0,
-			timeout);
+	nvhost_cdma_end(&channel->cdma, job);
+	nvhost_job_put(job);
+	job = NULL;
 
 	/*
 	 * schedule a context save interrupt (to drain the host FIFO
diff --git a/drivers/video/tegra/host/t20/hardware_t20.h b/drivers/video/tegra/host/t20/hardware_t20.h
index 25f2e7791451..4cc83f79ec81 100644
--- a/drivers/video/tegra/host/t20/hardware_t20.h
+++ b/drivers/video/tegra/host/t20/hardware_t20.h
@@ -283,9 +283,11 @@ static inline u32 nvhost_mask2(unsigned x, unsigned y)
 int nvhost_drain_read_fifo(void __iomem *chan_regs,
 		u32 *ptr, unsigned int count, unsigned int *pending);
 
-/* Size of the sync queue. If it is too small, we won't be able to queue up
- * many command buffers. If it is too large, we waste memory. */
-#define NVHOST_SYNC_QUEUE_SIZE 8192
+/*
+ * Size of the sync queue. Size equals to case where all submits consist of
+ * only one gather.
+ */
+#define NVHOST_SYNC_QUEUE_SIZE 512
 
 /* Number of gathers we allow to be queued up per channel. Must be a
  * power of two. Currently sized such that pushbuffer is 4KB (512*8B). */
diff --git a/drivers/video/tegra/host/t20/syncpt_t20.c b/drivers/video/tegra/host/t20/syncpt_t20.c
index 7b471738f9d9..259e80fd25f6 100644
--- a/drivers/video/tegra/host/t20/syncpt_t20.c
+++ b/drivers/video/tegra/host/t20/syncpt_t20.c
@@ -120,7 +120,7 @@ static int t20_syncpt_wait_check(struct nvhost_syncpt *sp,
 				 struct nvmap_client *nvmap,
 				 u32 waitchk_mask,
 				 struct nvhost_waitchk *wait,
-				 struct nvhost_waitchk *waitend)
+				 int num_waitchk)
 {
 	u32 idx;
 	int err = 0;
@@ -131,10 +131,10 @@ static int t20_syncpt_wait_check(struct nvhost_syncpt *sp,
 			nvhost_syncpt_update_min(sp, idx);
 	}
 
-	BUG_ON(!wait && !waitend);
+	BUG_ON(!wait && !num_waitchk);
 
 	/* compare syncpt vs wait threshold */
-	while (wait != waitend) {
+	while (num_waitchk) {
 		u32 syncpt, override;
 
 		BUG_ON(wait->syncpt_id >= NV_HOST1X_SYNCPT_NB_PTS);
@@ -164,7 +164,9 @@ static int t20_syncpt_wait_check(struct nvhost_syncpt *sp,
 			if (err)
 				break;
 		}
+
 		wait++;
+		num_waitchk--;
 	}
 	return err;
 }
diff --git a/drivers/video/tegra/host/t20/t20.c b/drivers/video/tegra/host/t20/t20.c
index bb8d1819a5da..605f4edf015f 100644
--- a/drivers/video/tegra/host/t20/t20.c
+++ b/drivers/video/tegra/host/t20/t20.c
@@ -74,6 +74,7 @@ int nvhost_t20_save_context(struct nvhost_module *mod, u32 syncpt_id)
 	int err = 0;
 	void *ref;
 	void *ctx_waiter = NULL, *wakeup_waiter = NULL;
+	struct nvhost_job *job;
 
 	ctx_waiter = nvhost_intr_alloc_waiter();
 	wakeup_waiter = nvhost_intr_alloc_waiter();
@@ -92,6 +93,15 @@ int nvhost_t20_save_context(struct nvhost_module *mod, u32 syncpt_id)
 		goto done;
 	}
 
+	job = nvhost_job_alloc(ch, hwctx_to_save,
+			NULL,
+			ch->dev->nvmap, 0, hwctx_to_save->timeout);
+	if (IS_ERR_OR_NULL(job)) {
+		err = PTR_ERR(job);
+		mutex_unlock(&ch->submitlock);
+		goto done;
+	}
+
 	err = nvhost_cdma_begin(&ch->cdma, hwctx_to_save->timeout);
 	if (err) {
 		mutex_unlock(&ch->submitlock);
@@ -106,9 +116,14 @@ int nvhost_t20_save_context(struct nvhost_module *mod, u32 syncpt_id)
 	syncpt_val = nvhost_syncpt_incr_max(&ch->dev->syncpt,
 					syncpt_id, syncpt_incrs);
 
+	job->syncpt_id = syncpt_id;
+	job->syncpt_incrs = syncpt_incrs;
+	job->syncpt_end = syncpt_val;
+
 	ch->ctxhandler.save_push(&ch->cdma, hwctx_to_save);
-	nvhost_cdma_end(&ch->cdma, ch->dev->nvmap, syncpt_id, syncpt_val,
-			NULL, 0, hwctx_to_save->timeout);
+	nvhost_cdma_end(&ch->cdma, job);
+	nvhost_job_put(job);
+	job = NULL;
 
 	err = nvhost_intr_add_action(&ch->dev->intr, syncpt_id,
 			syncpt_val - syncpt_incrs + hwctx_to_save->save_thresh,