20 files changed, 474 insertions, 413 deletions
diff --git a/drivers/video/tegra/host/bus_client.c b/drivers/video/tegra/host/bus_client.c
index fd632a6ea9c5..87aa9c64d363 100644
--- a/drivers/video/tegra/host/bus_client.c
+++ b/drivers/video/tegra/host/bus_client.c
@@ -141,12 +141,6 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	priv->clientid = atomic_add_return(1,
 			&nvhost_get_host(ch->dev)->clientid);
 	priv->timeout = MAX_STUCK_CHECK_COUNT * SYNCPT_CHECK_PERIOD;
-
-	priv->job = nvhost_job_alloc(ch, priv->hwctx, &priv->hdr,
-			NULL, priv->priority, priv->clientid);
-	if (!priv->job)
-		goto fail;
-
 	return 0;
 fail:
 	nvhost_channelrelease(inode, filp);
@@ -166,7 +160,7 @@ static int set_submit(struct nvhost_channel_userctx *ctx)
 		return -EFAULT;
 	}
 
-	ctx->job = nvhost_job_realloc(ctx->job,
+	ctx->job = nvhost_job_alloc(ctx->ch,
 			ctx->hwctx,
 			&ctx->hdr,
 			ctx->nvmap,
@@ -238,17 +232,28 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 				cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
 			hdr->num_cmdbufs--;
 		} else if (hdr->num_relocs) {
-			consumed = sizeof(struct nvhost_reloc);
-			if (remaining < consumed)
+			int numrelocs = remaining / sizeof(struct nvhost_reloc);
+			if (!numrelocs)
 				break;
-			if (copy_from_user(&job->pinarray[job->num_pins],
+			numrelocs = min_t(int, numrelocs, priv->hdr.num_relocs);
+			consumed = numrelocs * sizeof(struct nvhost_reloc);
+			if (copy_from_user(&job->relocarray[job->num_relocs],
 					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			trace_nvhost_channel_write_reloc(chname);
-			job->num_pins++;
-			hdr->num_relocs--;
+			while (numrelocs) {
+				struct nvhost_reloc *reloc =
+					&job->relocarray[job->num_relocs];
+				trace_nvhost_channel_write_reloc(chname,
+					reloc->cmdbuf_mem,
+					reloc->cmdbuf_offset,
+					reloc->target,
+					reloc->target_offset);
+				job->num_relocs++;
+				hdr->num_relocs--;
+				numrelocs--;
+			}
 		} else if (hdr->num_waitchks) {
 			int numwaitchks =
 				(remaining / sizeof(struct nvhost_waitchk));
@@ -269,17 +274,19 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 			hdr->num_waitchks -= numwaitchks;
 		} else if (priv->num_relocshifts) {
 			int next_shift =
-				job->num_pins - priv->num_relocshifts;
-			consumed = sizeof(struct nvhost_reloc_shift);
-			if (remaining < consumed)
+				job->num_relocs - priv->num_relocshifts;
+			int num =
+				(remaining / sizeof(struct nvhost_reloc_shift));
+			if (!num)
 				break;
-			if (copy_from_user(
-					&job->pinarray[next_shift].reloc_shift,
+			num = min_t(int, num, priv->num_relocshifts);
+			consumed = num * sizeof(struct nvhost_reloc_shift);
+			if (copy_from_user(&job->relocshiftarray[next_shift],
 					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			priv->num_relocshifts--;
+			priv->num_relocshifts -= num;
 		} else {
 			err = -EFAULT;
 			break;
@@ -302,7 +309,7 @@ static int nvhost_ioctl_channel_flush(
 	struct nvhost_get_param_args *args,
 	int null_kickoff)
 {
-	struct device *device = &ctx->ch->dev->dev;
+	struct nvhost_device *ndev = to_nvhost_device(&ctx->ch->dev->dev);
 	int err;
 
 	trace_nvhost_ioctl_channel_flush(ctx->ch->dev->name);
@@ -312,13 +319,13 @@ static int nvhost_ioctl_channel_flush(
 	    ctx->hdr.num_cmdbufs ||
 	    ctx->hdr.num_waitchks) {
 		reset_submit(ctx);
-		dev_err(device, "channel submit out of sync\n");
+		dev_err(&ndev->dev, "channel submit out of sync\n");
 		return -EFAULT;
 	}
 
-	err = nvhost_job_pin(ctx->job);
+	err = nvhost_job_pin(ctx->job, &nvhost_get_host(ndev)->syncpt);
 	if (err) {
-		dev_warn(device, "nvhost_job_pin failed: %d\n", err);
+		dev_warn(&ndev->dev, "nvhost_job_pin failed: %d\n", err);
 		return err;
 	}
 
@@ -337,6 +344,9 @@ static int nvhost_ioctl_channel_flush(
 	if (err)
 		nvhost_job_unpin(ctx->job);
 
+	nvhost_job_put(ctx->job);
+	ctx->job = NULL;
+
 	return err;
 }
 
diff --git a/drivers/video/tegra/host/chip_support.h b/drivers/video/tegra/host/chip_support.h
index edc5f6a51574..d69e1c4bccb9 100644
--- a/drivers/video/tegra/host/chip_support.h
+++ b/drivers/video/tegra/host/chip_support.h
@@ -28,10 +28,9 @@ struct output;
 struct nvhost_master;
 struct nvhost_intr;
 struct nvhost_syncpt;
-struct nvhost_waitchk;
 struct nvhost_userctx_timeout;
 struct nvhost_channel;
-struct nvmap_handle;
+struct nvmap_handle_ref;
 struct nvmap_client;
 struct nvhost_hwctx;
 struct nvhost_cdma;
@@ -77,7 +76,7 @@ struct nvhost_chip_support {
 		void (*destroy)(struct push_buffer *);
 		void (*push_to)(struct push_buffer *,
 				struct nvmap_client *,
-				struct nvmap_handle *,
+				struct nvmap_handle_ref *,
 				u32 op1, u32 op2);
 		void (*pop_from)(struct push_buffer *,
 				 unsigned int slots);
@@ -106,11 +105,8 @@ struct nvhost_chip_support {
 		void (*read_wait_base)(struct nvhost_syncpt *, u32 id);
 		u32 (*update_min)(struct nvhost_syncpt *, u32 id);
 		void (*cpu_incr)(struct nvhost_syncpt *, u32 id);
-		int (*wait_check)(struct nvhost_syncpt *sp,
-				  struct nvmap_client *nvmap,
-				  u32 waitchk_mask,
-				  struct nvhost_waitchk *wait,
-				  int num_waitchk);
+		int (*patch_wait)(struct nvhost_syncpt *sp,
+				void *patch_addr);
 		void (*debug)(struct nvhost_syncpt *);
 		const char * (*name)(struct nvhost_syncpt *, u32 id);
 		int (*mutex_try_lock)(struct nvhost_syncpt *,
diff --git a/drivers/video/tegra/host/debug.c b/drivers/video/tegra/host/debug.c
index 8a26f92c79f6..820eac85521d 100644
--- a/drivers/video/tegra/host/debug.c
+++ b/drivers/video/tegra/host/debug.c
@@ -106,13 +106,53 @@ static void show_all(struct nvhost_master *m, struct output *o)
 	nvhost_get_chip_ops()->debug.show_mlocks(m, o);
 	show_syncpts(m, o);
 	nvhost_debug_output(o, "---- channels ----\n");
-	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o, show_channels);
+	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o,
+			show_channels);
 
 	nvhost_module_idle(m->dev);
 }
 
 #ifdef CONFIG_DEBUG_FS
-static int nvhost_debug_show(struct seq_file *s, void *unused)
+static int show_channels_no_fifo(struct device *dev, void *data)
+{
+	struct nvhost_channel *ch;
+	struct nvhost_device *nvdev = to_nvhost_device(dev);
+	struct output *o = data;
+	struct nvhost_master *m;
+
+	if (nvdev == NULL)
+		return 0;
+
+	m = nvhost_get_host(nvdev);
+	ch = nvdev->channel;
+	if (ch) {
+		mutex_lock(&ch->reflock);
+		if (ch->refcount) {
+			mutex_lock(&ch->cdma.lock);
+			nvhost_get_chip_ops()->debug.show_channel_cdma(m,
+					ch, o, nvdev->index);
+			mutex_unlock(&ch->cdma.lock);
+		}
+		mutex_unlock(&ch->reflock);
+	}
+
+	return 0;
+}
+
+static void show_all_no_fifo(struct nvhost_master *m, struct output *o)
+{
+	nvhost_module_busy(m->dev);
+
+	nvhost_get_chip_ops()->debug.show_mlocks(m, o);
+	show_syncpts(m, o);
+	nvhost_debug_output(o, "---- channels ----\n");
+	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o,
+			show_channels_no_fifo);
+
+	nvhost_module_idle(m->dev);
+}
+
+static int nvhost_debug_show_all(struct seq_file *s, void *unused)
 {
 	struct output o = {
 		.fn = write_to_seqfile,
@@ -121,6 +161,27 @@ static int nvhost_debug_show(struct seq_file *s, void *unused)
 	show_all(s->private, &o);
 	return 0;
 }
+static int nvhost_debug_show(struct seq_file *s, void *unused)
+{
+	struct output o = {
+		.fn = write_to_seqfile,
+		.ctx = s
+	};
+	show_all_no_fifo(s->private, &o);
+	return 0;
+}
+
+static int nvhost_debug_open_all(struct inode *inode, struct file *file)
+{
+	return single_open(file, nvhost_debug_show_all, inode->i_private);
+}
+
+static const struct file_operations nvhost_debug_all_fops = {
+	.open		= nvhost_debug_open_all,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
 static int nvhost_debug_open(struct inode *inode, struct file *file)
 {
@@ -140,6 +201,8 @@ void nvhost_debug_init(struct nvhost_master *master)
 
 	debugfs_create_file("status", S_IRUGO, de,
 			master, &nvhost_debug_fops);
+	debugfs_create_file("status_all", S_IRUGO, de,
+			master, &nvhost_debug_all_fops);
 
 	debugfs_create_u32("null_kickoff_pid", S_IRUGO|S_IWUSR, de,
 			&nvhost_debug_null_kickoff_pid);
diff --git a/drivers/video/tegra/host/gr3d/gr3d_t20.c b/drivers/video/tegra/host/gr3d/gr3d_t20.c
index c0efac03b882..5645f5b2b0c6 100644
--- a/drivers/video/tegra/host/gr3d/gr3d_t20.c
+++ b/drivers/video/tegra/host/gr3d/gr3d_t20.c
@@ -138,7 +138,7 @@ static void save_push_v0(struct nvhost_hwctx *nctx, struct nvhost_cdma *cdma)
 
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			p->save_buf->handle,
+			p->save_buf,
 			0,
 			nvhost_opcode_gather(p->save_size),
 			p->save_phys);
diff --git a/drivers/video/tegra/host/gr3d/gr3d_t30.c b/drivers/video/tegra/host/gr3d/gr3d_t30.c
index 93d98dfa645c..57f4c779eff8 100644
--- a/drivers/video/tegra/host/gr3d/gr3d_t30.c
+++ b/drivers/video/tegra/host/gr3d/gr3d_t30.c
@@ -145,7 +145,7 @@ static void save_push_v1(struct nvhost_hwctx *nctx, struct nvhost_cdma *cdma)
 	/* gather the save buffer */
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			p->save_buf->handle,
+			p->save_buf,
 			0,
 			nvhost_opcode_gather(p->save_size),
 			p->save_phys);
diff --git a/drivers/video/tegra/host/host1x/host1x_cdma.c b/drivers/video/tegra/host/host1x/host1x_cdma.c
index fcb1f05f0025..4569c3d62494 100644
--- a/drivers/video/tegra/host/host1x/host1x_cdma.c
+++ b/drivers/video/tegra/host/host1x/host1x_cdma.c
@@ -137,7 +137,7 @@ static void push_buffer_destroy(struct push_buffer *pb)
  */
 static void push_buffer_push_to(struct push_buffer *pb,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle, u32 op1, u32 op2)
+		struct nvmap_handle_ref *handle, u32 op1, u32 op2)
 {
 	u32 cur = pb->cur;
 	u32 *p = (u32 *)((u32)pb->mapped + cur);
diff --git a/drivers/video/tegra/host/host1x/host1x_channel.c b/drivers/video/tegra/host/host1x/host1x_channel.c
index 8c4a7a5c74ad..c72e6478b806 100644
--- a/drivers/video/tegra/host/host1x/host1x_channel.c
+++ b/drivers/video/tegra/host/host1x/host1x_channel.c
@@ -143,7 +143,7 @@ static void submit_ctxrestore(struct nvhost_job *job)
 	/* Send restore buffer to channel */
 	nvhost_cdma_push_gather(&ch->cdma,
 		host->nvmap,
-		nvmap_ref_to_handle(ctx->restore),
+		ctx->restore,
 		0,
 		nvhost_opcode_gather(ctx->restore_size),
 		ctx->restore_phys);
@@ -188,7 +188,7 @@ void submit_gathers(struct nvhost_job *job)
 		u32 op2 = job->gathers[i].mem;
 		nvhost_cdma_push_gather(&job->ch->cdma,
 				job->nvmap,
-				nvmap_id_to_handle(job->gathers[i].mem_id),
+				job->gathers[i].ref,
 				job->gathers[i].offset,
 				op1, op2);
 	}
@@ -242,22 +242,6 @@ int host1x_channel_submit(struct nvhost_job *job)
 		goto error;
 	}
 
-	/* remove stale waits */
-	if (job->num_waitchk) {
-		err = nvhost_syncpt_wait_check(sp,
-					       job->nvmap,
-					       job->waitchk_mask,
-					       job->waitchk,
-					       job->num_waitchk);
-		if (err) {
-			dev_warn(&ch->dev->dev,
-				 "nvhost_syncpt_wait_check failed: %d\n", err);
-			mutex_unlock(&ch->submitlock);
-			nvhost_module_idle(ch->dev);
-			goto error;
-		}
-	}
-
 	/* begin a CDMA submit */
 	err = nvhost_cdma_begin(&ch->cdma, job);
 	if (err) {
diff --git a/drivers/video/tegra/host/host1x/host1x_debug.c b/drivers/video/tegra/host/host1x/host1x_debug.c
index 76483d82528b..7de342298c4d 100644
--- a/drivers/video/tegra/host/host1x/host1x_debug.c
+++ b/drivers/video/tegra/host/host1x/host1x_debug.c
@@ -28,8 +28,8 @@
 #include "host1x_hardware.h"
 #include "nvhost_cdma.h"
 #include "nvhost_channel.h"
-#include "../../nvmap/nvmap.h"
 #include "host1x_cdma.h"
+#include "nvhost_job.h"
 
 #define NVHOST_DEBUG_MAX_PAGE_OFFSET 102400
 
@@ -160,6 +160,34 @@ static void show_channel_word(struct output *o, int *state, int *count,
 	}
 }
 
+static void do_show_channel_gather(struct output *o,
+		phys_addr_t phys_addr,
+		u32 words, struct nvhost_cdma *cdma,
+		phys_addr_t pin_addr, u32 *map_addr)
+{
+	/* Map dmaget cursor to corresponding nvmap_handle */
+	u32 offset;
+	int state, count, i;
+
+	offset = phys_addr - pin_addr;
+	/*
+	 * Sometimes we're given different hardware address to the same
+	 * page - in these cases the offset will get an invalid number and
+	 * we just have to bail out.
+	 */
+	if (offset > NVHOST_DEBUG_MAX_PAGE_OFFSET) {
+		nvhost_debug_output(o, "[address mismatch]\n");
+	} else {
+		/* GATHER buffer starts always with commands */
+		state = NVHOST_DBG_STATE_CMD;
+		for (i = 0; i < words; i++)
+			show_channel_word(o, &state, &count,
+					phys_addr + i * 4,
+					*(map_addr + offset/4 + i),
+					cdma);
+	}
+}
+
 static void show_channel_gather(struct output *o, u32 addr,
 		phys_addr_t phys_addr,
 		u32 words, struct nvhost_cdma *cdma)
@@ -169,81 +197,36 @@ static void show_channel_gather(struct output *o, u32 addr,
 	struct push_buffer *pb = &cdma->push_buffer;
 	u32 cur = addr - pb->phys;
 	struct nvmap_client_handle *nvmap = &pb->nvmap[cur/8];
-	struct nvmap_handle_ref ref;
 	u32 *map_addr, offset;
 	phys_addr_t pin_addr;
-	int state, count, i;
 
-	if (!nvmap->handle || !nvmap->client
-			|| atomic_read(&nvmap->handle->ref) < 1) {
+	if (!nvmap || !nvmap->handle || !nvmap->client) {
 		nvhost_debug_output(o, "[already deallocated]\n");
 		return;
 	}
 
-	/* Create a fake nvmap_handle_ref - nvmap requires it
-	 * but accesses only the first field - nvmap_handle */
-	ref.handle = nvmap->handle;
-
-	map_addr = nvmap_mmap(&ref);
+	map_addr = nvmap_mmap(nvmap->handle);
 	if (!map_addr) {
 		nvhost_debug_output(o, "[could not mmap]\n");
 		return;
 	}
 
 	/* Get base address from nvmap */
-	pin_addr = nvmap_pin(nvmap->client, &ref);
+	pin_addr = nvmap_pin(nvmap->client, nvmap->handle);
 	if (IS_ERR_VALUE(pin_addr)) {
 		nvhost_debug_output(o, "[couldn't pin]\n");
-		nvmap_munmap(&ref, map_addr);
+		nvmap_munmap(nvmap->handle, map_addr);
 		return;
 	}
 
 	offset = phys_addr - pin_addr;
-	/*
-	 * Sometimes we're given different hardware address to the same
-	 * page - in these cases the offset will get an invalid number and
-	 * we just have to bail out.
-	 */
-	if (offset > NVHOST_DEBUG_MAX_PAGE_OFFSET) {
-		nvhost_debug_output(o, "[address mismatch]\n");
-	} else {
-		/* GATHER buffer starts always with commands */
-		state = NVHOST_DBG_STATE_CMD;
-		for (i = 0; i < words; i++)
-			show_channel_word(o, &state, &count,
-					phys_addr + i * 4,
-					*(map_addr + offset/4 + i),
-					cdma);
-	}
-	nvmap_unpin(nvmap->client, &ref);
-	nvmap_munmap(&ref, map_addr);
+	do_show_channel_gather(o, phys_addr, words, cdma,
+			pin_addr, map_addr);
+	nvmap_unpin(nvmap->client, nvmap->handle);
+	nvmap_munmap(nvmap->handle, map_addr);
 #endif
 }
 
-static void show_channel_pair(struct output *o, u32 addr,
-		u32 w0, u32 w1, struct nvhost_cdma *cdma)
-{
-	int state = NVHOST_DBG_STATE_CMD;
-	int count;
-
-	show_channel_word(o, &state, &count, addr, w0, cdma);
-	show_channel_word(o, &state, &count, addr+4, w1, cdma);
-}
-
-/**
- * Retrieve the op pair at a slot offset from a DMA address
- */
-static void cdma_peek(struct nvhost_cdma *cdma,
-		      u32 dmaget, int slot, u32 *out)
-{
-	u32 offset = dmaget - cdma->push_buffer.phys;
-	u32 *p = cdma->push_buffer.mapped;
-
-	offset = ((offset + slot * 8) & (PUSH_BUFFER_SIZE - 1)) >> 2;
-	out[0] = p[offset];
-	out[1] = p[offset + 1];
-}
-
 u32 previous_oppair(struct nvhost_cdma *cdma, u32 cur)
 {
 	u32 pb = cdma->push_buffer.phys;
@@ -253,6 +236,42 @@ u32 previous_oppair(struct nvhost_cdma *cdma, u32 cur)
 	return prev;
 }
 
+void show_channel_gathers(struct output *o, struct nvhost_cdma *cdma)
+{
+	struct nvhost_job *job;
+
+	list_for_each_entry(job, &cdma->sync_queue, list) {
+		int i;
+		nvhost_debug_output(o, "\n%p: JOB, syncpt_id=%d, syncpt_val=%d,"
+				" first_get=%08x, timeout=%d, ctx=%p,"
+				" num_slots=%d, num_handles=%d\n",
+				job,
+				job->syncpt_id,
+				job->syncpt_end,
+				job->first_get,
+				job->timeout,
+				job->hwctx,
+				job->num_slots,
+				job->num_unpins);
+
+		for (i = 0; i < job->num_gathers; i++) {
+			struct nvhost_job_gather *g = &job->gathers[i];
+			u32 *mapped = nvmap_mmap(g->ref);
+			if (!mapped) {
+				nvhost_debug_output(o, "[could not mmap]\n");
+				continue;
+			}
+
+			nvhost_debug_output(o, "    GATHER at %08x, %d words\n",
+				g->mem, g->words);
+
+			do_show_channel_gather(o, g->mem + g->offset,
+					g->words, cdma, g->mem, mapped);
+			nvmap_munmap(g->ref, mapped);
+		}
+	}
+}
+
 static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 	struct nvhost_channel *ch, struct output *o, int chid)
 {
@@ -261,7 +280,6 @@ static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 	u32 dmaput, dmaget, dmactrl;
 	u32 cbstat, cbread;
 	u32 val, base, baseval;
-	u32 pbw[2];
 
 	dmaput = readl(channel->aperture + HOST1X_CHANNEL_DMAPUT);
 	dmaget = readl(channel->aperture + HOST1X_CHANNEL_DMAGET);
@@ -310,9 +328,7 @@ static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 		dmaput, dmaget, dmactrl);
 	nvhost_debug_output(o, "CBREAD %08x, CBSTAT %08x\n", cbread, cbstat);
 
-	cdma_peek(cdma, dmaget, -1, pbw);
-	show_channel_pair(o, previous_oppair(cdma, dmaget),
-		pbw[0], pbw[1], &channel->cdma);
+	show_channel_gathers(o, cdma);
 	nvhost_debug_output(o, "\n");
 }
 
diff --git a/drivers/video/tegra/host/host1x/host1x_syncpt.c b/drivers/video/tegra/host/host1x/host1x_syncpt.c
index b7d6587acc61..4cc8e9e212fa 100644
--- a/drivers/video/tegra/host/host1x/host1x_syncpt.c
+++ b/drivers/video/tegra/host/host1x/host1x_syncpt.c
@@ -103,62 +103,14 @@ static void t20_syncpt_cpu_incr(struct nvhost_syncpt *sp, u32 id)
 	wmb();
 }
 
-/* check for old WAITs to be removed (avoiding a wrap) */
-static int t20_syncpt_wait_check(struct nvhost_syncpt *sp,
-				 struct nvmap_client *nvmap,
-				 u32 waitchk_mask,
-				 struct nvhost_waitchk *wait,
-				 int num_waitchk)
+/* remove a wait pointed to by patch_addr */
+static int host1x_syncpt_patch_wait(struct nvhost_syncpt *sp,
+		void *patch_addr)
 {
-	u32 idx;
-	int err = 0;
-
-	/* get current syncpt values */
-	for (idx = 0; idx < NV_HOST1X_SYNCPT_NB_PTS; idx++) {
-		if (BIT(idx) & waitchk_mask)
-			nvhost_syncpt_update_min(sp, idx);
-	}
-
-	BUG_ON(!wait && !num_waitchk);
-
-	/* compare syncpt vs wait threshold */
-	while (num_waitchk) {
-		u32 override;
-
-		BUG_ON(wait->syncpt_id >= NV_HOST1X_SYNCPT_NB_PTS);
-		trace_nvhost_syncpt_wait_check(wait->mem, wait->offset,
-				wait->syncpt_id, wait->thresh);
-		if (nvhost_syncpt_is_expired(sp,
-					wait->syncpt_id, wait->thresh)) {
-			/*
-			 * NULL an already satisfied WAIT_SYNCPT host method,
-			 * by patching its args in the command stream. The
-			 * method data is changed to reference a reserved
-			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
-			 * syncpt with a matching threshold value of 0, so
-			 * is guaranteed to be popped by the host HW.
-			 */
-			dev_dbg(&syncpt_to_dev(sp)->dev->dev,
-			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
-			    wait->syncpt_id,
-			    syncpt_op().name(sp, wait->syncpt_id),
-			    wait->thresh,
-			    nvhost_syncpt_read_min(sp, wait->syncpt_id));
-
-			/* patch the wait */
-			override = nvhost_class_host_wait_syncpt(
-					NVSYNCPT_GRAPHICS_HOST, 0);
-			err = nvmap_patch_word(nvmap,
-					(struct nvmap_handle *)wait->mem,
-					wait->offset, override);
-			if (err)
-				break;
-		}
-
-		wait++;
-		num_waitchk--;
-	}
-	return err;
+	u32 override = nvhost_class_host_wait_syncpt(
+			NVSYNCPT_GRAPHICS_HOST, 0);
+	__raw_writel(override, patch_addr);
+	return 0;
 }
 
 
@@ -241,7 +193,7 @@ int host1x_init_syncpt_support(struct nvhost_master *host,
 	op->syncpt.read_wait_base = t20_syncpt_read_wait_base;
 	op->syncpt.update_min = t20_syncpt_update_min;
 	op->syncpt.cpu_incr = t20_syncpt_cpu_incr;
-	op->syncpt.wait_check = t20_syncpt_wait_check;
+	op->syncpt.patch_wait = host1x_syncpt_patch_wait;
 	op->syncpt.debug = t20_syncpt_debug;
 	op->syncpt.name = t20_syncpt_name;
 	op->syncpt.mutex_try_lock = syncpt_mutex_try_lock;
diff --git a/drivers/video/tegra/host/mpe/mpe.c b/drivers/video/tegra/host/mpe/mpe.c
index d8c9da7e9a76..3fe2fcd8bb50 100644
--- a/drivers/video/tegra/host/mpe/mpe.c
+++ b/drivers/video/tegra/host/mpe/mpe.c
@@ -502,7 +502,7 @@ static void ctxmpe_save_push(struct nvhost_hwctx *nctx,
 	struct host1x_hwctx_handler *h = host1x_hwctx_handler(ctx);
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			h->save_buf->handle,
+			h->save_buf,
 			0,
 			nvhost_opcode_gather(h->save_size),
 			h->save_phys);
diff --git a/drivers/video/tegra/host/nvhost_cdma.c b/drivers/video/tegra/host/nvhost_cdma.c
index a72e18f16ac7..c87415bf5ac2 100644
--- a/drivers/video/tegra/host/nvhost_cdma.c
+++ b/drivers/video/tegra/host/nvhost_cdma.c
@@ -53,6 +53,18 @@ static void add_to_sync_queue(struct nvhost_cdma *cdma,
 	job->num_slots = nr_slots;
 	nvhost_job_get(job);
 	list_add_tail(&job->list, &cdma->sync_queue);
+
+	switch (job->priority) {
+	case NVHOST_PRIORITY_HIGH:
+		cdma->high_prio_count++;
+		break;
+	case NVHOST_PRIORITY_MEDIUM:
+		cdma->med_prio_count++;
+		break;
+	case NVHOST_PRIORITY_LOW:
+		cdma->low_prio_count++;
+		break;
+	}
 }
 
 /**
@@ -200,6 +212,19 @@ static void update_cdma_locked(struct nvhost_cdma *cdma)
 		}
 
 		list_del(&job->list);
+
+		switch (job->priority) {
+		case NVHOST_PRIORITY_HIGH:
+			cdma->high_prio_count--;
+			break;
+		case NVHOST_PRIORITY_MEDIUM:
+			cdma->med_prio_count--;
+			break;
+		case NVHOST_PRIORITY_LOW:
+			cdma->low_prio_count--;
+			break;
+		}
+
 		nvhost_job_put(job);
 	}
 
@@ -371,15 +396,13 @@ int nvhost_cdma_begin(struct nvhost_cdma *cdma, struct nvhost_job *job)
 }
 
 static void trace_write_gather(struct nvhost_cdma *cdma,
-		struct nvmap_handle *handle,
+		struct nvmap_handle_ref *ref,
 		u32 offset, u32 words)
 {
-	struct nvmap_handle_ref ref;
 	void *mem = NULL;
 
 	if (nvhost_debug_trace_cmdbuf) {
-		ref.handle = handle;
-		mem = nvmap_mmap(&ref);
+		mem = nvmap_mmap(ref);
 		if (IS_ERR_OR_NULL(mem))
 			mem = NULL;
 	};
@@ -393,12 +416,12 @@ static void trace_write_gather(struct nvhost_cdma *cdma,
 		for (i = 0; i < words; i += TRACE_MAX_LENGTH) {
 			trace_nvhost_cdma_push_gather(
 				cdma_to_channel(cdma)->dev->name,
-				(u32)handle,
+				(u32)ref->handle,
 				min(words - i, TRACE_MAX_LENGTH),
 				offset + i * sizeof(u32),
 				mem);
 		}
-		nvmap_munmap(&ref, mem);
+		nvmap_munmap(ref, mem);
 	}
 }
 
@@ -421,7 +444,7 @@ void nvhost_cdma_push(struct nvhost_cdma *cdma, u32 op1, u32 op2)
  */
 void nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle,
+		struct nvmap_handle_ref *handle,
 		u32 offset, u32 op1, u32 op2)
 {
 	u32 slots_free = cdma->slots_free;
@@ -468,6 +491,12 @@ void nvhost_cdma_end(struct nvhost_cdma *cdma,
 	if (job->timeout && was_idle)
 		cdma_start_timer_locked(cdma, job);
 
+	trace_nvhost_cdma_end(job->ch->dev->name,
+			job->priority,
+			job->ch->cdma.high_prio_count,
+			job->ch->cdma.med_prio_count,
+			job->ch->cdma.low_prio_count);
+
 	mutex_unlock(&cdma->lock);
 }
 
@@ -492,6 +521,8 @@ int nvhost_cdma_flush(struct nvhost_cdma *cdma, int timeout)
 	unsigned int space, err = 0;
 	unsigned long end_jiffies = jiffies + msecs_to_jiffies(timeout);
 
+	trace_nvhost_cdma_flush(cdma_to_channel(cdma)->dev->name, timeout);
+
 	/*
 	 * Wait for at most timeout ms. Recalculate timeout at each iteration
 	 * to better keep within given timeout.
diff --git a/drivers/video/tegra/host/nvhost_cdma.h b/drivers/video/tegra/host/nvhost_cdma.h
index e6f51179150f..2056774a7bc7 100644
--- a/drivers/video/tegra/host/nvhost_cdma.h
+++ b/drivers/video/tegra/host/nvhost_cdma.h
@@ -48,7 +48,7 @@ struct nvhost_job;
 
 struct nvmap_client_handle {
 	struct nvmap_client *client;
-	struct nvmap_handle *handle;
+	struct nvmap_handle_ref *handle;
 };
 
 struct push_buffer {
@@ -99,6 +99,9 @@ struct nvhost_cdma {
 	struct buffer_timeout timeout;	/* channel's timeout state/wq */
 	bool running;
 	bool torndown;
+	int high_prio_count;
+	int med_prio_count;
+	int low_prio_count;
 };
 
 #define cdma_to_channel(cdma) container_of(cdma, struct nvhost_channel, cdma)
@@ -113,7 +116,7 @@ int	nvhost_cdma_begin(struct nvhost_cdma *cdma, struct nvhost_job *job);
 void	nvhost_cdma_push(struct nvhost_cdma *cdma, u32 op1, u32 op2);
 void	nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle, u32 offset, u32 op1, u32 op2);
+		struct nvmap_handle_ref *handle, u32 offset, u32 op1, u32 op2);
 void	nvhost_cdma_end(struct nvhost_cdma *cdma,
 		struct nvhost_job *job);
 void	nvhost_cdma_update(struct nvhost_cdma *cdma);
diff --git a/drivers/video/tegra/host/nvhost_channel.c b/drivers/video/tegra/host/nvhost_channel.c
index ef8886fe4652..ad303cf0a22d 100644
--- a/drivers/video/tegra/host/nvhost_channel.c
+++ b/drivers/video/tegra/host/nvhost_channel.c
@@ -51,10 +51,26 @@ int nvhost_channel_init(struct nvhost_channel *ch,
 
 int nvhost_channel_submit(struct nvhost_job *job)
 {
-	/* Low priority submits wait until sync queue is empty. Ignores result
-	 * from nvhost_cdma_flush, as we submit either when push buffer is
-	 * empty or when we reach the timeout. */
-	if (job->priority < NVHOST_PRIORITY_MEDIUM)
+	/*
+	 * Check if queue has higher priority jobs running. If so, wait until
+	 * queue is empty. Ignores result from nvhost_cdma_flush, as we submit
+	 * either when push buffer is empty or when we reach the timeout.
+	 */
+	int higher_count = 0;
+
+	switch (job->priority) {
+	case NVHOST_PRIORITY_HIGH:
+		higher_count = 0;
+		break;
+	case NVHOST_PRIORITY_MEDIUM:
+		higher_count = job->ch->cdma.high_prio_count;
+		break;
+	case NVHOST_PRIORITY_LOW:
+		higher_count = job->ch->cdma.high_prio_count
+			+ job->ch->cdma.med_prio_count;
+		break;
+	}
+	if (higher_count > 0)
 		(void)nvhost_cdma_flush(&job->ch->cdma,
 				NVHOST_CHANNEL_LOW_PRIO_MAX_WAIT);
 
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index eac51731547b..b3a904d5a3ee 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -31,18 +31,10 @@
 #define NVHOST_MAX_POWERGATE_IDS	2
 
 struct nvhost_master;
-struct nvhost_waitchk;
 struct nvhost_device;
 struct nvhost_channel;
 struct nvhost_hwctx;
 
-struct nvhost_channel_gather {
-	u32 words;
-	phys_addr_t mem;
-	u32 mem_id;
-	int offset;
-};
-
 struct nvhost_channel {
 	int refcount;
 	int chid;
diff --git a/drivers/video/tegra/host/nvhost_intr.c b/drivers/video/tegra/host/nvhost_intr.c
index ba821f694cb4..af2e3ad1bdb5 100644
--- a/drivers/video/tegra/host/nvhost_intr.c
+++ b/drivers/video/tegra/host/nvhost_intr.c
@@ -128,12 +128,16 @@ static void action_submit_complete(struct nvhost_waitlist *waiter)
 	struct nvhost_channel *channel = waiter->data;
 	int nr_completed = waiter->count;
 
+	nvhost_cdma_update(&channel->cdma);
+	nvhost_module_idle_mult(channel->dev, nr_completed);
+
 	/*  Add nr_completed to trace */
 	trace_nvhost_channel_submit_complete(channel->dev->name,
-			nr_completed, waiter->thresh);
+			nr_completed, waiter->thresh,
+			channel->cdma.high_prio_count,
+			channel->cdma.med_prio_count,
+			channel->cdma.low_prio_count);
 
-	nvhost_cdma_update(&channel->cdma);
-	nvhost_module_idle_mult(channel->dev, nr_completed);
 }
 
 static void action_ctxsave(struct nvhost_waitlist *waiter)
diff --git a/drivers/video/tegra/host/nvhost_job.c b/drivers/video/tegra/host/nvhost_job.c
index 71f2ab0e751f..e029449b6184 100644
--- a/drivers/video/tegra/host/nvhost_job.c
+++ b/drivers/video/tegra/host/nvhost_job.c
@@ -23,9 +23,11 @@
 #include <linux/err.h>
 #include <linux/vmalloc.h>
 #include <linux/nvmap.h>
+#include <trace/events/nvhost.h>
 #include "nvhost_channel.h"
 #include "nvhost_job.h"
 #include "nvhost_hwctx.h"
+#include "nvhost_syncpt.h"
 #include "dev.h"
 
 /* Magic to use to fill freed handle slots */
@@ -33,128 +35,44 @@
 
 static int job_size(struct nvhost_submit_hdr_ext *hdr)
 {
-	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_relocs = hdr ? hdr->num_relocs : 0;
 	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int num_unpins = num_cmdbufs + num_relocs;
 
 	return sizeof(struct nvhost_job)
-			+ num_pins * sizeof(struct nvmap_pinarray_elem)
-			+ num_pins * sizeof(struct nvmap_handle *)
-			+ num_waitchks * sizeof(struct nvhost_waitchk);
-}
-
-static int gather_size(int num_cmdbufs)
-{
-	return num_cmdbufs * sizeof(struct nvhost_channel_gather);
-}
-
-static void free_gathers(struct nvhost_job *job)
-{
-	if (job->gathers) {
-		nvmap_munmap(job->gather_mem, job->gathers);
-		job->gathers = NULL;
-	}
-	if (job->gather_mem) {
-		nvmap_free(job->nvmap, job->gather_mem);
-		job->gather_mem = NULL;
-	}
-}
-
-static int alloc_gathers(struct nvhost_job *job,
-		int num_cmdbufs)
-{
-	int err = 0;
-
-	job->gather_mem = NULL;
-	job->gathers = NULL;
-	job->gather_mem_size = 0;
-
-	if (num_cmdbufs) {
-		/* Allocate memory */
-		job->gather_mem = nvmap_alloc(job->nvmap,
-				gather_size(num_cmdbufs),
-				32, NVMAP_HANDLE_CACHEABLE, 0);
-		if (IS_ERR_OR_NULL(job->gather_mem)) {
-			err = job->gather_mem ? PTR_ERR(job->gather_mem) : -ENOMEM;
-			job->gather_mem = NULL;
-			goto error;
-		}
-		job->gather_mem_size = gather_size(num_cmdbufs);
-
-		/* Map memory to kernel */
-		job->gathers = nvmap_mmap(job->gather_mem);
-		if (IS_ERR_OR_NULL(job->gathers)) {
-			err = job->gathers ? PTR_ERR(job->gathers) : -ENOMEM;
-			job->gathers = NULL;
-			goto error;
-		}
-	}
-
-	return 0;
-
-error:
-	free_gathers(job);
-	return err;
-}
-
-static int realloc_gathers(struct nvhost_job *oldjob,
-		struct nvhost_job *newjob,
-		int num_cmdbufs)
-{
-	int err = 0;
-
-	/* Check if we can reuse gather buffer */
-	if (oldjob->gather_mem_size < gather_size(num_cmdbufs)
-			|| oldjob->nvmap != newjob->nvmap) {
-		free_gathers(oldjob);
-		err = alloc_gathers(newjob, num_cmdbufs);
-	} else {
-		newjob->gather_mem = oldjob->gather_mem;
-		newjob->gathers = oldjob->gathers;
-		newjob->gather_mem_size = oldjob->gather_mem_size;
-
-		oldjob->gather_mem = NULL;
-		oldjob->gathers = NULL;
-		oldjob->gather_mem_size = 0;
-	}
-	return err;
+			+ num_relocs * sizeof(struct nvhost_reloc)
+			+ num_relocs * sizeof(struct nvhost_reloc_shift)
+			+ num_unpins * sizeof(struct nvmap_handle_ref *)
+			+ num_waitchks * sizeof(struct nvhost_waitchk)
+			+ num_cmdbufs * sizeof(struct nvhost_job_gather);
 }
 
 static void init_fields(struct nvhost_job *job,
 		struct nvhost_submit_hdr_ext *hdr,
 		int priority, int clientid)
 {
-	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_relocs = hdr ? hdr->num_relocs : 0;
 	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int num_unpins = num_cmdbufs + num_relocs;
 	void *mem = job;
 
 	/* First init state to zero */
-	job->num_gathers = 0;
-	job->num_pins = 0;
-	job->num_unpins = 0;
-	job->num_waitchk = 0;
-	job->waitchk_mask = 0;
-	job->syncpt_id = 0;
-	job->syncpt_incrs = 0;
-	job->syncpt_end = 0;
 	job->priority = priority;
 	job->clientid = clientid;
-	job->null_kickoff = false;
-	job->first_get = 0;
-	job->num_slots = 0;
 
 	/* Redistribute memory to the structs */
 	mem += sizeof(struct nvhost_job);
-	if (num_pins) {
-		job->pinarray = mem;
-		mem += num_pins * sizeof(struct nvmap_pinarray_elem);
-		job->unpins = mem;
-		mem += num_pins * sizeof(struct nvmap_handle *);
-	} else {
-		job->pinarray = NULL;
-		job->unpins = NULL;
-	}
-
+	job->relocarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct nvhost_reloc);
+	job->relocshiftarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct nvhost_reloc_shift);
+	job->unpins = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(struct nvmap_handle_ref *);
 	job->waitchk = num_waitchks ? mem : NULL;
+	mem += num_waitchks * sizeof(struct nvhost_waitchk);
+	job->gathers = num_cmdbufs ? mem : NULL;
 
 	/* Copy information from header */
 	if (hdr) {
@@ -172,8 +90,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		int clientid)
 {
 	struct nvhost_job *job = NULL;
-	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
-	int err = 0;
 
 	job = vzalloc(job_size(hdr));
 	if (!job)
@@ -186,10 +102,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		hwctx->h->get(hwctx);
 	job->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
 
-	err = alloc_gathers(job, num_cmdbufs);
-	if (err)
-		goto error;
-
 	init_fields(job, hdr, priority, clientid);
 
 	return job;
@@ -200,46 +112,6 @@ error:
 	return NULL;
 }
 
-struct nvhost_job *nvhost_job_realloc(
-		struct nvhost_job *oldjob,
-		struct nvhost_hwctx *hwctx,
-		struct nvhost_submit_hdr_ext *hdr,
-		struct nvmap_client *nvmap,
-		int priority, int clientid)
-{
-	struct nvhost_job *newjob = NULL;
-	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
-	int err = 0;
-
-	newjob = vzalloc(job_size(hdr));
-	if (!newjob)
-		goto error;
-	kref_init(&newjob->ref);
-	newjob->ch = oldjob->ch;
-	newjob->hwctx = hwctx;
-	if (hwctx)
-		newjob->hwctx->h->get(newjob->hwctx);
-	newjob->timeout = oldjob->timeout;
-	newjob->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
-
-	err = realloc_gathers(oldjob, newjob, num_cmdbufs);
-	if (err)
-		goto error;
-
-	nvhost_job_put(oldjob);
-
-	init_fields(newjob, hdr, priority, clientid);
-
-	return newjob;
-
-error:
-	if (newjob)
-		nvhost_job_put(newjob);
-	if (oldjob)
-		nvhost_job_put(oldjob);
-	return NULL;
-}
-
 void nvhost_job_get(struct nvhost_job *job)
 {
 	kref_get(&job->ref);
@@ -253,10 +125,6 @@ static void job_free(struct kref *ref)
 		job->hwctxref->h->put(job->hwctxref);
 	if (job->hwctx)
 		job->hwctx->h->put(job->hwctx);
-	if (job->gathers)
-		nvmap_munmap(job->gather_mem, job->gathers);
-	if (job->gather_mem)
-		nvmap_free(job->nvmap, job->gather_mem);
 	if (job->nvmap)
 		nvmap_client_put(job->nvmap);
 	vfree(job);
@@ -280,42 +148,177 @@ void nvhost_job_put(struct nvhost_job *job)
 void nvhost_job_add_gather(struct nvhost_job *job,
 		u32 mem_id, u32 words, u32 offset)
 {
-	struct nvmap_pinarray_elem *pin;
-	struct nvhost_channel_gather *cur_gather =
+	struct nvhost_job_gather *cur_gather =
 			&job->gathers[job->num_gathers];
 
-	pin = &job->pinarray[job->num_pins++];
-	pin->patch_mem = (u32)nvmap_ref_to_handle(job->gather_mem);
-	pin->patch_offset = (void *)&(cur_gather->mem) - (void *)job->gathers;
-	pin->pin_mem = nvmap_convert_handle_u2k(mem_id);
-	pin->pin_offset = offset;
 	cur_gather->words = words;
 	cur_gather->mem_id = mem_id;
 	cur_gather->offset = offset;
 	job->num_gathers += 1;
 }
 
-int nvhost_job_pin(struct nvhost_job *job)
+static int do_relocs(struct nvhost_job *job, u32 cmdbuf_mem, void *cmdbuf_addr)
 {
-	int err = 0;
+	phys_addr_t target_phys;
+	int i;
+	u32 mem_id = 0;
+	struct nvmap_handle_ref *target_ref = NULL;
+
+	/* pin & patch the relocs for one gather */
+	for (i = 0; i < job->num_relocs; i++) {
+		struct nvhost_reloc *reloc = &job->relocarray[i];
+		struct nvhost_reloc_shift *shift = &job->relocshiftarray[i];
+
+		/* skip all other gathers */
+		if (cmdbuf_mem != reloc->cmdbuf_mem)
+			continue;
+
+		/* check if pin-mem is same as previous */
+		if (reloc->target != mem_id) {
+			target_ref = nvmap_duplicate_handle_id(job->nvmap,
+					reloc->target);
+			if (IS_ERR(target_ref))
+				return PTR_ERR(target_ref);
+
+			target_phys = nvmap_pin(job->nvmap, target_ref);
+			if (IS_ERR((void *)target_phys)) {
+				nvmap_free(job->nvmap, target_ref);
+				return target_phys;
+			}
+
+			mem_id = reloc->target;
+			job->unpins[job->num_unpins++] = target_ref;
+		}
 
-	/* pin mem handles and patch physical addresses */
-	job->num_unpins = nvmap_pin_array(job->nvmap,
-				nvmap_ref_to_handle(job->gather_mem),
-				job->pinarray, job->num_pins,
-				job->unpins);
-	if (job->num_unpins < 0)
-		err = job->num_unpins;
+		__raw_writel(
+			(target_phys + reloc->target_offset) >> shift->shift,
+			(cmdbuf_addr + reloc->cmdbuf_offset));
+
+		/* Different gathers might have same mem_id. This ensures we
+		 * perform reloc only once per gather memid. */
+		reloc->cmdbuf_mem = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Check driver supplied waitchk structs for syncpt thresholds
+ * that have already been satisfied and NULL the comparison (to
+ * avoid a wrap condition in the HW).
+ */
+static int do_waitchks(struct nvhost_job *job, struct nvhost_syncpt *sp,
+		u32 patch_mem, void *patch_addr)
+{
+	int i;
+
+	/* compare syncpt vs wait threshold */
+	for (i = 0; i < job->num_waitchk; i++) {
+		struct nvhost_waitchk *wait = &job->waitchk[i];
+
+		/* skip all other gathers */
+		if (patch_mem != wait->mem)
+			continue;
+
+		trace_nvhost_syncpt_wait_check(wait->mem, wait->offset,
+				wait->syncpt_id, wait->thresh,
+				nvhost_syncpt_read(sp, wait->syncpt_id));
+		if (nvhost_syncpt_is_expired(sp,
+					wait->syncpt_id, wait->thresh)) {
+			/*
+			 * NULL an already satisfied WAIT_SYNCPT host method,
+			 * by patching its args in the command stream. The
+			 * method data is changed to reference a reserved
+			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
+			 * syncpt with a matching threshold value of 0, so
+			 * is guaranteed to be popped by the host HW.
+			 */
+			dev_dbg(&syncpt_to_dev(sp)->dev->dev,
+			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
+			    wait->syncpt_id,
+			    syncpt_op().name(sp, wait->syncpt_id),
+			    wait->thresh,
+			    nvhost_syncpt_read_min(sp, wait->syncpt_id));
+
+			/* patch the wait */
+			nvhost_syncpt_patch_wait(sp,
+					(patch_addr + wait->offset));
+		}
+
+		wait->mem = 0;
+	}
+	return 0;
+}
+
+int nvhost_job_pin(struct nvhost_job *job, struct nvhost_syncpt *sp)
+{
+	int err = 0, i = 0;
+	phys_addr_t gather_phys = 0;
+	void *gather_addr = NULL;
+	unsigned long waitchk_mask = job->waitchk_mask;
+
+	/* get current syncpt values for waitchk */
+	for_each_set_bit(i, &waitchk_mask, sizeof(job->waitchk_mask))
+		nvhost_syncpt_update_min(sp, i);
+
+	/* pin gathers */
+	for (i = 0; i < job->num_gathers; i++) {
+		struct nvhost_job_gather *g = &job->gathers[i];
+
+		/* process each gather mem only once */
+		if (!g->ref) {
+			g->ref = nvmap_duplicate_handle_id(job->nvmap,
+					job->gathers[i].mem_id);
+			if (IS_ERR(g->ref)) {
+				err = PTR_ERR(g->ref);
+				g->ref = NULL;
+				break;
+			}
+
+			gather_phys = nvmap_pin(job->nvmap, g->ref);
+			if (IS_ERR((void *)gather_phys)) {
+				nvmap_free(job->nvmap, g->ref);
+				err = gather_phys;
+				break;
+			}
+
+			/* store the gather ref into unpin array */
+			job->unpins[job->num_unpins++] = g->ref;
+
+			gather_addr = nvmap_mmap(g->ref);
+			if (!gather_addr) {
+				err = -ENOMEM;
+				break;
+			}
+
+			err = do_relocs(job, g->mem_id, gather_addr);
+			if (!err)
+				err = do_waitchks(job, sp,
+						g->mem_id, gather_addr);
+			nvmap_munmap(g->ref, gather_addr);
+
+			if (err)
+				break;
+		}
+		g->mem = gather_phys + g->offset;
+	}
+	wmb();
 
 	return err;
 }
 
 void nvhost_job_unpin(struct nvhost_job *job)
 {
-	nvmap_unpin_handles(job->nvmap, job->unpins,
-			job->num_unpins);
+	int i;
+
+	for (i = 0; i < job->num_unpins; i++) {
+		nvmap_unpin(job->nvmap, job->unpins[i]);
+		nvmap_free(job->nvmap, job->unpins[i]);
+	}
+
 	memset(job->unpins, BAD_MAGIC,
-			job->num_unpins * sizeof(struct nvmap_handle *));
+			job->num_unpins * sizeof(struct nvmap_handle_ref *));
+	job->num_unpins = 0;
 }
 
 /**
diff --git a/drivers/video/tegra/host/nvhost_job.h b/drivers/video/tegra/host/nvhost_job.h
index ad9d1af60da1..ec1366337279 100644
--- a/drivers/video/tegra/host/nvhost_job.h
+++ b/drivers/video/tegra/host/nvhost_job.h
@@ -27,7 +27,15 @@ struct nvhost_channel;
 struct nvhost_hwctx;
 struct nvmap_client;
 struct nvhost_waitchk;
-struct nvmap_handle;
+struct nvhost_syncpt;
+
+struct nvhost_job_gather {
+	u32 words;
+	phys_addr_t mem;
+	u32 mem_id;
+	int offset;
+	struct nvmap_handle_ref *ref;
+};
 
 /*
  * Each submit is tracked as a nvhost_job.
@@ -50,10 +58,8 @@ struct nvhost_job {
 	struct nvmap_client *nvmap;
 
 	/* Gathers and their memory */
-	struct nvmap_handle_ref *gather_mem;
-	struct nvhost_channel_gather *gathers;
+	struct nvhost_job_gather *gathers;
 	int num_gathers;
-	int gather_mem_size;
 
 	/* Wait checks to be processed at submit time */
 	struct nvhost_waitchk *waitchk;
@@ -61,9 +67,10 @@ struct nvhost_job {
 	u32 waitchk_mask;
 
 	/* Array of handles to be pinned & unpinned */
-	struct nvmap_pinarray_elem *pinarray;
-	int num_pins;
-	struct nvmap_handle **unpins;
+	struct nvhost_reloc *relocarray;
+	struct nvhost_reloc_shift *relocshiftarray;
+	int num_relocs;
+	struct nvmap_handle_ref **unpins;
 	int num_unpins;
 
 	/* Sync point id, number of increments and end related to the submit */
@@ -99,17 +106,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		int priority, int clientid);
 
 /*
- * Allocate memory for a job. Just enough memory will be allocated to
- * accomodate the submit announced in submit header. Gather memory from
- * oldjob will be reused, and nvhost_job_put() will be called to it.
- */
-struct nvhost_job *nvhost_job_realloc(struct nvhost_job *oldjob,
-		struct nvhost_hwctx *hwctx,
-		struct nvhost_submit_hdr_ext *hdr,
-		struct nvmap_client *nvmap,
-		int priority, int clientid);
-
-/*
  * Add a gather to a job.
  */
 void nvhost_job_add_gather(struct nvhost_job *job,
@@ -134,8 +130,11 @@ void nvhost_job_put(struct nvhost_job *job);
  * Pin memory related to job. This handles relocation of addresses to the
  * host1x address space. Handles both the gather memory and any other memory
  * referred to from the gather buffers.
+ *
+ * Handles also patching out host waits that would wait for an expired sync
+ * point value.
  */
-int nvhost_job_pin(struct nvhost_job *job);
+int nvhost_job_pin(struct nvhost_job *job, struct nvhost_syncpt *sp);
 
 /*
  * Unpin memory related to job.
diff --git a/drivers/video/tegra/host/nvhost_syncpt.c b/drivers/video/tegra/host/nvhost_syncpt.c
index 4835d22881b8..7550512b0214 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.c
+++ b/drivers/video/tegra/host/nvhost_syncpt.c
@@ -74,7 +74,7 @@ u32 nvhost_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
 
 	BUG_ON(!syncpt_op().update_min);
 
-	return syncpt_op().update_min(sp, id);
+	val = syncpt_op().update_min(sp, id);
 	trace_nvhost_syncpt_update_min(id, val);
 
 	return val;
@@ -130,6 +130,19 @@ void nvhost_syncpt_incr(struct nvhost_syncpt *sp, u32 id)
 }
 
 /**
+ * Updated sync point form hardware, and returns true if syncpoint is expired,
+ * false if we may need to wait
+ */
+static bool syncpt_update_min_is_expired(
+	struct nvhost_syncpt *sp,
+	u32 id,
+	u32 thresh)
+{
+	syncpt_op().update_min(sp, id);
+	return nvhost_syncpt_is_expired(sp, id, thresh);
+}
+
+/**
  * Main entrypoint for syncpoint value waits.
  */
 int nvhost_syncpt_wait_timeout(struct nvhost_syncpt *sp, u32 id,
@@ -190,9 +203,9 @@ int nvhost_syncpt_wait_timeout(struct nvhost_syncpt *sp, u32 id,
 	while (timeout) {
 		u32 check = min_t(u32, SYNCPT_CHECK_PERIOD, timeout);
 		int remain = wait_event_interruptible_timeout(wq,
-				nvhost_syncpt_is_expired(sp, id, thresh),
+				syncpt_update_min_is_expired(sp, id, thresh),
 				check);
-		if (remain > 0) {
+		if (remain > 0 || nvhost_syncpt_is_expired(sp, id, thresh)) {
 			if (value)
 				*value = nvhost_syncpt_read_min(sp, id);
 			err = 0;
@@ -317,15 +330,10 @@ void nvhost_mutex_unlock(struct nvhost_syncpt *sp, int idx)
 	atomic_dec(&sp->lock_counts[idx]);
 }
 
-/* check for old WAITs to be removed (avoiding a wrap) */
-int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
-			     struct nvmap_client *nvmap,
-			     u32 waitchk_mask,
-			     struct nvhost_waitchk *wait,
-			     int num_waitchk)
+/* remove a wait pointed to by patch_addr */
+int nvhost_syncpt_patch_wait(struct nvhost_syncpt *sp, void *patch_addr)
 {
-	return syncpt_op().wait_check(sp, nvmap,
-			waitchk_mask, wait, num_waitchk);
+	return syncpt_op().patch_wait(sp, patch_addr);
 }
 
 /* Displays the current value of the sync point via sysfs */
diff --git a/drivers/video/tegra/host/nvhost_syncpt.h b/drivers/video/tegra/host/nvhost_syncpt.h
index b770ed91c76c..b58921bffa9c 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.h
+++ b/drivers/video/tegra/host/nvhost_syncpt.h
@@ -136,23 +136,7 @@ static inline int nvhost_syncpt_wait(struct nvhost_syncpt *sp, u32 id, u32 thres
 					  MAX_SCHEDULE_TIMEOUT, NULL);
 }
 
-/*
- * Check driver supplied waitchk structs for syncpt thresholds
- * that have already been satisfied and NULL the comparison (to
- * avoid a wrap condition in the HW).
- *
- * @param: sp - global shadowed syncpt struct
- * @param: nvmap - needed to access command buffer
- * @param: mask - bit mask of syncpt IDs referenced in WAITs
- * @param: wait - start of filled in array of waitchk structs
- * @param: waitend - end ptr (one beyond last valid waitchk)
- */
-struct nvhost_waitchk;
-int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
-			struct nvmap_client *nvmap,
-			u32 mask,
-			struct nvhost_waitchk *wait,
-			int num_waitchk);
+int nvhost_syncpt_patch_wait(struct nvhost_syncpt *sp, void *patch_addr);
 
 void nvhost_syncpt_debug(struct nvhost_syncpt *sp);
 
diff --git a/drivers/video/tegra/host/t30/t30.c b/drivers/video/tegra/host/t30/t30.c
index 257ba0849277..b2768741546a 100644
--- a/drivers/video/tegra/host/t30/t30.c
+++ b/drivers/video/tegra/host/t30/t30.c
@@ -95,7 +95,7 @@ struct nvhost_device t30_devices[] = {
 	.waitbases	= BIT(NVWAITBASE_2D_0) | BIT(NVWAITBASE_2D_1),
 	.modulemutexes	= BIT(NVMODMUTEX_2D_FULL) | BIT(NVMODMUTEX_2D_SIMPLE) |
 			  BIT(NVMODMUTEX_2D_SB_A) | BIT(NVMODMUTEX_2D_SB_B),
-	.clocks 	= { {"gr2d", UINT_MAX},
+	.clocks 	= { {"gr2d", 0},
 			    {"epp", 0},
 			    {"emc", 300000000} },
 	NVHOST_MODULE_NO_POWERGATE_IDS,