28 files changed, 581 insertions, 800 deletions
diff --git a/drivers/video/tegra/dc/dc.c b/drivers/video/tegra/dc/dc.c
index 55d3fccf56b2..df54578a5b5a 100644
--- a/drivers/video/tegra/dc/dc.c
+++ b/drivers/video/tegra/dc/dc.c
@@ -583,6 +583,23 @@ static unsigned int tegra_dc_has_multiple_dc(void)
 	return (cnt > 1);
 }
 
+/* get the stride size of a window.
+ * return: stride size in bytes for window win. or 0 if unavailble. */
+int tegra_dc_get_stride(struct tegra_dc *dc, unsigned win)
+{
+	u32 tmp;
+	u32 stride;
+
+	if (!dc->enabled)
+		return 0;
+	BUG_ON(win > DC_N_WINDOWS);
+	tegra_dc_writel(dc, WINDOW_A_SELECT << win,
+		DC_CMD_DISPLAY_WINDOW_HEADER);
+	tmp = tegra_dc_readl(dc, DC_WIN_LINE_STRIDE);
+	return GET_LINE_STRIDE(tmp);
+}
+EXPORT_SYMBOL(tegra_dc_get_stride);
+
 struct tegra_dc *tegra_dc_get_dc(unsigned idx)
 {
 	if (idx < TEGRA_MAX_DC)
diff --git a/drivers/video/tegra/dc/dc_reg.h b/drivers/video/tegra/dc/dc_reg.h
index ded64de2decc..0b628fc7a14a 100644
--- a/drivers/video/tegra/dc/dc_reg.h
+++ b/drivers/video/tegra/dc/dc_reg.h
@@ -431,6 +431,8 @@
 #define DC_WIN_LINE_STRIDE			0x70a
 #define  LINE_STRIDE(x)		(x)
 #define  UV_LINE_STRIDE(x)	(((x) & 0xffff) << 16)
+#define  GET_LINE_STRIDE(x)	((x) & 0xffff)
+#define  GET_UV_LINE_STRIDE(x)	(((x) >> 16) & 0xffff)
 #define DC_WIN_BUF_STRIDE			0x70b
 #define DC_WIN_UV_BUF_STRIDE			0x70c
 #define DC_WIN_BUFFER_ADDR_MODE			0x70d
diff --git a/drivers/video/tegra/dc/dsi.c b/drivers/video/tegra/dc/dsi.c
index 69cc60f70f1c..e402c416b779 100644
--- a/drivers/video/tegra/dc/dsi.c
+++ b/drivers/video/tegra/dc/dsi.c
@@ -1,7 +1,7 @@
 /*
  * drivers/video/tegra/dc/dsi.c
  *
- * Copyright (c) 2011, NVIDIA Corporation.
+ * Copyright (c) 2011-2012, NVIDIA Corporation.
  *
  * This software is licensed under the terms of the GNU General Public
  * License version 2, as published by the Free Software Foundation, and
@@ -31,6 +31,7 @@
 #include <mach/dc.h>
 #include <mach/fb.h>
 #include <mach/csi.h>
+#include <mach/iomap.h>
 #include <linux/nvhost.h>
 
 #include "dc_reg.h"
@@ -38,6 +39,9 @@
 #include "dsi_regs.h"
 #include "dsi.h"
 
+#define APB_MISC_GP_MIPI_PAD_CTRL_0 	(TEGRA_APB_MISC_BASE + 0x820)
+#define DSIB_MODE_ENABLE		0x2
+
 #define DSI_USE_SYNC_POINTS		1
 #define S_TO_MS(x)			(1000 * (x))
 
@@ -1621,6 +1625,15 @@ static void tegra_dsi_pad_calibration(struct tegra_dc_dsi_data *dsi)
 	tegra_vi_csi_writel(val, CSI_CIL_PAD_CONFIG);
 }
 
+static void tegra_dsi_panelB_enable()
+{
+	unsigned int val;
+
+	val = readl(IO_ADDRESS(APB_MISC_GP_MIPI_PAD_CTRL_0));
+	val |= DSIB_MODE_ENABLE;
+	writel(val, (IO_ADDRESS(APB_MISC_GP_MIPI_PAD_CTRL_0)));
+}
+
 static int tegra_dsi_init_hw(struct tegra_dc *dc,
 						struct tegra_dc_dsi_data *dsi)
 {
@@ -1634,7 +1647,7 @@ static int tegra_dsi_init_hw(struct tegra_dc *dc,
 
 	tegra_dsi_set_dsi_clk(dc, dsi, dsi->target_lp_clk_khz);
 	if (dsi->info.dsi_instance) {
-		/* TODO:Set the misc register*/
+		tegra_dsi_panelB_enable();
 	}
 
 	/* TODO: only need to change the timing for bta */
diff --git a/drivers/video/tegra/fb.c b/drivers/video/tegra/fb.c
index 50aa9b383059..1193a2eb8c52 100644
--- a/drivers/video/tegra/fb.c
+++ b/drivers/video/tegra/fb.c
@@ -44,7 +44,7 @@
 #include "dc/dc_priv.h"
 
 /* Pad pitch to 16-byte boundary. */
-#define TEGRA_LINEAR_PITCH_ALIGNMENT 16
+#define TEGRA_LINEAR_PITCH_ALIGNMENT 32
 
 struct tegra_fb_info {
 	struct tegra_dc_win	*win;
@@ -527,6 +527,7 @@ struct tegra_fb_info *tegra_fb_register(struct nvhost_device *ndev,
 	unsigned long fb_size = 0;
 	unsigned long fb_phys = 0;
 	int ret = 0;
+	unsigned stride;
 
 	win = tegra_dc_get_window(dc, fb_data->win);
 	if (!win) {
@@ -560,6 +561,11 @@ struct tegra_fb_info *tegra_fb_register(struct nvhost_device *ndev,
 		tegra_fb->valid = true;
 	}
 
+	stride = tegra_dc_get_stride(dc, 0);
+	if (!stride) /* default to pad the stride to 16-byte boundary. */
+		stride = round_up(info->fix.line_length,
+			TEGRA_LINEAR_PITCH_ALIGNMENT);
+
 	info->fbops = &tegra_fb_ops;
 	info->pseudo_palette = pseudo_palette;
 	info->screen_base = fb_base;
@@ -574,9 +580,7 @@ struct tegra_fb_info *tegra_fb_register(struct nvhost_device *ndev,
 	info->fix.smem_start	= fb_phys;
 	info->fix.smem_len	= fb_size;
 	info->fix.line_length = fb_data->xres * fb_data->bits_per_pixel / 8;
-	/* Pad the stride to 16-byte boundary. */
-	info->fix.line_length = round_up(info->fix.line_length,
-					TEGRA_LINEAR_PITCH_ALIGNMENT);
+	info->fix.line_length = stride;
 
 	info->var.xres			= fb_data->xres;
 	info->var.yres			= fb_data->yres;
diff --git a/drivers/video/tegra/host/bus_client.c b/drivers/video/tegra/host/bus_client.c
index fd632a6ea9c5..87aa9c64d363 100644
--- a/drivers/video/tegra/host/bus_client.c
+++ b/drivers/video/tegra/host/bus_client.c
@@ -141,12 +141,6 @@ static int nvhost_channelopen(struct inode *inode, struct file *filp)
 	priv->clientid = atomic_add_return(1,
 			&nvhost_get_host(ch->dev)->clientid);
 	priv->timeout = MAX_STUCK_CHECK_COUNT * SYNCPT_CHECK_PERIOD;
-
-	priv->job = nvhost_job_alloc(ch, priv->hwctx, &priv->hdr,
-			NULL, priv->priority, priv->clientid);
-	if (!priv->job)
-		goto fail;
-
 	return 0;
 fail:
 	nvhost_channelrelease(inode, filp);
@@ -166,7 +160,7 @@ static int set_submit(struct nvhost_channel_userctx *ctx)
 		return -EFAULT;
 	}
 
-	ctx->job = nvhost_job_realloc(ctx->job,
+	ctx->job = nvhost_job_alloc(ctx->ch,
 			ctx->hwctx,
 			&ctx->hdr,
 			ctx->nvmap,
@@ -238,17 +232,28 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 				cmdbuf.mem, cmdbuf.words, cmdbuf.offset);
 			hdr->num_cmdbufs--;
 		} else if (hdr->num_relocs) {
-			consumed = sizeof(struct nvhost_reloc);
-			if (remaining < consumed)
+			int numrelocs = remaining / sizeof(struct nvhost_reloc);
+			if (!numrelocs)
 				break;
-			if (copy_from_user(&job->pinarray[job->num_pins],
+			numrelocs = min_t(int, numrelocs, priv->hdr.num_relocs);
+			consumed = numrelocs * sizeof(struct nvhost_reloc);
+			if (copy_from_user(&job->relocarray[job->num_relocs],
 					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			trace_nvhost_channel_write_reloc(chname);
-			job->num_pins++;
-			hdr->num_relocs--;
+			while (numrelocs) {
+				struct nvhost_reloc *reloc =
+					&job->relocarray[job->num_relocs];
+				trace_nvhost_channel_write_reloc(chname,
+					reloc->cmdbuf_mem,
+					reloc->cmdbuf_offset,
+					reloc->target,
+					reloc->target_offset);
+				job->num_relocs++;
+				hdr->num_relocs--;
+				numrelocs--;
+			}
 		} else if (hdr->num_waitchks) {
 			int numwaitchks =
 				(remaining / sizeof(struct nvhost_waitchk));
@@ -269,17 +274,19 @@ static ssize_t nvhost_channelwrite(struct file *filp, const char __user *buf,
 			hdr->num_waitchks -= numwaitchks;
 		} else if (priv->num_relocshifts) {
 			int next_shift =
-				job->num_pins - priv->num_relocshifts;
-			consumed = sizeof(struct nvhost_reloc_shift);
-			if (remaining < consumed)
+				job->num_relocs - priv->num_relocshifts;
+			int num =
+				(remaining / sizeof(struct nvhost_reloc_shift));
+			if (!num)
 				break;
-			if (copy_from_user(
-					&job->pinarray[next_shift].reloc_shift,
+			num = min_t(int, num, priv->num_relocshifts);
+			consumed = num * sizeof(struct nvhost_reloc_shift);
+			if (copy_from_user(&job->relocshiftarray[next_shift],
 					buf, consumed)) {
 				err = -EFAULT;
 				break;
 			}
-			priv->num_relocshifts--;
+			priv->num_relocshifts -= num;
 		} else {
 			err = -EFAULT;
 			break;
@@ -302,7 +309,7 @@ static int nvhost_ioctl_channel_flush(
 	struct nvhost_get_param_args *args,
 	int null_kickoff)
 {
-	struct device *device = &ctx->ch->dev->dev;
+	struct nvhost_device *ndev = to_nvhost_device(&ctx->ch->dev->dev);
 	int err;
 
 	trace_nvhost_ioctl_channel_flush(ctx->ch->dev->name);
@@ -312,13 +319,13 @@ static int nvhost_ioctl_channel_flush(
 	    ctx->hdr.num_cmdbufs ||
 	    ctx->hdr.num_waitchks) {
 		reset_submit(ctx);
-		dev_err(device, "channel submit out of sync\n");
+		dev_err(&ndev->dev, "channel submit out of sync\n");
 		return -EFAULT;
 	}
 
-	err = nvhost_job_pin(ctx->job);
+	err = nvhost_job_pin(ctx->job, &nvhost_get_host(ndev)->syncpt);
 	if (err) {
-		dev_warn(device, "nvhost_job_pin failed: %d\n", err);
+		dev_warn(&ndev->dev, "nvhost_job_pin failed: %d\n", err);
 		return err;
 	}
 
@@ -337,6 +344,9 @@ static int nvhost_ioctl_channel_flush(
 	if (err)
 		nvhost_job_unpin(ctx->job);
 
+	nvhost_job_put(ctx->job);
+	ctx->job = NULL;
+
 	return err;
 }
 
diff --git a/drivers/video/tegra/host/chip_support.h b/drivers/video/tegra/host/chip_support.h
index edc5f6a51574..d69e1c4bccb9 100644
--- a/drivers/video/tegra/host/chip_support.h
+++ b/drivers/video/tegra/host/chip_support.h
@@ -28,10 +28,9 @@ struct output;
 struct nvhost_master;
 struct nvhost_intr;
 struct nvhost_syncpt;
-struct nvhost_waitchk;
 struct nvhost_userctx_timeout;
 struct nvhost_channel;
-struct nvmap_handle;
+struct nvmap_handle_ref;
 struct nvmap_client;
 struct nvhost_hwctx;
 struct nvhost_cdma;
@@ -77,7 +76,7 @@ struct nvhost_chip_support {
 		void (*destroy)(struct push_buffer *);
 		void (*push_to)(struct push_buffer *,
 				struct nvmap_client *,
-				struct nvmap_handle *,
+				struct nvmap_handle_ref *,
 				u32 op1, u32 op2);
 		void (*pop_from)(struct push_buffer *,
 				 unsigned int slots);
@@ -106,11 +105,8 @@ struct nvhost_chip_support {
 		void (*read_wait_base)(struct nvhost_syncpt *, u32 id);
 		u32 (*update_min)(struct nvhost_syncpt *, u32 id);
 		void (*cpu_incr)(struct nvhost_syncpt *, u32 id);
-		int (*wait_check)(struct nvhost_syncpt *sp,
-				  struct nvmap_client *nvmap,
-				  u32 waitchk_mask,
-				  struct nvhost_waitchk *wait,
-				  int num_waitchk);
+		int (*patch_wait)(struct nvhost_syncpt *sp,
+				void *patch_addr);
 		void (*debug)(struct nvhost_syncpt *);
 		const char * (*name)(struct nvhost_syncpt *, u32 id);
 		int (*mutex_try_lock)(struct nvhost_syncpt *,
diff --git a/drivers/video/tegra/host/debug.c b/drivers/video/tegra/host/debug.c
index 8a26f92c79f6..820eac85521d 100644
--- a/drivers/video/tegra/host/debug.c
+++ b/drivers/video/tegra/host/debug.c
@@ -106,13 +106,53 @@ static void show_all(struct nvhost_master *m, struct output *o)
 	nvhost_get_chip_ops()->debug.show_mlocks(m, o);
 	show_syncpts(m, o);
 	nvhost_debug_output(o, "---- channels ----\n");
-	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o, show_channels);
+	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o,
+			show_channels);
 
 	nvhost_module_idle(m->dev);
 }
 
 #ifdef CONFIG_DEBUG_FS
-static int nvhost_debug_show(struct seq_file *s, void *unused)
+static int show_channels_no_fifo(struct device *dev, void *data)
+{
+	struct nvhost_channel *ch;
+	struct nvhost_device *nvdev = to_nvhost_device(dev);
+	struct output *o = data;
+	struct nvhost_master *m;
+
+	if (nvdev == NULL)
+		return 0;
+
+	m = nvhost_get_host(nvdev);
+	ch = nvdev->channel;
+	if (ch) {
+		mutex_lock(&ch->reflock);
+		if (ch->refcount) {
+			mutex_lock(&ch->cdma.lock);
+			nvhost_get_chip_ops()->debug.show_channel_cdma(m,
+					ch, o, nvdev->index);
+			mutex_unlock(&ch->cdma.lock);
+		}
+		mutex_unlock(&ch->reflock);
+	}
+
+	return 0;
+}
+
+static void show_all_no_fifo(struct nvhost_master *m, struct output *o)
+{
+	nvhost_module_busy(m->dev);
+
+	nvhost_get_chip_ops()->debug.show_mlocks(m, o);
+	show_syncpts(m, o);
+	nvhost_debug_output(o, "---- channels ----\n");
+	bus_for_each_dev(&(nvhost_bus_get())->nvhost_bus_type, NULL, o,
+			show_channels_no_fifo);
+
+	nvhost_module_idle(m->dev);
+}
+
+static int nvhost_debug_show_all(struct seq_file *s, void *unused)
 {
 	struct output o = {
 		.fn = write_to_seqfile,
@@ -121,6 +161,27 @@ static int nvhost_debug_show(struct seq_file *s, void *unused)
 	show_all(s->private, &o);
 	return 0;
 }
+static int nvhost_debug_show(struct seq_file *s, void *unused)
+{
+	struct output o = {
+		.fn = write_to_seqfile,
+		.ctx = s
+	};
+	show_all_no_fifo(s->private, &o);
+	return 0;
+}
+
+static int nvhost_debug_open_all(struct inode *inode, struct file *file)
+{
+	return single_open(file, nvhost_debug_show_all, inode->i_private);
+}
+
+static const struct file_operations nvhost_debug_all_fops = {
+	.open		= nvhost_debug_open_all,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
 
 static int nvhost_debug_open(struct inode *inode, struct file *file)
 {
@@ -140,6 +201,8 @@ void nvhost_debug_init(struct nvhost_master *master)
 
 	debugfs_create_file("status", S_IRUGO, de,
 			master, &nvhost_debug_fops);
+	debugfs_create_file("status_all", S_IRUGO, de,
+			master, &nvhost_debug_all_fops);
 
 	debugfs_create_u32("null_kickoff_pid", S_IRUGO|S_IWUSR, de,
 			&nvhost_debug_null_kickoff_pid);
diff --git a/drivers/video/tegra/host/gr3d/gr3d_t20.c b/drivers/video/tegra/host/gr3d/gr3d_t20.c
index c0efac03b882..5645f5b2b0c6 100644
--- a/drivers/video/tegra/host/gr3d/gr3d_t20.c
+++ b/drivers/video/tegra/host/gr3d/gr3d_t20.c
@@ -138,7 +138,7 @@ static void save_push_v0(struct nvhost_hwctx *nctx, struct nvhost_cdma *cdma)
 
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			p->save_buf->handle,
+			p->save_buf,
 			0,
 			nvhost_opcode_gather(p->save_size),
 			p->save_phys);
diff --git a/drivers/video/tegra/host/gr3d/gr3d_t30.c b/drivers/video/tegra/host/gr3d/gr3d_t30.c
index 93d98dfa645c..57f4c779eff8 100644
--- a/drivers/video/tegra/host/gr3d/gr3d_t30.c
+++ b/drivers/video/tegra/host/gr3d/gr3d_t30.c
@@ -145,7 +145,7 @@ static void save_push_v1(struct nvhost_hwctx *nctx, struct nvhost_cdma *cdma)
 	/* gather the save buffer */
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			p->save_buf->handle,
+			p->save_buf,
 			0,
 			nvhost_opcode_gather(p->save_size),
 			p->save_phys);
diff --git a/drivers/video/tegra/host/host1x/host1x_cdma.c b/drivers/video/tegra/host/host1x/host1x_cdma.c
index fcb1f05f0025..4569c3d62494 100644
--- a/drivers/video/tegra/host/host1x/host1x_cdma.c
+++ b/drivers/video/tegra/host/host1x/host1x_cdma.c
@@ -137,7 +137,7 @@ static void push_buffer_destroy(struct push_buffer *pb)
  */
 static void push_buffer_push_to(struct push_buffer *pb,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle, u32 op1, u32 op2)
+		struct nvmap_handle_ref *handle, u32 op1, u32 op2)
 {
 	u32 cur = pb->cur;
 	u32 *p = (u32 *)((u32)pb->mapped + cur);
diff --git a/drivers/video/tegra/host/host1x/host1x_channel.c b/drivers/video/tegra/host/host1x/host1x_channel.c
index 8c4a7a5c74ad..c72e6478b806 100644
--- a/drivers/video/tegra/host/host1x/host1x_channel.c
+++ b/drivers/video/tegra/host/host1x/host1x_channel.c
@@ -143,7 +143,7 @@ static void submit_ctxrestore(struct nvhost_job *job)
 	/* Send restore buffer to channel */
 	nvhost_cdma_push_gather(&ch->cdma,
 		host->nvmap,
-		nvmap_ref_to_handle(ctx->restore),
+		ctx->restore,
 		0,
 		nvhost_opcode_gather(ctx->restore_size),
 		ctx->restore_phys);
@@ -188,7 +188,7 @@ void submit_gathers(struct nvhost_job *job)
 		u32 op2 = job->gathers[i].mem;
 		nvhost_cdma_push_gather(&job->ch->cdma,
 				job->nvmap,
-				nvmap_id_to_handle(job->gathers[i].mem_id),
+				job->gathers[i].ref,
 				job->gathers[i].offset,
 				op1, op2);
 	}
@@ -242,22 +242,6 @@ int host1x_channel_submit(struct nvhost_job *job)
 		goto error;
 	}
 
-	/* remove stale waits */
-	if (job->num_waitchk) {
-		err = nvhost_syncpt_wait_check(sp,
-					       job->nvmap,
-					       job->waitchk_mask,
-					       job->waitchk,
-					       job->num_waitchk);
-		if (err) {
-			dev_warn(&ch->dev->dev,
-				 "nvhost_syncpt_wait_check failed: %d\n", err);
-			mutex_unlock(&ch->submitlock);
-			nvhost_module_idle(ch->dev);
-			goto error;
-		}
-	}
-
 	/* begin a CDMA submit */
 	err = nvhost_cdma_begin(&ch->cdma, job);
 	if (err) {
diff --git a/drivers/video/tegra/host/host1x/host1x_debug.c b/drivers/video/tegra/host/host1x/host1x_debug.c
index 76483d82528b..7de342298c4d 100644
--- a/drivers/video/tegra/host/host1x/host1x_debug.c
+++ b/drivers/video/tegra/host/host1x/host1x_debug.c
@@ -28,8 +28,8 @@
 #include "host1x_hardware.h"
 #include "nvhost_cdma.h"
 #include "nvhost_channel.h"
-#include "../../nvmap/nvmap.h"
 #include "host1x_cdma.h"
+#include "nvhost_job.h"
 
 #define NVHOST_DEBUG_MAX_PAGE_OFFSET 102400
 
@@ -160,6 +160,34 @@ static void show_channel_word(struct output *o, int *state, int *count,
 	}
 }
 
+static void do_show_channel_gather(struct output *o,
+		phys_addr_t phys_addr,
+		u32 words, struct nvhost_cdma *cdma,
+		phys_addr_t pin_addr, u32 *map_addr)
+{
+	/* Map dmaget cursor to corresponding nvmap_handle */
+	u32 offset;
+	int state, count, i;
+
+	offset = phys_addr - pin_addr;
+	/*
+	 * Sometimes we're given different hardware address to the same
+	 * page - in these cases the offset will get an invalid number and
+	 * we just have to bail out.
+	 */
+	if (offset > NVHOST_DEBUG_MAX_PAGE_OFFSET) {
+		nvhost_debug_output(o, "[address mismatch]\n");
+	} else {
+		/* GATHER buffer starts always with commands */
+		state = NVHOST_DBG_STATE_CMD;
+		for (i = 0; i < words; i++)
+			show_channel_word(o, &state, &count,
+					phys_addr + i * 4,
+					*(map_addr + offset/4 + i),
+					cdma);
+	}
+}
+
 static void show_channel_gather(struct output *o, u32 addr,
 		phys_addr_t phys_addr,
 		u32 words, struct nvhost_cdma *cdma)
@@ -169,81 +197,36 @@ static void show_channel_gather(struct output *o, u32 addr,
 	struct push_buffer *pb = &cdma->push_buffer;
 	u32 cur = addr - pb->phys;
 	struct nvmap_client_handle *nvmap = &pb->nvmap[cur/8];
-	struct nvmap_handle_ref ref;
 	u32 *map_addr, offset;
 	phys_addr_t pin_addr;
-	int state, count, i;
 
-	if (!nvmap->handle || !nvmap->client
-			|| atomic_read(&nvmap->handle->ref) < 1) {
+	if (!nvmap || !nvmap->handle || !nvmap->client) {
 		nvhost_debug_output(o, "[already deallocated]\n");
 		return;
 	}
 
-	/* Create a fake nvmap_handle_ref - nvmap requires it
-	 * but accesses only the first field - nvmap_handle */
-	ref.handle = nvmap->handle;
-
-	map_addr = nvmap_mmap(&ref);
+	map_addr = nvmap_mmap(nvmap->handle);
 	if (!map_addr) {
 		nvhost_debug_output(o, "[could not mmap]\n");
 		return;
 	}
 
 	/* Get base address from nvmap */
-	pin_addr = nvmap_pin(nvmap->client, &ref);
+	pin_addr = nvmap_pin(nvmap->client, nvmap->handle);
 	if (IS_ERR_VALUE(pin_addr)) {
 		nvhost_debug_output(o, "[couldn't pin]\n");
-		nvmap_munmap(&ref, map_addr);
+		nvmap_munmap(nvmap->handle, map_addr);
 		return;
 	}
 
 	offset = phys_addr - pin_addr;
-	/*
-	 * Sometimes we're given different hardware address to the same
-	 * page - in these cases the offset will get an invalid number and
-	 * we just have to bail out.
-	 */
-	if (offset > NVHOST_DEBUG_MAX_PAGE_OFFSET) {
-		nvhost_debug_output(o, "[address mismatch]\n");
-	} else {
-		/* GATHER buffer starts always with commands */
-		state = NVHOST_DBG_STATE_CMD;
-		for (i = 0; i < words; i++)
-			show_channel_word(o, &state, &count,
-					phys_addr + i * 4,
-					*(map_addr + offset/4 + i),
-					cdma);
-	}
-	nvmap_unpin(nvmap->client, &ref);
-	nvmap_munmap(&ref, map_addr);
+	do_show_channel_gather(o, phys_addr, words, cdma,
+			pin_addr, map_addr);
+	nvmap_unpin(nvmap->client, nvmap->handle);
+	nvmap_munmap(nvmap->handle, map_addr);
 #endif
 }
 
-static void show_channel_pair(struct output *o, u32 addr,
-		u32 w0, u32 w1, struct nvhost_cdma *cdma)
-{
-	int state = NVHOST_DBG_STATE_CMD;
-	int count;
-
-	show_channel_word(o, &state, &count, addr, w0, cdma);
-	show_channel_word(o, &state, &count, addr+4, w1, cdma);
-}
-
-/**
- * Retrieve the op pair at a slot offset from a DMA address
- */
-static void cdma_peek(struct nvhost_cdma *cdma,
-		      u32 dmaget, int slot, u32 *out)
-{
-	u32 offset = dmaget - cdma->push_buffer.phys;
-	u32 *p = cdma->push_buffer.mapped;
-
-	offset = ((offset + slot * 8) & (PUSH_BUFFER_SIZE - 1)) >> 2;
-	out[0] = p[offset];
-	out[1] = p[offset + 1];
-}
-
 u32 previous_oppair(struct nvhost_cdma *cdma, u32 cur)
 {
 	u32 pb = cdma->push_buffer.phys;
@@ -253,6 +236,42 @@ u32 previous_oppair(struct nvhost_cdma *cdma, u32 cur)
 	return prev;
 }
 
+void show_channel_gathers(struct output *o, struct nvhost_cdma *cdma)
+{
+	struct nvhost_job *job;
+
+	list_for_each_entry(job, &cdma->sync_queue, list) {
+		int i;
+		nvhost_debug_output(o, "\n%p: JOB, syncpt_id=%d, syncpt_val=%d,"
+				" first_get=%08x, timeout=%d, ctx=%p,"
+				" num_slots=%d, num_handles=%d\n",
+				job,
+				job->syncpt_id,
+				job->syncpt_end,
+				job->first_get,
+				job->timeout,
+				job->hwctx,
+				job->num_slots,
+				job->num_unpins);
+
+		for (i = 0; i < job->num_gathers; i++) {
+			struct nvhost_job_gather *g = &job->gathers[i];
+			u32 *mapped = nvmap_mmap(g->ref);
+			if (!mapped) {
+				nvhost_debug_output(o, "[could not mmap]\n");
+				continue;
+			}
+
+			nvhost_debug_output(o, "    GATHER at %08x, %d words\n",
+				g->mem, g->words);
+
+			do_show_channel_gather(o, g->mem + g->offset,
+					g->words, cdma, g->mem, mapped);
+			nvmap_munmap(g->ref, mapped);
+		}
+	}
+}
+
 static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 	struct nvhost_channel *ch, struct output *o, int chid)
 {
@@ -261,7 +280,6 @@ static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 	u32 dmaput, dmaget, dmactrl;
 	u32 cbstat, cbread;
 	u32 val, base, baseval;
-	u32 pbw[2];
 
 	dmaput = readl(channel->aperture + HOST1X_CHANNEL_DMAPUT);
 	dmaget = readl(channel->aperture + HOST1X_CHANNEL_DMAGET);
@@ -310,9 +328,7 @@ static void t20_debug_show_channel_cdma(struct nvhost_master *m,
 		dmaput, dmaget, dmactrl);
 	nvhost_debug_output(o, "CBREAD %08x, CBSTAT %08x\n", cbread, cbstat);
 
-	cdma_peek(cdma, dmaget, -1, pbw);
-	show_channel_pair(o, previous_oppair(cdma, dmaget),
-		pbw[0], pbw[1], &channel->cdma);
+	show_channel_gathers(o, cdma);
 	nvhost_debug_output(o, "\n");
 }
 
diff --git a/drivers/video/tegra/host/host1x/host1x_syncpt.c b/drivers/video/tegra/host/host1x/host1x_syncpt.c
index b7d6587acc61..4cc8e9e212fa 100644
--- a/drivers/video/tegra/host/host1x/host1x_syncpt.c
+++ b/drivers/video/tegra/host/host1x/host1x_syncpt.c
@@ -103,62 +103,14 @@ static void t20_syncpt_cpu_incr(struct nvhost_syncpt *sp, u32 id)
 	wmb();
 }
 
-/* check for old WAITs to be removed (avoiding a wrap) */
-static int t20_syncpt_wait_check(struct nvhost_syncpt *sp,
-				 struct nvmap_client *nvmap,
-				 u32 waitchk_mask,
-				 struct nvhost_waitchk *wait,
-				 int num_waitchk)
+/* remove a wait pointed to by patch_addr */
+static int host1x_syncpt_patch_wait(struct nvhost_syncpt *sp,
+		void *patch_addr)
 {
-	u32 idx;
-	int err = 0;
-
-	/* get current syncpt values */
-	for (idx = 0; idx < NV_HOST1X_SYNCPT_NB_PTS; idx++) {
-		if (BIT(idx) & waitchk_mask)
-			nvhost_syncpt_update_min(sp, idx);
-	}
-
-	BUG_ON(!wait && !num_waitchk);
-
-	/* compare syncpt vs wait threshold */
-	while (num_waitchk) {
-		u32 override;
-
-		BUG_ON(wait->syncpt_id >= NV_HOST1X_SYNCPT_NB_PTS);
-		trace_nvhost_syncpt_wait_check(wait->mem, wait->offset,
-				wait->syncpt_id, wait->thresh);
-		if (nvhost_syncpt_is_expired(sp,
-					wait->syncpt_id, wait->thresh)) {
-			/*
-			 * NULL an already satisfied WAIT_SYNCPT host method,
-			 * by patching its args in the command stream. The
-			 * method data is changed to reference a reserved
-			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
-			 * syncpt with a matching threshold value of 0, so
-			 * is guaranteed to be popped by the host HW.
-			 */
-			dev_dbg(&syncpt_to_dev(sp)->dev->dev,
-			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
-			    wait->syncpt_id,
-			    syncpt_op().name(sp, wait->syncpt_id),
-			    wait->thresh,
-			    nvhost_syncpt_read_min(sp, wait->syncpt_id));
-
-			/* patch the wait */
-			override = nvhost_class_host_wait_syncpt(
-					NVSYNCPT_GRAPHICS_HOST, 0);
-			err = nvmap_patch_word(nvmap,
-					(struct nvmap_handle *)wait->mem,
-					wait->offset, override);
-			if (err)
-				break;
-		}
-
-		wait++;
-		num_waitchk--;
-	}
-	return err;
+	u32 override = nvhost_class_host_wait_syncpt(
+			NVSYNCPT_GRAPHICS_HOST, 0);
+	__raw_writel(override, patch_addr);
+	return 0;
 }
 
 
@@ -241,7 +193,7 @@ int host1x_init_syncpt_support(struct nvhost_master *host,
 	op->syncpt.read_wait_base = t20_syncpt_read_wait_base;
 	op->syncpt.update_min = t20_syncpt_update_min;
 	op->syncpt.cpu_incr = t20_syncpt_cpu_incr;
-	op->syncpt.wait_check = t20_syncpt_wait_check;
+	op->syncpt.patch_wait = host1x_syncpt_patch_wait;
 	op->syncpt.debug = t20_syncpt_debug;
 	op->syncpt.name = t20_syncpt_name;
 	op->syncpt.mutex_try_lock = syncpt_mutex_try_lock;
diff --git a/drivers/video/tegra/host/mpe/mpe.c b/drivers/video/tegra/host/mpe/mpe.c
index d8c9da7e9a76..3fe2fcd8bb50 100644
--- a/drivers/video/tegra/host/mpe/mpe.c
+++ b/drivers/video/tegra/host/mpe/mpe.c
@@ -502,7 +502,7 @@ static void ctxmpe_save_push(struct nvhost_hwctx *nctx,
 	struct host1x_hwctx_handler *h = host1x_hwctx_handler(ctx);
 	nvhost_cdma_push_gather(cdma,
 			nvhost_get_host(nctx->channel->dev)->nvmap,
-			h->save_buf->handle,
+			h->save_buf,
 			0,
 			nvhost_opcode_gather(h->save_size),
 			h->save_phys);
diff --git a/drivers/video/tegra/host/nvhost_cdma.c b/drivers/video/tegra/host/nvhost_cdma.c
index a72e18f16ac7..c87415bf5ac2 100644
--- a/drivers/video/tegra/host/nvhost_cdma.c
+++ b/drivers/video/tegra/host/nvhost_cdma.c
@@ -53,6 +53,18 @@ static void add_to_sync_queue(struct nvhost_cdma *cdma,
 	job->num_slots = nr_slots;
 	nvhost_job_get(job);
 	list_add_tail(&job->list, &cdma->sync_queue);
+
+	switch (job->priority) {
+	case NVHOST_PRIORITY_HIGH:
+		cdma->high_prio_count++;
+		break;
+	case NVHOST_PRIORITY_MEDIUM:
+		cdma->med_prio_count++;
+		break;
+	case NVHOST_PRIORITY_LOW:
+		cdma->low_prio_count++;
+		break;
+	}
 }
 
 /**
@@ -200,6 +212,19 @@ static void update_cdma_locked(struct nvhost_cdma *cdma)
 		}
 
 		list_del(&job->list);
+
+		switch (job->priority) {
+		case NVHOST_PRIORITY_HIGH:
+			cdma->high_prio_count--;
+			break;
+		case NVHOST_PRIORITY_MEDIUM:
+			cdma->med_prio_count--;
+			break;
+		case NVHOST_PRIORITY_LOW:
+			cdma->low_prio_count--;
+			break;
+		}
+
 		nvhost_job_put(job);
 	}
 
@@ -371,15 +396,13 @@ int nvhost_cdma_begin(struct nvhost_cdma *cdma, struct nvhost_job *job)
 }
 
 static void trace_write_gather(struct nvhost_cdma *cdma,
-		struct nvmap_handle *handle,
+		struct nvmap_handle_ref *ref,
 		u32 offset, u32 words)
 {
-	struct nvmap_handle_ref ref;
 	void *mem = NULL;
 
 	if (nvhost_debug_trace_cmdbuf) {
-		ref.handle = handle;
-		mem = nvmap_mmap(&ref);
+		mem = nvmap_mmap(ref);
 		if (IS_ERR_OR_NULL(mem))
 			mem = NULL;
 	};
@@ -393,12 +416,12 @@ static void trace_write_gather(struct nvhost_cdma *cdma,
 		for (i = 0; i < words; i += TRACE_MAX_LENGTH) {
 			trace_nvhost_cdma_push_gather(
 				cdma_to_channel(cdma)->dev->name,
-				(u32)handle,
+				(u32)ref->handle,
 				min(words - i, TRACE_MAX_LENGTH),
 				offset + i * sizeof(u32),
 				mem);
 		}
-		nvmap_munmap(&ref, mem);
+		nvmap_munmap(ref, mem);
 	}
 }
 
@@ -421,7 +444,7 @@ void nvhost_cdma_push(struct nvhost_cdma *cdma, u32 op1, u32 op2)
  */
 void nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle,
+		struct nvmap_handle_ref *handle,
 		u32 offset, u32 op1, u32 op2)
 {
 	u32 slots_free = cdma->slots_free;
@@ -468,6 +491,12 @@ void nvhost_cdma_end(struct nvhost_cdma *cdma,
 	if (job->timeout && was_idle)
 		cdma_start_timer_locked(cdma, job);
 
+	trace_nvhost_cdma_end(job->ch->dev->name,
+			job->priority,
+			job->ch->cdma.high_prio_count,
+			job->ch->cdma.med_prio_count,
+			job->ch->cdma.low_prio_count);
+
 	mutex_unlock(&cdma->lock);
 }
 
@@ -492,6 +521,8 @@ int nvhost_cdma_flush(struct nvhost_cdma *cdma, int timeout)
 	unsigned int space, err = 0;
 	unsigned long end_jiffies = jiffies + msecs_to_jiffies(timeout);
 
+	trace_nvhost_cdma_flush(cdma_to_channel(cdma)->dev->name, timeout);
+
 	/*
 	 * Wait for at most timeout ms. Recalculate timeout at each iteration
 	 * to better keep within given timeout.
diff --git a/drivers/video/tegra/host/nvhost_cdma.h b/drivers/video/tegra/host/nvhost_cdma.h
index e6f51179150f..2056774a7bc7 100644
--- a/drivers/video/tegra/host/nvhost_cdma.h
+++ b/drivers/video/tegra/host/nvhost_cdma.h
@@ -48,7 +48,7 @@ struct nvhost_job;
 
 struct nvmap_client_handle {
 	struct nvmap_client *client;
-	struct nvmap_handle *handle;
+	struct nvmap_handle_ref *handle;
 };
 
 struct push_buffer {
@@ -99,6 +99,9 @@ struct nvhost_cdma {
 	struct buffer_timeout timeout;	/* channel's timeout state/wq */
 	bool running;
 	bool torndown;
+	int high_prio_count;
+	int med_prio_count;
+	int low_prio_count;
 };
 
 #define cdma_to_channel(cdma) container_of(cdma, struct nvhost_channel, cdma)
@@ -113,7 +116,7 @@ int	nvhost_cdma_begin(struct nvhost_cdma *cdma, struct nvhost_job *job);
 void	nvhost_cdma_push(struct nvhost_cdma *cdma, u32 op1, u32 op2);
 void	nvhost_cdma_push_gather(struct nvhost_cdma *cdma,
 		struct nvmap_client *client,
-		struct nvmap_handle *handle, u32 offset, u32 op1, u32 op2);
+		struct nvmap_handle_ref *handle, u32 offset, u32 op1, u32 op2);
 void	nvhost_cdma_end(struct nvhost_cdma *cdma,
 		struct nvhost_job *job);
 void	nvhost_cdma_update(struct nvhost_cdma *cdma);
diff --git a/drivers/video/tegra/host/nvhost_channel.c b/drivers/video/tegra/host/nvhost_channel.c
index ef8886fe4652..ad303cf0a22d 100644
--- a/drivers/video/tegra/host/nvhost_channel.c
+++ b/drivers/video/tegra/host/nvhost_channel.c
@@ -51,10 +51,26 @@ int nvhost_channel_init(struct nvhost_channel *ch,
 
 int nvhost_channel_submit(struct nvhost_job *job)
 {
-	/* Low priority submits wait until sync queue is empty. Ignores result
-	 * from nvhost_cdma_flush, as we submit either when push buffer is
-	 * empty or when we reach the timeout. */
-	if (job->priority < NVHOST_PRIORITY_MEDIUM)
+	/*
+	 * Check if queue has higher priority jobs running. If so, wait until
+	 * queue is empty. Ignores result from nvhost_cdma_flush, as we submit
+	 * either when push buffer is empty or when we reach the timeout.
+	 */
+	int higher_count = 0;
+
+	switch (job->priority) {
+	case NVHOST_PRIORITY_HIGH:
+		higher_count = 0;
+		break;
+	case NVHOST_PRIORITY_MEDIUM:
+		higher_count = job->ch->cdma.high_prio_count;
+		break;
+	case NVHOST_PRIORITY_LOW:
+		higher_count = job->ch->cdma.high_prio_count
+			+ job->ch->cdma.med_prio_count;
+		break;
+	}
+	if (higher_count > 0)
 		(void)nvhost_cdma_flush(&job->ch->cdma,
 				NVHOST_CHANNEL_LOW_PRIO_MAX_WAIT);
 
diff --git a/drivers/video/tegra/host/nvhost_channel.h b/drivers/video/tegra/host/nvhost_channel.h
index eac51731547b..b3a904d5a3ee 100644
--- a/drivers/video/tegra/host/nvhost_channel.h
+++ b/drivers/video/tegra/host/nvhost_channel.h
@@ -31,18 +31,10 @@
 #define NVHOST_MAX_POWERGATE_IDS	2
 
 struct nvhost_master;
-struct nvhost_waitchk;
 struct nvhost_device;
 struct nvhost_channel;
 struct nvhost_hwctx;
 
-struct nvhost_channel_gather {
-	u32 words;
-	phys_addr_t mem;
-	u32 mem_id;
-	int offset;
-};
-
 struct nvhost_channel {
 	int refcount;
 	int chid;
diff --git a/drivers/video/tegra/host/nvhost_intr.c b/drivers/video/tegra/host/nvhost_intr.c
index ba821f694cb4..af2e3ad1bdb5 100644
--- a/drivers/video/tegra/host/nvhost_intr.c
+++ b/drivers/video/tegra/host/nvhost_intr.c
@@ -128,12 +128,16 @@ static void action_submit_complete(struct nvhost_waitlist *waiter)
 	struct nvhost_channel *channel = waiter->data;
 	int nr_completed = waiter->count;
 
+	nvhost_cdma_update(&channel->cdma);
+	nvhost_module_idle_mult(channel->dev, nr_completed);
+
 	/*  Add nr_completed to trace */
 	trace_nvhost_channel_submit_complete(channel->dev->name,
-			nr_completed, waiter->thresh);
+			nr_completed, waiter->thresh,
+			channel->cdma.high_prio_count,
+			channel->cdma.med_prio_count,
+			channel->cdma.low_prio_count);
 
-	nvhost_cdma_update(&channel->cdma);
-	nvhost_module_idle_mult(channel->dev, nr_completed);
 }
 
 static void action_ctxsave(struct nvhost_waitlist *waiter)
diff --git a/drivers/video/tegra/host/nvhost_job.c b/drivers/video/tegra/host/nvhost_job.c
index 71f2ab0e751f..e029449b6184 100644
--- a/drivers/video/tegra/host/nvhost_job.c
+++ b/drivers/video/tegra/host/nvhost_job.c
@@ -23,9 +23,11 @@
 #include <linux/err.h>
 #include <linux/vmalloc.h>
 #include <linux/nvmap.h>
+#include <trace/events/nvhost.h>
 #include "nvhost_channel.h"
 #include "nvhost_job.h"
 #include "nvhost_hwctx.h"
+#include "nvhost_syncpt.h"
 #include "dev.h"
 
 /* Magic to use to fill freed handle slots */
@@ -33,128 +35,44 @@
 
 static int job_size(struct nvhost_submit_hdr_ext *hdr)
 {
-	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_relocs = hdr ? hdr->num_relocs : 0;
 	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int num_unpins = num_cmdbufs + num_relocs;
 
 	return sizeof(struct nvhost_job)
-			+ num_pins * sizeof(struct nvmap_pinarray_elem)
-			+ num_pins * sizeof(struct nvmap_handle *)
-			+ num_waitchks * sizeof(struct nvhost_waitchk);
-}
-
-static int gather_size(int num_cmdbufs)
-{
-	return num_cmdbufs * sizeof(struct nvhost_channel_gather);
-}
-
-static void free_gathers(struct nvhost_job *job)
-{
-	if (job->gathers) {
-		nvmap_munmap(job->gather_mem, job->gathers);
-		job->gathers = NULL;
-	}
-	if (job->gather_mem) {
-		nvmap_free(job->nvmap, job->gather_mem);
-		job->gather_mem = NULL;
-	}
-}
-
-static int alloc_gathers(struct nvhost_job *job,
-		int num_cmdbufs)
-{
-	int err = 0;
-
-	job->gather_mem = NULL;
-	job->gathers = NULL;
-	job->gather_mem_size = 0;
-
-	if (num_cmdbufs) {
-		/* Allocate memory */
-		job->gather_mem = nvmap_alloc(job->nvmap,
-				gather_size(num_cmdbufs),
-				32, NVMAP_HANDLE_CACHEABLE, 0);
-		if (IS_ERR_OR_NULL(job->gather_mem)) {
-			err = job->gather_mem ? PTR_ERR(job->gather_mem) : -ENOMEM;
-			job->gather_mem = NULL;
-			goto error;
-		}
-		job->gather_mem_size = gather_size(num_cmdbufs);
-
-		/* Map memory to kernel */
-		job->gathers = nvmap_mmap(job->gather_mem);
-		if (IS_ERR_OR_NULL(job->gathers)) {
-			err = job->gathers ? PTR_ERR(job->gathers) : -ENOMEM;
-			job->gathers = NULL;
-			goto error;
-		}
-	}
-
-	return 0;
-
-error:
-	free_gathers(job);
-	return err;
-}
-
-static int realloc_gathers(struct nvhost_job *oldjob,
-		struct nvhost_job *newjob,
-		int num_cmdbufs)
-{
-	int err = 0;
-
-	/* Check if we can reuse gather buffer */
-	if (oldjob->gather_mem_size < gather_size(num_cmdbufs)
-			|| oldjob->nvmap != newjob->nvmap) {
-		free_gathers(oldjob);
-		err = alloc_gathers(newjob, num_cmdbufs);
-	} else {
-		newjob->gather_mem = oldjob->gather_mem;
-		newjob->gathers = oldjob->gathers;
-		newjob->gather_mem_size = oldjob->gather_mem_size;
-
-		oldjob->gather_mem = NULL;
-		oldjob->gathers = NULL;
-		oldjob->gather_mem_size = 0;
-	}
-	return err;
+			+ num_relocs * sizeof(struct nvhost_reloc)
+			+ num_relocs * sizeof(struct nvhost_reloc_shift)
+			+ num_unpins * sizeof(struct nvmap_handle_ref *)
+			+ num_waitchks * sizeof(struct nvhost_waitchk)
+			+ num_cmdbufs * sizeof(struct nvhost_job_gather);
 }
 
 static void init_fields(struct nvhost_job *job,
 		struct nvhost_submit_hdr_ext *hdr,
 		int priority, int clientid)
 {
-	int num_pins = hdr ? (hdr->num_relocs + hdr->num_cmdbufs)*2 : 0;
+	int num_relocs = hdr ? hdr->num_relocs : 0;
 	int num_waitchks = hdr ? hdr->num_waitchks : 0;
+	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
+	int num_unpins = num_cmdbufs + num_relocs;
 	void *mem = job;
 
 	/* First init state to zero */
-	job->num_gathers = 0;
-	job->num_pins = 0;
-	job->num_unpins = 0;
-	job->num_waitchk = 0;
-	job->waitchk_mask = 0;
-	job->syncpt_id = 0;
-	job->syncpt_incrs = 0;
-	job->syncpt_end = 0;
 	job->priority = priority;
 	job->clientid = clientid;
-	job->null_kickoff = false;
-	job->first_get = 0;
-	job->num_slots = 0;
 
 	/* Redistribute memory to the structs */
 	mem += sizeof(struct nvhost_job);
-	if (num_pins) {
-		job->pinarray = mem;
-		mem += num_pins * sizeof(struct nvmap_pinarray_elem);
-		job->unpins = mem;
-		mem += num_pins * sizeof(struct nvmap_handle *);
-	} else {
-		job->pinarray = NULL;
-		job->unpins = NULL;
-	}
-
+	job->relocarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct nvhost_reloc);
+	job->relocshiftarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct nvhost_reloc_shift);
+	job->unpins = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(struct nvmap_handle_ref *);
 	job->waitchk = num_waitchks ? mem : NULL;
+	mem += num_waitchks * sizeof(struct nvhost_waitchk);
+	job->gathers = num_cmdbufs ? mem : NULL;
 
 	/* Copy information from header */
 	if (hdr) {
@@ -172,8 +90,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		int clientid)
 {
 	struct nvhost_job *job = NULL;
-	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
-	int err = 0;
 
 	job = vzalloc(job_size(hdr));
 	if (!job)
@@ -186,10 +102,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		hwctx->h->get(hwctx);
 	job->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
 
-	err = alloc_gathers(job, num_cmdbufs);
-	if (err)
-		goto error;
-
 	init_fields(job, hdr, priority, clientid);
 
 	return job;
@@ -200,46 +112,6 @@ error:
 	return NULL;
 }
 
-struct nvhost_job *nvhost_job_realloc(
-		struct nvhost_job *oldjob,
-		struct nvhost_hwctx *hwctx,
-		struct nvhost_submit_hdr_ext *hdr,
-		struct nvmap_client *nvmap,
-		int priority, int clientid)
-{
-	struct nvhost_job *newjob = NULL;
-	int num_cmdbufs = hdr ? hdr->num_cmdbufs : 0;
-	int err = 0;
-
-	newjob = vzalloc(job_size(hdr));
-	if (!newjob)
-		goto error;
-	kref_init(&newjob->ref);
-	newjob->ch = oldjob->ch;
-	newjob->hwctx = hwctx;
-	if (hwctx)
-		newjob->hwctx->h->get(newjob->hwctx);
-	newjob->timeout = oldjob->timeout;
-	newjob->nvmap = nvmap ? nvmap_client_get(nvmap) : NULL;
-
-	err = realloc_gathers(oldjob, newjob, num_cmdbufs);
-	if (err)
-		goto error;
-
-	nvhost_job_put(oldjob);
-
-	init_fields(newjob, hdr, priority, clientid);
-
-	return newjob;
-
-error:
-	if (newjob)
-		nvhost_job_put(newjob);
-	if (oldjob)
-		nvhost_job_put(oldjob);
-	return NULL;
-}
-
 void nvhost_job_get(struct nvhost_job *job)
 {
 	kref_get(&job->ref);
@@ -253,10 +125,6 @@ static void job_free(struct kref *ref)
 		job->hwctxref->h->put(job->hwctxref);
 	if (job->hwctx)
 		job->hwctx->h->put(job->hwctx);
-	if (job->gathers)
-		nvmap_munmap(job->gather_mem, job->gathers);
-	if (job->gather_mem)
-		nvmap_free(job->nvmap, job->gather_mem);
 	if (job->nvmap)
 		nvmap_client_put(job->nvmap);
 	vfree(job);
@@ -280,42 +148,177 @@ void nvhost_job_put(struct nvhost_job *job)
 void nvhost_job_add_gather(struct nvhost_job *job,
 		u32 mem_id, u32 words, u32 offset)
 {
-	struct nvmap_pinarray_elem *pin;
-	struct nvhost_channel_gather *cur_gather =
+	struct nvhost_job_gather *cur_gather =
 			&job->gathers[job->num_gathers];
 
-	pin = &job->pinarray[job->num_pins++];
-	pin->patch_mem = (u32)nvmap_ref_to_handle(job->gather_mem);
-	pin->patch_offset = (void *)&(cur_gather->mem) - (void *)job->gathers;
-	pin->pin_mem = nvmap_convert_handle_u2k(mem_id);
-	pin->pin_offset = offset;
 	cur_gather->words = words;
 	cur_gather->mem_id = mem_id;
 	cur_gather->offset = offset;
 	job->num_gathers += 1;
 }
 
-int nvhost_job_pin(struct nvhost_job *job)
+static int do_relocs(struct nvhost_job *job, u32 cmdbuf_mem, void *cmdbuf_addr)
 {
-	int err = 0;
+	phys_addr_t target_phys;
+	int i;
+	u32 mem_id = 0;
+	struct nvmap_handle_ref *target_ref = NULL;
+
+	/* pin & patch the relocs for one gather */
+	for (i = 0; i < job->num_relocs; i++) {
+		struct nvhost_reloc *reloc = &job->relocarray[i];
+		struct nvhost_reloc_shift *shift = &job->relocshiftarray[i];
+
+		/* skip all other gathers */
+		if (cmdbuf_mem != reloc->cmdbuf_mem)
+			continue;
+
+		/* check if pin-mem is same as previous */
+		if (reloc->target != mem_id) {
+			target_ref = nvmap_duplicate_handle_id(job->nvmap,
+					reloc->target);
+			if (IS_ERR(target_ref))
+				return PTR_ERR(target_ref);
+
+			target_phys = nvmap_pin(job->nvmap, target_ref);
+			if (IS_ERR((void *)target_phys)) {
+				nvmap_free(job->nvmap, target_ref);
+				return target_phys;
+			}
+
+			mem_id = reloc->target;
+			job->unpins[job->num_unpins++] = target_ref;
+		}
 
-	/* pin mem handles and patch physical addresses */
-	job->num_unpins = nvmap_pin_array(job->nvmap,
-				nvmap_ref_to_handle(job->gather_mem),
-				job->pinarray, job->num_pins,
-				job->unpins);
-	if (job->num_unpins < 0)
-		err = job->num_unpins;
+		__raw_writel(
+			(target_phys + reloc->target_offset) >> shift->shift,
+			(cmdbuf_addr + reloc->cmdbuf_offset));
+
+		/* Different gathers might have same mem_id. This ensures we
+		 * perform reloc only once per gather memid. */
+		reloc->cmdbuf_mem = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Check driver supplied waitchk structs for syncpt thresholds
+ * that have already been satisfied and NULL the comparison (to
+ * avoid a wrap condition in the HW).
+ */
+static int do_waitchks(struct nvhost_job *job, struct nvhost_syncpt *sp,
+		u32 patch_mem, void *patch_addr)
+{
+	int i;
+
+	/* compare syncpt vs wait threshold */
+	for (i = 0; i < job->num_waitchk; i++) {
+		struct nvhost_waitchk *wait = &job->waitchk[i];
+
+		/* skip all other gathers */
+		if (patch_mem != wait->mem)
+			continue;
+
+		trace_nvhost_syncpt_wait_check(wait->mem, wait->offset,
+				wait->syncpt_id, wait->thresh,
+				nvhost_syncpt_read(sp, wait->syncpt_id));
+		if (nvhost_syncpt_is_expired(sp,
+					wait->syncpt_id, wait->thresh)) {
+			/*
+			 * NULL an already satisfied WAIT_SYNCPT host method,
+			 * by patching its args in the command stream. The
+			 * method data is changed to reference a reserved
+			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
+			 * syncpt with a matching threshold value of 0, so
+			 * is guaranteed to be popped by the host HW.
+			 */
+			dev_dbg(&syncpt_to_dev(sp)->dev->dev,
+			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
+			    wait->syncpt_id,
+			    syncpt_op().name(sp, wait->syncpt_id),
+			    wait->thresh,
+			    nvhost_syncpt_read_min(sp, wait->syncpt_id));
+
+			/* patch the wait */
+			nvhost_syncpt_patch_wait(sp,
+					(patch_addr + wait->offset));
+		}
+
+		wait->mem = 0;
+	}
+	return 0;
+}
+
+int nvhost_job_pin(struct nvhost_job *job, struct nvhost_syncpt *sp)
+{
+	int err = 0, i = 0;
+	phys_addr_t gather_phys = 0;
+	void *gather_addr = NULL;
+	unsigned long waitchk_mask = job->waitchk_mask;
+
+	/* get current syncpt values for waitchk */
+	for_each_set_bit(i, &waitchk_mask, sizeof(job->waitchk_mask))
+		nvhost_syncpt_update_min(sp, i);
+
+	/* pin gathers */
+	for (i = 0; i < job->num_gathers; i++) {
+		struct nvhost_job_gather *g = &job->gathers[i];
+
+		/* process each gather mem only once */
+		if (!g->ref) {
+			g->ref = nvmap_duplicate_handle_id(job->nvmap,
+					job->gathers[i].mem_id);
+			if (IS_ERR(g->ref)) {
+				err = PTR_ERR(g->ref);
+				g->ref = NULL;
+				break;
+			}
+
+			gather_phys = nvmap_pin(job->nvmap, g->ref);
+			if (IS_ERR((void *)gather_phys)) {
+				nvmap_free(job->nvmap, g->ref);
+				err = gather_phys;
+				break;
+			}
+
+			/* store the gather ref into unpin array */
+			job->unpins[job->num_unpins++] = g->ref;
+
+			gather_addr = nvmap_mmap(g->ref);
+			if (!gather_addr) {
+				err = -ENOMEM;
+				break;
+			}
+
+			err = do_relocs(job, g->mem_id, gather_addr);
+			if (!err)
+				err = do_waitchks(job, sp,
+						g->mem_id, gather_addr);
+			nvmap_munmap(g->ref, gather_addr);
+
+			if (err)
+				break;
+		}
+		g->mem = gather_phys + g->offset;
+	}
+	wmb();
 
 	return err;
 }
 
 void nvhost_job_unpin(struct nvhost_job *job)
 {
-	nvmap_unpin_handles(job->nvmap, job->unpins,
-			job->num_unpins);
+	int i;
+
+	for (i = 0; i < job->num_unpins; i++) {
+		nvmap_unpin(job->nvmap, job->unpins[i]);
+		nvmap_free(job->nvmap, job->unpins[i]);
+	}
+
 	memset(job->unpins, BAD_MAGIC,
-			job->num_unpins * sizeof(struct nvmap_handle *));
+			job->num_unpins * sizeof(struct nvmap_handle_ref *));
+	job->num_unpins = 0;
 }
 
 /**
diff --git a/drivers/video/tegra/host/nvhost_job.h b/drivers/video/tegra/host/nvhost_job.h
index ad9d1af60da1..ec1366337279 100644
--- a/drivers/video/tegra/host/nvhost_job.h
+++ b/drivers/video/tegra/host/nvhost_job.h
@@ -27,7 +27,15 @@ struct nvhost_channel;
 struct nvhost_hwctx;
 struct nvmap_client;
 struct nvhost_waitchk;
-struct nvmap_handle;
+struct nvhost_syncpt;
+
+struct nvhost_job_gather {
+	u32 words;
+	phys_addr_t mem;
+	u32 mem_id;
+	int offset;
+	struct nvmap_handle_ref *ref;
+};
 
 /*
  * Each submit is tracked as a nvhost_job.
@@ -50,10 +58,8 @@ struct nvhost_job {
 	struct nvmap_client *nvmap;
 
 	/* Gathers and their memory */
-	struct nvmap_handle_ref *gather_mem;
-	struct nvhost_channel_gather *gathers;
+	struct nvhost_job_gather *gathers;
 	int num_gathers;
-	int gather_mem_size;
 
 	/* Wait checks to be processed at submit time */
 	struct nvhost_waitchk *waitchk;
@@ -61,9 +67,10 @@ struct nvhost_job {
 	u32 waitchk_mask;
 
 	/* Array of handles to be pinned & unpinned */
-	struct nvmap_pinarray_elem *pinarray;
-	int num_pins;
-	struct nvmap_handle **unpins;
+	struct nvhost_reloc *relocarray;
+	struct nvhost_reloc_shift *relocshiftarray;
+	int num_relocs;
+	struct nvmap_handle_ref **unpins;
 	int num_unpins;
 
 	/* Sync point id, number of increments and end related to the submit */
@@ -99,17 +106,6 @@ struct nvhost_job *nvhost_job_alloc(struct nvhost_channel *ch,
 		int priority, int clientid);
 
 /*
- * Allocate memory for a job. Just enough memory will be allocated to
- * accomodate the submit announced in submit header. Gather memory from
- * oldjob will be reused, and nvhost_job_put() will be called to it.
- */
-struct nvhost_job *nvhost_job_realloc(struct nvhost_job *oldjob,
-		struct nvhost_hwctx *hwctx,
-		struct nvhost_submit_hdr_ext *hdr,
-		struct nvmap_client *nvmap,
-		int priority, int clientid);
-
-/*
  * Add a gather to a job.
  */
 void nvhost_job_add_gather(struct nvhost_job *job,
@@ -134,8 +130,11 @@ void nvhost_job_put(struct nvhost_job *job);
  * Pin memory related to job. This handles relocation of addresses to the
  * host1x address space. Handles both the gather memory and any other memory
  * referred to from the gather buffers.
+ *
+ * Handles also patching out host waits that would wait for an expired sync
+ * point value.
  */
-int nvhost_job_pin(struct nvhost_job *job);
+int nvhost_job_pin(struct nvhost_job *job, struct nvhost_syncpt *sp);
 
 /*
  * Unpin memory related to job.
diff --git a/drivers/video/tegra/host/nvhost_syncpt.c b/drivers/video/tegra/host/nvhost_syncpt.c
index 4835d22881b8..7550512b0214 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.c
+++ b/drivers/video/tegra/host/nvhost_syncpt.c
@@ -74,7 +74,7 @@ u32 nvhost_syncpt_update_min(struct nvhost_syncpt *sp, u32 id)
 
 	BUG_ON(!syncpt_op().update_min);
 
-	return syncpt_op().update_min(sp, id);
+	val = syncpt_op().update_min(sp, id);
 	trace_nvhost_syncpt_update_min(id, val);
 
 	return val;
@@ -130,6 +130,19 @@ void nvhost_syncpt_incr(struct nvhost_syncpt *sp, u32 id)
 }
 
 /**
+ * Updated sync point form hardware, and returns true if syncpoint is expired,
+ * false if we may need to wait
+ */
+static bool syncpt_update_min_is_expired(
+	struct nvhost_syncpt *sp,
+	u32 id,
+	u32 thresh)
+{
+	syncpt_op().update_min(sp, id);
+	return nvhost_syncpt_is_expired(sp, id, thresh);
+}
+
+/**
  * Main entrypoint for syncpoint value waits.
  */
 int nvhost_syncpt_wait_timeout(struct nvhost_syncpt *sp, u32 id,
@@ -190,9 +203,9 @@ int nvhost_syncpt_wait_timeout(struct nvhost_syncpt *sp, u32 id,
 	while (timeout) {
 		u32 check = min_t(u32, SYNCPT_CHECK_PERIOD, timeout);
 		int remain = wait_event_interruptible_timeout(wq,
-				nvhost_syncpt_is_expired(sp, id, thresh),
+				syncpt_update_min_is_expired(sp, id, thresh),
 				check);
-		if (remain > 0) {
+		if (remain > 0 || nvhost_syncpt_is_expired(sp, id, thresh)) {
 			if (value)
 				*value = nvhost_syncpt_read_min(sp, id);
 			err = 0;
@@ -317,15 +330,10 @@ void nvhost_mutex_unlock(struct nvhost_syncpt *sp, int idx)
 	atomic_dec(&sp->lock_counts[idx]);
 }
 
-/* check for old WAITs to be removed (avoiding a wrap) */
-int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
-			     struct nvmap_client *nvmap,
-			     u32 waitchk_mask,
-			     struct nvhost_waitchk *wait,
-			     int num_waitchk)
+/* remove a wait pointed to by patch_addr */
+int nvhost_syncpt_patch_wait(struct nvhost_syncpt *sp, void *patch_addr)
 {
-	return syncpt_op().wait_check(sp, nvmap,
-			waitchk_mask, wait, num_waitchk);
+	return syncpt_op().patch_wait(sp, patch_addr);
 }
 
 /* Displays the current value of the sync point via sysfs */
diff --git a/drivers/video/tegra/host/nvhost_syncpt.h b/drivers/video/tegra/host/nvhost_syncpt.h
index b770ed91c76c..b58921bffa9c 100644
--- a/drivers/video/tegra/host/nvhost_syncpt.h
+++ b/drivers/video/tegra/host/nvhost_syncpt.h
@@ -136,23 +136,7 @@ static inline int nvhost_syncpt_wait(struct nvhost_syncpt *sp, u32 id, u32 thres
 					  MAX_SCHEDULE_TIMEOUT, NULL);
 }
 
-/*
- * Check driver supplied waitchk structs for syncpt thresholds
- * that have already been satisfied and NULL the comparison (to
- * avoid a wrap condition in the HW).
- *
- * @param: sp - global shadowed syncpt struct
- * @param: nvmap - needed to access command buffer
- * @param: mask - bit mask of syncpt IDs referenced in WAITs
- * @param: wait - start of filled in array of waitchk structs
- * @param: waitend - end ptr (one beyond last valid waitchk)
- */
-struct nvhost_waitchk;
-int nvhost_syncpt_wait_check(struct nvhost_syncpt *sp,
-			struct nvmap_client *nvmap,
-			u32 mask,
-			struct nvhost_waitchk *wait,
-			int num_waitchk);
+int nvhost_syncpt_patch_wait(struct nvhost_syncpt *sp, void *patch_addr);
 
 void nvhost_syncpt_debug(struct nvhost_syncpt *sp);
 
diff --git a/drivers/video/tegra/host/t30/t30.c b/drivers/video/tegra/host/t30/t30.c
index 257ba0849277..b2768741546a 100644
--- a/drivers/video/tegra/host/t30/t30.c
+++ b/drivers/video/tegra/host/t30/t30.c
@@ -95,7 +95,7 @@ struct nvhost_device t30_devices[] = {
 	.waitbases	= BIT(NVWAITBASE_2D_0) | BIT(NVWAITBASE_2D_1),
 	.modulemutexes	= BIT(NVMODMUTEX_2D_FULL) | BIT(NVMODMUTEX_2D_SIMPLE) |
 			  BIT(NVMODMUTEX_2D_SB_A) | BIT(NVMODMUTEX_2D_SB_B),
-	.clocks 	= { {"gr2d", UINT_MAX},
+	.clocks 	= { {"gr2d", 0},
 			    {"epp", 0},
 			    {"emc", 300000000} },
 	NVHOST_MODULE_NO_POWERGATE_IDS,
diff --git a/drivers/video/tegra/nvmap/nvmap.c b/drivers/video/tegra/nvmap/nvmap.c
index a0c4156668e5..b7fd695d04ee 100644
--- a/drivers/video/tegra/nvmap/nvmap.c
+++ b/drivers/video/tegra/nvmap/nvmap.c
@@ -352,225 +352,6 @@ static phys_addr_t handle_phys(struct nvmap_handle *h)
 	return addr;
 }
 
-/* stores the physical address (+offset) of each handle relocation entry
- * into its output location. see nvmap_pin_array for more details.
- *
- * each entry in arr (i.e., each relocation request) specifies two handles:
- * the handle to pin (pin), and the handle where the address of pin should be
- * written (patch). in pseudocode, this loop basically looks like:
- *
- * for (i = 0; i < nr; i++) {
- *     (pin, pin_offset, patch, patch_offset) = arr[i];
- *     patch[patch_offset] = address_of(pin) + pin_offset;
- * }
- */
-static int nvmap_reloc_pin_array(struct nvmap_client *client,
-				 const struct nvmap_pinarray_elem *arr,
-				 int nr, struct nvmap_handle *gather)
-{
-	struct nvmap_handle *last_patch = NULL;
-	unsigned int last_pfn = 0;
-	pte_t **pte;
-	void *addr;
-	int i;
-
-	pte = nvmap_alloc_pte(client->dev, &addr);
-	if (IS_ERR(pte))
-		return PTR_ERR(pte);
-
-	for (i = 0; i < nr; i++) {
-		struct nvmap_handle *patch;
-		struct nvmap_handle *pin;
-		phys_addr_t reloc_addr;
-		phys_addr_t phys;
-		unsigned int pfn;
-
-		/* all of the handles are validated and get'ted prior to
-		 * calling this function, so casting is safe here */
-		pin = (struct nvmap_handle *)arr[i].pin_mem;
-
-		if (arr[i].patch_mem == (unsigned long)last_patch) {
-			patch = last_patch;
-		} else if (arr[i].patch_mem == (unsigned long)gather) {
-			patch = gather;
-		} else {
-			if (last_patch)
-				nvmap_handle_put(last_patch);
-
-			patch = nvmap_get_handle_id(client, arr[i].patch_mem);
-			if (!patch) {
-				nvmap_free_pte(client->dev, pte);
-				return -EPERM;
-			}
-			last_patch = patch;
-		}
-
-		if (patch->heap_pgalloc) {
-			unsigned int page = arr[i].patch_offset >> PAGE_SHIFT;
-			phys = page_to_phys(patch->pgalloc.pages[page]);
-			phys += (arr[i].patch_offset & ~PAGE_MASK);
-		} else {
-			phys = patch->carveout->base + arr[i].patch_offset;
-		}
-
-		pfn = __phys_to_pfn(phys);
-		if (pfn != last_pfn) {
-			pgprot_t prot = nvmap_pgprot(patch, pgprot_kernel);
-			phys_addr_t kaddr = (phys_addr_t)addr;
-			set_pte_at(&init_mm, kaddr, *pte, pfn_pte(pfn, prot));
-			flush_tlb_kernel_page(kaddr);
-			last_pfn = pfn;
-		}
-
-		reloc_addr = handle_phys(pin) + arr[i].pin_offset;
-		reloc_addr >>= arr[i].reloc_shift;
-		__raw_writel(reloc_addr, addr + (phys & ~PAGE_MASK));
-	}
-
-	nvmap_free_pte(client->dev, pte);
-
-	if (last_patch)
-		nvmap_handle_put(last_patch);
-
-	wmb();
-
-	return 0;
-}
-
-static int nvmap_validate_get_pin_array(struct nvmap_client *client,
-					const struct nvmap_pinarray_elem *arr,
-					int nr, struct nvmap_handle **h)
-{
-	int i;
-	int ret = 0;
-	int count = 0;
-
-	nvmap_ref_lock(client);
-
-	for (i = 0; i < nr; i++) {
-		struct nvmap_handle_ref *ref;
-
-		if (need_resched()) {
-			nvmap_ref_unlock(client);
-			schedule();
-			nvmap_ref_lock(client);
-		}
-
-		ref = _nvmap_validate_id_locked(client, arr[i].pin_mem);
-
-		if (!ref)
-			nvmap_warn(client, "falied to validate id\n");
-		else if (!ref->handle)
-			nvmap_warn(client, "id had no associated handle\n");
-		else if (!ref->handle->alloc)
-			nvmap_warn(client, "handle had no allocation\n");
-
-		if (!ref || !ref->handle || !ref->handle->alloc) {
-			ret = -EPERM;
-			break;
-		}
-
-		/* a handle may be referenced multiple times in arr, but
-		 * it will only be pinned once; this ensures that the
-		 * minimum number of sync-queue slots in the host driver
-		 * are dedicated to storing unpin lists, which allows
-		 * for greater parallelism between the CPU and graphics
-		 * processor */
-		if (ref->handle->flags & NVMAP_HANDLE_VISITED)
-			continue;
-
-		ref->handle->flags |= NVMAP_HANDLE_VISITED;
-
-		h[count] = nvmap_handle_get(ref->handle);
-		BUG_ON(!h[count]);
-		count++;
-	}
-
-	nvmap_ref_unlock(client);
-
-	if (ret) {
-		for (i = 0; i < count; i++) {
-			h[i]->flags &= ~NVMAP_HANDLE_VISITED;
-			nvmap_handle_put(h[i]);
-		}
-	}
-
-	return ret ?: count;
-}
-
-/* a typical mechanism host1x clients use for using the Tegra graphics
- * processor is to build a command buffer which contains relocatable
- * memory handle commands, and rely on the kernel to convert these in-place
- * to addresses which are understood by the GPU hardware.
- *
- * this is implemented by having clients provide a sideband array
- * of relocatable handles (+ offsets) and the location in the command
- * buffer handle to patch with the GPU address when the client submits
- * its command buffer to the host1x driver.
- *
- * the host driver also uses this relocation mechanism internally to
- * relocate the client's (unpinned) command buffers into host-addressable
- * memory.
- *
- * @client: nvmap_client which should be used for validation; should be
- *          owned by the process which is submitting command buffers
- * @gather: special handle for relocated command buffer outputs used
- *          internally by the host driver. if this handle is encountered
- *          as an output handle in the relocation array, it is assumed
- *          to be a known-good output and is not validated.
- * @arr:    array of ((relocatable handle, offset), (output handle, offset))
- *          tuples.
- * @nr:     number of entries in arr
- * @unique_arr: list of nvmap_handle objects which were pinned by
- *              nvmap_pin_array. must be unpinned by the caller after the
- *              command buffers referenced in gather have completed.
- */
-int nvmap_pin_array(struct nvmap_client *client, struct nvmap_handle *gather,
-		    const struct nvmap_pinarray_elem *arr, int nr,
-		    struct nvmap_handle **unique_arr)
-{
-	int count = 0;
-	int ret = 0;
-	int i;
-
-	if (mutex_lock_interruptible(&client->share->pin_lock)) {
-		nvmap_warn(client, "%s interrupted when acquiring pin lock\n",
-			   current->group_leader->comm);
-		return -EINTR;
-	}
-
-	count = nvmap_validate_get_pin_array(client, arr, nr, unique_arr);
-	if (count < 0) {
-		mutex_unlock(&client->share->pin_lock);
-		nvmap_warn(client, "failed to validate pin array\n");
-		return count;
-	}
-
-	for (i = 0; i < count; i++)
-		unique_arr[i]->flags &= ~NVMAP_HANDLE_VISITED;
-
-	ret = wait_pin_array_locked(client, unique_arr, count);
-
-	mutex_unlock(&client->share->pin_lock);
-
-	if (!ret)
-		ret = nvmap_reloc_pin_array(client, arr, nr, gather);
-
-	if (WARN_ON(ret)) {
-		for (i = 0; i < count; i++)
-			nvmap_handle_put(unique_arr[i]);
-		return ret;
-	} else {
-		for (i = 0; i < count; i++) {
-			if (unique_arr[i]->heap_pgalloc &&
-			    unique_arr[i]->pgalloc.dirty)
-				map_iovmm_area(unique_arr[i]);
-		}
-	}
-
-	return count;
-}
-
 phys_addr_t nvmap_pin(struct nvmap_client *client,
 			struct nvmap_handle_ref *ref)
 {
@@ -820,52 +601,3 @@ void nvmap_free(struct nvmap_client *client, struct nvmap_handle_ref *r)
 
 	nvmap_free_handle_id(client, nvmap_ref_to_id(r));
 }
-
-/*
- * create a mapping to the user's buffer and write it
- * (uses similar logic from nvmap_reloc_pin_array to map the cmdbuf)
- */
-int nvmap_patch_word(struct nvmap_client *client,
-				struct nvmap_handle *patch,
-				u32 patch_offset, u32 patch_value)
-{
-	phys_addr_t phys;
-	unsigned long kaddr;
-	unsigned int pfn;
-	void *addr;
-	pte_t **pte;
-	pgprot_t prot;
-
-	if (patch_offset >= patch->size) {
-		nvmap_warn(client, "read/write outside of handle\n");
-		return -EFAULT;
-	}
-
-	pte = nvmap_alloc_pte(client->dev, &addr);
-	if (IS_ERR(pte))
-		return PTR_ERR(pte);
-
-	/* derive physaddr of cmdbuf WAIT to patch */
-	if (patch->heap_pgalloc) {
-		unsigned int page = patch_offset >> PAGE_SHIFT;
-		phys = page_to_phys(patch->pgalloc.pages[page]);
-		phys += (patch_offset & ~PAGE_MASK);
-	} else {
-		phys = patch->carveout->base + patch_offset;
-	}
-
-	pfn = __phys_to_pfn(phys);
-	prot = nvmap_pgprot(patch, pgprot_kernel);
-	kaddr = (unsigned long)addr;
-
-	/* write PTE, so addr points to cmdbuf PFN */
-	set_pte_at(&init_mm, kaddr, *pte, pfn_pte(pfn, prot));
-	flush_tlb_kernel_page(kaddr);
-
-	/* write patch_value to addr + page offset */
-	__raw_writel(patch_value, addr + (phys & ~PAGE_MASK));
-
-	nvmap_free_pte(client->dev, pte);
-	wmb();
-	return 0;
-}
diff --git a/drivers/video/tegra/nvmap/nvmap.h b/drivers/video/tegra/nvmap/nvmap.h
index b0fb70f64a5c..25403f5e7098 100644
--- a/drivers/video/tegra/nvmap/nvmap.h
+++ b/drivers/video/tegra/nvmap/nvmap.h
@@ -37,6 +37,8 @@ struct nvmap_device;
 struct page;
 struct tegra_iovmm_area;
 
+void _nvmap_handle_free(struct nvmap_handle *h);
+
 #if defined(CONFIG_TEGRA_NVMAP)
 #define nvmap_err(_client, _fmt, ...)				\
 	dev_err(nvmap_client_to_device(_client),		\
@@ -162,7 +164,46 @@ static inline void nvmap_ref_unlock(struct nvmap_client *priv)
 {
 	mutex_unlock(&priv->ref_lock);
 }
-#endif /* CONFIG_TEGRA_NVMAP */
+
+static inline struct nvmap_handle *nvmap_handle_get(struct nvmap_handle *h)
+{
+	if (unlikely(atomic_inc_return(&h->ref) <= 1)) {
+		pr_err("%s: %s getting a freed handle\n",
+			__func__, current->group_leader->comm);
+		if (atomic_read(&h->ref) <= 0)
+			return NULL;
+	}
+	return h;
+}
+
+static inline void nvmap_handle_put(struct nvmap_handle *h)
+{
+	int cnt = atomic_dec_return(&h->ref);
+
+	if (WARN_ON(cnt < 0)) {
+		pr_err("%s: %s put to negative references\n",
+			__func__, current->comm);
+	} else if (cnt == 0)
+		_nvmap_handle_free(h);
+}
+
+static inline pgprot_t nvmap_pgprot(struct nvmap_handle *h, pgprot_t prot)
+{
+	if (h->flags == NVMAP_HANDLE_UNCACHEABLE)
+		return pgprot_noncached(prot);
+	else if (h->flags == NVMAP_HANDLE_WRITE_COMBINE)
+		return pgprot_writecombine(prot);
+	else if (h->flags == NVMAP_HANDLE_INNER_CACHEABLE)
+		return pgprot_inner_writeback(prot);
+	return prot;
+}
+
+#else /* CONFIG_TEGRA_NVMAP */
+struct nvmap_handle *nvmap_handle_get(struct nvmap_handle *h);
+void nvmap_handle_put(struct nvmap_handle *h);
+pgprot_t nvmap_pgprot(struct nvmap_handle *h, pgprot_t prot);
+
+#endif /* !CONFIG_TEGRA_NVMAP */
 
 struct device *nvmap_client_to_device(struct nvmap_client *client);
 
@@ -216,51 +257,10 @@ int nvmap_pin_ids(struct nvmap_client *client,
 void nvmap_unpin_ids(struct nvmap_client *priv,
 		     unsigned int nr, const unsigned long *ids);
 
-void _nvmap_handle_free(struct nvmap_handle *h);
-
 int nvmap_handle_remove(struct nvmap_device *dev, struct nvmap_handle *h);
 
 void nvmap_handle_add(struct nvmap_device *dev, struct nvmap_handle *h);
 
-#if defined(CONFIG_TEGRA_NVMAP)
-static inline struct nvmap_handle *nvmap_handle_get(struct nvmap_handle *h)
-{
-	if (unlikely(atomic_inc_return(&h->ref) <= 1)) {
-		pr_err("%s: %s getting a freed handle\n",
-			__func__, current->group_leader->comm);
-		if (atomic_read(&h->ref) <= 0)
-			return NULL;
-	}
-	return h;
-}
-
-static inline void nvmap_handle_put(struct nvmap_handle *h)
-{
-	int cnt = atomic_dec_return(&h->ref);
-
-	if (WARN_ON(cnt < 0)) {
-		pr_err("%s: %s put to negative references\n",
-			__func__, current->comm);
-	} else if (cnt == 0)
-		_nvmap_handle_free(h);
-}
-
-static inline pgprot_t nvmap_pgprot(struct nvmap_handle *h, pgprot_t prot)
-{
-	if (h->flags == NVMAP_HANDLE_UNCACHEABLE)
-		return pgprot_noncached(prot);
-	else if (h->flags == NVMAP_HANDLE_WRITE_COMBINE)
-		return pgprot_writecombine(prot);
-	else if (h->flags == NVMAP_HANDLE_INNER_CACHEABLE)
-		return pgprot_inner_writeback(prot);
-	return prot;
-}
-#else /* CONFIG_TEGRA_NVMAP */
-struct nvmap_handle *nvmap_handle_get(struct nvmap_handle *h);
-void nvmap_handle_put(struct nvmap_handle *h);
-pgprot_t nvmap_pgprot(struct nvmap_handle *h, pgprot_t prot);
-#endif /* !CONFIG_TEGRA_NVMAP */
-
 int is_nvmap_vma(struct vm_area_struct *vma);
 
 struct nvmap_handle_ref *nvmap_alloc_iovm(struct nvmap_client *client,
@@ -268,4 +268,4 @@ struct nvmap_handle_ref *nvmap_alloc_iovm(struct nvmap_client *client,
 
 void nvmap_free_iovm(struct nvmap_client *client, struct nvmap_handle_ref *r);
 
-#endif
+#endif /* __VIDEO_TEGRA_NVMAP_NVMAP_H */
diff --git a/drivers/video/tegra/nvmap/nvmap_dev.c b/drivers/video/tegra/nvmap/nvmap_dev.c
index c78818711f74..9ecce7eeeb17 100644
--- a/drivers/video/tegra/nvmap/nvmap_dev.c
+++ b/drivers/video/tegra/nvmap/nvmap_dev.c
@@ -973,20 +973,17 @@ static void client_stringify(struct nvmap_client *client, struct seq_file *s)
 }
 
 static void allocations_stringify(struct nvmap_client *client,
-				  struct seq_file *s)
+				  struct seq_file *s, bool iovmm)
 {
-	unsigned long base = 0;
 	struct rb_node *n = rb_first(&client->handle_refs);
 
 	for (; n != NULL; n = rb_next(n)) {
 		struct nvmap_handle_ref *ref =
 			rb_entry(n, struct nvmap_handle_ref, node);
 		struct nvmap_handle *handle = ref->handle;
-		if (handle->alloc && !handle->heap_pgalloc) {
-			seq_printf(s, "%-18s %-18s %8lx %10u %8x\n", "", "",
-					(unsigned long)(handle->carveout->base),
-					handle->size, handle->userflags);
-		} else if (handle->alloc && handle->heap_pgalloc) {
+		if (handle->alloc && handle->heap_pgalloc == iovmm) {
+			unsigned long base = iovmm ? 0:
+				(unsigned long)(handle->carveout->base);
 			seq_printf(s, "%-18s %-18s %8lx %10u %8x\n", "", "",
 					base, handle->size, handle->userflags);
 		}
@@ -1010,7 +1007,7 @@ static int nvmap_debug_allocations_show(struct seq_file *s, void *unused)
 			get_client_from_carveout_commit(node, commit);
 		client_stringify(client, s);
 		seq_printf(s, " %10u\n", commit->commit);
-		allocations_stringify(client, s);
+		allocations_stringify(client, s, false);
 		seq_printf(s, "\n");
 		total += commit->commit;
 	}
@@ -1111,14 +1108,14 @@ static int nvmap_debug_iovmm_allocations_show(struct seq_file *s, void *unused)
 	struct nvmap_device *dev = s->private;
 
 	spin_lock_irqsave(&dev->clients_lock, flags);
-	seq_printf(s, "%-18s %18s %8s %10s\n", "CLIENT", "PROCESS", "PID",
-		"SIZE");
+	seq_printf(s, "%-18s %18s %8s %10s %8s\n", "CLIENT", "PROCESS", "PID",
+		"SIZE", "FLAGS");
 	seq_printf(s, "%-18s %18s %8s %10s\n", "", "",
 					"BASE", "SIZE");
 	list_for_each_entry(client, &dev->clients, list) {
 		client_stringify(client, s);
 		seq_printf(s, " %10u\n", atomic_read(&client->iovm_commit));
-		allocations_stringify(client, s);
+		allocations_stringify(client, s, true);
 		seq_printf(s, "\n");
 		total += atomic_read(&client->iovm_commit);
 	}
diff --git a/drivers/video/tegra/nvmap/nvmap_handle.c b/drivers/video/tegra/nvmap/nvmap_handle.c
index 2f24ba515862..56e2dab1820c 100644
--- a/drivers/video/tegra/nvmap/nvmap_handle.c
+++ b/drivers/video/tegra/nvmap/nvmap_handle.c
@@ -38,22 +38,12 @@
 #include <mach/iovmm.h>
 #include <linux/nvmap.h>
 
-#include <linux/vmstat.h>
-#include <linux/swap.h>
 #include <linux/shrinker.h>
 #include <linux/moduleparam.h>
-
 #include "nvmap.h"
 #include "nvmap_mru.h"
 #include "nvmap_common.h"
 
-#define PRINT_CARVEOUT_CONVERSION 0
-#if PRINT_CARVEOUT_CONVERSION
-#define PR_INFO pr_info
-#else
-#define PR_INFO(...)
-#endif
-
 #define NVMAP_SECURE_HEAPS	(NVMAP_HEAP_CARVEOUT_IRAM | NVMAP_HEAP_IOVMM | \
 				 NVMAP_HEAP_CARVEOUT_VPR)
 #ifdef CONFIG_NVMAP_HIGHMEM_ONLY
@@ -646,36 +636,19 @@ fail:
 static void alloc_handle(struct nvmap_client *client,
 			 struct nvmap_handle *h, unsigned int type)
 {
+	unsigned int carveout_mask = NVMAP_HEAP_CARVEOUT_MASK;
+	unsigned int iovmm_mask = NVMAP_HEAP_IOVMM;
+
 	BUG_ON(type & (type - 1));
 
 #ifdef CONFIG_NVMAP_CONVERT_CARVEOUT_TO_IOVMM
-#define __NVMAP_HEAP_CARVEOUT	(NVMAP_HEAP_CARVEOUT_IRAM | NVMAP_HEAP_CARVEOUT_VPR)
-#define __NVMAP_HEAP_IOVMM	(NVMAP_HEAP_IOVMM | NVMAP_HEAP_CARVEOUT_GENERIC)
-	if (type & NVMAP_HEAP_CARVEOUT_GENERIC) {
-#ifdef CONFIG_NVMAP_ALLOW_SYSMEM
-		if (h->size <= PAGE_SIZE) {
-			PR_INFO("###CARVEOUT CONVERTED TO SYSMEM "
-				"0x%x bytes %s(%d)###\n",
-				h->size, current->comm, current->pid);
-			goto sysheap;
-		}
-#endif
-		PR_INFO("###CARVEOUT CONVERTED TO IOVM "
-			"0x%x bytes %s(%d)###\n",
-			h->size, current->comm, current->pid);
-	}
-#else
-#define __NVMAP_HEAP_CARVEOUT	NVMAP_HEAP_CARVEOUT_MASK
-#define __NVMAP_HEAP_IOVMM	NVMAP_HEAP_IOVMM
+	/* Convert generic carveout requests to iovmm requests. */
+	carveout_mask &= ~NVMAP_HEAP_CARVEOUT_GENERIC;
+	iovmm_mask |= NVMAP_HEAP_CARVEOUT_GENERIC;
 #endif
 
-	if (type & __NVMAP_HEAP_CARVEOUT) {
+	if (type & carveout_mask) {
 		struct nvmap_heap_block *b;
-#ifdef CONFIG_NVMAP_CONVERT_CARVEOUT_TO_IOVMM
-		PR_INFO("###IRAM REQUEST RETAINED "
-			"0x%x bytes %s(%d)###\n",
-			h->size, current->comm, current->pid);
-#endif
 		/* Protect handle from relocation */
 		nvmap_usecount_inc(h);
 
@@ -689,7 +662,7 @@ static void alloc_handle(struct nvmap_client *client,
 		}
 		nvmap_usecount_dec(h);
 
-	} else if (type & __NVMAP_HEAP_IOVMM) {
+	} else if (type & iovmm_mask) {
 		size_t reserved = PAGE_ALIGN(h->size);
 		int commit = 0;
 		int ret;
@@ -713,10 +686,6 @@ static void alloc_handle(struct nvmap_client *client,
 		}
 
 	} else if (type & NVMAP_HEAP_SYSMEM) {
-#if defined(CONFIG_NVMAP_CONVERT_CARVEOUT_TO_IOVMM) && \
-	defined(CONFIG_NVMAP_ALLOW_SYSMEM)
-sysheap:
-#endif
 		if (handle_page_alloc(client, h, true) == 0) {
 			BUG_ON(!h->pgalloc.contig);
 			h->heap_pgalloc = true;
@@ -751,10 +720,6 @@ static const unsigned int heap_policy_large[] = {
 	0,
 };
 
-/* Do not override single page policy if there is not much space to
-avoid invoking system oom killer. */
-#define NVMAP_SMALL_POLICY_SYSMEM_THRESHOLD 50000000
-
 int nvmap_alloc_handle_id(struct nvmap_client *client,
 			  unsigned long id, unsigned int heap_mask,
 			  size_t align, unsigned int flags)
@@ -779,32 +744,22 @@ int nvmap_alloc_handle_id(struct nvmap_client *client,
 	h->align = max_t(size_t, align, L1_CACHE_BYTES);
 
 #ifndef CONFIG_TEGRA_IOVMM
+	/* convert iovmm requests to generic carveout. */
 	if (heap_mask & NVMAP_HEAP_IOVMM) {
-		heap_mask &= NVMAP_HEAP_IOVMM;
-		heap_mask |= NVMAP_HEAP_CARVEOUT_GENERIC;
+		heap_mask = heap_mask & ~NVMAP_HEAP_IOVMM |
+			    NVMAP_HEAP_CARVEOUT_GENERIC;
 	}
 #endif
-#ifndef CONFIG_NVMAP_CONVERT_CARVEOUT_TO_IOVMM
 #ifdef CONFIG_NVMAP_ALLOW_SYSMEM
 	/* Allow single pages allocations in system memory to save
 	 * carveout space and avoid extra iovm mappings */
 	if (nr_page == 1) {
-		if (heap_mask & NVMAP_HEAP_IOVMM)
+		if (heap_mask &
+		    (NVMAP_HEAP_IOVMM | NVMAP_HEAP_CARVEOUT_GENERIC))
 			heap_mask |= NVMAP_HEAP_SYSMEM;
-		else if (heap_mask & NVMAP_HEAP_CARVEOUT_GENERIC) {
-			/* Calculate size of free physical pages
-			 * managed by kernel */
-			unsigned long freeMem =
-				(global_page_state(NR_FREE_PAGES) +
-				global_page_state(NR_FILE_PAGES) -
-				total_swapcache_pages) << PAGE_SHIFT;
-
-			if (freeMem > NVMAP_SMALL_POLICY_SYSMEM_THRESHOLD)
-				heap_mask |= NVMAP_HEAP_SYSMEM;
-		}
 	}
 #endif
-
+#ifndef CONFIG_NVMAP_CONVERT_CARVEOUT_TO_IOVMM
 	/* This restriction is deprecated as alignments greater than
 	   PAGE_SIZE are now correctly handled, but it is retained for
 	   AP20 compatibility. */