gpu: nvgpu: Add NVIDIA GPU Driver

This patch moves the NVIDIA GPU driver to a new location. Bug 1482562 Change-Id: I24293810b9d0f1504fd9be00135e21dad656ccb6 Signed-off-by: Arto Merilainen <amerilainen@nvidia.com> Reviewed-on: http://git-master/r/383722 Reviewed-by: Terje Bergstrom <tbergstrom@nvidia.com>
author: Arto Merilainen <amerilainen@nvidia.com> 2014-03-19 09:38:25 +0200
committer: Terje Bergstrom <tbergstrom@nvidia.com> 2014-03-28 04:21:39 -0700
commit: e51e1033bd22dc5ea6a86f6704142baf89a2f7cb (patch)
tree: 9b5f65258f5777273f3b62e4f59f8001ed7da543 /drivers/gpu/nvgpu
parent: 1428ed474d1acb22321e89301c06be1bb9e5fe17 (diff)
82 files changed, 43318 insertions, 0 deletions
diff --git a/drivers/gpu/nvgpu/Kconfig b/drivers/gpu/nvgpu/Kconfig
new file mode 100644
index 000000000000..160ec8be94de
--- /dev/null
+++ b/drivers/gpu/nvgpu/Kconfig
@@ -0,0 +1,60 @@
+config GK20A
+	bool "Nvidia GK20A GPU support"
+	help
+	  Choose this option if you have an SoC with integrated
+	  Nvidia GPU IP.
+
+config GK20A_DEFAULT_TIMEOUT
+	depends on GK20A
+	int "Default timeout for submits"
+	default 10000
+	help
+	  Default timeout for jobs in milliseconds. Set to zero for no timeout.
+
+config GK20A_PMU
+	bool "Support GK20A PMU"
+	depends on GK20A
+	default n
+	help
+	  Say Y here to enable GK20A PMU features.
+
+choice
+	depends on GK20A
+	prompt "Enable GK20A frequency scaling"
+	default GK20A_PERFMON
+	optional
+	help
+	  Select this entry to enable gk20a scaling
+
+config GK20A_PERFMON
+	bool "Use Perfmon"
+	help
+	  Select this to enable built-in perfmon scaling.
+	  The built-in scaling option uses simplistic
+	  scaling mechanism (if busy, increase frequency and
+	  decrease frequency if idle).
+
+config GK20A_DEVFREQ
+	bool "Use Devfreq"
+	help
+	  Select this to use devfreq based scaling.
+	  Devfreq is a common framework that allows using
+	  variety of different governors and changing
+	  between governors on the fly. By default, no
+	  governor is selected.
+
+endchoice
+
+config GK20A_CYCLE_STATS
+	bool "Support GK20A GPU CYCLE STATS"
+	depends on GK20A
+	default y
+	help
+	  Say Y here to enable the cycle stats debugging features.
+
+config GK20A_PHYS_PAGE_TABLES
+	bool "Use physical addressing for gk20a page tables"
+	default y if TEGRA_SIMULATION_PLATFORM
+	help
+	  Use physical addressing for gk20a page tables. If this is off, we
+	  use SMMU translation.
diff --git a/drivers/gpu/nvgpu/gk20a/Makefile b/drivers/gpu/nvgpu/gk20a/Makefile
new file mode 100644
index 000000000000..f9b06b72eead
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/Makefile
@@ -0,0 +1,36 @@
+
+GCOV_PROFILE := y
+ccflags-y += -Idrivers/devfreq
+ccflags-y += -Wno-multichar
+ccflags-y += -Werror
+
+obj-$(CONFIG_GK20A) += \
+	gk20a.o \
+	as_gk20a.o \
+	ctrl_gk20a.o \
+	fifo_gk20a.o \
+	channel_gk20a.o \
+	channel_sync_gk20a.o \
+	debug_gk20a.o \
+	dbg_gpu_gk20a.o \
+	regops_gk20a.o \
+	gr_gk20a.o \
+	kind_gk20a.o \
+	mm_gk20a.o \
+	pmu_gk20a.o \
+	priv_ring_gk20a.o \
+	clk_gk20a.o \
+	therm_gk20a.o \
+	gr_ctx_gk20a_sim.o \
+	gr_ctx_gk20a.o \
+	gk20a_gating_reglist.o \
+	gk20a_scale.o \
+	gk20a_sysfs.o \
+	ltc_gk20a.o \
+	fb_gk20a.o \
+	hal.o \
+	hal_gk20a.o \
+	gk20a_allocator.o
+
+obj-$(CONFIG_GK20A) += platform_gk20a_generic.o
+obj-$(CONFIG_TEGRA_GK20A) += platform_gk20a_tegra.o
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.c b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
new file mode 100644
index 000000000000..65c26938ea80
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.c
@@ -0,0 +1,293 @@
+/*
+ * drivers/video/tegra/host/gk20a/as_gk20a.c
+ *
+ * GK20A Address Spaces
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+
+#include <trace/events/gk20a.h>
+
+#include "gk20a.h"
+
+/* dumb allocator... */
+static int generate_as_share_id(struct gk20a_as *as)
+{
+	gk20a_dbg_fn("");
+	return ++as->last_share_id;
+}
+/* still dumb */
+static void release_as_share_id(struct gk20a_as *as, int id)
+{
+	gk20a_dbg_fn("");
+	return;
+}
+
+static int gk20a_as_alloc_share(struct gk20a_as *as,
+				struct gk20a_as_share **out)
+{
+	struct gk20a_as_share *as_share;
+	int err = 0;
+
+	gk20a_dbg_fn("");
+
+	*out = 0;
+	as_share = kzalloc(sizeof(*as_share), GFP_KERNEL);
+	if (!as_share)
+		return -ENOMEM;
+
+	as_share->as = as;
+	as_share->id = generate_as_share_id(as_share->as);
+	as_share->ref_cnt.counter = 1;
+
+	/* this will set as_share->vm. */
+	err = gk20a_vm_alloc_share(as_share);
+	if (err)
+		goto failed;
+
+	*out = as_share;
+	return 0;
+
+ failed:
+	kfree(as_share);
+	return err;
+}
+
+/*
+ * channels and the device nodes call this to release.
+ * once the ref_cnt hits zero the share is deleted.
+ */
+int gk20a_as_release_share(struct gk20a_as_share *as_share)
+{
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (atomic_dec_return(&as_share->ref_cnt) > 0)
+		return 0;
+
+	err = gk20a_vm_release_share(as_share);
+	release_as_share_id(as_share->as, as_share->id);
+	kfree(as_share);
+	return err;
+}
+
+static int gk20a_as_ioctl_bind_channel(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_bind_channel_args *args)
+{
+	int err = 0;
+	struct channel_gk20a *ch;
+
+	gk20a_dbg_fn("");
+
+	ch = gk20a_get_channel_from_file(args->channel_fd);
+	if (!ch || gk20a_channel_as_bound(ch))
+		return -EINVAL;
+
+	atomic_inc(&as_share->ref_cnt);
+
+	/* this will set channel_gk20a->vm */
+	err = gk20a_vm_bind_channel(as_share, ch);
+	if (err) {
+		atomic_dec(&as_share->ref_cnt);
+		return err;
+	}
+
+	return err;
+}
+
+static int gk20a_as_ioctl_alloc_space(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_alloc_space_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_alloc_space(as_share, args);
+}
+
+static int gk20a_as_ioctl_free_space(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_free_space_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_free_space(as_share, args);
+}
+
+static int gk20a_as_ioctl_map_buffer_ex(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_map_buffer_ex_args *args)
+{
+	int i;
+
+	gk20a_dbg_fn("");
+
+	/* ensure that padding is not set. this is required for ensuring that
+	 * we can safely use these fields later */
+	for (i = 0; i < ARRAY_SIZE(args->padding); i++)
+		if (args->padding[i])
+			return -EINVAL;
+
+	return gk20a_vm_map_buffer(as_share, args->dmabuf_fd,
+				   &args->offset, args->flags,
+				   args->kind);
+}
+
+static int gk20a_as_ioctl_map_buffer(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_map_buffer_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_map_buffer(as_share, args->nvmap_handle,
+				   &args->o_a.align,
+				   args->flags, NV_KIND_DEFAULT);
+	/* args->o_a.offset will be set if !err */
+}
+
+static int gk20a_as_ioctl_unmap_buffer(
+		struct gk20a_as_share *as_share,
+		struct nvhost_as_unmap_buffer_args *args)
+{
+	gk20a_dbg_fn("");
+	return gk20a_vm_unmap_buffer(as_share, args->offset);
+}
+
+int gk20a_as_dev_open(struct inode *inode, struct file *filp)
+{
+	struct gk20a_as_share *as_share;
+	struct gk20a *g;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	g = container_of(inode->i_cdev, struct gk20a, as.cdev);
+
+	err = gk20a_get_client(g);
+	if (err) {
+		gk20a_dbg_fn("fail to get channel!");
+		return err;
+	}
+
+	err = gk20a_as_alloc_share(&g->as, &as_share);
+	if (err) {
+		gk20a_dbg_fn("failed to alloc share");
+		gk20a_put_client(g);
+		return err;
+	}
+
+	filp->private_data = as_share;
+	return 0;
+}
+
+int gk20a_as_dev_release(struct inode *inode, struct file *filp)
+{
+	struct gk20a_as_share *as_share = filp->private_data;
+	int ret;
+	struct gk20a *g = gk20a_from_as(as_share->as);
+
+	gk20a_dbg_fn("");
+
+	ret = gk20a_as_release_share(as_share);
+
+	gk20a_put_client(g);
+
+	return ret;
+}
+
+long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	int err = 0;
+	struct gk20a_as_share *as_share = filp->private_data;
+	struct gk20a *g = gk20a_from_as(as_share->as);
+
+	u8 buf[NVHOST_AS_IOCTL_MAX_ARG_SIZE];
+
+	if ((_IOC_TYPE(cmd) != NVHOST_AS_IOCTL_MAGIC) ||
+		(_IOC_NR(cmd) == 0) ||
+		(_IOC_NR(cmd) > NVHOST_AS_IOCTL_LAST))
+		return -EFAULT;
+
+	BUG_ON(_IOC_SIZE(cmd) > NVHOST_AS_IOCTL_MAX_ARG_SIZE);
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	err = gk20a_channel_busy(g->dev);
+	if (err)
+		return err;
+
+	switch (cmd) {
+	case NVHOST_AS_IOCTL_BIND_CHANNEL:
+		trace_gk20a_as_ioctl_bind_channel(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_bind_channel(as_share,
+			       (struct nvhost_as_bind_channel_args *)buf);
+
+		break;
+	case NVHOST32_AS_IOCTL_ALLOC_SPACE:
+	{
+		struct nvhost32_as_alloc_space_args *args32 =
+			(struct nvhost32_as_alloc_space_args *)buf;
+		struct nvhost_as_alloc_space_args args;
+
+		args.pages = args32->pages;
+		args.page_size = args32->page_size;
+		args.flags = args32->flags;
+		args.o_a.offset = args32->o_a.offset;
+		trace_gk20a_as_ioctl_alloc_space(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_alloc_space(as_share, &args);
+		args32->o_a.offset = args.o_a.offset;
+		break;
+	}
+	case NVHOST_AS_IOCTL_ALLOC_SPACE:
+		trace_gk20a_as_ioctl_alloc_space(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_alloc_space(as_share,
+				(struct nvhost_as_alloc_space_args *)buf);
+		break;
+	case NVHOST_AS_IOCTL_FREE_SPACE:
+		trace_gk20a_as_ioctl_free_space(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_free_space(as_share,
+				(struct nvhost_as_free_space_args *)buf);
+		break;
+	case NVHOST_AS_IOCTL_MAP_BUFFER:
+		trace_gk20a_as_ioctl_map_buffer(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_map_buffer(as_share,
+				(struct nvhost_as_map_buffer_args *)buf);
+		break;
+	case NVHOST_AS_IOCTL_MAP_BUFFER_EX:
+		trace_gk20a_as_ioctl_map_buffer(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_map_buffer_ex(as_share,
+				(struct nvhost_as_map_buffer_ex_args *)buf);
+		break;
+	case NVHOST_AS_IOCTL_UNMAP_BUFFER:
+		trace_gk20a_as_ioctl_unmap_buffer(dev_name(dev_from_gk20a(g)));
+		err = gk20a_as_ioctl_unmap_buffer(as_share,
+				(struct nvhost_as_unmap_buffer_args *)buf);
+		break;
+	default:
+		dev_err(dev_from_gk20a(g), "unrecognized as ioctl: 0x%x", cmd);
+		err = -ENOTTY;
+		break;
+	}
+
+	gk20a_channel_idle(g->dev);
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/as_gk20a.h b/drivers/gpu/nvgpu/gk20a/as_gk20a.h
new file mode 100644
index 000000000000..be0e97075f5a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/as_gk20a.h
@@ -0,0 +1,50 @@
+/*
+ * drivers/video/tegra/host/gk20a/as_gk20a.h
+ *
+ * GK20A Address Space
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef __GK20A_AS_H
+#define __GK20A_AS_H
+
+#include <linux/atomic.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+
+#include <linux/nvhost_as_ioctl.h>
+
+struct gk20a_as;
+struct gk20a_as_share;
+struct vm_gk20a;
+
+struct gk20a_as_share {
+	struct gk20a_as *as;
+	atomic_t ref_cnt;
+	int id;
+	struct vm_gk20a *vm;
+};
+
+struct gk20a_as {
+	int last_share_id; /* dummy allocator for now */
+	struct cdev cdev;
+	struct device *node;
+};
+
+int gk20a_as_release_share(struct gk20a_as_share *as_share);
+
+/* struct file_operations driver interface */
+int gk20a_as_dev_open(struct inode *inode, struct file *filp);
+int gk20a_as_dev_release(struct inode *inode, struct file *filp);
+long gk20a_as_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
new file mode 100644
index 000000000000..6056f558359f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.c
@@ -0,0 +1,2111 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.c
+ *
+ * GK20A Graphics channel
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/nvhost.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/highmem.h> /* need for nvmap.h*/
+#include <trace/events/gk20a.h>
+#include <linux/scatterlist.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/dma-buf.h>
+
+#include "debug_gk20a.h"
+
+#include "gk20a.h"
+#include "dbg_gpu_gk20a.h"
+
+#include "hw_ram_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ltc_gk20a.h"
+
+#define NVMAP_HANDLE_PARAM_SIZE 1
+
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f);
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c);
+
+static void free_priv_cmdbuf(struct channel_gk20a *c,
+			     struct priv_cmd_entry *e);
+static void recycle_priv_cmdbuf(struct channel_gk20a *c);
+
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c);
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c);
+
+static int channel_gk20a_commit_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_userd(struct channel_gk20a *c);
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+			u64 gpfifo_base, u32 gpfifo_entries);
+
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a);
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a);
+
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+				struct channel_gk20a *ch);
+static void channel_gk20a_free_inst(struct gk20a *g,
+				struct channel_gk20a *ch);
+
+static int channel_gk20a_update_runlist(struct channel_gk20a *c,
+					bool add);
+static void gk20a_free_error_notifiers(struct channel_gk20a *ch);
+
+static struct channel_gk20a *acquire_unused_channel(struct fifo_gk20a *f)
+{
+	struct channel_gk20a *ch = NULL;
+	int chid;
+
+	mutex_lock(&f->ch_inuse_mutex);
+	for (chid = 0; chid < f->num_channels; chid++) {
+		if (!f->channel[chid].in_use) {
+			f->channel[chid].in_use = true;
+			ch = &f->channel[chid];
+			break;
+		}
+	}
+	mutex_unlock(&f->ch_inuse_mutex);
+
+	return ch;
+}
+
+static void release_used_channel(struct fifo_gk20a *f, struct channel_gk20a *c)
+{
+	mutex_lock(&f->ch_inuse_mutex);
+	f->channel[c->hw_chid].in_use = false;
+	mutex_unlock(&f->ch_inuse_mutex);
+}
+
+int channel_gk20a_commit_va(struct channel_gk20a *c)
+{
+	u64 addr;
+	u32 addr_lo;
+	u32 addr_hi;
+	void *inst_ptr;
+
+	gk20a_dbg_fn("");
+
+	inst_ptr = c->inst_block.cpuva;
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	addr = gk20a_mm_iova_addr(c->vm->pdes.sgt->sgl);
+	addr_lo = u64_lo32(addr >> 12);
+	addr_hi = u64_hi32(addr);
+
+	gk20a_dbg_info("pde pa=0x%llx addr_lo=0x%x addr_hi=0x%x",
+		   (u64)addr, addr_lo, addr_hi);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+		ram_in_page_dir_base_target_vid_mem_f() |
+		ram_in_page_dir_base_vol_true_f() |
+		ram_in_page_dir_base_lo_f(addr_lo));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+		ram_in_page_dir_base_hi_f(addr_hi));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+		 u64_lo32(c->vm->va_limit) | 0xFFF);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+		ram_in_adr_limit_hi_f(u64_hi32(c->vm->va_limit)));
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+static int channel_gk20a_commit_userd(struct channel_gk20a *c)
+{
+	u32 addr_lo;
+	u32 addr_hi;
+	void *inst_ptr;
+
+	gk20a_dbg_fn("");
+
+	inst_ptr = c->inst_block.cpuva;
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	addr_lo = u64_lo32(c->userd_iova >> ram_userd_base_shift_v());
+	addr_hi = u64_hi32(c->userd_iova);
+
+	gk20a_dbg_info("channel %d : set ramfc userd 0x%16llx",
+		c->hw_chid, (u64)c->userd_iova);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_w(),
+		 pbdma_userd_target_vid_mem_f() |
+		 pbdma_userd_addr_f(addr_lo));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_ramfc_w() + ram_fc_userd_hi_w(),
+		 pbdma_userd_target_vid_mem_f() |
+		 pbdma_userd_hi_addr_f(addr_hi));
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+static int channel_gk20a_set_schedule_params(struct channel_gk20a *c,
+				u32 timeslice_timeout)
+{
+	void *inst_ptr;
+	int shift = 3;
+	int value = timeslice_timeout;
+
+	inst_ptr = c->inst_block.cpuva;
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	/* disable channel */
+	gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
+		gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
+		ccsr_channel_enable_clr_true_f());
+
+	/* preempt the channel */
+	WARN_ON(gk20a_fifo_preempt_channel(c->g, c->hw_chid));
+
+	/* flush GPU cache */
+	gk20a_mm_l2_flush(c->g, true);
+
+	/* value field is 8 bits long */
+	while (value >= 1 << 8) {
+		value >>= 1;
+		shift++;
+	}
+
+	/* time slice register is only 18bits long */
+	if ((value << shift) >= 1<<19) {
+		pr_err("Requested timeslice value is clamped to 18 bits\n");
+		value = 255;
+		shift = 10;
+	}
+
+	/* set new timeslice */
+	gk20a_mem_wr32(inst_ptr, ram_fc_eng_timeslice_w(),
+		value | (shift << 12) |
+		fifo_eng_timeslice_enable_true_f());
+
+	/* enable channel */
+	gk20a_writel(c->g, ccsr_channel_r(c->hw_chid),
+		gk20a_readl(c->g, ccsr_channel_r(c->hw_chid)) |
+		ccsr_channel_enable_set_true_f());
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+static int channel_gk20a_setup_ramfc(struct channel_gk20a *c,
+				u64 gpfifo_base, u32 gpfifo_entries)
+{
+	void *inst_ptr;
+
+	gk20a_dbg_fn("");
+
+	inst_ptr = c->inst_block.cpuva;
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	memset(inst_ptr, 0, ram_fc_size_val_v());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_w(),
+		pbdma_gp_base_offset_f(
+		u64_lo32(gpfifo_base >> pbdma_gp_base_rsvd_s())));
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_gp_base_hi_w(),
+		pbdma_gp_base_hi_offset_f(u64_hi32(gpfifo_base)) |
+		pbdma_gp_base_hi_limit2_f(ilog2(gpfifo_entries)));
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_signature_w(),
+		 pbdma_signature_hw_valid_f() | pbdma_signature_sw_zero_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_formats_w(),
+		pbdma_formats_gp_fermi0_f() |
+		pbdma_formats_pb_fermi1_f() |
+		pbdma_formats_mp_fermi0_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_pb_header_w(),
+		pbdma_pb_header_priv_user_f() |
+		pbdma_pb_header_method_zero_f() |
+		pbdma_pb_header_subchannel_zero_f() |
+		pbdma_pb_header_level_main_f() |
+		pbdma_pb_header_first_true_f() |
+		pbdma_pb_header_type_inc_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_subdevice_w(),
+		pbdma_subdevice_id_f(1) |
+		pbdma_subdevice_status_active_f() |
+		pbdma_subdevice_channel_dma_enable_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_target_w(), pbdma_target_engine_sw_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_acquire_w(),
+		pbdma_acquire_retry_man_2_f() |
+		pbdma_acquire_retry_exp_2_f() |
+		pbdma_acquire_timeout_exp_max_f() |
+		pbdma_acquire_timeout_man_max_f() |
+		pbdma_acquire_timeout_en_disable_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_eng_timeslice_w(),
+		fifo_eng_timeslice_timeout_128_f() |
+		fifo_eng_timeslice_timescale_3_f() |
+		fifo_eng_timeslice_enable_true_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_pb_timeslice_w(),
+		fifo_pb_timeslice_timeout_16_f() |
+		fifo_pb_timeslice_timescale_0_f() |
+		fifo_pb_timeslice_enable_true_f());
+
+	gk20a_mem_wr32(inst_ptr, ram_fc_chid_w(), ram_fc_chid_id_f(c->hw_chid));
+
+	/* TBD: alwasy priv mode? */
+	gk20a_mem_wr32(inst_ptr, ram_fc_hce_ctrl_w(),
+		 pbdma_hce_ctrl_hce_priv_mode_yes_f());
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+static int channel_gk20a_setup_userd(struct channel_gk20a *c)
+{
+	BUG_ON(!c->userd_cpu_va);
+
+	gk20a_dbg_fn("");
+
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_put_hi_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_ref_threshold_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_top_level_get_hi_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_get_hi_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_get_w(), 0);
+	gk20a_mem_wr32(c->userd_cpu_va, ram_userd_gp_put_w(), 0);
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+static void channel_gk20a_bind(struct channel_gk20a *ch_gk20a)
+{
+	struct gk20a *g = ch_gk20a->g;
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_engine_info_gk20a *engine_info =
+		f->engine_info + ENGINE_GR_GK20A;
+
+	u32 inst_ptr = ch_gk20a->inst_block.cpu_pa
+		>> ram_in_base_shift_v();
+
+	gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
+		ch_gk20a->hw_chid, inst_ptr);
+
+	ch_gk20a->bound = true;
+
+	gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+		(gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+		 ~ccsr_channel_runlist_f(~0)) |
+		 ccsr_channel_runlist_f(engine_info->runlist_id));
+
+	gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+		ccsr_channel_inst_ptr_f(inst_ptr) |
+		ccsr_channel_inst_target_vid_mem_f() |
+		ccsr_channel_inst_bind_true_f());
+
+	gk20a_writel(g, ccsr_channel_r(ch_gk20a->hw_chid),
+		(gk20a_readl(g, ccsr_channel_r(ch_gk20a->hw_chid)) &
+		 ~ccsr_channel_enable_set_f(~0)) |
+		 ccsr_channel_enable_set_true_f());
+}
+
+static void channel_gk20a_unbind(struct channel_gk20a *ch_gk20a)
+{
+	struct gk20a *g = ch_gk20a->g;
+
+	gk20a_dbg_fn("");
+
+	if (ch_gk20a->bound)
+		gk20a_writel(g, ccsr_channel_inst_r(ch_gk20a->hw_chid),
+			ccsr_channel_inst_ptr_f(0) |
+			ccsr_channel_inst_bind_false_f());
+
+	ch_gk20a->bound = false;
+}
+
+static int channel_gk20a_alloc_inst(struct gk20a *g,
+				struct channel_gk20a *ch)
+{
+	struct device *d = dev_from_gk20a(g);
+	int err = 0;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	ch->inst_block.size = ram_in_alloc_size_v();
+	ch->inst_block.cpuva = dma_alloc_coherent(d,
+					ch->inst_block.size,
+					&iova,
+					GFP_KERNEL);
+	if (!ch->inst_block.cpuva) {
+		gk20a_err(d, "%s: memory allocation failed\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	ch->inst_block.iova = iova;
+	ch->inst_block.cpu_pa = gk20a_get_phys_from_iova(d,
+							ch->inst_block.iova);
+	if (!ch->inst_block.cpu_pa) {
+		gk20a_err(d, "%s: failed to get physical address\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	gk20a_dbg_info("channel %d inst block physical addr: 0x%16llx",
+		ch->hw_chid, (u64)ch->inst_block.cpu_pa);
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up:
+	gk20a_err(d, "fail");
+	channel_gk20a_free_inst(g, ch);
+	return err;
+}
+
+static void channel_gk20a_free_inst(struct gk20a *g,
+				struct channel_gk20a *ch)
+{
+	struct device *d = dev_from_gk20a(g);
+
+	if (ch->inst_block.cpuva)
+		dma_free_coherent(d, ch->inst_block.size,
+				ch->inst_block.cpuva, ch->inst_block.iova);
+	ch->inst_block.cpuva = NULL;
+	ch->inst_block.iova = 0;
+	memset(&ch->inst_block, 0, sizeof(struct inst_desc));
+}
+
+static int channel_gk20a_update_runlist(struct channel_gk20a *c, bool add)
+{
+	return gk20a_fifo_update_runlist(c->g, 0, c->hw_chid, add, true);
+}
+
+void gk20a_disable_channel_no_update(struct channel_gk20a *ch)
+{
+	/* ensure no fences are pending */
+	if (ch->sync)
+		ch->sync->set_min_eq_max(ch->sync);
+
+	/* disable channel */
+	gk20a_writel(ch->g, ccsr_channel_r(ch->hw_chid),
+		     gk20a_readl(ch->g,
+		     ccsr_channel_r(ch->hw_chid)) |
+		     ccsr_channel_enable_clr_true_f());
+}
+
+static int gk20a_wait_channel_idle(struct channel_gk20a *ch)
+{
+	bool channel_idle = false;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(ch->g));
+
+	do {
+		mutex_lock(&ch->jobs_lock);
+		channel_idle = list_empty(&ch->jobs);
+		mutex_unlock(&ch->jobs_lock);
+		if (channel_idle)
+			break;
+
+		usleep_range(1000, 3000);
+	} while (time_before(jiffies, end_jiffies)
+			|| !tegra_platform_is_silicon());
+
+	if (!channel_idle)
+		gk20a_err(dev_from_gk20a(ch->g), "channel jobs not freed");
+
+	return 0;
+}
+
+void gk20a_disable_channel(struct channel_gk20a *ch,
+			   bool finish,
+			   unsigned long finish_timeout)
+{
+	if (finish) {
+		int err = gk20a_channel_finish(ch, finish_timeout);
+		WARN_ON(err);
+	}
+
+	/* disable the channel from hw and increment syncpoints */
+	gk20a_disable_channel_no_update(ch);
+
+	gk20a_wait_channel_idle(ch);
+
+	/* preempt the channel */
+	gk20a_fifo_preempt_channel(ch->g, ch->hw_chid);
+
+	/* remove channel from runlist */
+	channel_gk20a_update_runlist(ch, false);
+}
+
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+
+static void gk20a_free_cycle_stats_buffer(struct channel_gk20a *ch)
+{
+	/* disable existing cyclestats buffer */
+	mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
+	if (ch->cyclestate.cyclestate_buffer_handler) {
+		dma_buf_vunmap(ch->cyclestate.cyclestate_buffer_handler,
+				ch->cyclestate.cyclestate_buffer);
+		dma_buf_put(ch->cyclestate.cyclestate_buffer_handler);
+		ch->cyclestate.cyclestate_buffer_handler = NULL;
+		ch->cyclestate.cyclestate_buffer = NULL;
+		ch->cyclestate.cyclestate_buffer_size = 0;
+	}
+	mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
+}
+
+static int gk20a_channel_cycle_stats(struct channel_gk20a *ch,
+		       struct nvhost_cycle_stats_args *args)
+{
+	struct dma_buf *dmabuf;
+	void *virtual_address;
+
+	if (args->nvmap_handle && !ch->cyclestate.cyclestate_buffer_handler) {
+
+		/* set up new cyclestats buffer */
+		dmabuf = dma_buf_get(args->nvmap_handle);
+		if (IS_ERR(dmabuf))
+			return PTR_ERR(dmabuf);
+		virtual_address = dma_buf_vmap(dmabuf);
+		if (!virtual_address)
+			return -ENOMEM;
+
+		ch->cyclestate.cyclestate_buffer_handler = dmabuf;
+		ch->cyclestate.cyclestate_buffer = virtual_address;
+		ch->cyclestate.cyclestate_buffer_size = dmabuf->size;
+		return 0;
+
+	} else if (!args->nvmap_handle &&
+			ch->cyclestate.cyclestate_buffer_handler) {
+		gk20a_free_cycle_stats_buffer(ch);
+		return 0;
+
+	} else if (!args->nvmap_handle &&
+			!ch->cyclestate.cyclestate_buffer_handler) {
+		/* no requst from GL */
+		return 0;
+
+	} else {
+		pr_err("channel already has cyclestats buffer\n");
+		return -EINVAL;
+	}
+}
+#endif
+
+static int gk20a_init_error_notifier(struct channel_gk20a *ch,
+		struct nvhost_set_error_notifier *args) {
+	void *va;
+
+	struct dma_buf *dmabuf;
+
+	if (!args->mem) {
+		pr_err("gk20a_init_error_notifier: invalid memory handle\n");
+		return -EINVAL;
+	}
+
+	dmabuf = dma_buf_get(args->mem);
+
+	if (ch->error_notifier_ref)
+		gk20a_free_error_notifiers(ch);
+
+	if (IS_ERR(dmabuf)) {
+		pr_err("Invalid handle: %d\n", args->mem);
+		return -EINVAL;
+	}
+	/* map handle */
+	va = dma_buf_vmap(dmabuf);
+	if (!va) {
+		dma_buf_put(dmabuf);
+		pr_err("Cannot map notifier handle\n");
+		return -ENOMEM;
+	}
+
+	/* set channel notifiers pointer */
+	ch->error_notifier_ref = dmabuf;
+	ch->error_notifier = va + args->offset;
+	ch->error_notifier_va = va;
+	memset(ch->error_notifier, 0, sizeof(struct nvhost_notification));
+	return 0;
+}
+
+void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error)
+{
+	if (ch->error_notifier_ref) {
+		struct timespec time_data;
+		u64 nsec;
+		getnstimeofday(&time_data);
+		nsec = ((u64)time_data.tv_sec) * 1000000000u +
+				(u64)time_data.tv_nsec;
+		ch->error_notifier->time_stamp.nanoseconds[0] =
+				(u32)nsec;
+		ch->error_notifier->time_stamp.nanoseconds[1] =
+				(u32)(nsec >> 32);
+		ch->error_notifier->info32 = error;
+		ch->error_notifier->status = 0xffff;
+		gk20a_err(dev_from_gk20a(ch->g),
+				"error notifier set to %d\n", error);
+	}
+}
+
+static void gk20a_free_error_notifiers(struct channel_gk20a *ch)
+{
+	if (ch->error_notifier_ref) {
+		dma_buf_vunmap(ch->error_notifier_ref, ch->error_notifier_va);
+		dma_buf_put(ch->error_notifier_ref);
+		ch->error_notifier_ref = 0;
+		ch->error_notifier = 0;
+		ch->error_notifier_va = 0;
+	}
+}
+
+void gk20a_free_channel(struct channel_gk20a *ch, bool finish)
+{
+	struct gk20a *g = ch->g;
+	struct device *d = dev_from_gk20a(g);
+	struct fifo_gk20a *f = &g->fifo;
+	struct gr_gk20a *gr = &g->gr;
+	struct vm_gk20a *ch_vm = ch->vm;
+	unsigned long timeout = gk20a_get_gr_idle_timeout(g);
+	struct dbg_session_gk20a *dbg_s;
+
+	gk20a_dbg_fn("");
+
+	/* if engine reset was deferred, perform it now */
+	mutex_lock(&f->deferred_reset_mutex);
+	if (g->fifo.deferred_reset_pending) {
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "engine reset was"
+			   " deferred, running now");
+		fifo_gk20a_finish_mmu_fault_handling(g, g->fifo.mmu_fault_engines);
+		g->fifo.mmu_fault_engines = 0;
+		g->fifo.deferred_reset_pending = false;
+	}
+	mutex_unlock(&f->deferred_reset_mutex);
+
+	if (!ch->bound)
+		return;
+
+	if (!gk20a_channel_as_bound(ch))
+		goto unbind;
+
+	gk20a_dbg_info("freeing bound channel context, timeout=%ld",
+			timeout);
+
+	gk20a_disable_channel(ch, finish && !ch->has_timedout, timeout);
+
+	gk20a_free_error_notifiers(ch);
+
+	/* release channel ctx */
+	gk20a_free_channel_ctx(ch);
+
+	gk20a_gr_flush_channel_tlb(gr);
+
+	memset(&ch->ramfc, 0, sizeof(struct mem_desc_sub));
+
+	/* free gpfifo */
+	if (ch->gpfifo.gpu_va)
+		gk20a_gmmu_unmap(ch_vm, ch->gpfifo.gpu_va,
+			ch->gpfifo.size, gk20a_mem_flag_none);
+	if (ch->gpfifo.cpu_va)
+		dma_free_coherent(d, ch->gpfifo.size,
+			ch->gpfifo.cpu_va, ch->gpfifo.iova);
+	ch->gpfifo.cpu_va = NULL;
+	ch->gpfifo.iova = 0;
+
+	gk20a_mm_l2_invalidate(ch->g);
+
+	memset(&ch->gpfifo, 0, sizeof(struct gpfifo_desc));
+
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+	gk20a_free_cycle_stats_buffer(ch);
+#endif
+
+	channel_gk20a_free_priv_cmdbuf(ch);
+
+	if (ch->sync) {
+		ch->sync->destroy(ch->sync);
+		ch->sync = NULL;
+	}
+
+	/* release channel binding to the as_share */
+	gk20a_as_release_share(ch_vm->as_share);
+
+unbind:
+	channel_gk20a_unbind(ch);
+	channel_gk20a_free_inst(g, ch);
+
+	ch->vpr = false;
+	ch->vm = NULL;
+	WARN_ON(ch->sync);
+
+	/* unlink all debug sessions */
+	mutex_lock(&ch->dbg_s_lock);
+
+	list_for_each_entry(dbg_s, &ch->dbg_s_list, dbg_s_list_node) {
+		dbg_s->ch = NULL;
+		list_del_init(&dbg_s->dbg_s_list_node);
+	}
+
+	mutex_unlock(&ch->dbg_s_lock);
+
+	/* ALWAYS last */
+	release_used_channel(f, ch);
+}
+
+int gk20a_channel_release(struct inode *inode, struct file *filp)
+{
+	struct channel_gk20a *ch = (struct channel_gk20a *)filp->private_data;
+	struct gk20a *g = ch->g;
+
+	trace_gk20a_channel_release(dev_name(&g->dev->dev));
+
+	gk20a_channel_busy(ch->g->dev);
+	gk20a_free_channel(ch, true);
+	gk20a_channel_idle(ch->g->dev);
+
+	gk20a_put_client(g);
+	filp->private_data = NULL;
+	return 0;
+}
+
+static struct channel_gk20a *gk20a_open_new_channel(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch;
+
+	ch = acquire_unused_channel(f);
+	if (ch == NULL) {
+		/* TBD: we want to make this virtualizable */
+		gk20a_err(dev_from_gk20a(g), "out of hw chids");
+		return 0;
+	}
+
+	ch->g = g;
+
+	if (channel_gk20a_alloc_inst(g, ch)) {
+		ch->in_use = false;
+		gk20a_err(dev_from_gk20a(g),
+			   "failed to open gk20a channel, out of inst mem");
+
+		return 0;
+	}
+	g->ops.fifo.bind_channel(ch);
+	ch->pid = current->pid;
+
+	/* reset timeout counter and update timestamp */
+	ch->timeout_accumulated_ms = 0;
+	ch->timeout_gpfifo_get = 0;
+	/* set gr host default timeout */
+	ch->timeout_ms_max = gk20a_get_gr_idle_timeout(g);
+	ch->timeout_debug_dump = true;
+	ch->has_timedout = false;
+
+	/* The channel is *not* runnable at this point. It still needs to have
+	 * an address space bound and allocate a gpfifo and grctx. */
+
+	init_waitqueue_head(&ch->notifier_wq);
+	init_waitqueue_head(&ch->semaphore_wq);
+	init_waitqueue_head(&ch->submit_wq);
+
+	return ch;
+}
+
+static int __gk20a_channel_open(struct gk20a *g, struct file *filp)
+{
+	int err;
+	struct channel_gk20a *ch;
+
+	trace_gk20a_channel_open(dev_name(&g->dev->dev));
+
+	err = gk20a_get_client(g);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to get client ref");
+		return err;
+	}
+
+	err = gk20a_channel_busy(g->dev);
+	if (err) {
+		gk20a_put_client(g);
+		gk20a_err(dev_from_gk20a(g), "failed to power on, %d", err);
+		return err;
+	}
+	ch = gk20a_open_new_channel(g);
+	gk20a_channel_idle(g->dev);
+	if (!ch) {
+		gk20a_put_client(g);
+		gk20a_err(dev_from_gk20a(g),
+			"failed to get f");
+		return -ENOMEM;
+	}
+
+	filp->private_data = ch;
+	return 0;
+}
+
+int gk20a_channel_open(struct inode *inode, struct file *filp)
+{
+	struct gk20a *g = container_of(inode->i_cdev,
+			struct gk20a, channel.cdev);
+	return __gk20a_channel_open(g, filp);
+}
+
+/* allocate private cmd buffer.
+   used for inserting commands before/after user submitted buffers. */
+static int channel_gk20a_alloc_priv_cmdbuf(struct channel_gk20a *c)
+{
+	struct device *d = dev_from_gk20a(c->g);
+	struct vm_gk20a *ch_vm = c->vm;
+	struct priv_cmd_queue *q = &c->priv_cmd_q;
+	struct priv_cmd_entry *e;
+	u32 i = 0, size;
+	int err = 0;
+	struct sg_table *sgt;
+	dma_addr_t iova;
+
+	/* Kernel can insert gpfifos before and after user gpfifos.
+	   Before user gpfifos, kernel inserts fence_wait, which takes
+	   syncpoint_a (2 dwords) + syncpoint_b (2 dwords) = 4 dwords.
+	   After user gpfifos, kernel inserts fence_get, which takes
+	   wfi (2 dwords) + syncpoint_a (2 dwords) + syncpoint_b (2 dwords)
+	   = 6 dwords.
+	   Worse case if kernel adds both of them for every user gpfifo,
+	   max size of priv_cmdbuf is :
+	   (gpfifo entry number * (2 / 3) * (4 + 6) * 4 bytes */
+	size = roundup_pow_of_two(
+		c->gpfifo.entry_num * 2 * 10 * sizeof(u32) / 3);
+
+	q->mem.base_cpuva = dma_alloc_coherent(d, size,
+					&iova,
+					GFP_KERNEL);
+	if (!q->mem.base_cpuva) {
+		gk20a_err(d, "%s: memory allocation failed\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	q->mem.base_iova = iova;
+	q->mem.size = size;
+
+	err = gk20a_get_sgtable(d, &sgt,
+			q->mem.base_cpuva, q->mem.base_iova, size);
+	if (err) {
+		gk20a_err(d, "%s: failed to create sg table\n", __func__);
+		goto clean_up;
+	}
+
+	memset(q->mem.base_cpuva, 0, size);
+
+	q->base_gpuva = gk20a_gmmu_map(ch_vm, &sgt,
+					size,
+					0, /* flags */
+					gk20a_mem_flag_none);
+	if (!q->base_gpuva) {
+		gk20a_err(d, "ch %d : failed to map gpu va"
+			   "for priv cmd buffer", c->hw_chid);
+		err = -ENOMEM;
+		goto clean_up_sgt;
+	}
+
+	q->size = q->mem.size / sizeof (u32);
+
+	INIT_LIST_HEAD(&q->head);
+	INIT_LIST_HEAD(&q->free);
+
+	/* pre-alloc 25% of priv cmdbuf entries and put them on free list */
+	for (i = 0; i < q->size / 4; i++) {
+		e = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
+		if (!e) {
+			gk20a_err(d, "ch %d: fail to pre-alloc cmd entry",
+				c->hw_chid);
+			err = -ENOMEM;
+			goto clean_up_sgt;
+		}
+		e->pre_alloc = true;
+		list_add(&e->list, &q->free);
+	}
+
+	gk20a_free_sgtable(&sgt);
+
+	return 0;
+
+clean_up_sgt:
+	gk20a_free_sgtable(&sgt);
+clean_up:
+	channel_gk20a_free_priv_cmdbuf(c);
+	return err;
+}
+
+static void channel_gk20a_free_priv_cmdbuf(struct channel_gk20a *c)
+{
+	struct device *d = dev_from_gk20a(c->g);
+	struct vm_gk20a *ch_vm = c->vm;
+	struct priv_cmd_queue *q = &c->priv_cmd_q;
+	struct priv_cmd_entry *e;
+	struct list_head *pos, *tmp, *head;
+
+	if (q->size == 0)
+		return;
+
+	if (q->base_gpuva)
+		gk20a_gmmu_unmap(ch_vm, q->base_gpuva,
+				q->mem.size, gk20a_mem_flag_none);
+	if (q->mem.base_cpuva)
+		dma_free_coherent(d, q->mem.size,
+			q->mem.base_cpuva, q->mem.base_iova);
+	q->mem.base_cpuva = NULL;
+	q->mem.base_iova = 0;
+
+	/* free used list */
+	head = &q->head;
+	list_for_each_safe(pos, tmp, head) {
+		e = container_of(pos, struct priv_cmd_entry, list);
+		free_priv_cmdbuf(c, e);
+	}
+
+	/* free free list */
+	head = &q->free;
+	list_for_each_safe(pos, tmp, head) {
+		e = container_of(pos, struct priv_cmd_entry, list);
+		e->pre_alloc = false;
+		free_priv_cmdbuf(c, e);
+	}
+
+	memset(q, 0, sizeof(struct priv_cmd_queue));
+}
+
+/* allocate a cmd buffer with given size. size is number of u32 entries */
+int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 orig_size,
+			     struct priv_cmd_entry **entry)
+{
+	struct priv_cmd_queue *q = &c->priv_cmd_q;
+	struct priv_cmd_entry *e;
+	struct list_head *node;
+	u32 free_count;
+	u32 size = orig_size;
+	bool no_retry = false;
+
+	gk20a_dbg_fn("size %d", orig_size);
+
+	*entry = NULL;
+
+	/* if free space in the end is less than requested, increase the size
+	 * to make the real allocated space start from beginning. */
+	if (q->put + size > q->size)
+		size = orig_size + (q->size - q->put);
+
+	gk20a_dbg_info("ch %d: priv cmd queue get:put %d:%d",
+			c->hw_chid, q->get, q->put);
+
+TRY_AGAIN:
+	free_count = (q->size - (q->put - q->get) - 1) % q->size;
+
+	if (size > free_count) {
+		if (!no_retry) {
+			recycle_priv_cmdbuf(c);
+			no_retry = true;
+			goto TRY_AGAIN;
+		} else
+			return -EAGAIN;
+	}
+
+	if (unlikely(list_empty(&q->free))) {
+
+		gk20a_dbg_info("ch %d: run out of pre-alloc entries",
+			c->hw_chid);
+
+		e = kzalloc(sizeof(struct priv_cmd_entry), GFP_KERNEL);
+		if (!e) {
+			gk20a_err(dev_from_gk20a(c->g),
+				"ch %d: fail to allocate priv cmd entry",
+				c->hw_chid);
+			return -ENOMEM;
+		}
+	} else  {
+		node = q->free.next;
+		list_del(node);
+		e = container_of(node, struct priv_cmd_entry, list);
+	}
+
+	e->size = orig_size;
+	e->gp_get = c->gpfifo.get;
+	e->gp_put = c->gpfifo.put;
+	e->gp_wrap = c->gpfifo.wrap;
+
+	/* if we have increased size to skip free space in the end, set put
+	   to beginning of cmd buffer (0) + size */
+	if (size != orig_size) {
+		e->ptr = q->mem.base_cpuva;
+		e->gva = q->base_gpuva;
+		q->put = orig_size;
+	} else {
+		e->ptr = q->mem.base_cpuva + q->put;
+		e->gva = q->base_gpuva + q->put * sizeof(u32);
+		q->put = (q->put + orig_size) & (q->size - 1);
+	}
+
+	/* we already handled q->put + size > q->size so BUG_ON this */
+	BUG_ON(q->put > q->size);
+
+	/* add new entry to head since we free from head */
+	list_add(&e->list, &q->head);
+
+	*entry = e;
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+/* Don't call this to free an explict cmd entry.
+ * It doesn't update priv_cmd_queue get/put */
+static void free_priv_cmdbuf(struct channel_gk20a *c,
+			     struct priv_cmd_entry *e)
+{
+	struct priv_cmd_queue *q = &c->priv_cmd_q;
+
+	if (!e)
+		return;
+
+	list_del(&e->list);
+
+	if (unlikely(!e->pre_alloc))
+		kfree(e);
+	else {
+		memset(e, 0, sizeof(struct priv_cmd_entry));
+		e->pre_alloc = true;
+		list_add(&e->list, &q->free);
+	}
+}
+
+/* free entries if they're no longer being used */
+static void recycle_priv_cmdbuf(struct channel_gk20a *c)
+{
+	struct priv_cmd_queue *q = &c->priv_cmd_q;
+	struct priv_cmd_entry *e, *tmp;
+	struct list_head *head = &q->head;
+	bool wrap_around, found = false;
+
+	gk20a_dbg_fn("");
+
+	/* Find the most recent free entry. Free it and everything before it */
+	list_for_each_entry(e, head, list) {
+
+		gk20a_dbg_info("ch %d: cmd entry get:put:wrap %d:%d:%d "
+			"curr get:put:wrap %d:%d:%d",
+			c->hw_chid, e->gp_get, e->gp_put, e->gp_wrap,
+			c->gpfifo.get, c->gpfifo.put, c->gpfifo.wrap);
+
+		wrap_around = (c->gpfifo.wrap != e->gp_wrap);
+		if (e->gp_get < e->gp_put) {
+			if (c->gpfifo.get >= e->gp_put ||
+			    wrap_around) {
+				found = true;
+				break;
+			} else
+				e->gp_get = c->gpfifo.get;
+		} else if (e->gp_get > e->gp_put) {
+			if (wrap_around &&
+			    c->gpfifo.get >= e->gp_put) {
+				found = true;
+				break;
+			} else
+				e->gp_get = c->gpfifo.get;
+		}
+	}
+
+	if (found)
+		q->get = (e->ptr - q->mem.base_cpuva) + e->size;
+	else {
+		gk20a_dbg_info("no free entry recycled");
+		return;
+	}
+
+	list_for_each_entry_safe_continue(e, tmp, head, list) {
+		free_priv_cmdbuf(c, e);
+	}
+
+	gk20a_dbg_fn("done");
+}
+
+
+static int gk20a_alloc_channel_gpfifo(struct channel_gk20a *c,
+				      struct nvhost_alloc_gpfifo_args *args)
+{
+	struct gk20a *g = c->g;
+	struct device *d = dev_from_gk20a(g);
+	struct vm_gk20a *ch_vm;
+	u32 gpfifo_size;
+	int err = 0;
+	struct sg_table *sgt;
+	dma_addr_t iova;
+
+	/* Kernel can insert one extra gpfifo entry before user submitted gpfifos
+	   and another one after, for internal usage. Triple the requested size. */
+	gpfifo_size = roundup_pow_of_two(args->num_entries * 3);
+
+	if (args->flags & NVHOST_ALLOC_GPFIFO_FLAGS_VPR_ENABLED)
+		c->vpr = true;
+
+	/* an address space needs to have been bound at this point.   */
+	if (!gk20a_channel_as_bound(c)) {
+		gk20a_err(d,
+			    "not bound to an address space at time of gpfifo"
+			    " allocation.  Attempting to create and bind to"
+			    " one...");
+		return -EINVAL;
+	}
+	ch_vm = c->vm;
+
+	c->cmds_pending = false;
+	c->last_submit_fence.valid = false;
+
+	c->ramfc.offset = 0;
+	c->ramfc.size = ram_in_ramfc_s() / 8;
+
+	if (c->gpfifo.cpu_va) {
+		gk20a_err(d, "channel %d :"
+			   "gpfifo already allocated", c->hw_chid);
+		return -EEXIST;
+	}
+
+	c->gpfifo.size = gpfifo_size * sizeof(struct gpfifo);
+	c->gpfifo.cpu_va = (struct gpfifo *)dma_alloc_coherent(d,
+						c->gpfifo.size,
+						&iova,
+						GFP_KERNEL);
+	if (!c->gpfifo.cpu_va) {
+		gk20a_err(d, "%s: memory allocation failed\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	c->gpfifo.iova = iova;
+	c->gpfifo.entry_num = gpfifo_size;
+
+	c->gpfifo.get = c->gpfifo.put = 0;
+
+	err = gk20a_get_sgtable(d, &sgt,
+			c->gpfifo.cpu_va, c->gpfifo.iova, c->gpfifo.size);
+	if (err) {
+		gk20a_err(d, "%s: failed to allocate sg table\n", __func__);
+		goto clean_up;
+	}
+
+	c->gpfifo.gpu_va = gk20a_gmmu_map(ch_vm,
+					&sgt,
+					c->gpfifo.size,
+					0, /* flags */
+					gk20a_mem_flag_none);
+	if (!c->gpfifo.gpu_va) {
+		gk20a_err(d, "channel %d : failed to map"
+			   " gpu_va for gpfifo", c->hw_chid);
+		err = -ENOMEM;
+		goto clean_up_sgt;
+	}
+
+	gk20a_dbg_info("channel %d : gpfifo_base 0x%016llx, size %d",
+		c->hw_chid, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+
+	channel_gk20a_setup_ramfc(c, c->gpfifo.gpu_va, c->gpfifo.entry_num);
+
+	channel_gk20a_setup_userd(c);
+	channel_gk20a_commit_userd(c);
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	/* TBD: setup engine contexts */
+
+	err = channel_gk20a_alloc_priv_cmdbuf(c);
+	if (err)
+		goto clean_up_unmap;
+
+	err = channel_gk20a_update_runlist(c, true);
+	if (err)
+		goto clean_up_unmap;
+
+	gk20a_free_sgtable(&sgt);
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up_unmap:
+	gk20a_gmmu_unmap(ch_vm, c->gpfifo.gpu_va,
+		c->gpfifo.size, gk20a_mem_flag_none);
+clean_up_sgt:
+	gk20a_free_sgtable(&sgt);
+clean_up:
+	dma_free_coherent(d, c->gpfifo.size,
+		c->gpfifo.cpu_va, c->gpfifo.iova);
+	c->gpfifo.cpu_va = NULL;
+	c->gpfifo.iova = 0;
+	memset(&c->gpfifo, 0, sizeof(struct gpfifo_desc));
+	gk20a_err(d, "fail");
+	return err;
+}
+
+static inline int wfi_cmd_size(void)
+{
+	return 2;
+}
+void add_wfi_cmd(struct priv_cmd_entry *cmd, int *i)
+{
+	/* wfi */
+	cmd->ptr[(*i)++] = 0x2001001E;
+	/* handle, ignored */
+	cmd->ptr[(*i)++] = 0x00000000;
+}
+
+static inline bool check_gp_put(struct gk20a *g,
+				struct channel_gk20a *c)
+{
+	u32 put;
+	/* gp_put changed unexpectedly since last update? */
+	put = gk20a_bar1_readl(g,
+	       c->userd_gpu_va + 4 * ram_userd_gp_put_w());
+	if (c->gpfifo.put != put) {
+		/*TBD: BUG_ON/teardown on this*/
+		gk20a_err(dev_from_gk20a(g), "gp_put changed unexpectedly "
+			   "since last update");
+		c->gpfifo.put = put;
+		return false; /* surprise! */
+	}
+	return true; /* checked out ok */
+}
+
+/* Update with this periodically to determine how the gpfifo is draining. */
+static inline u32 update_gp_get(struct gk20a *g,
+				struct channel_gk20a *c)
+{
+	u32 new_get = gk20a_bar1_readl(g,
+		c->userd_gpu_va + sizeof(u32) * ram_userd_gp_get_w());
+	if (new_get < c->gpfifo.get)
+		c->gpfifo.wrap = !c->gpfifo.wrap;
+	c->gpfifo.get = new_get;
+	return new_get;
+}
+
+static inline u32 gp_free_count(struct channel_gk20a *c)
+{
+	return (c->gpfifo.entry_num - (c->gpfifo.put - c->gpfifo.get) - 1) %
+		c->gpfifo.entry_num;
+}
+
+bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
+		u32 timeout_delta_ms)
+{
+	u32 gpfifo_get = update_gp_get(ch->g, ch);
+	/* Count consequent timeout isr */
+	if (gpfifo_get == ch->timeout_gpfifo_get) {
+		/* we didn't advance since previous channel timeout check */
+		ch->timeout_accumulated_ms += timeout_delta_ms;
+	} else {
+		/* first timeout isr encountered */
+		ch->timeout_accumulated_ms = timeout_delta_ms;
+	}
+
+	ch->timeout_gpfifo_get = gpfifo_get;
+
+	return ch->g->timeouts_enabled &&
+		ch->timeout_accumulated_ms > ch->timeout_ms_max;
+}
+
+
+/* Issue a syncpoint increment *preceded* by a wait-for-idle
+ * command.  All commands on the channel will have been
+ * consumed at the time the fence syncpoint increment occurs.
+ */
+static int gk20a_channel_submit_wfi(struct channel_gk20a *c)
+{
+	struct priv_cmd_entry *cmd = NULL;
+	struct gk20a *g = c->g;
+	u32 free_count;
+	int err;
+
+	if (c->has_timedout)
+		return -ETIMEDOUT;
+
+	if (!c->sync) {
+		c->sync = gk20a_channel_sync_create(c);
+		if (!c->sync)
+			return -ENOMEM;
+	}
+
+	update_gp_get(g, c);
+	free_count = gp_free_count(c);
+	if (unlikely(!free_count)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "not enough gpfifo space");
+		return -EAGAIN;
+	}
+
+	err = c->sync->incr_wfi(c->sync, &cmd, &c->last_submit_fence);
+	if (unlikely(err))
+		return err;
+
+	WARN_ON(!c->last_submit_fence.wfi);
+
+	c->gpfifo.cpu_va[c->gpfifo.put].entry0 = u64_lo32(cmd->gva);
+	c->gpfifo.cpu_va[c->gpfifo.put].entry1 = u64_hi32(cmd->gva) |
+		pbdma_gp_entry1_length_f(cmd->size);
+
+	c->gpfifo.put = (c->gpfifo.put + 1) & (c->gpfifo.entry_num - 1);
+
+	/* save gp_put */
+	cmd->gp_put = c->gpfifo.put;
+
+	gk20a_bar1_writel(g,
+		c->userd_gpu_va + 4 * ram_userd_gp_put_w(),
+		c->gpfifo.put);
+
+	gk20a_dbg_info("post-submit put %d, get %d, size %d",
+		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+	return 0;
+}
+
+static u32 get_gp_free_count(struct channel_gk20a *c)
+{
+	update_gp_get(c->g, c);
+	return gp_free_count(c);
+}
+
+static void trace_write_pushbuffer(struct channel_gk20a *c, struct gpfifo *g)
+{
+	void *mem = NULL;
+	unsigned int words;
+	u64 offset;
+	struct dma_buf *dmabuf = NULL;
+
+	if (gk20a_debug_trace_cmdbuf) {
+		u64 gpu_va = (u64)g->entry0 |
+			(u64)((u64)pbdma_gp_entry1_get_hi_v(g->entry1) << 32);
+		int err;
+
+		words = pbdma_gp_entry1_length_v(g->entry1);
+		err = gk20a_vm_find_buffer(c->vm, gpu_va, &dmabuf, &offset);
+		if (!err)
+			mem = dma_buf_vmap(dmabuf);
+	}
+
+	if (mem) {
+		u32 i;
+		/*
+		 * Write in batches of 128 as there seems to be a limit
+		 * of how much you can output to ftrace at once.
+		 */
+		for (i = 0; i < words; i += 128U) {
+			trace_gk20a_push_cmdbuf(
+				c->g->dev->name,
+				0,
+				min(words - i, 128U),
+				offset + i * sizeof(u32),
+				mem);
+		}
+		dma_buf_vunmap(dmabuf, mem);
+	}
+}
+
+static int gk20a_channel_add_job(struct channel_gk20a *c,
+				 struct gk20a_channel_fence *fence)
+{
+	struct vm_gk20a *vm = c->vm;
+	struct channel_gk20a_job *job = NULL;
+	struct mapped_buffer_node **mapped_buffers = NULL;
+	int err = 0, num_mapped_buffers;
+
+	/* job needs reference to this vm */
+	gk20a_vm_get(vm);
+
+	err = gk20a_vm_get_buffers(vm, &mapped_buffers, &num_mapped_buffers);
+	if (err) {
+		gk20a_vm_put(vm);
+		return err;
+	}
+
+	job = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job) {
+		gk20a_vm_put_buffers(vm, mapped_buffers, num_mapped_buffers);
+		gk20a_vm_put(vm);
+		return -ENOMEM;
+	}
+
+	job->num_mapped_buffers = num_mapped_buffers;
+	job->mapped_buffers = mapped_buffers;
+	job->fence = *fence;
+
+	mutex_lock(&c->jobs_lock);
+	list_add_tail(&job->list, &c->jobs);
+	mutex_unlock(&c->jobs_lock);
+
+	return 0;
+}
+
+void gk20a_channel_update(struct channel_gk20a *c, int nr_completed)
+{
+	struct gk20a *g = c->g;
+	struct vm_gk20a *vm = c->vm;
+	struct channel_gk20a_job *job, *n;
+	int i;
+
+	wake_up(&c->submit_wq);
+
+	mutex_lock(&c->jobs_lock);
+	list_for_each_entry_safe(job, n, &c->jobs, list) {
+		bool completed = WARN_ON(!c->sync) ||
+			c->sync->is_expired(c->sync, &job->fence);
+		if (!completed)
+			break;
+
+		gk20a_vm_put_buffers(vm, job->mapped_buffers,
+				job->num_mapped_buffers);
+
+		/* job is done. release its reference to vm */
+		gk20a_vm_put(vm);
+
+		list_del_init(&job->list);
+		kfree(job);
+		gk20a_channel_idle(g->dev);
+	}
+	mutex_unlock(&c->jobs_lock);
+
+	for (i = 0; i < nr_completed; i++)
+		gk20a_channel_idle(c->g->dev);
+}
+
+static int gk20a_submit_channel_gpfifo(struct channel_gk20a *c,
+				struct nvhost_gpfifo *gpfifo,
+				u32 num_entries,
+				struct nvhost_fence *fence,
+				u32 flags)
+{
+	struct gk20a *g = c->g;
+	struct device *d = dev_from_gk20a(g);
+	u32 err = 0;
+	int i;
+	struct priv_cmd_entry *wait_cmd = NULL;
+	struct priv_cmd_entry *incr_cmd = NULL;
+	/* we might need two extra gpfifo entries - one for pre fence
+	 * and one for post fence. */
+	const int extra_entries = 2;
+
+	if (c->has_timedout)
+		return -ETIMEDOUT;
+
+	if ((flags & (NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT |
+		      NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET)) &&
+	    !fence)
+		return -EINVAL;
+
+	if (!c->sync) {
+		c->sync = gk20a_channel_sync_create(c);
+		if (!c->sync)
+			return -ENOMEM;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	/* update debug settings */
+	if (g->ops.ltc.sync_debugfs)
+		g->ops.ltc.sync_debugfs(g);
+#endif
+
+	gk20a_dbg_info("channel %d", c->hw_chid);
+
+	/* gk20a_channel_update releases this ref. */
+	gk20a_channel_busy(g->dev);
+
+	trace_gk20a_channel_submit_gpfifo(c->g->dev->name,
+					  c->hw_chid,
+					  num_entries,
+					  flags,
+					  fence->syncpt_id, fence->value);
+	check_gp_put(g, c);
+	update_gp_get(g, c);
+
+	gk20a_dbg_info("pre-submit put %d, get %d, size %d",
+		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+	/* Invalidate tlb if it's dirty...                                   */
+	/* TBD: this should be done in the cmd stream, not with PRIs.        */
+	/* We don't know what context is currently running...                */
+	/* Note also: there can be more than one context associated with the */
+	/* address space (vm).   */
+	gk20a_mm_tlb_invalidate(c->vm);
+
+	/* Make sure we have enough space for gpfifo entries. If not,
+	 * wait for signals from completed submits */
+	if (gp_free_count(c) < num_entries + extra_entries) {
+		err = wait_event_interruptible(c->submit_wq,
+			get_gp_free_count(c) >= num_entries + extra_entries ||
+			c->has_timedout);
+	}
+
+	if (c->has_timedout) {
+		err = -ETIMEDOUT;
+		goto clean_up;
+	}
+
+	if (err) {
+		gk20a_err(d, "not enough gpfifo space");
+		err = -EAGAIN;
+		goto clean_up;
+	}
+
+	/*
+	 * optionally insert syncpt wait in the beginning of gpfifo submission
+	 * when user requested and the wait hasn't expired.
+	 * validate that the id makes sense, elide if not
+	 * the only reason this isn't being unceremoniously killed is to
+	 * keep running some tests which trigger this condition
+	 */
+	if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_WAIT) {
+		if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
+			err = c->sync->wait_fd(c->sync, fence->syncpt_id,
+					&wait_cmd);
+		else
+			err = c->sync->wait_syncpt(c->sync, fence->syncpt_id,
+					fence->value, &wait_cmd);
+	}
+	if (err)
+		goto clean_up;
+
+
+	/* always insert syncpt increment at end of gpfifo submission
+	   to keep track of method completion for idle railgating */
+	if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET &&
+			flags & NVHOST_SUBMIT_GPFIFO_FLAGS_SYNC_FENCE)
+		err = c->sync->incr_user_fd(c->sync, &incr_cmd,
+					    &c->last_submit_fence,
+					    &fence->syncpt_id);
+	else if (flags & NVHOST_SUBMIT_GPFIFO_FLAGS_FENCE_GET)
+		err = c->sync->incr_user_syncpt(c->sync, &incr_cmd,
+						&c->last_submit_fence,
+						&fence->syncpt_id,
+						&fence->value);
+	else
+		err = c->sync->incr(c->sync, &incr_cmd,
+				    &c->last_submit_fence);
+	if (err)
+		goto clean_up;
+
+	if (wait_cmd) {
+		c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+			u64_lo32(wait_cmd->gva);
+		c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+			u64_hi32(wait_cmd->gva) |
+			pbdma_gp_entry1_length_f(wait_cmd->size);
+		trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+
+		c->gpfifo.put = (c->gpfifo.put + 1) &
+			(c->gpfifo.entry_num - 1);
+
+		/* save gp_put */
+		wait_cmd->gp_put = c->gpfifo.put;
+	}
+
+	for (i = 0; i < num_entries; i++) {
+		c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+			gpfifo[i].entry0; /* cmd buf va low 32 */
+		c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+			gpfifo[i].entry1; /* cmd buf va high 32 | words << 10 */
+		trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+		c->gpfifo.put = (c->gpfifo.put + 1) &
+			(c->gpfifo.entry_num - 1);
+	}
+
+	if (incr_cmd) {
+		c->gpfifo.cpu_va[c->gpfifo.put].entry0 =
+			u64_lo32(incr_cmd->gva);
+		c->gpfifo.cpu_va[c->gpfifo.put].entry1 =
+			u64_hi32(incr_cmd->gva) |
+			pbdma_gp_entry1_length_f(incr_cmd->size);
+		trace_write_pushbuffer(c, &c->gpfifo.cpu_va[c->gpfifo.put]);
+
+		c->gpfifo.put = (c->gpfifo.put + 1) &
+			(c->gpfifo.entry_num - 1);
+
+		/* save gp_put */
+		incr_cmd->gp_put = c->gpfifo.put;
+	}
+
+	/* Invalidate tlb if it's dirty...                                   */
+	/* TBD: this should be done in the cmd stream, not with PRIs.        */
+	/* We don't know what context is currently running...                */
+	/* Note also: there can be more than one context associated with the */
+	/* address space (vm).   */
+	gk20a_mm_tlb_invalidate(c->vm);
+
+	trace_gk20a_channel_submitted_gpfifo(c->g->dev->name,
+					     c->hw_chid,
+					     num_entries,
+					     flags,
+					     fence->syncpt_id, fence->value);
+
+	/* TODO! Check for errors... */
+	gk20a_channel_add_job(c, &c->last_submit_fence);
+
+	c->cmds_pending = true;
+	gk20a_bar1_writel(g,
+		c->userd_gpu_va + 4 * ram_userd_gp_put_w(),
+		c->gpfifo.put);
+
+	gk20a_dbg_info("post-submit put %d, get %d, size %d",
+		c->gpfifo.put, c->gpfifo.get, c->gpfifo.entry_num);
+
+	gk20a_dbg_fn("done");
+	return err;
+
+clean_up:
+	gk20a_err(d, "fail");
+	free_priv_cmdbuf(c, wait_cmd);
+	free_priv_cmdbuf(c, incr_cmd);
+	gk20a_channel_idle(g->dev);
+	return err;
+}
+
+void gk20a_remove_channel_support(struct channel_gk20a *c)
+{
+
+}
+
+int gk20a_init_channel_support(struct gk20a *g, u32 chid)
+{
+	struct channel_gk20a *c = g->fifo.channel+chid;
+	c->g = g;
+	c->in_use = false;
+	c->hw_chid = chid;
+	c->bound = false;
+	c->remove_support = gk20a_remove_channel_support;
+	mutex_init(&c->jobs_lock);
+	INIT_LIST_HEAD(&c->jobs);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+	mutex_init(&c->cyclestate.cyclestate_buffer_mutex);
+#endif
+	INIT_LIST_HEAD(&c->dbg_s_list);
+	mutex_init(&c->dbg_s_lock);
+
+	return 0;
+}
+
+int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout)
+{
+	int err = 0;
+
+	if (!ch->cmds_pending)
+		return 0;
+
+	/* Do not wait for a timedout channel */
+	if (ch->has_timedout)
+		return -ETIMEDOUT;
+
+	if (!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi)) {
+		gk20a_dbg_fn("issuing wfi, incr to finish the channel");
+		err = gk20a_channel_submit_wfi(ch);
+	}
+	if (err)
+		return err;
+
+	BUG_ON(!(ch->last_submit_fence.valid && ch->last_submit_fence.wfi));
+
+	gk20a_dbg_fn("waiting for channel to finish thresh:%d",
+		      ch->last_submit_fence.thresh);
+
+	err = ch->sync->wait_cpu(ch->sync, &ch->last_submit_fence, timeout);
+	if (WARN_ON(err))
+		dev_warn(dev_from_gk20a(ch->g),
+			 "timed out waiting for gk20a channel to finish");
+	else
+		ch->cmds_pending = false;
+
+	return err;
+}
+
+static int gk20a_channel_wait_semaphore(struct channel_gk20a *ch,
+					ulong id, u32 offset,
+					u32 payload, long timeout)
+{
+	struct platform_device *pdev = ch->g->dev;
+	struct dma_buf *dmabuf;
+	void *data;
+	u32 *semaphore;
+	int ret = 0;
+	long remain;
+
+	/* do not wait if channel has timed out */
+	if (ch->has_timedout)
+		return -ETIMEDOUT;
+
+	dmabuf = dma_buf_get(id);
+	if (IS_ERR(dmabuf)) {
+		gk20a_err(&pdev->dev, "invalid notifier nvmap handle 0x%lx",
+			   id);
+		return -EINVAL;
+	}
+
+	data = dma_buf_kmap(dmabuf, offset >> PAGE_SHIFT);
+	if (!data) {
+		gk20a_err(&pdev->dev, "failed to map notifier memory");
+		ret = -EINVAL;
+		goto cleanup_put;
+	}
+
+	semaphore = data + (offset & ~PAGE_MASK);
+
+	remain = wait_event_interruptible_timeout(
+			ch->semaphore_wq,
+			*semaphore == payload || ch->has_timedout,
+			timeout);
+
+	if (remain == 0 && *semaphore != payload)
+		ret = -ETIMEDOUT;
+	else if (remain < 0)
+		ret = remain;
+
+	dma_buf_kunmap(dmabuf, offset >> PAGE_SHIFT, data);
+cleanup_put:
+	dma_buf_put(dmabuf);
+	return ret;
+}
+
+static int gk20a_channel_wait(struct channel_gk20a *ch,
+			      struct nvhost_wait_args *args)
+{
+	struct device *d = dev_from_gk20a(ch->g);
+	struct dma_buf *dmabuf;
+	struct notification *notif;
+	struct timespec tv;
+	u64 jiffies;
+	ulong id;
+	u32 offset;
+	unsigned long timeout;
+	int remain, ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (ch->has_timedout)
+		return -ETIMEDOUT;
+
+	if (args->timeout == NVHOST_NO_TIMEOUT)
+		timeout = MAX_SCHEDULE_TIMEOUT;
+	else
+		timeout = (u32)msecs_to_jiffies(args->timeout);
+
+	switch (args->type) {
+	case NVHOST_WAIT_TYPE_NOTIFIER:
+		id = args->condition.notifier.nvmap_handle;
+		offset = args->condition.notifier.offset;
+
+		dmabuf = dma_buf_get(id);
+		if (IS_ERR(dmabuf)) {
+			gk20a_err(d, "invalid notifier nvmap handle 0x%lx",
+				   id);
+			return -EINVAL;
+		}
+
+		notif = dma_buf_vmap(dmabuf);
+		if (!notif) {
+			gk20a_err(d, "failed to map notifier memory");
+			return -ENOMEM;
+		}
+
+		notif = (struct notification *)((uintptr_t)notif + offset);
+
+		/* user should set status pending before
+		 * calling this ioctl */
+		remain = wait_event_interruptible_timeout(
+				ch->notifier_wq,
+				notif->status == 0 || ch->has_timedout,
+				timeout);
+
+		if (remain == 0 && notif->status != 0) {
+			ret = -ETIMEDOUT;
+			goto notif_clean_up;
+		} else if (remain < 0) {
+			ret = -EINTR;
+			goto notif_clean_up;
+		}
+
+		/* TBD: fill in correct information */
+		jiffies = get_jiffies_64();
+		jiffies_to_timespec(jiffies, &tv);
+		notif->timestamp.nanoseconds[0] = tv.tv_nsec;
+		notif->timestamp.nanoseconds[1] = tv.tv_sec;
+		notif->info32 = 0xDEADBEEF; /* should be object name */
+		notif->info16 = ch->hw_chid; /* should be method offset */
+
+notif_clean_up:
+		dma_buf_vunmap(dmabuf, notif);
+		return ret;
+
+	case NVHOST_WAIT_TYPE_SEMAPHORE:
+		ret = gk20a_channel_wait_semaphore(ch,
+				args->condition.semaphore.nvmap_handle,
+				args->condition.semaphore.offset,
+				args->condition.semaphore.payload,
+				timeout);
+
+		break;
+
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int gk20a_channel_set_priority(struct channel_gk20a *ch,
+		u32 priority)
+{
+	u32 timeslice_timeout;
+	/* set priority of graphics channel */
+	switch (priority) {
+	case NVHOST_PRIORITY_LOW:
+		/* 64 << 3 = 512us */
+		timeslice_timeout = 64;
+		break;
+	case NVHOST_PRIORITY_MEDIUM:
+		/* 128 << 3 = 1024us */
+		timeslice_timeout = 128;
+		break;
+	case NVHOST_PRIORITY_HIGH:
+		/* 255 << 3 = 2048us */
+		timeslice_timeout = 255;
+		break;
+	default:
+		pr_err("Unsupported priority");
+		return -EINVAL;
+	}
+	channel_gk20a_set_schedule_params(ch,
+			timeslice_timeout);
+	return 0;
+}
+
+static int gk20a_channel_zcull_bind(struct channel_gk20a *ch,
+			    struct nvhost_zcull_bind_args *args)
+{
+	struct gk20a *g = ch->g;
+	struct gr_gk20a *gr = &g->gr;
+
+	gk20a_dbg_fn("");
+
+	return gr_gk20a_bind_ctxsw_zcull(g, gr, ch,
+				args->gpu_va, args->mode);
+}
+
+/* in this context the "channel" is the host1x channel which
+ * maps to *all* gk20a channels */
+int gk20a_channel_suspend(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+	bool channels_in_use = false;
+	struct device *d = dev_from_gk20a(g);
+	int err;
+
+	gk20a_dbg_fn("");
+
+	/* idle the engine by submitting WFI on non-KEPLER_C channel */
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *c = &f->channel[chid];
+		if (c->in_use && c->obj_class != KEPLER_C) {
+			err = gk20a_channel_submit_wfi(c);
+			if (err) {
+				gk20a_err(d, "cannot idle channel %d\n",
+						chid);
+				return err;
+			}
+
+			c->sync->wait_cpu(c->sync, &c->last_submit_fence,
+					  500000);
+			break;
+		}
+	}
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		if (f->channel[chid].in_use) {
+
+			gk20a_dbg_info("suspend channel %d", chid);
+			/* disable channel */
+			gk20a_writel(g, ccsr_channel_r(chid),
+				gk20a_readl(g, ccsr_channel_r(chid)) |
+				ccsr_channel_enable_clr_true_f());
+			/* preempt the channel */
+			gk20a_fifo_preempt_channel(g, chid);
+
+			channels_in_use = true;
+		}
+	}
+
+	if (channels_in_use) {
+		gk20a_fifo_update_runlist(g, 0, ~0, false, true);
+
+		for (chid = 0; chid < f->num_channels; chid++) {
+			if (f->channel[chid].in_use)
+				channel_gk20a_unbind(&f->channel[chid]);
+		}
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+/* in this context the "channel" is the host1x channel which
+ * maps to *all* gk20a channels */
+int gk20a_channel_resume(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+	bool channels_in_use = false;
+
+	gk20a_dbg_fn("");
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		if (f->channel[chid].in_use) {
+			gk20a_dbg_info("resume channel %d", chid);
+			g->ops.fifo.bind_channel(&f->channel[chid]);
+			channels_in_use = true;
+		}
+	}
+
+	if (channels_in_use)
+		gk20a_fifo_update_runlist(g, 0, ~0, true, true);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+void gk20a_channel_semaphore_wakeup(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+
+	gk20a_dbg_fn("");
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		struct channel_gk20a *c = g->fifo.channel+chid;
+		if (c->in_use)
+			wake_up_interruptible_all(&c->semaphore_wq);
+	}
+}
+
+static int gk20a_ioctl_channel_submit_gpfifo(
+	struct channel_gk20a *ch,
+	struct nvhost_submit_gpfifo_args *args)
+{
+	void *gpfifo;
+	u32 size;
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (ch->has_timedout)
+		return -ETIMEDOUT;
+
+	size = args->num_entries * sizeof(struct nvhost_gpfifo);
+
+	gpfifo = kzalloc(size, GFP_KERNEL);
+	if (!gpfifo)
+		return -ENOMEM;
+
+	if (copy_from_user(gpfifo,
+			   (void __user *)(uintptr_t)args->gpfifo, size)) {
+		ret = -EINVAL;
+		goto clean_up;
+	}
+
+	ret = gk20a_submit_channel_gpfifo(ch, gpfifo, args->num_entries,
+					&args->fence, args->flags);
+
+clean_up:
+	kfree(gpfifo);
+	return ret;
+}
+
+void gk20a_init_fifo(struct gpu_ops *gops)
+{
+	gops->fifo.bind_channel = channel_gk20a_bind;
+}
+
+long gk20a_channel_ioctl(struct file *filp,
+	unsigned int cmd, unsigned long arg)
+{
+	struct channel_gk20a *ch = filp->private_data;
+	struct platform_device *dev = ch->g->dev;
+	u8 buf[NVHOST_IOCTL_CHANNEL_MAX_ARG_SIZE];
+	int err = 0;
+
+	if ((_IOC_TYPE(cmd) != NVHOST_IOCTL_MAGIC) ||
+		(_IOC_NR(cmd) == 0) ||
+		(_IOC_NR(cmd) > NVHOST_IOCTL_CHANNEL_LAST) ||
+		(_IOC_SIZE(cmd) > NVHOST_IOCTL_CHANNEL_MAX_ARG_SIZE))
+		return -EFAULT;
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	switch (cmd) {
+	case NVHOST_IOCTL_CHANNEL_OPEN:
+	{
+		int fd;
+		struct file *file;
+		char *name;
+
+		err = get_unused_fd_flags(O_RDWR);
+		if (err < 0)
+			break;
+		fd = err;
+
+		name = kasprintf(GFP_KERNEL, "nvhost-%s-fd%d",
+				dev_name(&dev->dev), fd);
+		if (!name) {
+			err = -ENOMEM;
+			put_unused_fd(fd);
+			break;
+		}
+
+		file = anon_inode_getfile(name, filp->f_op, NULL, O_RDWR);
+		kfree(name);
+		if (IS_ERR(file)) {
+			err = PTR_ERR(file);
+			put_unused_fd(fd);
+			break;
+		}
+		fd_install(fd, file);
+
+		err = __gk20a_channel_open(ch->g, file);
+		if (err) {
+			put_unused_fd(fd);
+			fput(file);
+			break;
+		}
+
+		((struct nvhost_channel_open_args *)buf)->channel_fd = fd;
+		break;
+	}
+	case NVHOST_IOCTL_CHANNEL_SET_NVMAP_FD:
+		break;
+	case NVHOST_IOCTL_CHANNEL_ALLOC_OBJ_CTX:
+		gk20a_channel_busy(dev);
+		err = gk20a_alloc_obj_ctx(ch,
+				(struct nvhost_alloc_obj_ctx_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+	case NVHOST_IOCTL_CHANNEL_FREE_OBJ_CTX:
+		gk20a_channel_busy(dev);
+		err = gk20a_free_obj_ctx(ch,
+				(struct nvhost_free_obj_ctx_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+	case NVHOST_IOCTL_CHANNEL_ALLOC_GPFIFO:
+		gk20a_channel_busy(dev);
+		err = gk20a_alloc_channel_gpfifo(ch,
+				(struct nvhost_alloc_gpfifo_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+	case NVHOST_IOCTL_CHANNEL_SUBMIT_GPFIFO:
+		err = gk20a_ioctl_channel_submit_gpfifo(ch,
+				(struct nvhost_submit_gpfifo_args *)buf);
+		break;
+	case NVHOST_IOCTL_CHANNEL_WAIT:
+		gk20a_channel_busy(dev);
+		err = gk20a_channel_wait(ch,
+				(struct nvhost_wait_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+	case NVHOST_IOCTL_CHANNEL_ZCULL_BIND:
+		gk20a_channel_busy(dev);
+		err = gk20a_channel_zcull_bind(ch,
+				(struct nvhost_zcull_bind_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+	case NVHOST_IOCTL_CHANNEL_SET_ERROR_NOTIFIER:
+		gk20a_channel_busy(dev);
+		err = gk20a_init_error_notifier(ch,
+				(struct nvhost_set_error_notifier *)buf);
+		gk20a_channel_idle(dev);
+		break;
+#ifdef CONFIG_GK20A_CYCLE_STATS
+	case NVHOST_IOCTL_CHANNEL_CYCLE_STATS:
+		gk20a_channel_busy(dev);
+		err = gk20a_channel_cycle_stats(ch,
+				(struct nvhost_cycle_stats_args *)buf);
+		gk20a_channel_idle(dev);
+		break;
+#endif
+	case NVHOST_IOCTL_CHANNEL_SET_TIMEOUT:
+	{
+		u32 timeout =
+			(u32)((struct nvhost_set_timeout_args *)buf)->timeout;
+		gk20a_dbg(gpu_dbg_gpu_dbg, "setting timeout (%d ms) for chid %d",
+			   timeout, ch->hw_chid);
+		ch->timeout_ms_max = timeout;
+		break;
+	}
+	case NVHOST_IOCTL_CHANNEL_SET_TIMEOUT_EX:
+	{
+		u32 timeout =
+			(u32)((struct nvhost_set_timeout_args *)buf)->timeout;
+		bool timeout_debug_dump = !((u32)
+			((struct nvhost_set_timeout_ex_args *)buf)->flags &
+			(1 << NVHOST_TIMEOUT_FLAG_DISABLE_DUMP));
+		gk20a_dbg(gpu_dbg_gpu_dbg, "setting timeout (%d ms) for chid %d",
+			   timeout, ch->hw_chid);
+		ch->timeout_ms_max = timeout;
+		ch->timeout_debug_dump = timeout_debug_dump;
+		break;
+	}
+	case NVHOST_IOCTL_CHANNEL_GET_TIMEDOUT:
+		((struct nvhost_get_param_args *)buf)->value =
+			ch->has_timedout;
+		break;
+	case NVHOST_IOCTL_CHANNEL_SET_PRIORITY:
+		gk20a_channel_busy(dev);
+		gk20a_channel_set_priority(ch,
+			((struct nvhost_set_priority_args *)buf)->priority);
+		gk20a_channel_idle(dev);
+		break;
+	default:
+		dev_err(&dev->dev, "unrecognized ioctl cmd: 0x%x", cmd);
+		err = -ENOTTY;
+		break;
+	}
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
new file mode 100644
index 000000000000..429db85d4177
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_gk20a.h
@@ -0,0 +1,172 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_gk20a.h
+ *
+ * GK20A graphics channel
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __CHANNEL_GK20A_H__
+#define __CHANNEL_GK20A_H__
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/nvhost_ioctl.h>
+struct gk20a;
+struct gr_gk20a;
+struct dbg_session_gk20a;
+
+#include "channel_sync_gk20a.h"
+
+#include "mm_gk20a.h"
+#include "gr_gk20a.h"
+
+struct gpfifo {
+	u32 entry0;
+	u32 entry1;
+};
+
+struct notification {
+	struct {
+		u32 nanoseconds[2];
+	} timestamp;
+	u32 info32;
+	u16 info16;
+	u16 status;
+};
+
+struct fence {
+	u32 hw_chid;
+	u32 syncpt_val;
+};
+
+/* contexts associated with a channel */
+struct channel_ctx_gk20a {
+	struct gr_ctx_desc	gr_ctx;
+	struct pm_ctx_desc	pm_ctx;
+	struct patch_desc	patch_ctx;
+	struct zcull_ctx_desc	zcull_ctx;
+	u64	global_ctx_buffer_va[NR_GLOBAL_CTX_BUF_VA];
+	u64	global_ctx_buffer_size[NR_GLOBAL_CTX_BUF_VA];
+	bool	global_ctx_buffer_mapped;
+};
+
+struct channel_gk20a_job {
+	struct mapped_buffer_node **mapped_buffers;
+	int num_mapped_buffers;
+	struct gk20a_channel_fence fence;
+	struct list_head list;
+};
+
+/* this is the priv element of struct nvhost_channel */
+struct channel_gk20a {
+	struct gk20a *g;
+	bool in_use;
+	int hw_chid;
+	bool bound;
+	bool first_init;
+	bool vpr;
+	pid_t pid;
+
+	struct list_head jobs;
+	struct mutex jobs_lock;
+
+	struct vm_gk20a *vm;
+
+	struct gpfifo_desc gpfifo;
+
+	struct channel_ctx_gk20a ch_ctx;
+
+	struct inst_desc inst_block;
+	struct mem_desc_sub ramfc;
+
+	void *userd_cpu_va;
+	u64 userd_iova;
+	u64 userd_gpu_va;
+
+	s32 num_objects;
+	u32 obj_class;	/* we support only one obj per channel */
+
+	struct priv_cmd_queue priv_cmd_q;
+
+	wait_queue_head_t notifier_wq;
+	wait_queue_head_t semaphore_wq;
+	wait_queue_head_t submit_wq;
+
+	u32 timeout_accumulated_ms;
+	u32 timeout_gpfifo_get;
+
+	bool cmds_pending;
+	struct gk20a_channel_fence last_submit_fence;
+
+	void (*remove_support)(struct channel_gk20a *);
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+	struct {
+	void *cyclestate_buffer;
+	u32 cyclestate_buffer_size;
+	struct dma_buf *cyclestate_buffer_handler;
+	struct mutex cyclestate_buffer_mutex;
+	} cyclestate;
+#endif
+	struct mutex dbg_s_lock;
+	struct list_head dbg_s_list;
+
+	bool has_timedout;
+	u32 timeout_ms_max;
+	bool timeout_debug_dump;
+
+	struct dma_buf *error_notifier_ref;
+	struct nvhost_notification *error_notifier;
+	void *error_notifier_va;
+
+	struct gk20a_channel_sync *sync;
+};
+
+static inline bool gk20a_channel_as_bound(struct channel_gk20a *ch)
+{
+	return !!ch->vm;
+}
+int channel_gk20a_commit_va(struct channel_gk20a *c);
+int gk20a_init_channel_support(struct gk20a *, u32 chid);
+void gk20a_free_channel(struct channel_gk20a *ch, bool finish);
+bool gk20a_channel_update_and_check_timeout(struct channel_gk20a *ch,
+					    u32 timeout_delta_ms);
+void gk20a_disable_channel(struct channel_gk20a *ch,
+			   bool wait_for_finish,
+			   unsigned long finish_timeout);
+void gk20a_disable_channel_no_update(struct channel_gk20a *ch);
+int gk20a_channel_finish(struct channel_gk20a *ch, unsigned long timeout);
+void gk20a_set_error_notifier(struct channel_gk20a *ch, __u32 error);
+void gk20a_channel_semaphore_wakeup(struct gk20a *g);
+int gk20a_channel_alloc_priv_cmdbuf(struct channel_gk20a *c, u32 size,
+			     struct priv_cmd_entry **entry);
+
+int gk20a_channel_suspend(struct gk20a *g);
+int gk20a_channel_resume(struct gk20a *g);
+
+/* Channel file operations */
+int gk20a_channel_open(struct inode *inode, struct file *filp);
+long gk20a_channel_ioctl(struct file *filp,
+			 unsigned int cmd,
+			 unsigned long arg);
+int gk20a_channel_release(struct inode *inode, struct file *filp);
+struct channel_gk20a *gk20a_get_channel_from_file(int fd);
+void gk20a_channel_update(struct channel_gk20a *c, int nr_completed);
+
+void gk20a_init_fifo(struct gpu_ops *gops);
+
+#endif /*__CHANNEL_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
new file mode 100644
index 000000000000..9f9c3ba7ac71
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.c
@@ -0,0 +1,356 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_sync_gk20a.c
+ *
+ * GK20A Channel Synchronization Abstraction
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/gk20a.h>
+
+#include "channel_sync_gk20a.h"
+#include "gk20a.h"
+
+#ifdef CONFIG_SYNC
+#include "../../../staging/android/sync.h"
+#endif
+
+#ifdef CONFIG_TEGRA_GK20A
+#include <linux/nvhost.h>
+#endif
+
+#ifdef CONFIG_TEGRA_GK20A
+
+struct gk20a_channel_syncpt {
+	struct gk20a_channel_sync ops;
+	struct channel_gk20a *c;
+	struct platform_device *host1x_pdev;
+	u32 id;
+};
+
+static void add_wait_cmd(u32 *ptr, u32 id, u32 thresh)
+{
+	/* syncpoint_a */
+	ptr[0] = 0x2001001C;
+	/* payload */
+	ptr[1] = thresh;
+	/* syncpoint_b */
+	ptr[2] = 0x2001001D;
+	/* syncpt_id, switch_en, wait */
+	ptr[3] = (id << 8) | 0x10;
+}
+
+int gk20a_channel_syncpt_wait_cpu(struct gk20a_channel_sync *s,
+				  struct gk20a_channel_fence *fence,
+				  int timeout)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	if (!fence->valid)
+		return 0;
+	return nvhost_syncpt_wait_timeout_ext(
+			sp->host1x_pdev, sp->id, fence->thresh,
+			timeout, NULL, NULL);
+}
+
+bool gk20a_channel_syncpt_is_expired(struct gk20a_channel_sync *s,
+				     struct gk20a_channel_fence *fence)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	if (!fence->valid)
+		return true;
+	return nvhost_syncpt_is_expired_ext(sp->host1x_pdev, sp->id,
+			fence->thresh);
+}
+
+int gk20a_channel_syncpt_wait_syncpt(struct gk20a_channel_sync *s, u32 id,
+		u32 thresh, struct priv_cmd_entry **entry)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	struct priv_cmd_entry *wait_cmd = NULL;
+
+	if (id >= nvhost_syncpt_nb_pts_ext(sp->host1x_pdev)) {
+		dev_warn(dev_from_gk20a(sp->c->g),
+				"invalid wait id in gpfifo submit, elided");
+		return 0;
+	}
+
+	if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev, id, thresh))
+		return 0;
+
+	gk20a_channel_alloc_priv_cmdbuf(sp->c, 4, &wait_cmd);
+	if (wait_cmd == NULL) {
+		gk20a_err(dev_from_gk20a(sp->c->g),
+				"not enough priv cmd buffer space");
+		return -EAGAIN;
+	}
+
+	add_wait_cmd(&wait_cmd->ptr[0], id, thresh);
+
+	*entry = wait_cmd;
+	return 0;
+}
+
+int gk20a_channel_syncpt_wait_fd(struct gk20a_channel_sync *s, int fd,
+		       struct priv_cmd_entry **entry)
+{
+#ifdef CONFIG_SYNC
+	int i;
+	int num_wait_cmds;
+	struct sync_pt *pt;
+	struct sync_fence *sync_fence;
+	struct priv_cmd_entry *wait_cmd = NULL;
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	struct channel_gk20a *c = sp->c;
+
+	sync_fence = nvhost_sync_fdget(fd);
+	if (!sync_fence)
+		return -EINVAL;
+
+	num_wait_cmds = nvhost_sync_num_pts(sync_fence);
+	gk20a_channel_alloc_priv_cmdbuf(c, 4 * num_wait_cmds, &wait_cmd);
+	if (wait_cmd == NULL) {
+		gk20a_err(dev_from_gk20a(c->g),
+				"not enough priv cmd buffer space");
+		sync_fence_put(sync_fence);
+		return -EAGAIN;
+	}
+
+	i = 0;
+	list_for_each_entry(pt, &sync_fence->pt_list_head, pt_list) {
+		u32 wait_id = nvhost_sync_pt_id(pt);
+		u32 wait_value = nvhost_sync_pt_thresh(pt);
+
+		if (nvhost_syncpt_is_expired_ext(sp->host1x_pdev,
+				wait_id, wait_value)) {
+			wait_cmd->ptr[i * 4 + 0] = 0;
+			wait_cmd->ptr[i * 4 + 1] = 0;
+			wait_cmd->ptr[i * 4 + 2] = 0;
+			wait_cmd->ptr[i * 4 + 3] = 0;
+		} else
+			add_wait_cmd(&wait_cmd->ptr[i * 4], wait_id,
+					wait_value);
+		i++;
+	}
+	WARN_ON(i != num_wait_cmds);
+	sync_fence_put(sync_fence);
+
+	*entry = wait_cmd;
+	return 0;
+#else
+	return -ENODEV;
+#endif
+}
+
+static void gk20a_channel_syncpt_update(void *priv, int nr_completed)
+{
+	struct channel_gk20a *ch20a = priv;
+	gk20a_channel_update(ch20a, nr_completed);
+}
+
+static int __gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
+				       bool gfx_class, bool wfi_cmd,
+				       struct priv_cmd_entry **entry,
+				       struct gk20a_channel_fence *fence)
+{
+	u32 thresh;
+	int incr_cmd_size;
+	int j = 0;
+	int err;
+	struct priv_cmd_entry *incr_cmd = NULL;
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	struct channel_gk20a *c = sp->c;
+
+	/* nvhost action_gpfifo_submit_complete releases this ref. */
+	err = gk20a_channel_busy(c->g->dev);
+	if (err)
+		return err;
+
+	incr_cmd_size = 4;
+	if (wfi_cmd)
+		incr_cmd_size += 2;
+
+	gk20a_channel_alloc_priv_cmdbuf(c, incr_cmd_size, &incr_cmd);
+	if (incr_cmd == NULL) {
+		gk20a_channel_idle(c->g->dev);
+		gk20a_err(dev_from_gk20a(c->g),
+				"not enough priv cmd buffer space");
+		return -EAGAIN;
+	}
+
+	if (gfx_class) {
+		WARN_ON(wfi_cmd); /* No sense to use gfx class + wfi. */
+		/* setobject KEPLER_C */
+		incr_cmd->ptr[j++] = 0x20010000;
+		incr_cmd->ptr[j++] = KEPLER_C;
+		/* syncpt incr */
+		incr_cmd->ptr[j++] = 0x200100B2;
+		incr_cmd->ptr[j++] = sp->id |
+			(0x1 << 20) | (0x1 << 16);
+	} else {
+		if (wfi_cmd) {
+			/* wfi */
+			incr_cmd->ptr[j++] = 0x2001001E;
+			/* handle, ignored */
+			incr_cmd->ptr[j++] = 0x00000000;
+		}
+		/* syncpoint_a */
+		incr_cmd->ptr[j++] = 0x2001001C;
+		/* payload, ignored */
+		incr_cmd->ptr[j++] = 0;
+		/* syncpoint_b */
+		incr_cmd->ptr[j++] = 0x2001001D;
+		/* syncpt_id, incr */
+		incr_cmd->ptr[j++] = (sp->id << 8) | 0x1;
+	}
+	WARN_ON(j != incr_cmd_size);
+
+	thresh = nvhost_syncpt_incr_max_ext(sp->host1x_pdev, sp->id, 1);
+
+	err = nvhost_intr_register_notifier(sp->host1x_pdev, sp->id, thresh,
+			gk20a_channel_syncpt_update, c);
+
+	/* Adding interrupt action should never fail. A proper error handling
+	 * here would require us to decrement the syncpt max back to its
+	 * original value. */
+	if (WARN(err, "failed to set submit complete interrupt")) {
+		gk20a_channel_idle(c->g->dev);
+		err = 0; /* Ignore this error. */
+	}
+
+	fence->thresh = thresh;
+	fence->valid = true;
+	fence->wfi = wfi_cmd;
+	*entry = incr_cmd;
+	return 0;
+}
+
+int gk20a_channel_syncpt_incr_wfi(struct gk20a_channel_sync *s,
+				  struct priv_cmd_entry **entry,
+				  struct gk20a_channel_fence *fence)
+{
+	return __gk20a_channel_syncpt_incr(s,
+			false /* use host class */,
+			true /* wfi */,
+			entry, fence);
+}
+
+int gk20a_channel_syncpt_incr(struct gk20a_channel_sync *s,
+			      struct priv_cmd_entry **entry,
+			      struct gk20a_channel_fence *fence)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	/* Don't put wfi cmd to this one since we're not returning
+	 * a fence to user space. */
+	return __gk20a_channel_syncpt_incr(s,
+			sp->c->obj_class == KEPLER_C /* may use gfx class */,
+			false /* no wfi */,
+			entry, fence);
+}
+
+int gk20a_channel_syncpt_incr_user_syncpt(struct gk20a_channel_sync *s,
+					  struct priv_cmd_entry **entry,
+					  struct gk20a_channel_fence *fence,
+					  u32 *id, u32 *thresh)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	/* Need to do 'host incr + wfi' or 'gfx incr' since we return the fence
+	 * to user space. */
+	int err = __gk20a_channel_syncpt_incr(s,
+			sp->c->obj_class == KEPLER_C /* use gfx class? */,
+			sp->c->obj_class != KEPLER_C /* wfi if host class */,
+			entry, fence);
+	if (err)
+		return err;
+	*id = sp->id;
+	*thresh = fence->thresh;
+	return 0;
+}
+
+int gk20a_channel_syncpt_incr_user_fd(struct gk20a_channel_sync *s,
+				      struct priv_cmd_entry **entry,
+				      struct gk20a_channel_fence *fence,
+				      int *fd)
+{
+#ifdef CONFIG_SYNC
+	int err;
+	struct nvhost_ctrl_sync_fence_info pt;
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	err = gk20a_channel_syncpt_incr_user_syncpt(s, entry, fence,
+						    &pt.id, &pt.thresh);
+	if (err)
+		return err;
+	return nvhost_sync_create_fence_fd(sp->host1x_pdev, &pt, 1,
+					   "fence", fd);
+#else
+	return -ENODEV;
+#endif
+}
+
+void gk20a_channel_syncpt_set_min_eq_max(struct gk20a_channel_sync *s)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	nvhost_syncpt_set_min_eq_max_ext(sp->host1x_pdev, sp->id);
+}
+
+static void gk20a_channel_syncpt_destroy(struct gk20a_channel_sync *s)
+{
+	struct gk20a_channel_syncpt *sp =
+		container_of(s, struct gk20a_channel_syncpt, ops);
+	nvhost_free_syncpt(sp->id);
+	kfree(sp);
+}
+
+static struct gk20a_channel_sync *
+gk20a_channel_syncpt_create(struct channel_gk20a *c)
+{
+	struct gk20a_channel_syncpt *sp;
+
+	sp = kzalloc(sizeof(*sp), GFP_KERNEL);
+	if (!sp)
+		return NULL;
+
+	sp->c = c;
+	sp->host1x_pdev = to_platform_device(c->g->dev->dev.parent);
+	sp->id = nvhost_get_syncpt_host_managed(sp->host1x_pdev, c->hw_chid);
+
+	sp->ops.wait_cpu		= gk20a_channel_syncpt_wait_cpu;
+	sp->ops.is_expired		= gk20a_channel_syncpt_is_expired;
+	sp->ops.wait_syncpt		= gk20a_channel_syncpt_wait_syncpt;
+	sp->ops.wait_fd			= gk20a_channel_syncpt_wait_fd;
+	sp->ops.incr			= gk20a_channel_syncpt_incr;
+	sp->ops.incr_wfi		= gk20a_channel_syncpt_incr_wfi;
+	sp->ops.incr_user_syncpt	= gk20a_channel_syncpt_incr_user_syncpt;
+	sp->ops.incr_user_fd		= gk20a_channel_syncpt_incr_user_fd;
+	sp->ops.set_min_eq_max		= gk20a_channel_syncpt_set_min_eq_max;
+	sp->ops.destroy			= gk20a_channel_syncpt_destroy;
+	return &sp->ops;
+}
+#endif /* CONFIG_TEGRA_GK20A */
+
+struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c)
+{
+#ifdef CONFIG_TEGRA_GK20A
+	if (gk20a_platform_has_syncpoints(c->g->dev))
+		return gk20a_channel_syncpt_create(c);
+#endif
+	WARN_ON(1);
+	return NULL;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
new file mode 100644
index 000000000000..69feb89f0c3e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/channel_sync_gk20a.h
@@ -0,0 +1,102 @@
+/*
+ * drivers/video/tegra/host/gk20a/channel_sync_gk20a.h
+ *
+ * GK20A Channel Synchronization Abstraction
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _GK20A_CHANNEL_SYNC_H_
+#define _GK20A_CHANNEL_SYNC_H_
+
+#include <linux/types.h>
+
+struct gk20a_channel_sync;
+struct priv_cmd_entry;
+struct channel_gk20a;
+
+struct gk20a_channel_fence {
+	bool valid;
+	bool wfi; /* was issued with preceding wfi */
+	u32 thresh; /* either semaphore or syncpoint value */
+};
+
+struct gk20a_channel_sync {
+	/* CPU wait for a fence returned by incr_syncpt() or incr_fd(). */
+	int (*wait_cpu)(struct gk20a_channel_sync *s,
+			struct gk20a_channel_fence *fence,
+			int timeout);
+
+	/* Test whether a fence returned by incr_syncpt() or incr_fd() is
+	 * expired. */
+	bool (*is_expired)(struct gk20a_channel_sync *s,
+			   struct gk20a_channel_fence *fence);
+
+	/* Generate a gpu wait cmdbuf from syncpoint. */
+	int (*wait_syncpt)(struct gk20a_channel_sync *s, u32 id, u32 thresh,
+			   struct priv_cmd_entry **entry);
+
+	/* Generate a gpu wait cmdbuf from sync fd. */
+	int (*wait_fd)(struct gk20a_channel_sync *s, int fd,
+		       struct priv_cmd_entry **entry);
+
+	/* Increment syncpoint/semaphore.
+	 * Returns
+	 *  - a gpu cmdbuf that performs the increment when executed,
+	 *  - a fence that can be passed to wait_cpu() and is_expired().
+	 */
+	int (*incr)(struct gk20a_channel_sync *s,
+		    struct priv_cmd_entry **entry,
+		    struct gk20a_channel_fence *fence);
+
+	/* Increment syncpoint/semaphore, preceded by a wfi.
+	 * Returns
+	 *  - a gpu cmdbuf that performs the increment when executed,
+	 *  - a fence that can be passed to wait_cpu() and is_expired().
+	 */
+	int (*incr_wfi)(struct gk20a_channel_sync *s,
+			struct priv_cmd_entry **entry,
+			struct gk20a_channel_fence *fence);
+
+	/* Increment syncpoint, so that the returned fence represents
+	 * work completion (may need wfi) and can be returned to user space.
+	 * Returns
+	 *  - a gpu cmdbuf that performs the increment when executed,
+	 *  - a fence that can be passed to wait_cpu() and is_expired(),
+	 *  - a syncpoint id/value pair that can be returned to user space.
+	 */
+	int (*incr_user_syncpt)(struct gk20a_channel_sync *s,
+				struct priv_cmd_entry **entry,
+				struct gk20a_channel_fence *fence,
+				u32 *id, u32 *thresh);
+
+	/* Increment syncpoint/semaphore, so that the returned fence represents
+	 * work completion (may need wfi) and can be returned to user space.
+	 * Returns
+	 *  - a gpu cmdbuf that performs the increment when executed,
+	 *  - a fence that can be passed to wait_cpu() and is_expired(),
+	 *  - a sync fd that can be returned to user space.
+	 */
+	int (*incr_user_fd)(struct gk20a_channel_sync *s,
+			    struct priv_cmd_entry **entry,
+			    struct gk20a_channel_fence *fence,
+			    int *fd);
+
+	/* Reset the channel syncpoint/semaphore. */
+	void (*set_min_eq_max)(struct gk20a_channel_sync *s);
+
+	/* Free the resources allocated by gk20a_channel_sync_create. */
+	void (*destroy)(struct gk20a_channel_sync *s);
+};
+
+struct gk20a_channel_sync *gk20a_channel_sync_create(struct channel_gk20a *c);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/clk_gk20a.c b/drivers/gpu/nvgpu/gk20a/clk_gk20a.c
new file mode 100644
index 000000000000..151a332b8cbd
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/clk_gk20a.c
@@ -0,0 +1,865 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.c
+ *
+ * GK20A Clocks
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/clk.h>
+#include <linux/delay.h>	/* for mdelay */
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/clk/tegra.h>
+#include <mach/thermal.h>
+
+#include "gk20a.h"
+#include "hw_trim_gk20a.h"
+#include "hw_timer_gk20a.h"
+
+#define gk20a_dbg_clk(fmt, arg...) \
+	gk20a_dbg(gpu_dbg_clk, fmt, ##arg)
+
+/* from vbios PLL info table */
+struct pll_parms gpc_pll_params = {
+	144, 2064,	/* freq */
+	1000, 2064,	/* vco */
+	12, 38,		/* u */
+	1, 255,		/* M */
+	8, 255,		/* N */
+	1, 32,		/* PL */
+};
+
+static int num_gpu_cooling_freq;
+static struct gpufreq_table_data *gpu_cooling_freq;
+
+struct gpufreq_table_data *tegra_gpufreq_table_get(void)
+{
+	return gpu_cooling_freq;
+}
+
+unsigned int tegra_gpufreq_table_size_get(void)
+{
+	return num_gpu_cooling_freq;
+}
+
+static u8 pl_to_div[] = {
+/* PL:   0, 1, 2, 3, 4, 5, 6,  7,  8,  9, 10, 11, 12, 13, 14 */
+/* p: */ 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 12, 16, 20, 24, 32 };
+
+/* Calculate and update M/N/PL as well as pll->freq
+    ref_clk_f = clk_in_f / src_div = clk_in_f; (src_div = 1 on gk20a)
+    u_f = ref_clk_f / M;
+    PLL output = vco_f = u_f * N = ref_clk_f * N / M;
+    gpc2clk = target clock frequency = vco_f / PL;
+    gpcclk = gpc2clk / 2; */
+static int clk_config_pll(struct clk_gk20a *clk, struct pll *pll,
+	struct pll_parms *pll_params, u32 *target_freq, bool best_fit)
+{
+	u32 min_vco_f, max_vco_f;
+	u32 best_M, best_N;
+	u32 low_PL, high_PL, best_PL;
+	u32 m, n, n2;
+	u32 target_vco_f, vco_f;
+	u32 ref_clk_f, target_clk_f, u_f;
+	u32 delta, lwv, best_delta = ~0;
+	int pl;
+
+	BUG_ON(target_freq == NULL);
+
+	gk20a_dbg_fn("request target freq %d MHz", *target_freq);
+
+	ref_clk_f = pll->clk_in;
+	target_clk_f = *target_freq;
+	max_vco_f = pll_params->max_vco;
+	min_vco_f = pll_params->min_vco;
+	best_M = pll_params->max_M;
+	best_N = pll_params->min_N;
+	best_PL = pll_params->min_PL;
+
+	target_vco_f = target_clk_f + target_clk_f / 50;
+	if (max_vco_f < target_vco_f)
+		max_vco_f = target_vco_f;
+
+	high_PL = (max_vco_f + target_vco_f - 1) / target_vco_f;
+	high_PL = min(high_PL, pll_params->max_PL);
+	high_PL = max(high_PL, pll_params->min_PL);
+
+	low_PL = min_vco_f / target_vco_f;
+	low_PL = min(low_PL, pll_params->max_PL);
+	low_PL = max(low_PL, pll_params->min_PL);
+
+	/* Find Indices of high_PL and low_PL */
+	for (pl = 0; pl < 14; pl++) {
+		if (pl_to_div[pl] >= low_PL) {
+			low_PL = pl;
+			break;
+		}
+	}
+	for (pl = 0; pl < 14; pl++) {
+		if (pl_to_div[pl] >= high_PL) {
+			high_PL = pl;
+			break;
+		}
+	}
+	gk20a_dbg_info("low_PL %d(div%d), high_PL %d(div%d)",
+			low_PL, pl_to_div[low_PL], high_PL, pl_to_div[high_PL]);
+
+	for (pl = low_PL; pl <= high_PL; pl++) {
+		target_vco_f = target_clk_f * pl_to_div[pl];
+
+		for (m = pll_params->min_M; m <= pll_params->max_M; m++) {
+			u_f = ref_clk_f / m;
+
+			if (u_f < pll_params->min_u)
+				break;
+			if (u_f > pll_params->max_u)
+				continue;
+
+			n = (target_vco_f * m) / ref_clk_f;
+			n2 = ((target_vco_f * m) + (ref_clk_f - 1)) / ref_clk_f;
+
+			if (n > pll_params->max_N)
+				break;
+
+			for (; n <= n2; n++) {
+				if (n < pll_params->min_N)
+					continue;
+				if (n > pll_params->max_N)
+					break;
+
+				vco_f = ref_clk_f * n / m;
+
+				if (vco_f >= min_vco_f && vco_f <= max_vco_f) {
+					lwv = (vco_f + (pl_to_div[pl] / 2))
+						/ pl_to_div[pl];
+					delta = abs(lwv - target_clk_f);
+
+					if (delta < best_delta) {
+						best_delta = delta;
+						best_M = m;
+						best_N = n;
+						best_PL = pl;
+
+						if (best_delta == 0 ||
+						    /* 0.45% for non best fit */
+						    (!best_fit && (vco_f / best_delta > 218))) {
+							goto found_match;
+						}
+
+						gk20a_dbg_info("delta %d @ M %d, N %d, PL %d",
+							delta, m, n, pl);
+					}
+				}
+			}
+		}
+	}
+
+found_match:
+	BUG_ON(best_delta == ~0);
+
+	if (best_fit && best_delta != 0)
+		gk20a_dbg_clk("no best match for target @ %dMHz on gpc_pll",
+			target_clk_f);
+
+	pll->M = best_M;
+	pll->N = best_N;
+	pll->PL = best_PL;
+
+	/* save current frequency */
+	pll->freq = ref_clk_f * pll->N / (pll->M * pl_to_div[pll->PL]);
+
+	*target_freq = pll->freq;
+
+	gk20a_dbg_clk("actual target freq %d MHz, M %d, N %d, PL %d(div%d)",
+		*target_freq, pll->M, pll->N, pll->PL, pl_to_div[pll->PL]);
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+static int clk_slide_gpc_pll(struct gk20a *g, u32 n)
+{
+	u32 data, coeff;
+	u32 nold;
+	int ramp_timeout = 500;
+
+	/* get old coefficients */
+	coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+	nold = trim_sys_gpcpll_coeff_ndiv_v(coeff);
+
+	/* do nothing if NDIV is same */
+	if (n == nold)
+		return 0;
+
+	/* setup */
+	data = gk20a_readl(g, trim_sys_gpcpll_cfg2_r());
+	data = set_field(data, trim_sys_gpcpll_cfg2_pll_stepa_m(),
+			trim_sys_gpcpll_cfg2_pll_stepa_f(0x2b));
+	gk20a_writel(g, trim_sys_gpcpll_cfg2_r(), data);
+	data = gk20a_readl(g, trim_sys_gpcpll_cfg3_r());
+	data = set_field(data, trim_sys_gpcpll_cfg3_pll_stepb_m(),
+			trim_sys_gpcpll_cfg3_pll_stepb_f(0xb));
+	gk20a_writel(g, trim_sys_gpcpll_cfg3_r(), data);
+
+	/* pll slowdown mode */
+	data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+	data = set_field(data,
+			trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(),
+			trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_yes_f());
+	gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+
+	/* new ndiv ready for ramp */
+	coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+	coeff = set_field(coeff, trim_sys_gpcpll_coeff_ndiv_m(),
+			trim_sys_gpcpll_coeff_ndiv_f(n));
+	udelay(1);
+	gk20a_writel(g, trim_sys_gpcpll_coeff_r(), coeff);
+
+	/* dynamic ramp to new ndiv */
+	data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+	data = set_field(data,
+			trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(),
+			trim_sys_gpcpll_ndiv_slowdown_en_dynramp_yes_f());
+	udelay(1);
+	gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+
+	do {
+		udelay(1);
+		ramp_timeout--;
+		data = gk20a_readl(
+			g, trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_r());
+		if (trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_pll_dynramp_done_synced_v(data))
+			break;
+	} while (ramp_timeout > 0);
+
+	/* exit slowdown mode */
+	data = gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+	data = set_field(data,
+			trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(),
+			trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_no_f());
+	data = set_field(data,
+			trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(),
+			trim_sys_gpcpll_ndiv_slowdown_en_dynramp_no_f());
+	gk20a_writel(g, trim_sys_gpcpll_ndiv_slowdown_r(), data);
+	gk20a_readl(g, trim_sys_gpcpll_ndiv_slowdown_r());
+
+	if (ramp_timeout <= 0) {
+		gk20a_err(dev_from_gk20a(g), "gpcpll dynamic ramp timeout");
+		return -ETIMEDOUT;
+	}
+	return 0;
+}
+
+static int clk_program_gpc_pll(struct gk20a *g, struct clk_gk20a *clk,
+			int allow_slide)
+{
+	u32 data, cfg, coeff, timeout;
+	u32 m, n, pl;
+	u32 nlo;
+
+	gk20a_dbg_fn("");
+
+	if (!tegra_platform_is_silicon())
+		return 0;
+
+	/* get old coefficients */
+	coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+	m = trim_sys_gpcpll_coeff_mdiv_v(coeff);
+	n = trim_sys_gpcpll_coeff_ndiv_v(coeff);
+	pl = trim_sys_gpcpll_coeff_pldiv_v(coeff);
+
+	/* do NDIV slide if there is no change in M and PL */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	if (allow_slide && clk->gpc_pll.M == m && clk->gpc_pll.PL == pl
+		&& trim_sys_gpcpll_cfg_enable_v(cfg)) {
+		return clk_slide_gpc_pll(g, clk->gpc_pll.N);
+	}
+
+	/* slide down to NDIV_LO */
+	nlo = DIV_ROUND_UP(m * gpc_pll_params.min_vco, clk->gpc_pll.clk_in);
+	if (allow_slide && trim_sys_gpcpll_cfg_enable_v(cfg)) {
+		int ret = clk_slide_gpc_pll(g, nlo);
+		if (ret)
+			return ret;
+	}
+
+	/* split FO-to-bypass jump in halfs by setting out divider 1:2 */
+	data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+	data = set_field(data, trim_sys_gpc2clk_out_vcodiv_m(),
+		trim_sys_gpc2clk_out_vcodiv_f(2));
+	gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+
+	/* put PLL in bypass before programming it */
+	data = gk20a_readl(g, trim_sys_sel_vco_r());
+	data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+		trim_sys_sel_vco_gpc2clk_out_bypass_f());
+	udelay(2);
+	gk20a_writel(g, trim_sys_sel_vco_r(), data);
+
+	/* get out from IDDQ */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	if (trim_sys_gpcpll_cfg_iddq_v(cfg)) {
+		cfg = set_field(cfg, trim_sys_gpcpll_cfg_iddq_m(),
+				trim_sys_gpcpll_cfg_iddq_power_on_v());
+		gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+		gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+		udelay(2);
+	}
+
+	/* disable PLL before changing coefficients */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+			trim_sys_gpcpll_cfg_enable_no_f());
+	gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+	gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+
+	/* change coefficients */
+	nlo = DIV_ROUND_UP(clk->gpc_pll.M * gpc_pll_params.min_vco,
+			clk->gpc_pll.clk_in);
+	coeff = trim_sys_gpcpll_coeff_mdiv_f(clk->gpc_pll.M) |
+		trim_sys_gpcpll_coeff_ndiv_f(allow_slide ?
+					     nlo : clk->gpc_pll.N) |
+		trim_sys_gpcpll_coeff_pldiv_f(clk->gpc_pll.PL);
+	gk20a_writel(g, trim_sys_gpcpll_coeff_r(), coeff);
+
+	/* enable PLL after changing coefficients */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+			trim_sys_gpcpll_cfg_enable_yes_f());
+	gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+
+	/* lock pll */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	if (cfg & trim_sys_gpcpll_cfg_enb_lckdet_power_off_f()){
+		cfg = set_field(cfg, trim_sys_gpcpll_cfg_enb_lckdet_m(),
+			trim_sys_gpcpll_cfg_enb_lckdet_power_on_f());
+		gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+	}
+
+	/* wait pll lock */
+	timeout = clk->pll_delay / 2 + 1;
+	do {
+		cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+		if (cfg & trim_sys_gpcpll_cfg_pll_lock_true_f())
+			goto pll_locked;
+		udelay(2);
+	} while (--timeout > 0);
+
+	/* PLL is messed up. What can we do here? */
+	BUG();
+	return -EBUSY;
+
+pll_locked:
+	/* put PLL back on vco */
+	data = gk20a_readl(g, trim_sys_sel_vco_r());
+	data = set_field(data, trim_sys_sel_vco_gpc2clk_out_m(),
+		trim_sys_sel_vco_gpc2clk_out_vco_f());
+	gk20a_writel(g, trim_sys_sel_vco_r(), data);
+	clk->gpc_pll.enabled = true;
+
+	/* restore out divider 1:1 */
+	data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+	data = set_field(data, trim_sys_gpc2clk_out_vcodiv_m(),
+		trim_sys_gpc2clk_out_vcodiv_by1_f());
+	udelay(2);
+	gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+
+	/* slide up to target NDIV */
+	return clk_slide_gpc_pll(g, clk->gpc_pll.N);
+}
+
+static int clk_disable_gpcpll(struct gk20a *g, int allow_slide)
+{
+	u32 cfg, coeff, m, nlo;
+	struct clk_gk20a *clk = &g->clk;
+
+	/* slide to VCO min */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	if (allow_slide && trim_sys_gpcpll_cfg_enable_v(cfg)) {
+		coeff = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+		m = trim_sys_gpcpll_coeff_mdiv_v(coeff);
+		nlo = DIV_ROUND_UP(m * gpc_pll_params.min_vco,
+				   clk->gpc_pll.clk_in);
+		clk_slide_gpc_pll(g, nlo);
+	}
+
+	/* put PLL in bypass before disabling it */
+	cfg = gk20a_readl(g, trim_sys_sel_vco_r());
+	cfg = set_field(cfg, trim_sys_sel_vco_gpc2clk_out_m(),
+			trim_sys_sel_vco_gpc2clk_out_bypass_f());
+	gk20a_writel(g, trim_sys_sel_vco_r(), cfg);
+
+	/* disable PLL */
+	cfg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	cfg = set_field(cfg, trim_sys_gpcpll_cfg_enable_m(),
+			trim_sys_gpcpll_cfg_enable_no_f());
+	gk20a_writel(g, trim_sys_gpcpll_cfg_r(), cfg);
+	gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+
+	clk->gpc_pll.enabled = false;
+	return 0;
+}
+
+static int gk20a_init_clk_reset_enable_hw(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+	return 0;
+}
+
+struct clk *gk20a_clk_get(struct gk20a *g)
+{
+	if (!g->clk.tegra_clk) {
+		struct clk *clk;
+
+		clk = clk_get_sys("tegra_gk20a", "gpu");
+		if (IS_ERR(clk)) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to get tegra gpu clk tegra_gk20a/gpu");
+			return NULL;
+		}
+		g->clk.tegra_clk = clk;
+	}
+
+	return g->clk.tegra_clk;
+}
+
+static int gk20a_init_clk_setup_sw(struct gk20a *g)
+{
+	struct clk_gk20a *clk = &g->clk;
+	static int initialized;
+	unsigned long *freqs;
+	int err, num_freqs;
+	struct clk *ref;
+	unsigned long ref_rate;
+
+	gk20a_dbg_fn("");
+
+	if (clk->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	if (!gk20a_clk_get(g))
+		return -EINVAL;
+
+	ref = clk_get_parent(clk_get_parent(clk->tegra_clk));
+	if (IS_ERR(ref)) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to get GPCPLL reference clock");
+		return -EINVAL;
+	}
+	ref_rate = clk_get_rate(ref);
+
+	clk->pll_delay = 300; /* usec */
+
+	clk->gpc_pll.id = GK20A_GPC_PLL;
+	clk->gpc_pll.clk_in = ref_rate / 1000000; /* MHz */
+
+	/* Decide initial frequency */
+	if (!initialized) {
+		initialized = 1;
+		clk->gpc_pll.M = 1;
+		clk->gpc_pll.N = DIV_ROUND_UP(gpc_pll_params.min_vco,
+					clk->gpc_pll.clk_in);
+		clk->gpc_pll.PL = 1;
+		clk->gpc_pll.freq = clk->gpc_pll.clk_in * clk->gpc_pll.N;
+		clk->gpc_pll.freq /= pl_to_div[clk->gpc_pll.PL];
+	}
+
+	err = tegra_dvfs_get_freqs(clk_get_parent(clk->tegra_clk),
+				   &freqs, &num_freqs);
+	if (!err) {
+		int i, j;
+
+		/* init j for inverse traversal of frequencies */
+		j = num_freqs - 1;
+
+		gpu_cooling_freq = kzalloc(
+				(1 + num_freqs) * sizeof(*gpu_cooling_freq),
+				GFP_KERNEL);
+
+		/* store frequencies in inverse order */
+		for (i = 0; i < num_freqs; ++i, --j) {
+			gpu_cooling_freq[i].index = i;
+			gpu_cooling_freq[i].frequency = freqs[j];
+		}
+
+		/* add 'end of table' marker */
+		gpu_cooling_freq[i].index = i;
+		gpu_cooling_freq[i].frequency = GPUFREQ_TABLE_END;
+
+		/* store number of frequencies */
+		num_gpu_cooling_freq = num_freqs + 1;
+	}
+
+	mutex_init(&clk->clk_mutex);
+
+	clk->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static int gk20a_init_clk_setup_hw(struct gk20a *g)
+{
+	u32 data;
+
+	gk20a_dbg_fn("");
+
+	data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+	data = set_field(data,
+			trim_sys_gpc2clk_out_sdiv14_m() |
+			trim_sys_gpc2clk_out_vcodiv_m() |
+			trim_sys_gpc2clk_out_bypdiv_m(),
+			trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f() |
+			trim_sys_gpc2clk_out_vcodiv_by1_f() |
+			trim_sys_gpc2clk_out_bypdiv_f(0));
+	gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+
+	return 0;
+}
+
+static int set_pll_target(struct gk20a *g, u32 freq, u32 old_freq)
+{
+	struct clk_gk20a *clk = &g->clk;
+
+	if (freq > gpc_pll_params.max_freq)
+		freq = gpc_pll_params.max_freq;
+	else if (freq < gpc_pll_params.min_freq)
+		freq = gpc_pll_params.min_freq;
+
+	if (freq != old_freq) {
+		/* gpc_pll.freq is changed to new value here */
+		if (clk_config_pll(clk, &clk->gpc_pll, &gpc_pll_params,
+				   &freq, true)) {
+			gk20a_err(dev_from_gk20a(g),
+				   "failed to set pll target for %d", freq);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int set_pll_freq(struct gk20a *g, u32 freq, u32 old_freq)
+{
+	struct clk_gk20a *clk = &g->clk;
+	int err = 0;
+
+	gk20a_dbg_fn("curr freq: %dMHz, target freq %dMHz", old_freq, freq);
+
+	if ((freq == old_freq) && clk->gpc_pll.enabled)
+		return 0;
+
+	/* change frequency only if power is on */
+	if (g->clk.clk_hw_on) {
+		err = clk_program_gpc_pll(g, clk, 1);
+		if (err)
+			err = clk_program_gpc_pll(g, clk, 0);
+	}
+
+	/* Just report error but not restore PLL since dvfs could already change
+	    voltage even when it returns error. */
+	if (err)
+		gk20a_err(dev_from_gk20a(g),
+			"failed to set pll to %d", freq);
+	return err;
+}
+
+static int gk20a_clk_export_set_rate(void *data, unsigned long *rate)
+{
+	u32 old_freq;
+	int ret = -ENODATA;
+	struct gk20a *g = data;
+	struct clk_gk20a *clk = &g->clk;
+
+	if (rate) {
+		mutex_lock(&clk->clk_mutex);
+		old_freq = clk->gpc_pll.freq;
+		ret = set_pll_target(g, rate_gpu_to_gpc2clk(*rate), old_freq);
+		if (!ret && clk->gpc_pll.enabled)
+			ret = set_pll_freq(g, clk->gpc_pll.freq, old_freq);
+		if (!ret)
+			*rate = rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+		mutex_unlock(&clk->clk_mutex);
+	}
+	return ret;
+}
+
+static int gk20a_clk_export_enable(void *data)
+{
+	int ret;
+	struct gk20a *g = data;
+	struct clk_gk20a *clk = &g->clk;
+
+	mutex_lock(&clk->clk_mutex);
+	ret = set_pll_freq(g, clk->gpc_pll.freq, clk->gpc_pll.freq);
+	mutex_unlock(&clk->clk_mutex);
+	return ret;
+}
+
+static void gk20a_clk_export_disable(void *data)
+{
+	struct gk20a *g = data;
+	struct clk_gk20a *clk = &g->clk;
+
+	mutex_lock(&clk->clk_mutex);
+	if (g->clk.clk_hw_on)
+		clk_disable_gpcpll(g, 1);
+	mutex_unlock(&clk->clk_mutex);
+}
+
+static void gk20a_clk_export_init(void *data, unsigned long *rate, bool *state)
+{
+	struct gk20a *g = data;
+	struct clk_gk20a *clk = &g->clk;
+
+	mutex_lock(&clk->clk_mutex);
+	if (state)
+		*state = clk->gpc_pll.enabled;
+	if (rate)
+		*rate = rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+	mutex_unlock(&clk->clk_mutex);
+}
+
+static struct tegra_clk_export_ops gk20a_clk_export_ops = {
+	.init = gk20a_clk_export_init,
+	.enable = gk20a_clk_export_enable,
+	.disable = gk20a_clk_export_disable,
+	.set_rate = gk20a_clk_export_set_rate,
+};
+
+static int gk20a_clk_register_export_ops(struct gk20a *g)
+{
+	int ret;
+	struct clk *c;
+
+	if (gk20a_clk_export_ops.data)
+		return 0;
+
+	gk20a_clk_export_ops.data = (void *)g;
+	c = g->clk.tegra_clk;
+	if (!c || !clk_get_parent(c))
+		return -ENOSYS;
+
+	ret = tegra_clk_register_export_ops(clk_get_parent(c),
+					    &gk20a_clk_export_ops);
+
+	return ret;
+}
+
+int gk20a_init_clk_support(struct gk20a *g)
+{
+	struct clk_gk20a *clk = &g->clk;
+	u32 err;
+
+	gk20a_dbg_fn("");
+
+	clk->g = g;
+
+	err = gk20a_init_clk_reset_enable_hw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_clk_setup_sw(g);
+	if (err)
+		return err;
+
+	mutex_lock(&clk->clk_mutex);
+	clk->clk_hw_on = true;
+
+	err = gk20a_init_clk_setup_hw(g);
+	mutex_unlock(&clk->clk_mutex);
+	if (err)
+		return err;
+
+	err = gk20a_clk_register_export_ops(g);
+	if (err)
+		return err;
+
+	/* FIXME: this effectively prevents host level clock gating */
+	err = clk_enable(g->clk.tegra_clk);
+	if (err)
+		return err;
+
+	/* The prev call may not enable PLL if gbus is unbalanced - force it */
+	mutex_lock(&clk->clk_mutex);
+	err = set_pll_freq(g, clk->gpc_pll.freq, clk->gpc_pll.freq);
+	mutex_unlock(&clk->clk_mutex);
+	if (err)
+		return err;
+
+	return err;
+}
+
+unsigned long gk20a_clk_get_rate(struct gk20a *g)
+{
+	struct clk_gk20a *clk = &g->clk;
+	return rate_gpc2clk_to_gpu(clk->gpc_pll.freq);
+}
+
+long gk20a_clk_round_rate(struct gk20a *g, unsigned long rate)
+{
+	/* make sure the clock is available */
+	if (!gk20a_clk_get(g))
+		return rate;
+
+	return clk_round_rate(clk_get_parent(g->clk.tegra_clk), rate);
+}
+
+int gk20a_clk_set_rate(struct gk20a *g, unsigned long rate)
+{
+	return clk_set_rate(g->clk.tegra_clk, rate);
+}
+
+int gk20a_suspend_clk_support(struct gk20a *g)
+{
+	int ret;
+
+	clk_disable(g->clk.tegra_clk);
+
+	/* The prev call may not disable PLL if gbus is unbalanced - force it */
+	mutex_lock(&g->clk.clk_mutex);
+	ret = clk_disable_gpcpll(g, 1);
+	g->clk.clk_hw_on = false;
+	mutex_unlock(&g->clk.clk_mutex);
+	return ret;
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+static int rate_get(void *data, u64 *val)
+{
+	struct gk20a *g = (struct gk20a *)data;
+	*val = (u64)gk20a_clk_get_rate(g);
+	return 0;
+}
+static int rate_set(void *data, u64 val)
+{
+	struct gk20a *g = (struct gk20a *)data;
+	return gk20a_clk_set_rate(g, (u32)val);
+}
+DEFINE_SIMPLE_ATTRIBUTE(rate_fops, rate_get, rate_set, "%llu\n");
+
+static int pll_reg_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	u32 reg, m, n, pl, f;
+
+	mutex_lock(&g->clk.clk_mutex);
+	if (!g->clk.clk_hw_on) {
+		seq_printf(s, "gk20a powered down - no access to registers\n");
+		mutex_unlock(&g->clk.clk_mutex);
+		return 0;
+	}
+
+	reg = gk20a_readl(g, trim_sys_gpcpll_cfg_r());
+	seq_printf(s, "cfg  = 0x%x : %s : %s\n", reg,
+		   trim_sys_gpcpll_cfg_enable_v(reg) ? "enabled" : "disabled",
+		   trim_sys_gpcpll_cfg_pll_lock_v(reg) ? "locked" : "unlocked");
+
+	reg = gk20a_readl(g, trim_sys_gpcpll_coeff_r());
+	m = trim_sys_gpcpll_coeff_mdiv_v(reg);
+	n = trim_sys_gpcpll_coeff_ndiv_v(reg);
+	pl = trim_sys_gpcpll_coeff_pldiv_v(reg);
+	f = g->clk.gpc_pll.clk_in * n / (m * pl_to_div[pl]);
+	seq_printf(s, "coef = 0x%x : m = %u : n = %u : pl = %u", reg, m, n, pl);
+	seq_printf(s, " : pll_f(gpu_f) = %u(%u) MHz\n", f, f/2);
+	mutex_unlock(&g->clk.clk_mutex);
+	return 0;
+}
+
+static int pll_reg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pll_reg_show, inode->i_private);
+}
+
+static const struct file_operations pll_reg_fops = {
+	.open		= pll_reg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int monitor_get(void *data, u64 *val)
+{
+	struct gk20a *g = (struct gk20a *)data;
+	struct clk_gk20a *clk = &g->clk;
+	int err;
+
+	u32 ncycle = 100; /* count GPCCLK for ncycle of clkin */
+	u32 clkin = clk->gpc_pll.clk_in;
+	u32 count1, count2;
+
+	err = gk20a_busy(g->dev);
+	if (err)
+		return err;
+
+	gk20a_writel(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0),
+		     trim_gpc_clk_cntr_ncgpcclk_cfg_reset_asserted_f());
+	gk20a_writel(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0),
+		     trim_gpc_clk_cntr_ncgpcclk_cfg_enable_asserted_f() |
+		     trim_gpc_clk_cntr_ncgpcclk_cfg_write_en_asserted_f() |
+		     trim_gpc_clk_cntr_ncgpcclk_cfg_noofipclks_f(ncycle));
+	/* start */
+
+	/* It should take about 8us to finish 100 cycle of 12MHz.
+	   But longer than 100us delay is required here. */
+	gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cfg_r(0));
+	udelay(2000);
+
+	count1 = gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cnt_r(0));
+	udelay(100);
+	count2 = gk20a_readl(g, trim_gpc_clk_cntr_ncgpcclk_cnt_r(0));
+	*val = (u64)(trim_gpc_clk_cntr_ncgpcclk_cnt_value_v(count2) * clkin / ncycle);
+	gk20a_idle(g->dev);
+
+	if (count1 != count2)
+		return -EBUSY;
+	return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(monitor_fops, monitor_get, NULL, "%llu\n");
+
+int clk_gk20a_debugfs_init(struct platform_device *dev)
+{
+	struct dentry *d;
+	struct gk20a_platform *platform = platform_get_drvdata(dev);
+	struct gk20a *g = get_gk20a(dev);
+
+	d = debugfs_create_file(
+		"rate", S_IRUGO|S_IWUSR, platform->debugfs, g, &rate_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"pll_reg", S_IRUGO, platform->debugfs, g, &pll_reg_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"monitor", S_IRUGO, platform->debugfs, g, &monitor_fops);
+	if (!d)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	pr_err("%s: Failed to make debugfs node\n", __func__);
+	debugfs_remove_recursive(platform->debugfs);
+	return -ENOMEM;
+}
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/nvgpu/gk20a/clk_gk20a.h b/drivers/gpu/nvgpu/gk20a/clk_gk20a.h
new file mode 100644
index 000000000000..d2665259b0fe
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/clk_gk20a.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/video/tegra/host/gk20a/clk_gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011 - 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_CLK_GK20A_H_
+#define _NVHOST_CLK_GK20A_H_
+
+#include <linux/mutex.h>
+
+#define GPUFREQ_TABLE_END     ~(u32)1
+enum {
+	/* only one PLL for gk20a */
+	GK20A_GPC_PLL = 0,
+};
+
+struct pll {
+	u32 id;
+	u32 clk_in;	/* MHz */
+	u32 M;
+	u32 N;
+	u32 PL;
+	u32 freq;	/* MHz */
+	bool enabled;
+};
+
+struct pll_parms {
+	u32 min_freq, max_freq;	/* MHz */
+	u32 min_vco, max_vco;	/* MHz */
+	u32 min_u,   max_u;	/* MHz */
+	u32 min_M,   max_M;
+	u32 min_N,   max_N;
+	u32 min_PL,  max_PL;
+};
+
+struct clk_gk20a {
+	struct gk20a *g;
+	struct clk *tegra_clk;
+	struct pll gpc_pll;
+	u32 pll_delay; /* default PLL settle time */
+	struct mutex clk_mutex;
+	bool sw_ready;
+	bool clk_hw_on;
+};
+
+struct gpufreq_table_data {
+	unsigned int index;
+	unsigned int frequency; /* MHz */
+};
+
+struct gpufreq_table_data *tegra_gpufreq_table_get(void);
+
+unsigned int tegra_gpufreq_table_size_get(void);
+
+int gk20a_init_clk_support(struct gk20a *g);
+
+unsigned long gk20a_clk_get_rate(struct gk20a *g);
+int gk20a_clk_set_rate(struct gk20a *g, unsigned long rate);
+int gk20a_suspend_clk_support(struct gk20a *g);
+struct clk *gk20a_clk_get(struct gk20a *g);
+long gk20a_clk_round_rate(struct gk20a *g, unsigned long rate);
+
+extern struct pll_parms gpc_pll_params;
+
+#define KHZ 1000
+#define MHZ 1000000
+
+static inline unsigned long rate_gpc2clk_to_gpu(unsigned long rate)
+{
+	/* convert the MHz gpc2clk frequency to Hz gpcpll frequency */
+	return (rate * MHZ) / 2;
+}
+static inline unsigned long rate_gpu_to_gpc2clk(unsigned long rate)
+{
+	/* convert the Hz gpcpll frequency to MHz gpc2clk frequency */
+	return (rate * 2) / MHZ;
+}
+
+#endif /* _NVHOST_CLK_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
new file mode 100644
index 000000000000..9128959f60a7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.c
@@ -0,0 +1,240 @@
+/*
+ * GK20A Ctrl
+ *
+ * Copyright (c) 2011-2014, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/highmem.h>
+#include <linux/cdev.h>
+#include <linux/nvhost_gpu_ioctl.h>
+
+#include "gk20a.h"
+
+int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp)
+{
+	int err;
+	struct gk20a *g;
+
+	gk20a_dbg_fn("");
+
+	g = container_of(inode->i_cdev,
+			 struct gk20a, ctrl.cdev);
+
+	filp->private_data = g->dev;
+
+	err = gk20a_get_client(g);
+	if (err) {
+		gk20a_dbg_fn("fail to get channel!");
+		return err;
+	}
+
+	return 0;
+}
+
+int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp)
+{
+	struct platform_device *dev = filp->private_data;
+
+	gk20a_dbg_fn("");
+
+	gk20a_put_client(get_gk20a(dev));
+	return 0;
+}
+
+static long
+gk20a_ctrl_ioctl_gpu_characteristics(
+	struct gk20a *g,
+	struct nvhost_gpu_get_characteristics *request)
+{
+	struct nvhost_gpu_characteristics *pgpu = &g->gpu_characteristics;
+	long err = 0;
+
+	if (request->gpu_characteristics_buf_size > 0) {
+		size_t write_size = sizeof(*pgpu);
+
+		if (write_size > request->gpu_characteristics_buf_size)
+			write_size = request->gpu_characteristics_buf_size;
+
+		err = copy_to_user((void __user *)(uintptr_t)
+				   request->gpu_characteristics_buf_addr,
+				   pgpu, write_size);
+	}
+
+	if (err == 0)
+		request->gpu_characteristics_buf_size = sizeof(*pgpu);
+
+	return err;
+}
+
+long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct platform_device *dev = filp->private_data;
+	struct gk20a *g = get_gk20a(dev);
+	struct nvhost_gpu_zcull_get_ctx_size_args *get_ctx_size_args;
+	struct nvhost_gpu_zcull_get_info_args *get_info_args;
+	struct nvhost_gpu_zbc_set_table_args *set_table_args;
+	struct nvhost_gpu_zbc_query_table_args *query_table_args;
+	u8 buf[NVHOST_GPU_IOCTL_MAX_ARG_SIZE];
+	struct gr_zcull_info *zcull_info;
+	struct zbc_entry *zbc_val;
+	struct zbc_query_params *zbc_tbl;
+	int i, err = 0;
+
+	gk20a_dbg_fn("");
+
+	if ((_IOC_TYPE(cmd) != NVHOST_GPU_IOCTL_MAGIC) ||
+		(_IOC_NR(cmd) == 0) ||
+		(_IOC_NR(cmd) > NVHOST_GPU_IOCTL_LAST))
+		return -EFAULT;
+
+	BUG_ON(_IOC_SIZE(cmd) > NVHOST_GPU_IOCTL_MAX_ARG_SIZE);
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	if (!g->gr.sw_ready) {
+		err = gk20a_busy(g->dev);
+		if (err)
+			return err;
+
+		gk20a_idle(g->dev);
+	}
+
+	switch (cmd) {
+	case NVHOST_GPU_IOCTL_ZCULL_GET_CTX_SIZE:
+		get_ctx_size_args = (struct nvhost_gpu_zcull_get_ctx_size_args *)buf;
+
+		get_ctx_size_args->size = gr_gk20a_get_ctxsw_zcull_size(g, &g->gr);
+
+		break;
+	case NVHOST_GPU_IOCTL_ZCULL_GET_INFO:
+		get_info_args = (struct nvhost_gpu_zcull_get_info_args *)buf;
+
+		memset(get_info_args, 0, sizeof(struct nvhost_gpu_zcull_get_info_args));
+
+		zcull_info = kzalloc(sizeof(struct gr_zcull_info), GFP_KERNEL);
+		if (zcull_info == NULL)
+			return -ENOMEM;
+
+		err = gr_gk20a_get_zcull_info(g, &g->gr, zcull_info);
+		if (err) {
+			kfree(zcull_info);
+			break;
+		}
+
+		get_info_args->width_align_pixels = zcull_info->width_align_pixels;
+		get_info_args->height_align_pixels = zcull_info->height_align_pixels;
+		get_info_args->pixel_squares_by_aliquots = zcull_info->pixel_squares_by_aliquots;
+		get_info_args->aliquot_total = zcull_info->aliquot_total;
+		get_info_args->region_byte_multiplier = zcull_info->region_byte_multiplier;
+		get_info_args->region_header_size = zcull_info->region_header_size;
+		get_info_args->subregion_header_size = zcull_info->subregion_header_size;
+		get_info_args->subregion_width_align_pixels = zcull_info->subregion_width_align_pixels;
+		get_info_args->subregion_height_align_pixels = zcull_info->subregion_height_align_pixels;
+		get_info_args->subregion_count = zcull_info->subregion_count;
+
+		kfree(zcull_info);
+		break;
+	case NVHOST_GPU_IOCTL_ZBC_SET_TABLE:
+		set_table_args = (struct nvhost_gpu_zbc_set_table_args *)buf;
+
+		zbc_val = kzalloc(sizeof(struct zbc_entry), GFP_KERNEL);
+		if (zbc_val == NULL)
+			return -ENOMEM;
+
+		zbc_val->format = set_table_args->format;
+		zbc_val->type = set_table_args->type;
+
+		switch (zbc_val->type) {
+		case GK20A_ZBC_TYPE_COLOR:
+			for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+				zbc_val->color_ds[i] = set_table_args->color_ds[i];
+				zbc_val->color_l2[i] = set_table_args->color_l2[i];
+			}
+			break;
+		case GK20A_ZBC_TYPE_DEPTH:
+			zbc_val->depth = set_table_args->depth;
+			break;
+		default:
+			err = -EINVAL;
+		}
+
+		if (!err) {
+			gk20a_busy(dev);
+			err = gk20a_gr_zbc_set_table(g, &g->gr, zbc_val);
+			gk20a_idle(dev);
+		}
+
+		if (zbc_val)
+			kfree(zbc_val);
+		break;
+	case NVHOST_GPU_IOCTL_ZBC_QUERY_TABLE:
+		query_table_args = (struct nvhost_gpu_zbc_query_table_args *)buf;
+
+		zbc_tbl = kzalloc(sizeof(struct zbc_query_params), GFP_KERNEL);
+		if (zbc_tbl == NULL)
+			return -ENOMEM;
+
+		zbc_tbl->type = query_table_args->type;
+		zbc_tbl->index_size = query_table_args->index_size;
+
+		err = gr_gk20a_query_zbc(g, &g->gr, zbc_tbl);
+
+		if (!err) {
+			switch (zbc_tbl->type) {
+			case GK20A_ZBC_TYPE_COLOR:
+				for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+					query_table_args->color_ds[i] = zbc_tbl->color_ds[i];
+					query_table_args->color_l2[i] = zbc_tbl->color_l2[i];
+				}
+				break;
+			case GK20A_ZBC_TYPE_DEPTH:
+				query_table_args->depth = zbc_tbl->depth;
+				break;
+			case GK20A_ZBC_TYPE_INVALID:
+				query_table_args->index_size = zbc_tbl->index_size;
+				break;
+			default:
+				err = -EINVAL;
+			}
+			if (!err) {
+				query_table_args->format = zbc_tbl->format;
+				query_table_args->ref_cnt = zbc_tbl->ref_cnt;
+			}
+		}
+
+		if (zbc_tbl)
+			kfree(zbc_tbl);
+		break;
+
+	case NVHOST_GPU_IOCTL_GET_CHARACTERISTICS:
+		err = gk20a_ctrl_ioctl_gpu_characteristics(
+			g, (struct nvhost_gpu_get_characteristics *)buf);
+		break;
+
+	default:
+		gk20a_err(dev_from_gk20a(g), "unrecognized gpu ioctl cmd: 0x%x", cmd);
+		err = -ENOTTY;
+		break;
+	}
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *)arg, buf, _IOC_SIZE(cmd));
+
+	return err;
+}
+
diff --git a/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h
new file mode 100644
index 000000000000..ac9c253ec696
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ctrl_gk20a.h
@@ -0,0 +1,28 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_ctrl.h
+ *
+ * GK20A Ctrl
+ *
+ * Copyright (c) 2011-2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_GK20A_CTRL_H_
+#define _NVHOST_GK20A_CTRL_H_
+
+int gk20a_ctrl_dev_open(struct inode *inode, struct file *filp);
+int gk20a_ctrl_dev_release(struct inode *inode, struct file *filp);
+long gk20a_ctrl_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+
+#endif /* _NVHOST_GK20A_CTRL_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
new file mode 100644
index 000000000000..da7d733e3fd0
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.c
@@ -0,0 +1,699 @@
+/*
+ * Tegra GK20A GPU Debugger/Profiler Driver
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/uaccess.h>
+#include <linux/nvhost.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+#include "hw_therm_gk20a.h"
+
+struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a = {
+	.exec_reg_ops = exec_regops_gk20a,
+};
+
+/* silly allocator - just increment session id */
+static atomic_t session_id = ATOMIC_INIT(0);
+static int generate_session_id(void)
+{
+	return atomic_add_return(1, &session_id);
+}
+
+static int alloc_session(struct dbg_session_gk20a **_dbg_s)
+{
+	struct dbg_session_gk20a *dbg_s;
+	*_dbg_s = NULL;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	dbg_s = kzalloc(sizeof(*dbg_s), GFP_KERNEL);
+	if (!dbg_s)
+		return -ENOMEM;
+
+	dbg_s->id = generate_session_id();
+	dbg_s->ops = &dbg_gpu_session_ops_gk20a;
+	*_dbg_s = dbg_s;
+	return 0;
+}
+
+int gk20a_dbg_gpu_do_dev_open(struct inode *inode, struct file *filp, bool is_profiler)
+{
+	struct dbg_session_gk20a *dbg_session;
+	struct gk20a *g;
+
+	struct platform_device *pdev;
+	struct device *dev;
+
+	int err;
+
+	if (!is_profiler)
+		g = container_of(inode->i_cdev,
+				 struct gk20a, dbg.cdev);
+	else
+		g = container_of(inode->i_cdev,
+				 struct gk20a, prof.cdev);
+	pdev = g->dev;
+	dev  = &pdev->dev;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "dbg session: %s", dev_name(dev));
+
+	err  = alloc_session(&dbg_session);
+	if (err)
+		return err;
+
+	filp->private_data = dbg_session;
+	dbg_session->pdev  = pdev;
+	dbg_session->dev   = dev;
+	dbg_session->g     = g;
+	dbg_session->is_profiler = is_profiler;
+	dbg_session->is_pg_disabled = false;
+
+	INIT_LIST_HEAD(&dbg_session->dbg_s_list_node);
+	init_waitqueue_head(&dbg_session->dbg_events.wait_queue);
+	dbg_session->dbg_events.events_enabled = false;
+	dbg_session->dbg_events.num_pending_events = 0;
+
+	return 0;
+}
+
+/* used in scenarios where the debugger session can take just the inter-session
+ * lock for performance, but the profiler session must take the per-gpu lock
+ * since it might not have an associated channel. */
+static void gk20a_dbg_session_mutex_lock(struct dbg_session_gk20a *dbg_s)
+{
+	if (dbg_s->is_profiler)
+		mutex_lock(&dbg_s->g->dbg_sessions_lock);
+	else
+		mutex_lock(&dbg_s->ch->dbg_s_lock);
+}
+
+static void gk20a_dbg_session_mutex_unlock(struct dbg_session_gk20a *dbg_s)
+{
+	if (dbg_s->is_profiler)
+		mutex_unlock(&dbg_s->g->dbg_sessions_lock);
+	else
+		mutex_unlock(&dbg_s->ch->dbg_s_lock);
+}
+
+static void gk20a_dbg_gpu_events_enable(struct dbg_session_gk20a *dbg_s)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	gk20a_dbg_session_mutex_lock(dbg_s);
+
+	dbg_s->dbg_events.events_enabled = true;
+	dbg_s->dbg_events.num_pending_events = 0;
+
+	gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+
+static void gk20a_dbg_gpu_events_disable(struct dbg_session_gk20a *dbg_s)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	gk20a_dbg_session_mutex_lock(dbg_s);
+
+	dbg_s->dbg_events.events_enabled = false;
+	dbg_s->dbg_events.num_pending_events = 0;
+
+	gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+
+static void gk20a_dbg_gpu_events_clear(struct dbg_session_gk20a *dbg_s)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	gk20a_dbg_session_mutex_lock(dbg_s);
+
+	if (dbg_s->dbg_events.events_enabled &&
+			dbg_s->dbg_events.num_pending_events > 0)
+		dbg_s->dbg_events.num_pending_events--;
+
+	gk20a_dbg_session_mutex_unlock(dbg_s);
+}
+
+static int gk20a_dbg_gpu_events_ctrl(struct dbg_session_gk20a *dbg_s,
+			  struct nvhost_dbg_gpu_events_ctrl_args *args)
+{
+	int ret = 0;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "dbg events ctrl cmd %d", args->cmd);
+
+	if (!dbg_s->ch) {
+		gk20a_err(dev_from_gk20a(dbg_s->g),
+			   "no channel bound to dbg session\n");
+		return -EINVAL;
+	}
+
+	switch (args->cmd) {
+	case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_ENABLE:
+		gk20a_dbg_gpu_events_enable(dbg_s);
+		break;
+
+	case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_DISABLE:
+		gk20a_dbg_gpu_events_disable(dbg_s);
+		break;
+
+	case NVHOST_DBG_GPU_EVENTS_CTRL_CMD_CLEAR:
+		gk20a_dbg_gpu_events_clear(dbg_s);
+		break;
+
+	default:
+		gk20a_err(dev_from_gk20a(dbg_s->g),
+			   "unrecognized dbg gpu events ctrl cmd: 0x%x",
+			   args->cmd);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait)
+{
+	unsigned int mask = 0;
+	struct dbg_session_gk20a *dbg_s = filep->private_data;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	poll_wait(filep, &dbg_s->dbg_events.wait_queue, wait);
+
+	gk20a_dbg_session_mutex_lock(dbg_s);
+
+	if (dbg_s->dbg_events.events_enabled &&
+			dbg_s->dbg_events.num_pending_events > 0) {
+		gk20a_dbg(gpu_dbg_gpu_dbg, "found pending event on session id %d",
+				dbg_s->id);
+		gk20a_dbg(gpu_dbg_gpu_dbg, "%d events pending",
+				dbg_s->dbg_events.num_pending_events);
+		mask = (POLLPRI | POLLIN);
+	}
+
+	gk20a_dbg_session_mutex_unlock(dbg_s);
+
+	return mask;
+}
+
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+	return gk20a_dbg_gpu_do_dev_open(inode, filp, false /* not profiler */);
+}
+
+int gk20a_prof_gpu_dev_open(struct inode *inode, struct file *filp)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+	return gk20a_dbg_gpu_do_dev_open(inode, filp, true /* is profiler */);
+}
+
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *ch)
+{
+	struct dbg_session_gk20a *dbg_s;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	/* guard against the session list being modified */
+	mutex_lock(&ch->dbg_s_lock);
+
+	list_for_each_entry(dbg_s, &ch->dbg_s_list, dbg_s_list_node) {
+		if (dbg_s->dbg_events.events_enabled) {
+			gk20a_dbg(gpu_dbg_gpu_dbg, "posting event on session id %d",
+					dbg_s->id);
+			gk20a_dbg(gpu_dbg_gpu_dbg, "%d events pending",
+					dbg_s->dbg_events.num_pending_events);
+
+			dbg_s->dbg_events.num_pending_events++;
+
+			wake_up_interruptible_all(&dbg_s->dbg_events.wait_queue);
+		}
+	}
+
+	mutex_unlock(&ch->dbg_s_lock);
+}
+
+
+static int dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
+				__u32  powermode);
+
+static int dbg_unbind_channel_gk20a(struct dbg_session_gk20a *dbg_s)
+{
+	struct channel_gk20a *ch_gk20a = dbg_s->ch;
+	struct gk20a *g = dbg_s->g;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	/* wasn't bound to start with ? */
+	if (!ch_gk20a) {
+		gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "not bound already?");
+		return -ENODEV;
+	}
+
+	mutex_lock(&g->dbg_sessions_lock);
+	mutex_lock(&ch_gk20a->dbg_s_lock);
+
+	--g->dbg_sessions;
+
+	/* Powergate enable is called here as possibility of dbg_session
+	 * which called powergate disable ioctl, to be killed without calling
+	 * powergate enable ioctl
+	 */
+	dbg_set_powergate(dbg_s, NVHOST_DBG_GPU_POWERGATE_MODE_ENABLE);
+
+	dbg_s->ch = NULL;
+	fput(dbg_s->ch_f);
+	dbg_s->ch_f = NULL;
+
+	list_del_init(&dbg_s->dbg_s_list_node);
+
+	mutex_unlock(&ch_gk20a->dbg_s_lock);
+	mutex_unlock(&g->dbg_sessions_lock);
+
+	return 0;
+}
+
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dbg_session_gk20a *dbg_s = filp->private_data;
+
+	gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "%s", dev_name(dbg_s->dev));
+
+	/* unbind if it was bound */
+	if (!dbg_s->ch)
+		return 0;
+	dbg_unbind_channel_gk20a(dbg_s);
+
+	kfree(dbg_s);
+	return 0;
+}
+
+static int dbg_bind_channel_gk20a(struct dbg_session_gk20a *dbg_s,
+			  struct nvhost_dbg_gpu_bind_channel_args *args)
+{
+	struct file *f;
+	struct gk20a *g;
+	struct channel_gk20a *ch;
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s fd=%d",
+		   dev_name(dbg_s->dev), args->channel_fd);
+
+	if (args->channel_fd == ~0)
+		return dbg_unbind_channel_gk20a(dbg_s);
+
+	/* even though get_file_channel is doing this it releases it as well */
+	/* by holding it here we'll keep it from disappearing while the
+	 * debugger is in session */
+	f = fget(args->channel_fd);
+	if (!f)
+		return -ENODEV;
+
+	ch = gk20a_get_channel_from_file(args->channel_fd);
+	if (!ch) {
+		gk20a_dbg_fn("no channel found for fd");
+		fput(f);
+		return -EINVAL;
+	}
+
+	g = dbg_s->g;
+	gk20a_dbg_fn("%s hwchid=%d", dev_name(dbg_s->dev), ch->hw_chid);
+
+	mutex_lock(&g->dbg_sessions_lock);
+	mutex_lock(&ch->dbg_s_lock);
+
+	dbg_s->ch_f = f;
+	dbg_s->ch = ch;
+	list_add(&dbg_s->dbg_s_list_node, &dbg_s->ch->dbg_s_list);
+
+	g->dbg_sessions++;
+
+	mutex_unlock(&ch->dbg_s_lock);
+	mutex_unlock(&g->dbg_sessions_lock);
+	return 0;
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_exec_reg_ops_args *args);
+
+static int nvhost_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_powergate_args *args);
+
+static int nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+			      struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *args);
+
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct dbg_session_gk20a *dbg_s = filp->private_data;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	u8 buf[NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE];
+	int err = 0;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	if ((_IOC_TYPE(cmd) != NVHOST_DBG_GPU_IOCTL_MAGIC) ||
+	    (_IOC_NR(cmd) == 0) ||
+	    (_IOC_NR(cmd) > NVHOST_DBG_GPU_IOCTL_LAST))
+		return -EFAULT;
+
+	BUG_ON(_IOC_SIZE(cmd) > NVHOST_DBG_GPU_IOCTL_MAX_ARG_SIZE);
+
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		if (copy_from_user(buf, (void __user *)arg, _IOC_SIZE(cmd)))
+			return -EFAULT;
+	}
+
+	if (!g->gr.sw_ready) {
+		err = gk20a_busy(g->dev);
+		if (err)
+			return err;
+
+		gk20a_idle(g->dev);
+	}
+
+	switch (cmd) {
+	case NVHOST_DBG_GPU_IOCTL_BIND_CHANNEL:
+		err = dbg_bind_channel_gk20a(dbg_s,
+			     (struct nvhost_dbg_gpu_bind_channel_args *)buf);
+		gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+		break;
+
+	case NVHOST_DBG_GPU_IOCTL_REG_OPS:
+		err = nvhost_ioctl_channel_reg_ops(dbg_s,
+			   (struct nvhost_dbg_gpu_exec_reg_ops_args *)buf);
+		gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+		break;
+
+	case NVHOST_DBG_GPU_IOCTL_POWERGATE:
+		err = nvhost_ioctl_powergate_gk20a(dbg_s,
+			   (struct nvhost_dbg_gpu_powergate_args *)buf);
+		gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+		break;
+
+	case NVHOST_DBG_GPU_IOCTL_EVENTS_CTRL:
+		err = gk20a_dbg_gpu_events_ctrl(dbg_s,
+			   (struct nvhost_dbg_gpu_events_ctrl_args *)buf);
+		break;
+
+	case NVHOST_DBG_GPU_IOCTL_SMPC_CTXSW_MODE:
+		err = nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(dbg_s,
+			   (struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *)buf);
+		break;
+
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			   "unrecognized dbg gpu ioctl cmd: 0x%x",
+			   cmd);
+		err = -ENOTTY;
+		break;
+	}
+
+	if ((err == 0) && (_IOC_DIR(cmd) & _IOC_READ))
+		err = copy_to_user((void __user *)arg,
+				   buf, _IOC_SIZE(cmd));
+
+	return err;
+}
+
+/* In order to perform a context relative op the context has
+ * to be created already... which would imply that the
+ * context switch mechanism has already been put in place.
+ * So by the time we perform such an opertation it should always
+ * be possible to query for the appropriate context offsets, etc.
+ *
+ * But note: while the dbg_gpu bind requires the a channel fd,
+ * it doesn't require an allocated gr/compute obj at that point...
+ */
+static bool gr_context_info_available(struct dbg_session_gk20a *dbg_s,
+				      struct gr_gk20a *gr)
+{
+	int err;
+
+	mutex_lock(&gr->ctx_mutex);
+	err = !gr->ctx_vars.golden_image_initialized;
+	mutex_unlock(&gr->ctx_mutex);
+	if (err)
+		return false;
+	return true;
+
+}
+
+static int nvhost_ioctl_channel_reg_ops(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_exec_reg_ops_args *args)
+{
+	int err;
+	struct device *dev = dbg_s->dev;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	struct nvhost_dbg_gpu_reg_op *ops;
+	u64 ops_size = sizeof(ops[0]) * args->num_ops;
+
+	gk20a_dbg_fn("%d ops, total size %llu", args->num_ops, ops_size);
+
+	if (!dbg_s->ops) {
+		gk20a_err(dev, "can't call reg_ops on an unbound debugger session");
+		return -EINVAL;
+	}
+
+	if (!dbg_s->is_profiler && !dbg_s->ch) {
+		gk20a_err(dev, "bind a channel before regops for a debugging session");
+		return -EINVAL;
+	}
+
+	/* be sure that ctx info is in place */
+	if (!gr_context_info_available(dbg_s, &g->gr)) {
+		gk20a_err(dev, "gr context data not available\n");
+		return -ENODEV;
+	}
+
+	ops = kzalloc(ops_size, GFP_KERNEL);
+	if (!ops) {
+		gk20a_err(dev, "Allocating memory failed!");
+		return -ENOMEM;
+	}
+
+	gk20a_dbg_fn("Copying regops from userspace");
+
+	if (copy_from_user(ops, (void *)(uintptr_t)args->ops, ops_size)) {
+		dev_err(dev, "copy_from_user failed!");
+		err = -EFAULT;
+		goto clean_up;
+	}
+
+	/* since exec_reg_ops sends methods to the ucode, it must take the
+	 * global gpu lock to protect against mixing methods from debug sessions
+	 * on other channels */
+	mutex_lock(&g->dbg_sessions_lock);
+
+	err = dbg_s->ops->exec_reg_ops(dbg_s, ops, args->num_ops);
+
+	mutex_unlock(&g->dbg_sessions_lock);
+
+	if (err) {
+		gk20a_err(dev, "dbg regops failed");
+		goto clean_up;
+	}
+
+	gk20a_dbg_fn("Copying result to userspace");
+
+	if (copy_to_user((void *)(uintptr_t)args->ops, ops, ops_size)) {
+		dev_err(dev, "copy_to_user failed!");
+		err = -EFAULT;
+		goto clean_up;
+	}
+	return 0;
+ clean_up:
+	kfree(ops);
+	return err;
+}
+
+static int dbg_set_powergate(struct dbg_session_gk20a *dbg_s,
+				__u32  powermode)
+{
+	int err = 0;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+
+	 /* This function must be called with g->dbg_sessions_lock held */
+
+	gk20a_dbg(gpu_dbg_fn|gpu_dbg_gpu_dbg, "%s powergate mode = %d",
+		   dev_name(dbg_s->dev), powermode);
+
+	switch (powermode) {
+	case NVHOST_DBG_GPU_POWERGATE_MODE_DISABLE:
+		/* save off current powergate, clk state.
+		 * set gpu module's can_powergate = 0.
+		 * set gpu module's clk to max.
+		 * while *a* debug session is active there will be no power or
+		 * clocking state changes allowed from mainline code (but they
+		 * should be saved).
+		 */
+		/* Allow powergate disable if the current dbg_session doesn't
+		 * call a powergate disable ioctl and the global
+		 * powergating_disabled_refcount is zero
+		 */
+
+		if ((dbg_s->is_pg_disabled == false) &&
+		    (g->dbg_powergating_disabled_refcount++ == 0)) {
+
+			gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "module busy");
+			gk20a_busy(g->dev);
+			gk20a_channel_busy(dbg_s->pdev);
+
+			g->ops.clock_gating.slcg_gr_load_gating_prod(g,
+					false);
+			g->ops.clock_gating.slcg_perf_load_gating_prod(g,
+					false);
+			gr_gk20a_init_blcg_mode(g, BLCG_RUN, ENGINE_GR_GK20A);
+
+			g->elcg_enabled = false;
+			gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+			gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+
+			gk20a_pmu_disable_elpg(g);
+		}
+
+		dbg_s->is_pg_disabled = true;
+		break;
+
+	case NVHOST_DBG_GPU_POWERGATE_MODE_ENABLE:
+		/* restore (can) powergate, clk state */
+		/* release pending exceptions to fault/be handled as usual */
+		/*TBD: ordering of these? */
+
+		/* Re-enabling powergate as no other sessions want
+		 * powergate disabled and the current dbg-sessions had
+		 * requested the powergate disable through ioctl
+		*/
+		if (dbg_s->is_pg_disabled &&
+		    --g->dbg_powergating_disabled_refcount == 0) {
+
+			g->elcg_enabled = true;
+			gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+			gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+			gr_gk20a_init_blcg_mode(g, BLCG_AUTO, ENGINE_GR_GK20A);
+
+			g->ops.clock_gating.slcg_gr_load_gating_prod(g,
+					g->slcg_enabled);
+			g->ops.clock_gating.slcg_perf_load_gating_prod(g,
+					g->slcg_enabled);
+
+			gk20a_pmu_enable_elpg(g);
+
+			gk20a_dbg(gpu_dbg_gpu_dbg | gpu_dbg_fn, "module idle");
+			gk20a_channel_idle(dbg_s->pdev);
+			gk20a_idle(g->dev);
+		}
+
+		dbg_s->is_pg_disabled = false;
+		break;
+
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			   "unrecognized dbg gpu powergate mode: 0x%x",
+			   powermode);
+		err = -ENOTTY;
+		break;
+	}
+
+	return err;
+}
+
+static int nvhost_ioctl_powergate_gk20a(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_powergate_args *args)
+{
+	int err;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	gk20a_dbg_fn("%s  powergate mode = %d",
+		      dev_name(dbg_s->dev), args->mode);
+
+	mutex_lock(&g->dbg_sessions_lock);
+	err = dbg_set_powergate(dbg_s, args->mode);
+	mutex_unlock(&g->dbg_sessions_lock);
+	return  err;
+}
+
+static int nvhost_dbg_gpu_ioctl_smpc_ctxsw_mode(struct dbg_session_gk20a *dbg_s,
+			       struct nvhost_dbg_gpu_smpc_ctxsw_mode_args *args)
+{
+	int err;
+	struct gk20a *g = get_gk20a(dbg_s->pdev);
+	struct channel_gk20a *ch_gk20a;
+
+	gk20a_dbg_fn("%s smpc ctxsw mode = %d",
+		     dev_name(dbg_s->dev), args->mode);
+
+	/* Take the global lock, since we'll be doing global regops */
+	mutex_lock(&g->dbg_sessions_lock);
+
+	ch_gk20a = dbg_s->ch;
+
+	if (!ch_gk20a) {
+		gk20a_err(dev_from_gk20a(dbg_s->g),
+			  "no bound channel for smpc ctxsw mode update\n");
+		err = -EINVAL;
+		goto clean_up;
+	}
+
+	err = gr_gk20a_update_smpc_ctxsw_mode(g, ch_gk20a,
+		      args->mode == NVHOST_DBG_GPU_SMPC_CTXSW_MODE_CTXSW);
+	if (err) {
+		gk20a_err(dev_from_gk20a(dbg_s->g),
+			  "error (%d) during smpc ctxsw mode update\n", err);
+		goto clean_up;
+	}
+	/* The following regops are a hack/war to make up for the fact that we
+	 * just scribbled into the ctxsw image w/o really knowing whether
+	 * it was already swapped out in/out once or not, etc.
+	 */
+	{
+		struct nvhost_dbg_gpu_reg_op ops[4];
+		int i;
+		for (i = 0; i < ARRAY_SIZE(ops); i++) {
+			ops[i].op     = NVHOST_DBG_GPU_REG_OP_WRITE_32;
+			ops[i].type   = NVHOST_DBG_GPU_REG_OP_TYPE_GR_CTX;
+			ops[i].status = NVHOST_DBG_GPU_REG_OP_STATUS_SUCCESS;
+			ops[i].value_hi      = 0;
+			ops[i].and_n_mask_lo = 0;
+			ops[i].and_n_mask_hi = 0;
+		}
+		/* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control_sel1_r();*/
+		ops[0].offset   = 0x00419e08;
+		ops[0].value_lo = 0x1d;
+
+		/* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control5_r(); */
+		ops[1].offset   = 0x00419e58;
+		ops[1].value_lo = 0x1;
+
+		/* gr_pri_gpcs_tpcs_sm_dsm_perf_counter_control3_r(); */
+		ops[2].offset   = 0x00419e68;
+		ops[2].value_lo = 0xaaaa;
+
+		/* gr_pri_gpcs_tpcs_sm_dsm_perf_counter4_control_r(); */
+		ops[3].offset   = 0x00419f40;
+		ops[3].value_lo = 0x18;
+
+		err = dbg_s->ops->exec_reg_ops(dbg_s, ops, ARRAY_SIZE(ops));
+	}
+
+ clean_up:
+	mutex_unlock(&g->dbg_sessions_lock);
+	return  err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
new file mode 100644
index 000000000000..49827608436c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/dbg_gpu_gk20a.h
@@ -0,0 +1,83 @@
+/*
+ * Tegra GK20A GPU Debugger Driver
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __DBG_GPU_GK20A_H_
+#define __DBG_GPU_GK20A_H_
+#include <linux/poll.h>
+
+/* module debug driver interface */
+int gk20a_dbg_gpu_dev_release(struct inode *inode, struct file *filp);
+int gk20a_dbg_gpu_dev_open(struct inode *inode, struct file *filp);
+long gk20a_dbg_gpu_dev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+unsigned int gk20a_dbg_gpu_dev_poll(struct file *filep, poll_table *wait);
+
+/* used by profiler driver interface */
+int gk20a_prof_gpu_dev_open(struct inode *inode, struct file *filp);
+
+/* used by the interrupt handler to post events */
+void gk20a_dbg_gpu_post_events(struct channel_gk20a *fault_ch);
+
+struct dbg_gpu_session_ops {
+	int (*exec_reg_ops)(struct dbg_session_gk20a *dbg_s,
+			    struct nvhost_dbg_gpu_reg_op *ops,
+			    u64 num_ops);
+};
+
+struct dbg_gpu_session_events {
+	wait_queue_head_t wait_queue;
+	bool events_enabled;
+	int num_pending_events;
+};
+
+struct dbg_session_gk20a {
+	/* dbg session id used for trace/prints */
+	int id;
+
+	/* profiler session, if any */
+	bool is_profiler;
+
+	/* power enabled or disabled */
+	bool is_pg_disabled;
+
+	/*
+	 * There can be different versions of the whitelists
+	 * between both global and per-context sets; as well
+	 * as between debugger and profiler interfaces.
+	 */
+	struct regops_whitelist *global;
+	struct regops_whitelist *per_context;
+
+	/* gpu module vagaries */
+	struct device             *dev;
+	struct platform_device    *pdev;
+	struct gk20a              *g;
+
+	/* bound channel, if any */
+	struct file          *ch_f;
+	struct channel_gk20a *ch;
+
+	/* session operations */
+	struct dbg_gpu_session_ops *ops;
+
+	/* event support */
+	struct dbg_gpu_session_events dbg_events;
+	struct list_head dbg_s_list_node;
+};
+
+extern struct dbg_gpu_session_ops dbg_gpu_session_ops_gk20a;
+
+#endif /* __DBG_GPU_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.c b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
new file mode 100644
index 000000000000..c5b6953cfd02
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.c
@@ -0,0 +1,295 @@
+/*
+ * drivers/video/tegra/host/t20/debug_gk20a.c
+ *
+ * Copyright (C) 2011-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/nvhost.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/io.h>
+
+#include "gk20a.h"
+#include "debug_gk20a.h"
+
+#include "hw_ram_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+
+unsigned int gk20a_debug_trace_cmdbuf;
+struct platform_device *gk20a_device;
+
+struct gk20a_debug_output {
+	void (*fn)(void *ctx, const char *str, size_t len);
+	void *ctx;
+	char buf[256];
+};
+
+static const char * const ccsr_chan_status_str[] = {
+	"idle",
+	"pending",
+	"pending_ctx_reload",
+	"pending_acquire",
+	"pending_acq_ctx_reload",
+	"on_pbdma",
+	"on_pbdma_and_eng",
+	"on_eng",
+	"on_eng_pending_acquire",
+	"on_eng_pending",
+	"on_pbdma_ctx_reload",
+	"on_pbdma_and_eng_ctx_reload",
+	"on_eng_ctx_reload",
+	"on_eng_pending_ctx_reload",
+	"on_eng_pending_acq_ctx_reload",
+};
+
+static const char * const chan_status_str[] = {
+	"invalid",
+	"valid",
+	"chsw_load",
+	"chsw_save",
+	"chsw_switch",
+};
+
+static const char * const ctx_status_str[] = {
+	"invalid",
+	"valid",
+	NULL,
+	NULL,
+	NULL,
+	"ctxsw_load",
+	"ctxsw_save",
+	"ctxsw_switch",
+};
+
+static inline void gk20a_debug_write_printk(void *ctx, const char *str,
+					    size_t len)
+{
+	pr_info("%s", str);
+}
+
+static inline void gk20a_debug_write_to_seqfile(void *ctx, const char *str,
+						size_t len)
+{
+	seq_write((struct seq_file *)ctx, str, len);
+}
+
+void gk20a_debug_output(struct gk20a_debug_output *o, const char *fmt, ...)
+{
+	va_list args;
+	int len;
+
+	va_start(args, fmt);
+	len = vsnprintf(o->buf, sizeof(o->buf), fmt, args);
+	va_end(args);
+	o->fn(o->ctx, o->buf, len);
+}
+
+static void gk20a_debug_show_channel(struct gk20a *g,
+				     struct gk20a_debug_output *o,
+				     struct channel_gk20a *ch)
+{
+	u32 channel = gk20a_readl(g, ccsr_channel_r(ch->hw_chid));
+	u32 status = ccsr_channel_status_v(channel);
+	u32 syncpointa, syncpointb;
+	void *inst_ptr;
+
+	inst_ptr = ch->inst_block.cpuva;
+	if (!inst_ptr)
+		return;
+
+	syncpointa = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointa_w());
+	syncpointb = gk20a_mem_rd32(inst_ptr, ram_fc_syncpointb_w());
+
+	gk20a_debug_output(o, "%d-%s, pid %d: ", ch->hw_chid,
+			ch->g->dev->name,
+			ch->pid);
+	gk20a_debug_output(o, "%s in use %s %s\n",
+			ccsr_channel_enable_v(channel) ? "" : "not",
+			ccsr_chan_status_str[status],
+			ccsr_channel_busy_v(channel) ? "busy" : "not busy");
+	gk20a_debug_output(o, "TOP: %016llx PUT: %016llx GET: %016llx "
+			"FETCH: %016llx\nHEADER: %08x COUNT: %08x\n"
+			"SYNCPOINT %08x %08x SEMAPHORE %08x %08x %08x %08x\n",
+		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_top_level_get_w()) +
+		((u64)gk20a_mem_rd32(inst_ptr,
+			ram_fc_pb_top_level_get_hi_w()) << 32ULL),
+		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_w()) +
+		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_put_hi_w()) << 32ULL),
+		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_w()) +
+		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_get_hi_w()) << 32ULL),
+		(u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_w()) +
+		((u64)gk20a_mem_rd32(inst_ptr, ram_fc_pb_fetch_hi_w()) << 32ULL),
+		gk20a_mem_rd32(inst_ptr, ram_fc_pb_header_w()),
+		gk20a_mem_rd32(inst_ptr, ram_fc_pb_count_w()),
+		syncpointa,
+		syncpointb,
+		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorea_w()),
+		gk20a_mem_rd32(inst_ptr, ram_fc_semaphoreb_w()),
+		gk20a_mem_rd32(inst_ptr, ram_fc_semaphorec_w()),
+		gk20a_mem_rd32(inst_ptr, ram_fc_semaphored_w()));
+
+	if ((pbdma_syncpointb_op_v(syncpointb) == pbdma_syncpointb_op_wait_v())
+		&& (pbdma_syncpointb_wait_switch_v(syncpointb) ==
+			pbdma_syncpointb_wait_switch_en_v()))
+		gk20a_debug_output(o, "Waiting on syncpt %u (%s) val %u\n",
+			pbdma_syncpointb_syncpt_index_v(syncpointb),
+			nvhost_syncpt_get_name(
+				to_platform_device(g->dev->dev.parent),
+				pbdma_syncpointb_syncpt_index_v(syncpointb)),
+			pbdma_syncpointa_payload_v(syncpointa));
+
+	gk20a_debug_output(o, "\n");
+}
+
+void gk20a_debug_show_dump(struct platform_device *pdev,
+			   struct gk20a_debug_output *o)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(pdev);
+	struct gk20a *g = platform->g;
+	struct fifo_gk20a *f = &g->fifo;
+	u32 chid;
+	int i;
+
+	gk20a_busy(g->dev);
+	for (i = 0; i < fifo_pbdma_status__size_1_v(); i++) {
+		u32 status = gk20a_readl(g, fifo_pbdma_status_r(i));
+		u32 chan_status = fifo_pbdma_status_chan_status_v(status);
+
+		gk20a_debug_output(o, "%s pbdma %d: ", g->dev->name, i);
+		gk20a_debug_output(o,
+				"id: %d (%s), next_id: %d (%s) status: %s\n",
+				fifo_pbdma_status_id_v(status),
+				fifo_pbdma_status_id_type_v(status) ?
+					"tsg" : "channel",
+				fifo_pbdma_status_next_id_v(status),
+				fifo_pbdma_status_next_id_type_v(status) ?
+					"tsg" : "channel",
+				chan_status_str[chan_status]);
+		gk20a_debug_output(o, "PUT: %016llx GET: %016llx "
+				"FETCH: %08x HEADER: %08x\n",
+			(u64)gk20a_readl(g, pbdma_put_r(i)) +
+			((u64)gk20a_readl(g, pbdma_put_hi_r(i)) << 32ULL),
+			(u64)gk20a_readl(g, pbdma_get_r(i)) +
+			((u64)gk20a_readl(g, pbdma_get_hi_r(i)) << 32ULL),
+			gk20a_readl(g, pbdma_gp_fetch_r(i)),
+			gk20a_readl(g, pbdma_pb_header_r(i)));
+	}
+	gk20a_debug_output(o, "\n");
+
+	for (i = 0; i < fifo_engine_status__size_1_v(); i++) {
+		u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+		u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+
+		gk20a_debug_output(o, "%s eng %d: ", g->dev->name, i);
+		gk20a_debug_output(o,
+				"id: %d (%s), next_id: %d (%s), ctx: %s ",
+				fifo_engine_status_id_v(status),
+				fifo_engine_status_id_type_v(status) ?
+					"tsg" : "channel",
+				fifo_engine_status_next_id_v(status),
+				fifo_engine_status_next_id_type_v(status) ?
+					"tsg" : "channel",
+				ctx_status_str[ctx_status]);
+
+		if (fifo_engine_status_faulted_v(status))
+			gk20a_debug_output(o, "faulted ");
+		if (fifo_engine_status_engine_v(status))
+			gk20a_debug_output(o, "busy ");
+		gk20a_debug_output(o, "\n");
+	}
+	gk20a_debug_output(o, "\n");
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		if (f->channel[chid].in_use) {
+			struct channel_gk20a *gpu_ch = &f->channel[chid];
+			gk20a_debug_show_channel(g, o, gpu_ch);
+		}
+	}
+	gk20a_idle(g->dev);
+}
+
+void gk20a_debug_dump(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(pdev);
+	struct gk20a_debug_output o = {
+		.fn = gk20a_debug_write_printk
+	};
+
+	if (platform->dump_platform_dependencies)
+		platform->dump_platform_dependencies(pdev);
+
+	gk20a_debug_show_dump(pdev, &o);
+}
+
+void gk20a_debug_dump_device(struct platform_device *pdev)
+{
+	struct gk20a_debug_output o = {
+		.fn = gk20a_debug_write_printk
+	};
+
+	/* Dump the first device if no info is provided */
+	if (!pdev && gk20a_device)
+		pdev = gk20a_device;
+
+	gk20a_debug_show_dump(pdev, &o);
+}
+EXPORT_SYMBOL(gk20a_debug_dump_device);
+
+static int gk20a_debug_show(struct seq_file *s, void *unused)
+{
+	struct platform_device *pdev = s->private;
+	struct gk20a_debug_output o = {
+		.fn = gk20a_debug_write_to_seqfile,
+		.ctx = s,
+	};
+	gk20a_debug_show_dump(pdev, &o);
+	return 0;
+}
+
+static int gk20a_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, gk20a_debug_show, inode->i_private);
+}
+
+static const struct file_operations gk20a_debug_fops = {
+	.open		= gk20a_debug_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void gk20a_debug_init(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+
+	/* Store the first device */
+	if (!gk20a_device)
+		gk20a_device = pdev;
+
+	platform->debugfs = debugfs_create_dir(pdev->name, NULL);
+
+	debugfs_create_file("status", S_IRUGO, platform->debugfs,
+			pdev, &gk20a_debug_fops);
+	debugfs_create_u32("trace_cmdbuf", S_IRUGO|S_IWUSR, platform->debugfs,
+			&gk20a_debug_trace_cmdbuf);
+
+#if defined(GK20A_DEBUG)
+	debugfs_create_u32("dbg_mask", S_IRUGO|S_IWUSR, platform->debugfs,
+			&gk20a_dbg_mask);
+	debugfs_create_u32("dbg_ftrace", S_IRUGO|S_IWUSR, platform->debugfs,
+			&gk20a_dbg_ftrace);
+#endif
+}
diff --git a/drivers/gpu/nvgpu/gk20a/debug_gk20a.h b/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
new file mode 100644
index 000000000000..cd2e09c31f91
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/debug_gk20a.h
@@ -0,0 +1,25 @@
+/*
+ * GK20A Debug functionality
+ *
+ * Copyright (C) 2011-2014 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _DEBUG_GK20A_H_
+#define _DEBUG_GK20A_H_
+
+extern unsigned int gk20a_debug_trace_cmdbuf;
+
+void gk20a_debug_dump(struct platform_device *pdev);
+void gk20a_debug_init(struct platform_device *pdev);
+
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.c b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
new file mode 100644
index 000000000000..52f2db4d9e28
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.c
@@ -0,0 +1,37 @@
+/*
+ * GK20A memory interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/types.h>
+
+#include "gk20a.h"
+#include "kind_gk20a.h"
+#include "hw_mc_gk20a.h"
+
+static void fb_gk20a_reset(struct gk20a *g)
+{
+	gk20a_dbg_info("reset gk20a fb");
+
+	gk20a_reset(g, mc_enable_pfb_enabled_f()
+			| mc_enable_l2_enabled_f()
+			| mc_enable_xbar_enabled_f()
+			| mc_enable_hub_enabled_f());
+}
+
+void gk20a_init_fb(struct gpu_ops *gops)
+{
+	gops->fb.reset = fb_gk20a_reset;
+	gk20a_init_uncompressed_kind_map();
+	gk20a_init_kind_attr();
+}
diff --git a/drivers/gpu/nvgpu/gk20a/fb_gk20a.h b/drivers/gpu/nvgpu/gk20a/fb_gk20a.h
new file mode 100644
index 000000000000..34c21c9b2e13
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fb_gk20a.h
@@ -0,0 +1,21 @@
+/*
+ * GK20A FB
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _NVHOST_GK20A_FB
+#define _NVHOST_GK20A_FB
+struct gk20a;
+
+void gk20a_init_fb(struct gpu_ops *gops);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
new file mode 100644
index 000000000000..5575b995a100
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.c
@@ -0,0 +1,1836 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.c
+ *
+ * GK20A Graphics FIFO (gr host)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <trace/events/gk20a.h>
+#include <linux/dma-mapping.h>
+#include <linux/nvhost.h>
+
+#include "gk20a.h"
+#include "debug_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "hw_ccsr_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_gr_gk20a.h"
+
+static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+					    u32 hw_chid, bool add,
+					    bool wait_for_finish);
+static void gk20a_fifo_handle_mmu_fault_thread(struct work_struct *work);
+
+/*
+ * Link engine IDs to MMU IDs and vice versa.
+ */
+
+static inline u32 gk20a_engine_id_to_mmu_id(u32 engine_id)
+{
+	switch (engine_id) {
+	case ENGINE_GR_GK20A:
+		return 0x00;
+	case ENGINE_CE2_GK20A:
+		return 0x1b;
+	default:
+		return ~0;
+	}
+}
+
+static inline u32 gk20a_mmu_id_to_engine_id(u32 engine_id)
+{
+	switch (engine_id) {
+	case 0x00:
+		return ENGINE_GR_GK20A;
+	case 0x1b:
+		return ENGINE_CE2_GK20A;
+	default:
+		return ~0;
+	}
+}
+
+
+static int init_engine_info(struct fifo_gk20a *f)
+{
+	struct gk20a *g = f->g;
+	struct device *d = dev_from_gk20a(g);
+	struct fifo_engine_info_gk20a *gr_info;
+	const u32 gr_sw_id = ENGINE_GR_GK20A;
+	u32 i;
+	u32 max_info_entries = top_device_info__size_1_v();
+
+	gk20a_dbg_fn("");
+
+	/* all we really care about finding is the graphics entry    */
+	/* especially early on in sim it probably thinks it has more */
+	f->num_engines = 1;
+
+	gr_info = f->engine_info + gr_sw_id;
+
+	gr_info->sw_id = gr_sw_id;
+	gr_info->name = "gr";
+	gr_info->dev_info_id = top_device_info_type_enum_graphics_v();
+	gr_info->mmu_fault_id = fifo_intr_mmu_fault_eng_id_graphics_v();
+	gr_info->runlist_id = ~0;
+	gr_info->pbdma_id   = ~0;
+	gr_info->engine_id  = ~0;
+
+	for (i = 0; i < max_info_entries; i++) {
+		u32 table_entry = gk20a_readl(f->g, top_device_info_r(i));
+		u32 entry = top_device_info_entry_v(table_entry);
+		u32 engine_enum = top_device_info_type_enum_v(table_entry);
+		u32 table_entry2 = 0;
+
+		if (entry == top_device_info_entry_not_valid_v())
+			continue;
+
+		if (top_device_info_chain_v(table_entry) ==
+		    top_device_info_chain_enable_v()) {
+
+			table_entry2 = gk20a_readl(f->g,
+						   top_device_info_r(++i));
+
+			engine_enum = top_device_info_type_enum_v(table_entry2);
+		}
+
+		/* we only care about GR engine here */
+		if (entry == top_device_info_entry_enum_v() &&
+		    engine_enum == gr_info->dev_info_id) {
+			int pbdma_id;
+			u32 runlist_bit;
+
+			gr_info->runlist_id =
+				top_device_info_runlist_enum_v(table_entry);
+			gk20a_dbg_info("gr info: runlist_id %d", gr_info->runlist_id);
+
+			gr_info->engine_id =
+				top_device_info_engine_enum_v(table_entry);
+			gk20a_dbg_info("gr info: engine_id %d", gr_info->engine_id);
+
+			runlist_bit = 1 << gr_info->runlist_id;
+
+			for (pbdma_id = 0; pbdma_id < f->num_pbdma; pbdma_id++) {
+				gk20a_dbg_info("gr info: pbdma_map[%d]=%d",
+					pbdma_id, f->pbdma_map[pbdma_id]);
+				if (f->pbdma_map[pbdma_id] & runlist_bit)
+					break;
+			}
+
+			if (pbdma_id == f->num_pbdma) {
+				gk20a_err(d, "busted pbmda map");
+				return -EINVAL;
+			}
+			gr_info->pbdma_id = pbdma_id;
+
+			break;
+		}
+	}
+
+	if (gr_info->runlist_id == ~0) {
+		gk20a_err(d, "busted device info");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void gk20a_remove_fifo_support(struct fifo_gk20a *f)
+{
+	struct gk20a *g = f->g;
+	struct device *d = dev_from_gk20a(g);
+	struct fifo_engine_info_gk20a *engine_info;
+	struct fifo_runlist_info_gk20a *runlist;
+	u32 runlist_id;
+	u32 i;
+
+	gk20a_dbg_fn("");
+
+	if (f->channel) {
+		int c;
+		for (c = 0; c < f->num_channels; c++) {
+			if (f->channel[c].remove_support)
+				f->channel[c].remove_support(f->channel+c);
+		}
+		kfree(f->channel);
+	}
+	if (f->userd.gpu_va)
+		gk20a_gmmu_unmap(&g->mm.bar1.vm,
+				f->userd.gpu_va,
+				f->userd.size,
+				gk20a_mem_flag_none);
+
+	if (f->userd.sgt)
+		gk20a_free_sgtable(&f->userd.sgt);
+
+	if (f->userd.cpuva)
+		dma_free_coherent(d,
+				f->userd_total_size,
+				f->userd.cpuva,
+				f->userd.iova);
+	f->userd.cpuva = NULL;
+	f->userd.iova = 0;
+
+	engine_info = f->engine_info + ENGINE_GR_GK20A;
+	runlist_id = engine_info->runlist_id;
+	runlist = &f->runlist_info[runlist_id];
+
+	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+		if (runlist->mem[i].cpuva)
+			dma_free_coherent(d,
+				runlist->mem[i].size,
+				runlist->mem[i].cpuva,
+				runlist->mem[i].iova);
+		runlist->mem[i].cpuva = NULL;
+		runlist->mem[i].iova = 0;
+	}
+
+	kfree(runlist->active_channels);
+
+	kfree(f->runlist_info);
+	kfree(f->pbdma_map);
+	kfree(f->engine_info);
+}
+
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_pbdma_info(
+	struct gk20a *g,
+	struct fifo_engine_info_gk20a *eng_info)
+{
+	struct fifo_pbdma_exception_info_gk20a *e =
+		&eng_info->pbdma_exception_info;
+
+	u32 pbdma_status_r = e->status_r = gk20a_readl(g,
+		   fifo_pbdma_status_r(eng_info->pbdma_id));
+	e->id = fifo_pbdma_status_id_v(pbdma_status_r); /* vs. id_hw_v()? */
+	e->id_is_chid = fifo_pbdma_status_id_type_v(pbdma_status_r) ==
+		fifo_pbdma_status_id_type_chid_v();
+	e->chan_status_v  = fifo_pbdma_status_chan_status_v(pbdma_status_r);
+	e->next_id_is_chid =
+		fifo_pbdma_status_next_id_type_v(pbdma_status_r) ==
+		fifo_pbdma_status_next_id_type_chid_v();
+	e->next_id = fifo_pbdma_status_next_id_v(pbdma_status_r);
+	e->chsw_in_progress =
+		fifo_pbdma_status_chsw_v(pbdma_status_r) ==
+		fifo_pbdma_status_chsw_in_progress_v();
+}
+
+static void fifo_pbdma_exception_status(struct gk20a *g,
+	struct fifo_engine_info_gk20a *eng_info)
+{
+	struct fifo_pbdma_exception_info_gk20a *e;
+	get_exception_pbdma_info(g, eng_info);
+	e = &eng_info->pbdma_exception_info;
+
+	gk20a_dbg_fn("pbdma_id %d, "
+		      "id_type %s, id %d, chan_status %d, "
+		      "next_id_type %s, next_id %d, "
+		      "chsw_in_progress %d",
+		      eng_info->pbdma_id,
+		      e->id_is_chid ? "chid" : "tsgid", e->id, e->chan_status_v,
+		      e->next_id_is_chid ? "chid" : "tsgid", e->next_id,
+		      e->chsw_in_progress);
+}
+
+/* reads info from hardware and fills in pbmda exception info record */
+static inline void get_exception_engine_info(
+	struct gk20a *g,
+	struct fifo_engine_info_gk20a *eng_info)
+{
+	struct fifo_engine_exception_info_gk20a *e =
+		&eng_info->engine_exception_info;
+	u32 engine_status_r = e->status_r =
+		gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+	e->id = fifo_engine_status_id_v(engine_status_r); /* vs. id_hw_v()? */
+	e->id_is_chid = fifo_engine_status_id_type_v(engine_status_r) ==
+		fifo_engine_status_id_type_chid_v();
+	e->ctx_status_v = fifo_engine_status_ctx_status_v(engine_status_r);
+	e->faulted =
+		fifo_engine_status_faulted_v(engine_status_r) ==
+		fifo_engine_status_faulted_true_v();
+	e->idle =
+		fifo_engine_status_engine_v(engine_status_r) ==
+		fifo_engine_status_engine_idle_v();
+	e->ctxsw_in_progress =
+		fifo_engine_status_ctxsw_v(engine_status_r) ==
+		fifo_engine_status_ctxsw_in_progress_v();
+}
+
+static void fifo_engine_exception_status(struct gk20a *g,
+			       struct fifo_engine_info_gk20a *eng_info)
+{
+	struct fifo_engine_exception_info_gk20a *e;
+	get_exception_engine_info(g, eng_info);
+	e = &eng_info->engine_exception_info;
+
+	gk20a_dbg_fn("engine_id %d, id_type %s, id %d, ctx_status %d, "
+		      "faulted %d, idle %d, ctxsw_in_progress %d, ",
+		      eng_info->engine_id, e->id_is_chid ? "chid" : "tsgid",
+		      e->id, e->ctx_status_v,
+		      e->faulted, e->idle,  e->ctxsw_in_progress);
+}
+
+static int init_runlist(struct gk20a *g, struct fifo_gk20a *f)
+{
+	struct fifo_engine_info_gk20a *engine_info;
+	struct fifo_runlist_info_gk20a *runlist;
+	struct device *d = dev_from_gk20a(g);
+	u32 runlist_id;
+	u32 i;
+	u64 runlist_size;
+
+	gk20a_dbg_fn("");
+
+	f->max_runlists = fifo_eng_runlist_base__size_1_v();
+	f->runlist_info = kzalloc(sizeof(struct fifo_runlist_info_gk20a) *
+				  f->max_runlists, GFP_KERNEL);
+	if (!f->runlist_info)
+		goto clean_up;
+
+	engine_info = f->engine_info + ENGINE_GR_GK20A;
+	runlist_id = engine_info->runlist_id;
+	runlist = &f->runlist_info[runlist_id];
+
+	runlist->active_channels =
+		kzalloc(DIV_ROUND_UP(f->num_channels, BITS_PER_BYTE),
+			GFP_KERNEL);
+	if (!runlist->active_channels)
+		goto clean_up_runlist_info;
+
+	runlist_size  = ram_rl_entry_size_v() * f->num_channels;
+	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+		dma_addr_t iova;
+
+		runlist->mem[i].cpuva =
+			dma_alloc_coherent(d,
+					runlist_size,
+					&iova,
+					GFP_KERNEL);
+		if (!runlist->mem[i].cpuva) {
+			dev_err(d, "memory allocation failed\n");
+			goto clean_up_runlist;
+		}
+		runlist->mem[i].iova = iova;
+		runlist->mem[i].size = runlist_size;
+	}
+	mutex_init(&runlist->mutex);
+	init_waitqueue_head(&runlist->runlist_wq);
+
+	/* None of buffers is pinned if this value doesn't change.
+	    Otherwise, one of them (cur_buffer) must have been pinned. */
+	runlist->cur_buffer = MAX_RUNLIST_BUFFERS;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up_runlist:
+	for (i = 0; i < MAX_RUNLIST_BUFFERS; i++) {
+		if (runlist->mem[i].cpuva)
+			dma_free_coherent(d,
+				runlist->mem[i].size,
+				runlist->mem[i].cpuva,
+				runlist->mem[i].iova);
+		runlist->mem[i].cpuva = NULL;
+		runlist->mem[i].iova = 0;
+	}
+
+	kfree(runlist->active_channels);
+	runlist->active_channels = NULL;
+
+clean_up_runlist_info:
+	kfree(f->runlist_info);
+	f->runlist_info = NULL;
+
+clean_up:
+	gk20a_dbg_fn("fail");
+	return -ENOMEM;
+}
+
+#define GRFIFO_TIMEOUT_CHECK_PERIOD_US 100000
+
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g)
+{
+	u32 intr_stall;
+	u32 mask;
+	u32 timeout;
+	int i;
+
+	gk20a_dbg_fn("");
+	/* enable pmc pfifo */
+	gk20a_reset(g, mc_enable_pfifo_enabled_f()
+			| mc_enable_ce2_enabled_f());
+
+	/* enable pbdma */
+	mask = 0;
+	for (i = 0; i < proj_host_num_pbdma_v(); ++i)
+		mask |= mc_enable_pb_sel_f(mc_enable_pb_0_enabled_v(), i);
+	gk20a_writel(g, mc_enable_pb_r(), mask);
+
+	/* enable pfifo interrupt */
+	gk20a_writel(g, fifo_intr_0_r(), 0xFFFFFFFF);
+	gk20a_writel(g, fifo_intr_en_0_r(), 0x7FFFFFFF);
+	gk20a_writel(g, fifo_intr_en_1_r(), 0x80000000);
+
+	/* enable pbdma interrupt */
+	mask = 0;
+	for (i = 0; i < proj_host_num_pbdma_v(); i++) {
+		intr_stall = gk20a_readl(g, pbdma_intr_stall_r(i));
+		intr_stall &= ~pbdma_intr_stall_lbreq_enabled_f();
+		gk20a_writel(g, pbdma_intr_stall_r(i), intr_stall);
+		gk20a_writel(g, pbdma_intr_0_r(i), 0xFFFFFFFF);
+		gk20a_writel(g, pbdma_intr_en_0_r(i),
+			(~0) & ~pbdma_intr_en_0_lbreq_enabled_f());
+		gk20a_writel(g, pbdma_intr_1_r(i), 0xFFFFFFFF);
+		gk20a_writel(g, pbdma_intr_en_1_r(i), 0xFFFFFFFF);
+	}
+
+	/* TBD: apply overrides */
+
+	/* TBD: BLCG prod */
+
+	/* reset runlist interrupts */
+	gk20a_writel(g, fifo_intr_runlist_r(), ~0);
+
+	/* TBD: do we need those? */
+	timeout = gk20a_readl(g, fifo_fb_timeout_r());
+	timeout = set_field(timeout, fifo_fb_timeout_period_m(),
+			fifo_fb_timeout_period_max_f());
+	gk20a_writel(g, fifo_fb_timeout_r(), timeout);
+
+	if (tegra_platform_is_silicon()) {
+		timeout = gk20a_readl(g, fifo_pb_timeout_r());
+		timeout &= ~fifo_pb_timeout_detection_enabled_f();
+		gk20a_writel(g, fifo_pb_timeout_r(), timeout);
+	}
+
+	timeout = GRFIFO_TIMEOUT_CHECK_PERIOD_US |
+			fifo_eng_timeout_detection_enabled_f();
+	gk20a_writel(g, fifo_eng_timeout_r(), timeout);
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+static void gk20a_init_fifo_pbdma_intr_descs(struct fifo_gk20a *f)
+{
+	/* These are all errors which indicate something really wrong
+	 * going on in the device. */
+	f->intr.pbdma.device_fatal_0 =
+		pbdma_intr_0_memreq_pending_f() |
+		pbdma_intr_0_memack_timeout_pending_f() |
+		pbdma_intr_0_memack_extra_pending_f() |
+		pbdma_intr_0_memdat_timeout_pending_f() |
+		pbdma_intr_0_memdat_extra_pending_f() |
+		pbdma_intr_0_memflush_pending_f() |
+		pbdma_intr_0_memop_pending_f() |
+		pbdma_intr_0_lbconnect_pending_f() |
+		pbdma_intr_0_lbreq_pending_f() |
+		pbdma_intr_0_lback_timeout_pending_f() |
+		pbdma_intr_0_lback_extra_pending_f() |
+		pbdma_intr_0_lbdat_timeout_pending_f() |
+		pbdma_intr_0_lbdat_extra_pending_f() |
+		pbdma_intr_0_xbarconnect_pending_f() |
+		pbdma_intr_0_pri_pending_f();
+
+	/* These are data parsing, framing errors or others which can be
+	 * recovered from with intervention... or just resetting the
+	 * channel. */
+	f->intr.pbdma.channel_fatal_0 =
+		pbdma_intr_0_gpfifo_pending_f() |
+		pbdma_intr_0_gpptr_pending_f() |
+		pbdma_intr_0_gpentry_pending_f() |
+		pbdma_intr_0_gpcrc_pending_f() |
+		pbdma_intr_0_pbptr_pending_f() |
+		pbdma_intr_0_pbentry_pending_f() |
+		pbdma_intr_0_pbcrc_pending_f() |
+		pbdma_intr_0_method_pending_f() |
+		pbdma_intr_0_methodcrc_pending_f() |
+		pbdma_intr_0_pbseg_pending_f() |
+		pbdma_intr_0_signature_pending_f();
+
+	/* Can be used for sw-methods, or represents
+	 * a recoverable timeout. */
+	f->intr.pbdma.restartable_0 =
+		pbdma_intr_0_device_pending_f() |
+		pbdma_intr_0_acquire_pending_f();
+}
+
+static int gk20a_init_fifo_setup_sw(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct device *d = dev_from_gk20a(g);
+	int chid, i, err = 0;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	if (f->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	f->g = g;
+
+	INIT_WORK(&f->fault_restore_thread,
+		  gk20a_fifo_handle_mmu_fault_thread);
+	mutex_init(&f->intr.isr.mutex);
+	gk20a_init_fifo_pbdma_intr_descs(f); /* just filling in data/tables */
+
+	f->num_channels = ccsr_channel__size_1_v();
+	f->num_pbdma = proj_host_num_pbdma_v();
+	f->max_engines = ENGINE_INVAL_GK20A;
+
+	f->userd_entry_size = 1 << ram_userd_base_shift_v();
+	f->userd_total_size = f->userd_entry_size * f->num_channels;
+
+	f->userd.cpuva = dma_alloc_coherent(d,
+					f->userd_total_size,
+					&iova,
+					GFP_KERNEL);
+	if (!f->userd.cpuva) {
+		dev_err(d, "memory allocation failed\n");
+		goto clean_up;
+	}
+
+	f->userd.iova = iova;
+	err = gk20a_get_sgtable(d, &f->userd.sgt,
+				f->userd.cpuva, f->userd.iova,
+				f->userd_total_size);
+	if (err) {
+		dev_err(d, "failed to create sg table\n");
+		goto clean_up;
+	}
+
+	/* bar1 va */
+	f->userd.gpu_va = gk20a_gmmu_map(&g->mm.bar1.vm,
+					&f->userd.sgt,
+					f->userd_total_size,
+					0, /* flags */
+					gk20a_mem_flag_none);
+	if (!f->userd.gpu_va) {
+		dev_err(d, "gmmu mapping failed\n");
+		goto clean_up;
+	}
+
+	gk20a_dbg(gpu_dbg_map, "userd bar1 va = 0x%llx", f->userd.gpu_va);
+
+	f->userd.size = f->userd_total_size;
+
+	f->channel = kzalloc(f->num_channels * sizeof(*f->channel),
+				GFP_KERNEL);
+	f->pbdma_map = kzalloc(f->num_pbdma * sizeof(*f->pbdma_map),
+				GFP_KERNEL);
+	f->engine_info = kzalloc(f->max_engines * sizeof(*f->engine_info),
+				GFP_KERNEL);
+
+	if (!(f->channel && f->pbdma_map && f->engine_info)) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	/* pbdma map needs to be in place before calling engine info init */
+	for (i = 0; i < f->num_pbdma; ++i)
+		f->pbdma_map[i] = gk20a_readl(g, fifo_pbdma_map_r(i));
+
+	init_engine_info(f);
+
+	init_runlist(g, f);
+
+	for (chid = 0; chid < f->num_channels; chid++) {
+		f->channel[chid].userd_cpu_va =
+			f->userd.cpuva + chid * f->userd_entry_size;
+		f->channel[chid].userd_iova =
+			NV_MC_SMMU_VADDR_TRANSLATE(f->userd.iova)
+				+ chid * f->userd_entry_size;
+		f->channel[chid].userd_gpu_va =
+			f->userd.gpu_va + chid * f->userd_entry_size;
+
+		gk20a_init_channel_support(g, chid);
+	}
+	mutex_init(&f->ch_inuse_mutex);
+
+	f->remove_support = gk20a_remove_fifo_support;
+
+	f->deferred_reset_pending = false;
+	mutex_init(&f->deferred_reset_mutex);
+
+	f->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up:
+	gk20a_dbg_fn("fail");
+	if (f->userd.gpu_va)
+		gk20a_gmmu_unmap(&g->mm.bar1.vm,
+					f->userd.gpu_va,
+					f->userd.size,
+					gk20a_mem_flag_none);
+	if (f->userd.sgt)
+		gk20a_free_sgtable(&f->userd.sgt);
+	if (f->userd.cpuva)
+		dma_free_coherent(d,
+				f->userd_total_size,
+				f->userd.cpuva,
+				f->userd.iova);
+	f->userd.cpuva = NULL;
+	f->userd.iova = 0;
+
+	memset(&f->userd, 0, sizeof(struct userd_desc));
+
+	kfree(f->channel);
+	f->channel = NULL;
+	kfree(f->pbdma_map);
+	f->pbdma_map = NULL;
+	kfree(f->engine_info);
+	f->engine_info = NULL;
+
+	return err;
+}
+
+static void gk20a_fifo_handle_runlist_event(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_runlist_info_gk20a *runlist;
+	unsigned long runlist_event;
+	u32 runlist_id;
+
+	runlist_event = gk20a_readl(g, fifo_intr_runlist_r());
+	gk20a_writel(g, fifo_intr_runlist_r(), runlist_event);
+
+	for_each_set_bit(runlist_id, &runlist_event, f->max_runlists) {
+		runlist = &f->runlist_info[runlist_id];
+		wake_up(&runlist->runlist_wq);
+	}
+
+}
+
+static int gk20a_init_fifo_setup_hw(struct gk20a *g)
+{
+	struct fifo_gk20a *f = &g->fifo;
+
+	gk20a_dbg_fn("");
+
+	/* test write, read through bar1 @ userd region before
+	 * turning on the snooping */
+	{
+		struct fifo_gk20a *f = &g->fifo;
+		u32 v, v1 = 0x33, v2 = 0x55;
+
+		u32 bar1_vaddr = f->userd.gpu_va;
+		volatile u32 *cpu_vaddr = f->userd.cpuva;
+
+		gk20a_dbg_info("test bar1 @ vaddr 0x%x",
+			   bar1_vaddr);
+
+		v = gk20a_bar1_readl(g, bar1_vaddr);
+
+		*cpu_vaddr = v1;
+		smp_mb();
+
+		if (v1 != gk20a_bar1_readl(g, bar1_vaddr)) {
+			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+			return -EINVAL;
+		}
+
+		gk20a_bar1_writel(g, bar1_vaddr, v2);
+
+		if (v2 != gk20a_bar1_readl(g, bar1_vaddr)) {
+			gk20a_err(dev_from_gk20a(g), "bar1 broken @ gk20a!");
+			return -EINVAL;
+		}
+
+		/* is it visible to the cpu? */
+		if (*cpu_vaddr != v2) {
+			gk20a_err(dev_from_gk20a(g),
+				"cpu didn't see bar1 write @ %p!",
+				cpu_vaddr);
+		}
+
+		/* put it back */
+		gk20a_bar1_writel(g, bar1_vaddr, v);
+	}
+
+	/*XXX all manner of flushes and caching worries, etc */
+
+	/* set the base for the userd region now */
+	gk20a_writel(g, fifo_bar1_base_r(),
+			fifo_bar1_base_ptr_f(f->userd.gpu_va >> 12) |
+			fifo_bar1_base_valid_true_f());
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+int gk20a_init_fifo_support(struct gk20a *g)
+{
+	u32 err;
+
+	err = gk20a_init_fifo_setup_sw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_fifo_setup_hw(g);
+	if (err)
+		return err;
+
+	return err;
+}
+
+static struct channel_gk20a *
+channel_from_inst_ptr(struct fifo_gk20a *f, u64 inst_ptr)
+{
+	int ci;
+	if (unlikely(!f->channel))
+		return NULL;
+	for (ci = 0; ci < f->num_channels; ci++) {
+		struct channel_gk20a *c = f->channel+ci;
+		if (c->inst_block.cpuva &&
+		    (inst_ptr == c->inst_block.cpu_pa))
+			return f->channel+ci;
+	}
+	return NULL;
+}
+
+/* fault info/descriptions.
+ * tbd: move to setup
+ *  */
+static const char * const fault_type_descs[] = {
+	 "pde", /*fifo_intr_mmu_fault_info_type_pde_v() == 0 */
+	 "pde size",
+	 "pte",
+	 "va limit viol",
+	 "unbound inst",
+	 "priv viol",
+	 "ro viol",
+	 "wo viol",
+	 "pitch mask",
+	 "work creation",
+	 "bad aperture",
+	 "compression failure",
+	 "bad kind",
+	 "region viol",
+	 "dual ptes",
+	 "poisoned",
+};
+/* engine descriptions */
+static const char * const engine_subid_descs[] = {
+	"gpc",
+	"hub",
+};
+
+static const char * const hub_client_descs[] = {
+	"vip", "ce0", "ce1", "dniso", "fe", "fecs", "host", "host cpu",
+	"host cpu nb", "iso", "mmu", "mspdec", "msppp", "msvld",
+	"niso", "p2p", "pd", "perf", "pmu", "raster twod", "scc",
+	"scc nb", "sec", "ssync", "gr copy", "ce2", "xv", "mmu nb",
+	"msenc", "d falcon", "sked", "a falcon", "n/a",
+};
+
+static const char * const gpc_client_descs[] = {
+	"l1 0", "t1 0", "pe 0",
+	"l1 1", "t1 1", "pe 1",
+	"l1 2", "t1 2", "pe 2",
+	"l1 3", "t1 3", "pe 3",
+	"rast", "gcc", "gpccs",
+	"prop 0", "prop 1", "prop 2", "prop 3",
+	"l1 4", "t1 4", "pe 4",
+	"l1 5", "t1 5", "pe 5",
+	"l1 6", "t1 6", "pe 6",
+	"l1 7", "t1 7", "pe 7",
+	"gpm",
+	"ltp utlb 0", "ltp utlb 1", "ltp utlb 2", "ltp utlb 3",
+	"rgg utlb",
+};
+
+/* reads info from hardware and fills in mmu fault info record */
+static inline void get_exception_mmu_fault_info(
+	struct gk20a *g, u32 engine_id,
+	struct fifo_mmu_fault_info_gk20a *f)
+{
+	u32 fault_info_v;
+
+	gk20a_dbg_fn("engine_id %d", engine_id);
+
+	memset(f, 0, sizeof(*f));
+
+	f->fault_info_v = fault_info_v = gk20a_readl(g,
+	     fifo_intr_mmu_fault_info_r(engine_id));
+	f->fault_type_v =
+		fifo_intr_mmu_fault_info_type_v(fault_info_v);
+	f->engine_subid_v =
+		fifo_intr_mmu_fault_info_engine_subid_v(fault_info_v);
+	f->client_v = fifo_intr_mmu_fault_info_client_v(fault_info_v);
+
+	BUG_ON(f->fault_type_v >= ARRAY_SIZE(fault_type_descs));
+	f->fault_type_desc =  fault_type_descs[f->fault_type_v];
+
+	BUG_ON(f->engine_subid_v >= ARRAY_SIZE(engine_subid_descs));
+	f->engine_subid_desc = engine_subid_descs[f->engine_subid_v];
+
+	if (f->engine_subid_v ==
+	    fifo_intr_mmu_fault_info_engine_subid_hub_v()) {
+
+		BUG_ON(f->client_v >= ARRAY_SIZE(hub_client_descs));
+		f->client_desc = hub_client_descs[f->client_v];
+	} else if (f->engine_subid_v ==
+		   fifo_intr_mmu_fault_info_engine_subid_gpc_v()) {
+		BUG_ON(f->client_v >= ARRAY_SIZE(gpc_client_descs));
+		f->client_desc = gpc_client_descs[f->client_v];
+	} else {
+		BUG_ON(1);
+	}
+
+	f->fault_hi_v = gk20a_readl(g, fifo_intr_mmu_fault_hi_r(engine_id));
+	f->fault_lo_v = gk20a_readl(g, fifo_intr_mmu_fault_lo_r(engine_id));
+	/* note:ignoring aperture on gk20a... */
+	f->inst_ptr = fifo_intr_mmu_fault_inst_ptr_v(
+		 gk20a_readl(g, fifo_intr_mmu_fault_inst_r(engine_id)));
+	/* note: inst_ptr is a 40b phys addr.  */
+	f->inst_ptr <<= fifo_intr_mmu_fault_inst_ptr_align_shift_v();
+}
+
+static void gk20a_fifo_reset_engine(struct gk20a *g, u32 engine_id)
+{
+	gk20a_dbg_fn("");
+
+	if (engine_id == top_device_info_type_enum_graphics_v()) {
+		/* resetting engine using mc_enable_r() is not enough,
+		 * we do full init sequence */
+		gk20a_gr_reset(g);
+	}
+	if (engine_id == top_device_info_type_enum_copy0_v())
+		gk20a_reset(g, mc_enable_ce2_m());
+}
+
+static void gk20a_fifo_handle_mmu_fault_thread(struct work_struct *work)
+{
+	struct fifo_gk20a *f = container_of(work, struct fifo_gk20a,
+					    fault_restore_thread);
+	struct gk20a *g = f->g;
+	int i;
+
+	/* Reinitialise FECS and GR */
+	gk20a_init_pmu_setup_hw2(g);
+
+	/* It is safe to enable ELPG again. */
+	gk20a_pmu_enable_elpg(g);
+
+	/* Restore the runlist */
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		gk20a_fifo_update_runlist_locked(g, i, ~0, true, true);
+
+	/* unlock all runlists */
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		mutex_unlock(&g->fifo.runlist_info[i].mutex);
+
+}
+
+static void gk20a_fifo_handle_chsw_fault(struct gk20a *g)
+{
+	u32 intr;
+
+	intr = gk20a_readl(g, fifo_intr_chsw_error_r());
+	gk20a_err(dev_from_gk20a(g), "chsw: %08x\n", intr);
+	gk20a_fecs_dump_falcon_stats(g);
+	gk20a_writel(g, fifo_intr_chsw_error_r(), intr);
+}
+
+static void gk20a_fifo_handle_dropped_mmu_fault(struct gk20a *g)
+{
+	struct device *dev = dev_from_gk20a(g);
+	u32 fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+	gk20a_err(dev, "dropped mmu fault (0x%08x)", fault_id);
+}
+
+static bool gk20a_fifo_should_defer_engine_reset(struct gk20a *g, u32 engine_id,
+		struct fifo_mmu_fault_info_gk20a *f, bool fake_fault)
+{
+	/* channel recovery is only deferred if an sm debugger
+	   is attached and has MMU debug mode is enabled */
+	if (!gk20a_gr_sm_debugger_attached(g) ||
+	    !gk20a_mm_mmu_debug_mode_enabled(g))
+		return false;
+
+	/* if this fault is fake (due to RC recovery), don't defer recovery */
+	if (fake_fault)
+		return false;
+
+	if (engine_id != ENGINE_GR_GK20A ||
+	    f->engine_subid_v != fifo_intr_mmu_fault_info_engine_subid_gpc_v())
+		return false;
+
+	return true;
+}
+
+void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
+		unsigned long fault_id) {
+	u32 engine_mmu_id;
+	int i;
+
+	/* reset engines */
+	for_each_set_bit(engine_mmu_id, &fault_id, 32) {
+		u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
+		if (engine_id != ~0)
+			gk20a_fifo_reset_engine(g, engine_id);
+	}
+
+	/* CLEAR the runlists. Do not wait for runlist to start as
+	 * some engines may not be available right now */
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		gk20a_fifo_update_runlist_locked(g, i, ~0, false, false);
+
+	/* clear interrupt */
+	gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
+
+	/* resume scheduler */
+	gk20a_writel(g, fifo_error_sched_disable_r(),
+		     gk20a_readl(g, fifo_error_sched_disable_r()));
+
+	/* Spawn a work to enable PMU and restore runlists */
+	schedule_work(&g->fifo.fault_restore_thread);
+}
+
+static bool gk20a_fifo_set_ctx_mmu_error(struct gk20a *g,
+		struct channel_gk20a *ch) {
+	bool verbose = true;
+	if (!ch)
+		return verbose;
+
+	gk20a_err(dev_from_gk20a(g),
+		"channel %d generated a mmu fault",
+		ch->hw_chid);
+	if (ch->error_notifier) {
+		u32 err = ch->error_notifier->info32;
+		if (ch->error_notifier->status == 0xffff) {
+			/* If error code is already set, this mmu fault
+			 * was triggered as part of recovery from other
+			 * error condition.
+			 * Don't overwrite error flag. */
+			/* Fifo timeout debug spew is controlled by user */
+			if (err == NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT)
+				verbose = ch->timeout_debug_dump;
+		} else {
+			gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_FIFO_ERROR_MMU_ERR_FLT);
+		}
+	}
+	/* mark channel as faulted */
+	ch->has_timedout = true;
+	wmb();
+	/* unblock pending waits */
+	wake_up(&ch->semaphore_wq);
+	wake_up(&ch->notifier_wq);
+	wake_up(&ch->submit_wq);
+	return verbose;
+}
+
+
+static bool gk20a_fifo_handle_mmu_fault(struct gk20a *g)
+{
+	bool fake_fault;
+	unsigned long fault_id;
+	unsigned long engine_mmu_id;
+	int i;
+	bool verbose = true;
+	gk20a_dbg_fn("");
+
+	g->fifo.deferred_reset_pending = false;
+
+	/* Disable ELPG */
+	gk20a_pmu_disable_elpg(g);
+
+	/* If we have recovery in progress, MMU fault id is invalid */
+	if (g->fifo.mmu_fault_engines) {
+		fault_id = g->fifo.mmu_fault_engines;
+		g->fifo.mmu_fault_engines = 0;
+		fake_fault = true;
+	} else {
+		fault_id = gk20a_readl(g, fifo_intr_mmu_fault_id_r());
+		fake_fault = false;
+		gk20a_debug_dump(g->dev);
+	}
+
+	/* lock all runlists. Note that locks are are released in
+	 * gk20a_fifo_handle_mmu_fault_thread() */
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		mutex_lock(&g->fifo.runlist_info[i].mutex);
+
+	/* go through all faulted engines */
+	for_each_set_bit(engine_mmu_id, &fault_id, 32) {
+		/* bits in fifo_intr_mmu_fault_id_r do not correspond 1:1 to
+		 * engines. Convert engine_mmu_id to engine_id */
+		u32 engine_id = gk20a_mmu_id_to_engine_id(engine_mmu_id);
+		struct fifo_runlist_info_gk20a *runlist = g->fifo.runlist_info;
+		struct fifo_mmu_fault_info_gk20a f;
+		struct channel_gk20a *ch = NULL;
+
+		get_exception_mmu_fault_info(g, engine_mmu_id, &f);
+		trace_gk20a_mmu_fault(f.fault_hi_v,
+				      f.fault_lo_v,
+				      f.fault_info_v,
+				      f.inst_ptr,
+				      engine_id,
+				      f.engine_subid_desc,
+				      f.client_desc,
+				      f.fault_type_desc);
+		gk20a_err(dev_from_gk20a(g), "mmu fault on engine %d, "
+			   "engine subid %d (%s), client %d (%s), "
+			   "addr 0x%08x:0x%08x, type %d (%s), info 0x%08x,"
+			   "inst_ptr 0x%llx\n",
+			   engine_id,
+			   f.engine_subid_v, f.engine_subid_desc,
+			   f.client_v, f.client_desc,
+			   f.fault_hi_v, f.fault_lo_v,
+			   f.fault_type_v, f.fault_type_desc,
+			   f.fault_info_v, f.inst_ptr);
+
+		/* get the channel */
+		if (fake_fault) {
+			/* read and parse engine status */
+			u32 status = gk20a_readl(g,
+				fifo_engine_status_r(engine_id));
+			u32 ctx_status =
+				fifo_engine_status_ctx_status_v(status);
+			bool type_ch = fifo_pbdma_status_id_type_v(status) ==
+				fifo_pbdma_status_id_type_chid_v();
+
+			/* use next_id if context load is failing */
+			u32 id = (ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+				fifo_engine_status_next_id_v(status) :
+				fifo_engine_status_id_v(status);
+
+			if (type_ch) {
+				ch = g->fifo.channel + id;
+			} else {
+				gk20a_err(dev_from_gk20a(g), "non-chid type not supported");
+				WARN_ON(1);
+			}
+		} else {
+			/* read channel based on instruction pointer */
+			ch = channel_from_inst_ptr(&g->fifo, f.inst_ptr);
+		}
+
+		if (ch) {
+			if (ch->in_use) {
+				/* disable the channel from hw and increment
+				 * syncpoints */
+				gk20a_disable_channel_no_update(ch);
+
+				/* remove the channel from runlist */
+				clear_bit(ch->hw_chid,
+					  runlist->active_channels);
+			}
+
+			/* check if engine reset should be deferred */
+			if (gk20a_fifo_should_defer_engine_reset(g, engine_id, &f, fake_fault)) {
+				g->fifo.mmu_fault_engines = fault_id;
+
+				/* handled during channel free */
+				g->fifo.deferred_reset_pending = true;
+			} else
+				verbose = gk20a_fifo_set_ctx_mmu_error(g, ch);
+
+		} else if (f.inst_ptr ==
+				g->mm.bar1.inst_block.cpu_pa) {
+			gk20a_err(dev_from_gk20a(g), "mmu fault from bar1");
+		} else if (f.inst_ptr ==
+				g->mm.pmu.inst_block.cpu_pa) {
+			gk20a_err(dev_from_gk20a(g), "mmu fault from pmu");
+		} else
+			gk20a_err(dev_from_gk20a(g), "couldn't locate channel for mmu fault");
+	}
+
+	if (g->fifo.deferred_reset_pending) {
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "sm debugger attached,"
+			   " deferring channel recovery to channel free");
+		/* clear interrupt */
+		gk20a_writel(g, fifo_intr_mmu_fault_id_r(), fault_id);
+		return verbose;
+	}
+
+	/* resetting the engines and clearing the runlists is done in
+	   a separate function to allow deferred reset. */
+	fifo_gk20a_finish_mmu_fault_handling(g, fault_id);
+	return verbose;
+}
+
+static void gk20a_fifo_get_faulty_channel(struct gk20a *g, int engine_id,
+					  u32 *chid, bool *type_ch)
+{
+	u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+	u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+
+	*type_ch = fifo_pbdma_status_id_type_v(status) ==
+		fifo_pbdma_status_id_type_chid_v();
+	/* use next_id if context load is failing */
+	*chid = (ctx_status ==
+		fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+		fifo_engine_status_next_id_v(status) :
+		fifo_engine_status_id_v(status);
+}
+
+void gk20a_fifo_recover(struct gk20a *g, u32 __engine_ids,
+		bool verbose)
+{
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+	unsigned long engine_id, i;
+	unsigned long _engine_ids = __engine_ids;
+	unsigned long engine_ids = 0;
+	int ret;
+
+	if (verbose)
+		gk20a_debug_dump(g->dev);
+
+	/* store faulted engines in advance */
+	g->fifo.mmu_fault_engines = 0;
+	for_each_set_bit(engine_id, &_engine_ids, 32) {
+		bool ref_type_ch;
+		int ref_chid;
+		gk20a_fifo_get_faulty_channel(g, engine_id, &ref_chid,
+					      &ref_type_ch);
+
+		/* Reset *all* engines that use the
+		 * same channel as faulty engine */
+		for (i = 0; i < g->fifo.max_engines; i++) {
+			bool type_ch;
+			u32 chid;
+			gk20a_fifo_get_faulty_channel(g, i, &chid, &type_ch);
+			if (ref_type_ch == type_ch && ref_chid == chid) {
+				engine_ids |= BIT(i);
+				g->fifo.mmu_fault_engines |=
+					BIT(gk20a_engine_id_to_mmu_id(i));
+			}
+		}
+
+	}
+
+	/* trigger faults for all bad engines */
+	for_each_set_bit(engine_id, &engine_ids, 32) {
+		if (engine_id > g->fifo.max_engines) {
+			WARN_ON(true);
+			break;
+		}
+
+		gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id),
+			     fifo_trigger_mmu_fault_id_f(
+			     gk20a_engine_id_to_mmu_id(engine_id)) |
+			     fifo_trigger_mmu_fault_enable_f(1));
+	}
+
+	/* Wait for MMU fault to trigger */
+	ret = -EBUSY;
+	do {
+		if (gk20a_readl(g, fifo_intr_0_r()) &
+				fifo_intr_0_mmu_fault_pending_f()) {
+			ret = 0;
+			break;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+	} while (time_before(jiffies, end_jiffies) ||
+			!tegra_platform_is_silicon());
+
+	if (ret)
+		gk20a_err(dev_from_gk20a(g), "mmu fault timeout");
+
+	/* release mmu fault trigger */
+	for_each_set_bit(engine_id, &engine_ids, 32)
+		gk20a_writel(g, fifo_trigger_mmu_fault_r(engine_id), 0);
+}
+
+
+static bool gk20a_fifo_handle_sched_error(struct gk20a *g)
+{
+	u32 sched_error;
+	u32 engine_id;
+	int id = -1;
+	bool non_chid = false;
+
+	/* read and reset the scheduler error register */
+	sched_error = gk20a_readl(g, fifo_intr_sched_error_r());
+	gk20a_writel(g, fifo_intr_0_r(), fifo_intr_0_sched_error_reset_f());
+
+	for (engine_id = 0; engine_id < g->fifo.max_engines; engine_id++) {
+		u32 status = gk20a_readl(g, fifo_engine_status_r(engine_id));
+		u32 ctx_status = fifo_engine_status_ctx_status_v(status);
+		bool failing_engine;
+
+		/* we are interested in busy engines */
+		failing_engine = fifo_engine_status_engine_v(status) ==
+			fifo_engine_status_engine_busy_v();
+
+		/* ..that are doing context switch */
+		failing_engine = failing_engine &&
+			(ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_switch_v()
+			|| ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_save_v()
+			|| ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_load_v());
+
+		if (failing_engine) {
+			id = (ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+				fifo_engine_status_next_id_v(status) :
+				fifo_engine_status_id_v(status);
+			non_chid = fifo_pbdma_status_id_type_v(status) !=
+				fifo_pbdma_status_id_type_chid_v();
+			break;
+		}
+	}
+
+	/* could not find the engine - should never happen */
+	if (unlikely(engine_id >= g->fifo.max_engines))
+		goto err;
+
+	if (fifo_intr_sched_error_code_f(sched_error) ==
+			fifo_intr_sched_error_code_ctxsw_timeout_v()) {
+		struct fifo_gk20a *f = &g->fifo;
+		struct channel_gk20a *ch = &f->channel[id];
+
+		if (non_chid) {
+			gk20a_fifo_recover(g, BIT(engine_id), true);
+			goto err;
+		}
+
+		if (gk20a_channel_update_and_check_timeout(ch,
+			GRFIFO_TIMEOUT_CHECK_PERIOD_US / 1000)) {
+			gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+			gk20a_err(dev_from_gk20a(g),
+				"fifo sched ctxsw timeout error:"
+				"engine = %u, ch = %d", engine_id, id);
+			gk20a_fifo_recover(g, BIT(engine_id),
+				ch->timeout_debug_dump);
+		} else {
+			gk20a_warn(dev_from_gk20a(g),
+				"fifo is waiting for ctx switch for %d ms,"
+				"ch = %d\n",
+				ch->timeout_accumulated_ms,
+				id);
+		}
+		return ch->timeout_debug_dump;
+	}
+err:
+	gk20a_err(dev_from_gk20a(g), "fifo sched error : 0x%08x, engine=%u, %s=%d",
+		   sched_error, engine_id, non_chid ? "non-ch" : "ch", id);
+
+	return true;
+}
+
+static u32 fifo_error_isr(struct gk20a *g, u32 fifo_intr)
+{
+	bool print_channel_reset_log = false, reset_engine = false;
+	struct device *dev = dev_from_gk20a(g);
+	u32 handled = 0;
+
+	gk20a_dbg_fn("");
+
+	if (fifo_intr & fifo_intr_0_pio_error_pending_f()) {
+		/* pio mode is unused.  this shouldn't happen, ever. */
+		/* should we clear it or just leave it pending? */
+		gk20a_err(dev, "fifo pio error!\n");
+		BUG_ON(1);
+	}
+
+	if (fifo_intr & fifo_intr_0_bind_error_pending_f()) {
+		u32 bind_error = gk20a_readl(g, fifo_intr_bind_error_r());
+		gk20a_err(dev, "fifo bind error: 0x%08x", bind_error);
+		print_channel_reset_log = true;
+		handled |= fifo_intr_0_bind_error_pending_f();
+	}
+
+	if (fifo_intr & fifo_intr_0_sched_error_pending_f()) {
+		print_channel_reset_log = gk20a_fifo_handle_sched_error(g);
+		handled |= fifo_intr_0_sched_error_pending_f();
+	}
+
+	if (fifo_intr & fifo_intr_0_chsw_error_pending_f()) {
+		gk20a_fifo_handle_chsw_fault(g);
+		handled |= fifo_intr_0_chsw_error_pending_f();
+	}
+
+	if (fifo_intr & fifo_intr_0_mmu_fault_pending_f()) {
+		print_channel_reset_log = gk20a_fifo_handle_mmu_fault(g);
+		reset_engine  = true;
+		handled |= fifo_intr_0_mmu_fault_pending_f();
+	}
+
+	if (fifo_intr & fifo_intr_0_dropped_mmu_fault_pending_f()) {
+		gk20a_fifo_handle_dropped_mmu_fault(g);
+		handled |= fifo_intr_0_dropped_mmu_fault_pending_f();
+	}
+
+	print_channel_reset_log = !g->fifo.deferred_reset_pending
+			&& print_channel_reset_log;
+
+	if (print_channel_reset_log) {
+		int engine_id;
+		gk20a_err(dev_from_gk20a(g),
+			   "channel reset initated from %s", __func__);
+		for (engine_id = 0;
+		     engine_id < g->fifo.max_engines;
+		     engine_id++) {
+			gk20a_dbg_fn("enum:%d -> engine_id:%d", engine_id,
+				g->fifo.engine_info[engine_id].engine_id);
+			fifo_pbdma_exception_status(g,
+					&g->fifo.engine_info[engine_id]);
+			fifo_engine_exception_status(g,
+					&g->fifo.engine_info[engine_id]);
+		}
+	}
+
+	return handled;
+}
+
+
+static u32 gk20a_fifo_handle_pbdma_intr(struct device *dev,
+					struct gk20a *g,
+					struct fifo_gk20a *f,
+					u32 pbdma_id)
+{
+	u32 pbdma_intr_0 = gk20a_readl(g, pbdma_intr_0_r(pbdma_id));
+	u32 pbdma_intr_1 = gk20a_readl(g, pbdma_intr_1_r(pbdma_id));
+	u32 handled = 0;
+	bool reset_device = false;
+	bool reset_channel = false;
+
+	gk20a_dbg_fn("");
+
+	gk20a_dbg(gpu_dbg_intr, "pbdma id intr pending %d %08x %08x", pbdma_id,
+			pbdma_intr_0, pbdma_intr_1);
+	if (pbdma_intr_0) {
+		if (f->intr.pbdma.device_fatal_0 & pbdma_intr_0) {
+			dev_err(dev, "unrecoverable device error: "
+				"pbdma_intr_0(%d):0x%08x", pbdma_id, pbdma_intr_0);
+			reset_device = true;
+			/* TODO: disable pbdma intrs */
+			handled |= f->intr.pbdma.device_fatal_0 & pbdma_intr_0;
+		}
+		if (f->intr.pbdma.channel_fatal_0 & pbdma_intr_0) {
+			dev_warn(dev, "channel error: "
+				 "pbdma_intr_0(%d):0x%08x", pbdma_id, pbdma_intr_0);
+			reset_channel = true;
+			/* TODO: clear pbdma channel errors */
+			handled |= f->intr.pbdma.channel_fatal_0 & pbdma_intr_0;
+		}
+		if (f->intr.pbdma.restartable_0 & pbdma_intr_0) {
+			dev_warn(dev, "sw method: %08x %08x",
+				gk20a_readl(g, pbdma_method0_r(0)),
+				gk20a_readl(g, pbdma_method0_r(0)+4));
+			gk20a_writel(g, pbdma_method0_r(0), 0);
+			gk20a_writel(g, pbdma_method0_r(0)+4, 0);
+			handled |= f->intr.pbdma.restartable_0 & pbdma_intr_0;
+		}
+
+		gk20a_writel(g, pbdma_intr_0_r(pbdma_id), pbdma_intr_0);
+	}
+
+	/* all intrs in _intr_1 are "host copy engine" related,
+	 * which gk20a doesn't have. for now just make them channel fatal. */
+	if (pbdma_intr_1) {
+		dev_err(dev, "channel hce error: pbdma_intr_1(%d): 0x%08x",
+			pbdma_id, pbdma_intr_1);
+		reset_channel = true;
+		gk20a_writel(g, pbdma_intr_1_r(pbdma_id), pbdma_intr_1);
+	}
+
+
+
+	return handled;
+}
+
+static u32 fifo_channel_isr(struct gk20a *g, u32 fifo_intr)
+{
+	gk20a_channel_semaphore_wakeup(g);
+	return fifo_intr_0_channel_intr_pending_f();
+}
+
+
+static u32 fifo_pbdma_isr(struct gk20a *g, u32 fifo_intr)
+{
+	struct device *dev = dev_from_gk20a(g);
+	struct fifo_gk20a *f = &g->fifo;
+	u32 clear_intr = 0, i;
+	u32 pbdma_pending = gk20a_readl(g, fifo_intr_pbdma_id_r());
+
+	for (i = 0; i < fifo_intr_pbdma_id_status__size_1_v(); i++) {
+		if (fifo_intr_pbdma_id_status_f(pbdma_pending, i)) {
+			gk20a_dbg(gpu_dbg_intr, "pbdma id %d intr pending", i);
+			clear_intr |=
+				gk20a_fifo_handle_pbdma_intr(dev, g, f, i);
+		}
+	}
+	return fifo_intr_0_pbdma_intr_pending_f();
+}
+
+void gk20a_fifo_isr(struct gk20a *g)
+{
+	u32 error_intr_mask =
+		fifo_intr_0_bind_error_pending_f() |
+		fifo_intr_0_sched_error_pending_f() |
+		fifo_intr_0_chsw_error_pending_f() |
+		fifo_intr_0_fb_flush_timeout_pending_f() |
+		fifo_intr_0_dropped_mmu_fault_pending_f() |
+		fifo_intr_0_mmu_fault_pending_f() |
+		fifo_intr_0_lb_error_pending_f() |
+		fifo_intr_0_pio_error_pending_f();
+
+	u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+	u32 clear_intr = 0;
+
+	/* note we're not actually in an "isr", but rather
+	 * in a threaded interrupt context... */
+	mutex_lock(&g->fifo.intr.isr.mutex);
+
+	gk20a_dbg(gpu_dbg_intr, "fifo isr %08x\n", fifo_intr);
+
+	/* handle runlist update */
+	if (fifo_intr & fifo_intr_0_runlist_event_pending_f()) {
+		gk20a_fifo_handle_runlist_event(g);
+		clear_intr |= fifo_intr_0_runlist_event_pending_f();
+	}
+	if (fifo_intr & fifo_intr_0_pbdma_intr_pending_f())
+		clear_intr |= fifo_pbdma_isr(g, fifo_intr);
+
+	if (unlikely(fifo_intr & error_intr_mask))
+		clear_intr = fifo_error_isr(g, fifo_intr);
+
+	gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+
+	mutex_unlock(&g->fifo.intr.isr.mutex);
+
+	return;
+}
+
+void gk20a_fifo_nonstall_isr(struct gk20a *g)
+{
+	u32 fifo_intr = gk20a_readl(g, fifo_intr_0_r());
+	u32 clear_intr = 0;
+
+	gk20a_dbg(gpu_dbg_intr, "fifo nonstall isr %08x\n", fifo_intr);
+
+	if (fifo_intr & fifo_intr_0_channel_intr_pending_f())
+		clear_intr |= fifo_channel_isr(g, fifo_intr);
+
+	gk20a_writel(g, fifo_intr_0_r(), clear_intr);
+
+	return;
+}
+
+int gk20a_fifo_preempt_channel(struct gk20a *g, u32 hw_chid)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	unsigned long end_jiffies = jiffies
+		+ msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	u32 ret = 0;
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 elpg_off = 0;
+	u32 i;
+
+	gk20a_dbg_fn("%d", hw_chid);
+
+	/* we have no idea which runlist we are using. lock all */
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		mutex_lock(&f->runlist_info[i].mutex);
+
+	/* disable elpg if failed to acquire pmu mutex */
+	elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+	if (elpg_off)
+		gk20a_pmu_disable_elpg(g);
+
+	/* issue preempt */
+	gk20a_writel(g, fifo_preempt_r(),
+		fifo_preempt_chid_f(hw_chid) |
+		fifo_preempt_type_channel_f());
+
+	/* wait for preempt */
+	ret = -EBUSY;
+	do {
+		if (!(gk20a_readl(g, fifo_preempt_r()) &
+			fifo_preempt_pending_true_f())) {
+			ret = 0;
+			break;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+	} while (time_before(jiffies, end_jiffies) ||
+			!tegra_platform_is_silicon());
+
+	if (ret) {
+		int i;
+		u32 engines = 0;
+		struct fifo_gk20a *f = &g->fifo;
+		struct channel_gk20a *ch = &f->channel[hw_chid];
+
+		gk20a_err(dev_from_gk20a(g), "preempt channel %d timeout\n",
+			    hw_chid);
+
+		/* forcefully reset all busy engines using this channel */
+		for (i = 0; i < g->fifo.max_engines; i++) {
+			u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+			u32 ctx_status =
+				fifo_engine_status_ctx_status_v(status);
+			bool type_ch = fifo_pbdma_status_id_type_v(status) ==
+				fifo_pbdma_status_id_type_chid_v();
+			bool busy = fifo_engine_status_engine_v(status) ==
+				fifo_engine_status_engine_busy_v();
+			u32 id = (ctx_status ==
+				fifo_engine_status_ctx_status_ctxsw_load_v()) ?
+				fifo_engine_status_next_id_v(status) :
+				fifo_engine_status_id_v(status);
+
+			if (type_ch && busy && id == hw_chid)
+				engines |= BIT(i);
+		}
+		gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+		gk20a_fifo_recover(g, engines, true);
+	}
+
+	/* re-enable elpg or release pmu mutex */
+	if (elpg_off)
+		gk20a_pmu_enable_elpg(g);
+	else
+		pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	for (i = 0; i < g->fifo.max_runlists; i++)
+		mutex_unlock(&f->runlist_info[i].mutex);
+
+	return ret;
+}
+
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+				struct fifo_engine_info_gk20a *eng_info)
+{
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 elpg_off;
+	u32 enable;
+
+	gk20a_dbg_fn("");
+
+	/* disable elpg if failed to acquire pmu mutex */
+	elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+	if (elpg_off)
+		gk20a_pmu_disable_elpg(g);
+
+	enable = gk20a_readl(g, fifo_sched_disable_r());
+	enable &= ~(fifo_sched_disable_true_v() >> eng_info->runlist_id);
+	gk20a_writel(g, fifo_sched_disable_r(), enable);
+
+	/* re-enable elpg or release pmu mutex */
+	if (elpg_off)
+		gk20a_pmu_enable_elpg(g);
+	else
+		pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+				struct fifo_engine_info_gk20a *eng_info,
+				bool wait_for_idle)
+{
+	u32 gr_stat, pbdma_stat, chan_stat, eng_stat, ctx_stat;
+	u32 pbdma_chid = ~0, engine_chid = ~0, disable;
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 elpg_off;
+	u32 err = 0;
+
+	gk20a_dbg_fn("");
+
+	gr_stat =
+		gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+	if (fifo_engine_status_engine_v(gr_stat) ==
+	    fifo_engine_status_engine_busy_v() && !wait_for_idle)
+		return -EBUSY;
+
+	/* disable elpg if failed to acquire pmu mutex */
+	elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+	if (elpg_off)
+		gk20a_pmu_disable_elpg(g);
+
+	disable = gk20a_readl(g, fifo_sched_disable_r());
+	disable = set_field(disable,
+			fifo_sched_disable_runlist_m(eng_info->runlist_id),
+			fifo_sched_disable_runlist_f(fifo_sched_disable_true_v(),
+				eng_info->runlist_id));
+	gk20a_writel(g, fifo_sched_disable_r(), disable);
+
+	/* chid from pbdma status */
+	pbdma_stat = gk20a_readl(g, fifo_pbdma_status_r(eng_info->pbdma_id));
+	chan_stat  = fifo_pbdma_status_chan_status_v(pbdma_stat);
+	if (chan_stat == fifo_pbdma_status_chan_status_valid_v() ||
+	    chan_stat == fifo_pbdma_status_chan_status_chsw_save_v())
+		pbdma_chid = fifo_pbdma_status_id_v(pbdma_stat);
+	else if (chan_stat == fifo_pbdma_status_chan_status_chsw_load_v() ||
+		 chan_stat == fifo_pbdma_status_chan_status_chsw_switch_v())
+		pbdma_chid = fifo_pbdma_status_next_id_v(pbdma_stat);
+
+	if (pbdma_chid != ~0) {
+		err = gk20a_fifo_preempt_channel(g, pbdma_chid);
+		if (err)
+			goto clean_up;
+	}
+
+	/* chid from engine status */
+	eng_stat = gk20a_readl(g, fifo_engine_status_r(eng_info->engine_id));
+	ctx_stat  = fifo_engine_status_ctx_status_v(eng_stat);
+	if (ctx_stat == fifo_engine_status_ctx_status_valid_v() ||
+	    ctx_stat == fifo_engine_status_ctx_status_ctxsw_save_v())
+		engine_chid = fifo_engine_status_id_v(eng_stat);
+	else if (ctx_stat == fifo_engine_status_ctx_status_ctxsw_load_v() ||
+		 ctx_stat == fifo_engine_status_ctx_status_ctxsw_switch_v())
+		engine_chid = fifo_engine_status_next_id_v(eng_stat);
+
+	if (engine_chid != ~0 && engine_chid != pbdma_chid) {
+		err = gk20a_fifo_preempt_channel(g, engine_chid);
+		if (err)
+			goto clean_up;
+	}
+
+clean_up:
+	/* re-enable elpg or release pmu mutex */
+	if (elpg_off)
+		gk20a_pmu_enable_elpg(g);
+	else
+		pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	if (err) {
+		gk20a_dbg_fn("failed");
+		if (gk20a_fifo_enable_engine_activity(g, eng_info))
+			gk20a_err(dev_from_gk20a(g),
+				"failed to enable gr engine activity\n");
+	} else {
+		gk20a_dbg_fn("done");
+	}
+	return err;
+}
+
+static void gk20a_fifo_runlist_reset_engines(struct gk20a *g, u32 runlist_id)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	u32 engines = 0;
+	int i;
+
+	for (i = 0; i < f->max_engines; i++) {
+		u32 status = gk20a_readl(g, fifo_engine_status_r(i));
+		bool engine_busy = fifo_engine_status_engine_v(status) ==
+			fifo_engine_status_engine_busy_v();
+
+		if (engine_busy &&
+		    (f->engine_info[i].runlist_id == runlist_id))
+			engines |= BIT(i);
+	}
+	gk20a_fifo_recover(g, engines, true);
+}
+
+static int gk20a_fifo_runlist_wait_pending(struct gk20a *g, u32 runlist_id)
+{
+	struct fifo_runlist_info_gk20a *runlist;
+	u32 remain;
+	bool pending;
+
+	runlist = &g->fifo.runlist_info[runlist_id];
+	remain = wait_event_timeout(runlist->runlist_wq,
+		((pending = gk20a_readl(g, fifo_eng_runlist_r(runlist_id)) &
+			fifo_eng_runlist_pending_true_f()) == 0),
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+
+	if (remain == 0 && pending != 0)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static int gk20a_fifo_update_runlist_locked(struct gk20a *g, u32 runlist_id,
+					    u32 hw_chid, bool add,
+					    bool wait_for_finish)
+{
+	u32 ret = 0;
+	struct device *d = dev_from_gk20a(g);
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_runlist_info_gk20a *runlist = NULL;
+	u32 *runlist_entry_base = NULL;
+	u32 *runlist_entry = NULL;
+	phys_addr_t runlist_pa;
+	u32 old_buf, new_buf;
+	u32 chid;
+	u32 count = 0;
+	runlist = &f->runlist_info[runlist_id];
+
+	/* valid channel, add/remove it from active list.
+	   Otherwise, keep active list untouched for suspend/resume. */
+	if (hw_chid != ~0) {
+		if (add) {
+			if (test_and_set_bit(hw_chid,
+				runlist->active_channels) == 1)
+				return 0;
+		} else {
+			if (test_and_clear_bit(hw_chid,
+				runlist->active_channels) == 0)
+				return 0;
+		}
+	}
+
+	old_buf = runlist->cur_buffer;
+	new_buf = !runlist->cur_buffer;
+
+	gk20a_dbg_info("runlist_id : %d, switch to new buffer 0x%16llx",
+		runlist_id, runlist->mem[new_buf].iova);
+
+	runlist_pa = gk20a_get_phys_from_iova(d, runlist->mem[new_buf].iova);
+	if (!runlist_pa) {
+		ret = -EINVAL;
+		goto clean_up;
+	}
+
+	runlist_entry_base = runlist->mem[new_buf].cpuva;
+	if (!runlist_entry_base) {
+		ret = -ENOMEM;
+		goto clean_up;
+	}
+
+	if (hw_chid != ~0 || /* add/remove a valid channel */
+	    add /* resume to add all channels back */) {
+		runlist_entry = runlist_entry_base;
+		for_each_set_bit(chid,
+			runlist->active_channels, f->num_channels) {
+			gk20a_dbg_info("add channel %d to runlist", chid);
+			runlist_entry[0] = chid;
+			runlist_entry[1] = 0;
+			runlist_entry += 2;
+			count++;
+		}
+	} else	/* suspend to remove all channels */
+		count = 0;
+
+	if (count != 0) {
+		gk20a_writel(g, fifo_runlist_base_r(),
+			fifo_runlist_base_ptr_f(u64_lo32(runlist_pa >> 12)) |
+			fifo_runlist_base_target_vid_mem_f());
+	}
+
+	gk20a_writel(g, fifo_runlist_r(),
+		fifo_runlist_engine_f(runlist_id) |
+		fifo_eng_runlist_length_f(count));
+
+	if (wait_for_finish) {
+		ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
+
+		if (ret == -ETIMEDOUT) {
+			gk20a_err(dev_from_gk20a(g),
+				   "runlist update timeout");
+
+			gk20a_fifo_runlist_reset_engines(g, runlist_id);
+
+			/* engine reset needs the lock. drop it */
+			mutex_unlock(&runlist->mutex);
+			/* wait until the runlist is active again */
+			ret = gk20a_fifo_runlist_wait_pending(g, runlist_id);
+			/* get the lock back. at this point everything should
+			 * should be fine */
+			mutex_lock(&runlist->mutex);
+
+			if (ret)
+				gk20a_err(dev_from_gk20a(g),
+					   "runlist update failed: %d", ret);
+		} else if (ret == -EINTR)
+			gk20a_err(dev_from_gk20a(g),
+				   "runlist update interrupted");
+	}
+
+	runlist->cur_buffer = new_buf;
+
+clean_up:
+	return ret;
+}
+
+/* add/remove a channel from runlist
+   special cases below: runlist->active_channels will NOT be changed.
+   (hw_chid == ~0 && !add) means remove all active channels from runlist.
+   (hw_chid == ~0 &&  add) means restore all active channels on runlist. */
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 runlist_id, u32 hw_chid,
+			      bool add, bool wait_for_finish)
+{
+	struct fifo_runlist_info_gk20a *runlist = NULL;
+	struct fifo_gk20a *f = &g->fifo;
+	u32 token = PMU_INVALID_MUTEX_OWNER_ID;
+	u32 elpg_off;
+	u32 ret = 0;
+
+	runlist = &f->runlist_info[runlist_id];
+
+	mutex_lock(&runlist->mutex);
+
+	/* disable elpg if failed to acquire pmu mutex */
+	elpg_off = pmu_mutex_acquire(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+	if (elpg_off)
+		gk20a_pmu_disable_elpg(g);
+
+	ret = gk20a_fifo_update_runlist_locked(g, runlist_id, hw_chid, add,
+					       wait_for_finish);
+
+	/* re-enable elpg or release pmu mutex */
+	if (elpg_off)
+		gk20a_pmu_enable_elpg(g);
+	else
+		pmu_mutex_release(&g->pmu, PMU_MUTEX_ID_FIFO, &token);
+
+	mutex_unlock(&runlist->mutex);
+	return ret;
+}
+
+int gk20a_fifo_suspend(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	/* stop bar1 snooping */
+	gk20a_writel(g, fifo_bar1_base_r(),
+			fifo_bar1_base_valid_false_f());
+
+	/* disable fifo intr */
+	gk20a_writel(g, fifo_intr_en_0_r(), 0);
+	gk20a_writel(g, fifo_intr_en_1_r(), 0);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g)
+{
+	if (gk20a_readl(g, fifo_intr_0_r()) &
+			fifo_intr_0_mmu_fault_pending_f())
+		return true;
+	else
+		return false;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
new file mode 100644
index 000000000000..051acda23bcb
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/fifo_gk20a.h
@@ -0,0 +1,164 @@
+/*
+ * drivers/video/tegra/host/gk20a/fifo_gk20a.h
+ *
+ * GK20A graphics fifo (gr host)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __FIFO_GK20A_H__
+#define __FIFO_GK20A_H__
+
+#include "channel_gk20a.h"
+
+#define MAX_RUNLIST_BUFFERS	2
+
+/* generally corresponds to the "pbdma" engine */
+
+struct fifo_runlist_info_gk20a {
+	unsigned long *active_channels;
+	/* Each engine has its own SW and HW runlist buffer.*/
+	struct runlist_mem_desc mem[MAX_RUNLIST_BUFFERS];
+	u32  cur_buffer;
+	u32  total_entries;
+	bool stopped;
+	bool support_tsg;
+	struct mutex mutex; /* protect channel preempt and runlist upate */
+	wait_queue_head_t runlist_wq;
+};
+
+/* so far gk20a has two engines: gr and ce2(gr_copy) */
+enum {
+	ENGINE_GR_GK20A	    = 0,
+	ENGINE_CE2_GK20A    = 1,
+	ENGINE_INVAL_GK20A
+};
+
+struct fifo_pbdma_exception_info_gk20a {
+	u32 status_r; /* raw register value from hardware */
+	u32 id, next_id;
+	u32 chan_status_v; /* raw value from hardware */
+	bool id_is_chid, next_id_is_chid;
+	bool chsw_in_progress;
+};
+
+struct fifo_engine_exception_info_gk20a {
+	u32 status_r; /* raw register value from hardware */
+	u32 id, next_id;
+	u32 ctx_status_v; /* raw value from hardware */
+	bool id_is_chid, next_id_is_chid;
+	bool faulted, idle, ctxsw_in_progress;
+};
+
+struct fifo_mmu_fault_info_gk20a {
+	u32 fault_info_v;
+	u32 fault_type_v;
+	u32 engine_subid_v;
+	u32 client_v;
+	u32 fault_hi_v;
+	u32 fault_lo_v;
+	u64 inst_ptr;
+	const char *fault_type_desc;
+	const char *engine_subid_desc;
+	const char *client_desc;
+};
+
+struct fifo_engine_info_gk20a {
+	u32 sw_id;
+	const char *name;
+	u32 dev_info_id;
+	u32 engine_id;
+	u32 runlist_id;
+	u32 pbdma_id;
+	u32 mmu_fault_id;
+	u32 rc_mask;
+	struct fifo_pbdma_exception_info_gk20a pbdma_exception_info;
+	struct fifo_engine_exception_info_gk20a engine_exception_info;
+	struct fifo_mmu_fault_info_gk20a mmu_fault_info;
+
+};
+
+struct fifo_gk20a {
+	struct gk20a *g;
+	int num_channels;
+
+	int num_pbdma;
+	u32 *pbdma_map;
+
+	struct fifo_engine_info_gk20a *engine_info;
+	u32 max_engines;
+	u32 num_engines;
+
+	struct fifo_runlist_info_gk20a *runlist_info;
+	u32 max_runlists;
+
+	struct userd_desc userd;
+	u32 userd_entry_size;
+	u32 userd_total_size;
+
+	struct channel_gk20a *channel;
+	struct mutex ch_inuse_mutex; /* protect unused chid look up */
+
+	void (*remove_support)(struct fifo_gk20a *);
+	bool sw_ready;
+	struct {
+		/* share info between isrs and non-isr code */
+		struct {
+			struct mutex mutex;
+		} isr;
+		struct {
+			u32 device_fatal_0;
+			u32 channel_fatal_0;
+			u32 restartable_0;
+		} pbdma;
+		struct {
+
+		} engine;
+
+
+	} intr;
+
+	u32 mmu_fault_engines;
+	bool deferred_reset_pending;
+	struct mutex deferred_reset_mutex;
+
+	struct work_struct fault_restore_thread;
+};
+
+int gk20a_init_fifo_support(struct gk20a *g);
+
+void gk20a_fifo_isr(struct gk20a *g);
+void gk20a_fifo_nonstall_isr(struct gk20a *g);
+
+int gk20a_fifo_preempt_channel(struct gk20a *g, u32 hw_chid);
+
+int gk20a_fifo_enable_engine_activity(struct gk20a *g,
+			struct fifo_engine_info_gk20a *eng_info);
+int gk20a_fifo_disable_engine_activity(struct gk20a *g,
+			struct fifo_engine_info_gk20a *eng_info,
+			bool wait_for_idle);
+
+int gk20a_fifo_update_runlist(struct gk20a *g, u32 engine_id, u32 hw_chid,
+			      bool add, bool wait_for_finish);
+
+int gk20a_fifo_suspend(struct gk20a *g);
+
+bool gk20a_fifo_mmu_fault_pending(struct gk20a *g);
+void gk20a_fifo_recover(struct gk20a *g, u32 engine_ids, bool verbose);
+int gk20a_init_fifo_reset_enable_hw(struct gk20a *g);
+
+void fifo_gk20a_finish_mmu_fault_handling(struct gk20a *g,
+		unsigned long fault_id);
+#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.c b/drivers/gpu/nvgpu/gk20a/gk20a.c
new file mode 100644
index 000000000000..4cc500dee6f2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.c
@@ -0,0 +1,1681 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gk20a.h>
+
+#include <linux/dma-mapping.h>
+#include <linux/highmem.h>
+#include <linux/string.h>
+#include <linux/cdev.h>
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/file.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#include <linux/pm_runtime.h>
+#include <linux/thermal.h>
+#include <asm/cacheflush.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/tegra-powergate.h>
+
+#include <linux/sched.h>
+#include <linux/input-cfboost.h>
+
+#include <mach/pm_domains.h>
+
+#include "gk20a.h"
+#include "debug_gk20a.h"
+#include "ctrl_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_timer_gk20a.h"
+#include "hw_bus_gk20a.h"
+#include "hw_sim_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "gk20a_scale.h"
+#include "dbg_gpu_gk20a.h"
+#include "hal.h"
+
+#ifdef CONFIG_ARM64
+#define __cpuc_flush_dcache_area __flush_dcache_area
+#endif
+
+#define CLASS_NAME "nvidia-gpu"
+/* TODO: Change to e.g. "nvidia-gpu%s" once we have symlinks in place. */
+#define INTERFACE_NAME "nvhost%s-gpu"
+
+#define GK20A_NUM_CDEVS 5
+
+#if defined(GK20A_DEBUG)
+u32 gk20a_dbg_mask = GK20A_DEFAULT_DBG_MASK;
+u32 gk20a_dbg_ftrace;
+#endif
+
+static int gk20a_pm_finalize_poweron(struct device *dev);
+static int gk20a_pm_prepare_poweroff(struct device *dev);
+
+static inline void set_gk20a(struct platform_device *dev, struct gk20a *gk20a)
+{
+	gk20a_get_platform(dev)->g = gk20a;
+}
+
+static const struct file_operations gk20a_channel_ops = {
+	.owner = THIS_MODULE,
+	.release = gk20a_channel_release,
+	.open = gk20a_channel_open,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_channel_ioctl,
+#endif
+	.unlocked_ioctl = gk20a_channel_ioctl,
+};
+
+static const struct file_operations gk20a_ctrl_ops = {
+	.owner = THIS_MODULE,
+	.release = gk20a_ctrl_dev_release,
+	.open = gk20a_ctrl_dev_open,
+	.unlocked_ioctl = gk20a_ctrl_dev_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_ctrl_dev_ioctl,
+#endif
+};
+
+static const struct file_operations gk20a_dbg_ops = {
+	.owner = THIS_MODULE,
+	.release        = gk20a_dbg_gpu_dev_release,
+	.open           = gk20a_dbg_gpu_dev_open,
+	.unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+	.poll		= gk20a_dbg_gpu_dev_poll,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+};
+
+static const struct file_operations gk20a_as_ops = {
+	.owner = THIS_MODULE,
+	.release = gk20a_as_dev_release,
+	.open = gk20a_as_dev_open,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_as_dev_ioctl,
+#endif
+	.unlocked_ioctl = gk20a_as_dev_ioctl,
+};
+
+/*
+ * Note: We use a different 'open' to trigger handling of the profiler session.
+ * Most of the code is shared between them...  Though, at some point if the
+ * code does get too tangled trying to handle each in the same path we can
+ * separate them cleanly.
+ */
+static const struct file_operations gk20a_prof_ops = {
+	.owner = THIS_MODULE,
+	.release        = gk20a_dbg_gpu_dev_release,
+	.open           = gk20a_prof_gpu_dev_open,
+	.unlocked_ioctl = gk20a_dbg_gpu_dev_ioctl,
+	/* .mmap           = gk20a_prof_gpu_dev_mmap,*/
+	/*int (*mmap) (struct file *, struct vm_area_struct *);*/
+	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl = gk20a_dbg_gpu_dev_ioctl,
+#endif
+};
+
+static inline void sim_writel(struct gk20a *g, u32 r, u32 v)
+{
+	writel(v, g->sim.regs+r);
+}
+
+static inline u32 sim_readl(struct gk20a *g, u32 r)
+{
+	return readl(g->sim.regs+r);
+}
+
+static void kunmap_and_free_iopage(void **kvaddr, struct page **page)
+{
+	if (*kvaddr) {
+		kunmap(*kvaddr);
+		*kvaddr = 0;
+	}
+	if (*page) {
+		__free_page(*page);
+		*page = 0;
+	}
+}
+
+static void gk20a_free_sim_support(struct gk20a *g)
+{
+	/* free sim mappings, bfrs */
+	kunmap_and_free_iopage(&g->sim.send_bfr.kvaddr,
+			       &g->sim.send_bfr.page);
+
+	kunmap_and_free_iopage(&g->sim.recv_bfr.kvaddr,
+			       &g->sim.recv_bfr.page);
+
+	kunmap_and_free_iopage(&g->sim.msg_bfr.kvaddr,
+			       &g->sim.msg_bfr.page);
+}
+
+static void gk20a_remove_sim_support(struct sim_gk20a *s)
+{
+	struct gk20a *g = s->g;
+	if (g->sim.regs)
+		sim_writel(g, sim_config_r(), sim_config_mode_disabled_v());
+	gk20a_free_sim_support(g);
+}
+
+static int alloc_and_kmap_iopage(struct device *d,
+				 void **kvaddr,
+				 phys_addr_t *phys,
+				 struct page **page)
+{
+	int err = 0;
+	*page = alloc_page(GFP_KERNEL);
+
+	if (!*page) {
+		err = -ENOMEM;
+		dev_err(d, "couldn't allocate io page\n");
+		goto fail;
+	}
+
+	*kvaddr = kmap(*page);
+	if (!*kvaddr) {
+		err = -ENOMEM;
+		dev_err(d, "couldn't kmap io page\n");
+		goto fail;
+	}
+	*phys = page_to_phys(*page);
+	return 0;
+
+ fail:
+	kunmap_and_free_iopage(kvaddr, page);
+	return err;
+
+}
+
+static void __iomem *gk20a_ioremap_resource(struct platform_device *dev, int i,
+					    struct resource **out)
+{
+	struct resource *r = platform_get_resource(dev, IORESOURCE_MEM, i);
+	if (!r)
+		return NULL;
+	if (out)
+		*out = r;
+	return devm_request_and_ioremap(&dev->dev, r);
+}
+
+/* TBD: strip from released */
+static int gk20a_init_sim_support(struct platform_device *dev)
+{
+	int err = 0;
+	struct gk20a *g = get_gk20a(dev);
+	struct device *d = &dev->dev;
+	phys_addr_t phys;
+
+	g->sim.g = g;
+	g->sim.regs = gk20a_ioremap_resource(dev, GK20A_SIM_IORESOURCE_MEM,
+					     &g->sim.reg_mem);
+	if (!g->sim.regs) {
+		dev_err(d, "failed to remap gk20a sim regs\n");
+		err = -ENXIO;
+		goto fail;
+	}
+
+	/* allocate sim event/msg buffers */
+	err = alloc_and_kmap_iopage(d, &g->sim.send_bfr.kvaddr,
+				    &g->sim.send_bfr.phys,
+				    &g->sim.send_bfr.page);
+
+	err = err || alloc_and_kmap_iopage(d, &g->sim.recv_bfr.kvaddr,
+					   &g->sim.recv_bfr.phys,
+					   &g->sim.recv_bfr.page);
+
+	err = err || alloc_and_kmap_iopage(d, &g->sim.msg_bfr.kvaddr,
+					   &g->sim.msg_bfr.phys,
+					   &g->sim.msg_bfr.page);
+
+	if (!(g->sim.send_bfr.kvaddr && g->sim.recv_bfr.kvaddr &&
+	      g->sim.msg_bfr.kvaddr)) {
+		dev_err(d, "couldn't allocate all sim buffers\n");
+		goto fail;
+	}
+
+	/*mark send ring invalid*/
+	sim_writel(g, sim_send_ring_r(), sim_send_ring_status_invalid_f());
+
+	/*read get pointer and make equal to put*/
+	g->sim.send_ring_put = sim_readl(g, sim_send_get_r());
+	sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+
+	/*write send ring address and make it valid*/
+	/*TBD: work for >32b physmem*/
+	phys = g->sim.send_bfr.phys;
+	sim_writel(g, sim_send_ring_hi_r(), 0);
+	sim_writel(g, sim_send_ring_r(),
+		   sim_send_ring_status_valid_f() |
+		   sim_send_ring_target_phys_pci_coherent_f() |
+		   sim_send_ring_size_4kb_f() |
+		   sim_send_ring_addr_lo_f(phys >> PAGE_SHIFT));
+
+	/*repeat for recv ring (but swap put,get as roles are opposite) */
+	sim_writel(g, sim_recv_ring_r(), sim_recv_ring_status_invalid_f());
+
+	/*read put pointer and make equal to get*/
+	g->sim.recv_ring_get = sim_readl(g, sim_recv_put_r());
+	sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+
+	/*write send ring address and make it valid*/
+	/*TBD: work for >32b physmem*/
+	phys = g->sim.recv_bfr.phys;
+	sim_writel(g, sim_recv_ring_hi_r(), 0);
+	sim_writel(g, sim_recv_ring_r(),
+		   sim_recv_ring_status_valid_f() |
+		   sim_recv_ring_target_phys_pci_coherent_f() |
+		   sim_recv_ring_size_4kb_f() |
+		   sim_recv_ring_addr_lo_f(phys >> PAGE_SHIFT));
+
+	g->sim.remove_support = gk20a_remove_sim_support;
+	return 0;
+
+ fail:
+	gk20a_free_sim_support(g);
+	return err;
+}
+
+static inline u32 sim_msg_header_size(void)
+{
+	return 24;/*TBD: fix the header to gt this from NV_VGPU_MSG_HEADER*/
+}
+
+static inline u32 *sim_msg_bfr(struct gk20a *g, u32 byte_offset)
+{
+	return (u32 *)(g->sim.msg_bfr.kvaddr + byte_offset);
+}
+
+static inline u32 *sim_msg_hdr(struct gk20a *g, u32 byte_offset)
+{
+	return sim_msg_bfr(g, byte_offset); /*starts at 0*/
+}
+
+static inline u32 *sim_msg_param(struct gk20a *g, u32 byte_offset)
+{
+	/*starts after msg header/cmn*/
+	return sim_msg_bfr(g, byte_offset + sim_msg_header_size());
+}
+
+static inline void sim_write_hdr(struct gk20a *g, u32 func, u32 size)
+{
+	/*memset(g->sim.msg_bfr.kvaddr,0,min(PAGE_SIZE,size));*/
+	*sim_msg_hdr(g, sim_msg_signature_r()) = sim_msg_signature_valid_v();
+	*sim_msg_hdr(g, sim_msg_result_r())    = sim_msg_result_rpc_pending_v();
+	*sim_msg_hdr(g, sim_msg_spare_r())     = sim_msg_spare__init_v();
+	*sim_msg_hdr(g, sim_msg_function_r())  = func;
+	*sim_msg_hdr(g, sim_msg_length_r())    = size + sim_msg_header_size();
+}
+
+static inline u32 sim_escape_read_hdr_size(void)
+{
+	return 12; /*TBD: fix NV_VGPU_SIM_ESCAPE_READ_HEADER*/
+}
+
+static u32 *sim_send_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+	return (u32 *)(g->sim.send_bfr.kvaddr + byte_offset);
+}
+
+static int rpc_send_message(struct gk20a *g)
+{
+	/* calculations done in units of u32s */
+	u32 send_base = sim_send_put_pointer_v(g->sim.send_ring_put) * 2;
+	u32 dma_offset = send_base + sim_dma_r()/sizeof(u32);
+	u32 dma_hi_offset = send_base + sim_dma_hi_r()/sizeof(u32);
+
+	*sim_send_ring_bfr(g, dma_offset*sizeof(u32)) =
+		sim_dma_target_phys_pci_coherent_f() |
+		sim_dma_status_valid_f() |
+		sim_dma_size_4kb_f() |
+		sim_dma_addr_lo_f(g->sim.msg_bfr.phys >> PAGE_SHIFT);
+
+	*sim_send_ring_bfr(g, dma_hi_offset*sizeof(u32)) = 0; /*TBD >32b phys*/
+
+	*sim_msg_hdr(g, sim_msg_sequence_r()) = g->sim.sequence_base++;
+
+	g->sim.send_ring_put = (g->sim.send_ring_put + 2 * sizeof(u32)) %
+		PAGE_SIZE;
+
+	__cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
+	__cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
+	__cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);
+
+	/* Update the put pointer. This will trap into the host. */
+	sim_writel(g, sim_send_put_r(), g->sim.send_ring_put);
+
+	return 0;
+}
+
+static inline u32 *sim_recv_ring_bfr(struct gk20a *g, u32 byte_offset)
+{
+	return (u32 *)(g->sim.recv_bfr.kvaddr + byte_offset);
+}
+
+static int rpc_recv_poll(struct gk20a *g)
+{
+	phys_addr_t recv_phys_addr;
+
+	/* XXX This read is not required (?) */
+	/*pVGpu->recv_ring_get = VGPU_REG_RD32(pGpu, NV_VGPU_RECV_GET);*/
+
+	/* Poll the recv ring get pointer in an infinite loop*/
+	do {
+		g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+	} while (g->sim.recv_ring_put == g->sim.recv_ring_get);
+
+	/* process all replies */
+	while (g->sim.recv_ring_put != g->sim.recv_ring_get) {
+		/* these are in u32 offsets*/
+		u32 dma_lo_offset =
+			sim_recv_put_pointer_v(g->sim.recv_ring_get)*2 + 0;
+		/*u32 dma_hi_offset = dma_lo_offset + 1;*/
+		u32 recv_phys_addr_lo =	sim_dma_addr_lo_v(*sim_recv_ring_bfr(g, dma_lo_offset*4));
+
+		/*u32 recv_phys_addr_hi = sim_dma_hi_addr_v(
+		      (phys_addr_t)sim_recv_ring_bfr(g,dma_hi_offset*4));*/
+
+		/*TBD >32b phys addr */
+		recv_phys_addr = recv_phys_addr_lo << PAGE_SHIFT;
+
+		if (recv_phys_addr != g->sim.msg_bfr.phys) {
+			dev_err(dev_from_gk20a(g), "%s Error in RPC reply\n",
+				__func__);
+			return -1;
+		}
+
+		/* Update GET pointer */
+		g->sim.recv_ring_get = (g->sim.recv_ring_get + 2*sizeof(u32)) %
+			PAGE_SIZE;
+
+		__cpuc_flush_dcache_area(g->sim.msg_bfr.kvaddr, PAGE_SIZE);
+		__cpuc_flush_dcache_area(g->sim.send_bfr.kvaddr, PAGE_SIZE);
+		__cpuc_flush_dcache_area(g->sim.recv_bfr.kvaddr, PAGE_SIZE);
+
+		sim_writel(g, sim_recv_get_r(), g->sim.recv_ring_get);
+
+		g->sim.recv_ring_put = sim_readl(g, sim_recv_put_r());
+	}
+
+	return 0;
+}
+
+static int issue_rpc_and_wait(struct gk20a *g)
+{
+	int err;
+
+	err = rpc_send_message(g);
+	if (err) {
+		dev_err(dev_from_gk20a(g), "%s failed rpc_send_message\n",
+			__func__);
+		return err;
+	}
+
+	err = rpc_recv_poll(g);
+	if (err) {
+		dev_err(dev_from_gk20a(g), "%s failed rpc_recv_poll\n",
+			__func__);
+		return err;
+	}
+
+	/* Now check if RPC really succeeded */
+	if (*sim_msg_hdr(g, sim_msg_result_r()) != sim_msg_result_success_v()) {
+		dev_err(dev_from_gk20a(g), "%s received failed status!\n",
+			__func__);
+		return -(*sim_msg_hdr(g, sim_msg_result_r()));
+	}
+	return 0;
+}
+
+int gk20a_sim_esc_read(struct gk20a *g, char *path, u32 index, u32 count, u32 *data)
+{
+	int err;
+	size_t pathlen = strlen(path);
+	u32 data_offset;
+
+	sim_write_hdr(g, sim_msg_function_sim_escape_read_v(),
+		      sim_escape_read_hdr_size());
+	*sim_msg_param(g, 0) = index;
+	*sim_msg_param(g, 4) = count;
+	data_offset = roundup(0xc +  pathlen + 1, sizeof(u32));
+	*sim_msg_param(g, 8) = data_offset;
+	strcpy((char *)sim_msg_param(g, 0xc), path);
+
+	err = issue_rpc_and_wait(g);
+
+	if (!err)
+		memcpy(data, sim_msg_param(g, data_offset), count);
+	return err;
+}
+
+static irqreturn_t gk20a_intr_isr_stall(int irq, void *dev_id)
+{
+	struct gk20a *g = dev_id;
+	u32 mc_intr_0;
+
+	if (!g->power_on)
+		return IRQ_NONE;
+
+	/* not from gpu when sharing irq with others */
+	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+	if (unlikely(!mc_intr_0))
+		return IRQ_NONE;
+
+	gk20a_writel(g, mc_intr_en_0_r(),
+		mc_intr_en_0_inta_disabled_f());
+
+	/* flush previous write */
+	gk20a_readl(g, mc_intr_en_0_r());
+
+	return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t gk20a_intr_isr_nonstall(int irq, void *dev_id)
+{
+	struct gk20a *g = dev_id;
+	u32 mc_intr_1;
+
+	if (!g->power_on)
+		return IRQ_NONE;
+
+	/* not from gpu when sharing irq with others */
+	mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+	if (unlikely(!mc_intr_1))
+		return IRQ_NONE;
+
+	gk20a_writel(g, mc_intr_en_1_r(),
+		mc_intr_en_1_inta_disabled_f());
+
+	/* flush previous write */
+	gk20a_readl(g, mc_intr_en_1_r());
+
+	return IRQ_WAKE_THREAD;
+}
+
+static void gk20a_pbus_isr(struct gk20a *g)
+{
+	u32 val;
+	val = gk20a_readl(g, bus_intr_0_r());
+	if (val & (bus_intr_0_pri_squash_m() |
+			bus_intr_0_pri_fecserr_m() |
+			bus_intr_0_pri_timeout_m())) {
+		gk20a_err(dev_from_gk20a(g), "top_fs_status_r : 0x%x",
+			gk20a_readl(g, top_fs_status_r()));
+		gk20a_err(dev_from_gk20a(g), "pmc_enable : 0x%x",
+			gk20a_readl(g, mc_enable_r()));
+		gk20a_err(&g->dev->dev,
+			"NV_PTIMER_PRI_TIMEOUT_SAVE_0: 0x%x\n",
+			gk20a_readl(g, timer_pri_timeout_save_0_r()));
+		gk20a_err(&g->dev->dev,
+			"NV_PTIMER_PRI_TIMEOUT_SAVE_1: 0x%x\n",
+			gk20a_readl(g, timer_pri_timeout_save_1_r()));
+		gk20a_err(&g->dev->dev,
+			"NV_PTIMER_PRI_TIMEOUT_FECS_ERRCODE: 0x%x\n",
+			gk20a_readl(g, timer_pri_timeout_fecs_errcode_r()));
+	}
+
+	if (val)
+		gk20a_err(&g->dev->dev,
+			"Unhandled pending pbus interrupt\n");
+
+	gk20a_writel(g, bus_intr_0_r(), val);
+}
+
+static irqreturn_t gk20a_intr_thread_stall(int irq, void *dev_id)
+{
+	struct gk20a *g = dev_id;
+	u32 mc_intr_0;
+
+	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
+
+	mc_intr_0 = gk20a_readl(g, mc_intr_0_r());
+
+	gk20a_dbg(gpu_dbg_intr, "stall intr %08x\n", mc_intr_0);
+
+	if (mc_intr_0 & mc_intr_0_pgraph_pending_f())
+		gr_gk20a_elpg_protected_call(g, gk20a_gr_isr(g));
+	if (mc_intr_0 & mc_intr_0_pfifo_pending_f())
+		gk20a_fifo_isr(g);
+	if (mc_intr_0 & mc_intr_0_pmu_pending_f())
+		gk20a_pmu_isr(g);
+	if (mc_intr_0 & mc_intr_0_priv_ring_pending_f())
+		gk20a_priv_ring_isr(g);
+	if (mc_intr_0 & mc_intr_0_ltc_pending_f())
+		gk20a_mm_ltc_isr(g);
+	if (mc_intr_0 & mc_intr_0_pbus_pending_f())
+		gk20a_pbus_isr(g);
+
+	gk20a_writel(g, mc_intr_en_0_r(),
+		mc_intr_en_0_inta_hardware_f());
+
+	/* flush previous write */
+	gk20a_readl(g, mc_intr_en_0_r());
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t gk20a_intr_thread_nonstall(int irq, void *dev_id)
+{
+	struct gk20a *g = dev_id;
+	u32 mc_intr_1;
+
+	gk20a_dbg(gpu_dbg_intr, "interrupt thread launched");
+
+	mc_intr_1 = gk20a_readl(g, mc_intr_1_r());
+
+	gk20a_dbg(gpu_dbg_intr, "non-stall intr %08x\n", mc_intr_1);
+
+	if (mc_intr_1 & mc_intr_0_pfifo_pending_f())
+		gk20a_fifo_nonstall_isr(g);
+	if (mc_intr_1 & mc_intr_0_pgraph_pending_f())
+		gk20a_gr_nonstall_isr(g);
+
+	gk20a_writel(g, mc_intr_en_1_r(),
+		mc_intr_en_1_inta_hardware_f());
+
+	/* flush previous write */
+	gk20a_readl(g, mc_intr_en_1_r());
+
+	return IRQ_HANDLED;
+}
+
+static void gk20a_remove_support(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+
+	/* pmu support should already be removed when driver turns off
+	   gpu power rail in prepapre_poweroff */
+	if (g->gk20a_cdev.gk20a_cooling_dev)
+		thermal_cooling_device_unregister(g->gk20a_cdev.gk20a_cooling_dev);
+
+	if (g->gr.remove_support)
+		g->gr.remove_support(&g->gr);
+
+	if (g->fifo.remove_support)
+		g->fifo.remove_support(&g->fifo);
+
+	if (g->mm.remove_support)
+		g->mm.remove_support(&g->mm);
+
+	if (g->sim.remove_support)
+		g->sim.remove_support(&g->sim);
+
+	release_firmware(g->pmu_fw);
+
+	if (g->irq_requested) {
+		free_irq(g->irq_stall, g);
+		free_irq(g->irq_nonstall, g);
+		g->irq_requested = false;
+	}
+
+	/* free mappings to registers, etc*/
+
+	if (g->regs) {
+		iounmap(g->regs);
+		g->regs = 0;
+	}
+	if (g->bar1) {
+		iounmap(g->bar1);
+		g->bar1 = 0;
+	}
+}
+
+static int gk20a_init_support(struct platform_device *dev)
+{
+	int err = 0;
+	struct gk20a *g = get_gk20a(dev);
+
+	g->regs = gk20a_ioremap_resource(dev, GK20A_BAR0_IORESOURCE_MEM,
+					 &g->reg_mem);
+	if (!g->regs) {
+		dev_err(dev_from_gk20a(g), "failed to remap gk20a registers\n");
+		err = -ENXIO;
+		goto fail;
+	}
+
+	g->bar1 = gk20a_ioremap_resource(dev, GK20A_BAR1_IORESOURCE_MEM,
+					 &g->bar1_mem);
+	if (!g->bar1) {
+		dev_err(dev_from_gk20a(g), "failed to remap gk20a bar1\n");
+		err = -ENXIO;
+		goto fail;
+	}
+
+	/* Get interrupt numbers */
+	g->irq_stall = platform_get_irq(dev, 0);
+	g->irq_nonstall = platform_get_irq(dev, 1);
+	if (g->irq_stall < 0 || g->irq_nonstall < 0) {
+		err = -ENXIO;
+		goto fail;
+	}
+
+	if (tegra_cpu_is_asim()) {
+		err = gk20a_init_sim_support(dev);
+		if (err)
+			goto fail;
+	}
+
+	mutex_init(&g->dbg_sessions_lock);
+	mutex_init(&g->client_lock);
+
+	g->remove_support = gk20a_remove_support;
+	return 0;
+
+ fail:
+	gk20a_remove_support(dev);
+	return err;
+}
+
+static int gk20a_init_client(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+	int err;
+
+	gk20a_dbg_fn("");
+
+#ifndef CONFIG_PM_RUNTIME
+	gk20a_pm_finalize_poweron(&dev->dev);
+#endif
+
+	err = gk20a_init_mm_setup_sw(g);
+	if (err)
+		return err;
+
+	if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
+		gk20a_scale_hw_init(dev);
+	return 0;
+}
+
+static void gk20a_deinit_client(struct platform_device *dev)
+{
+	gk20a_dbg_fn("");
+#ifndef CONFIG_PM_RUNTIME
+	gk20a_pm_prepare_poweroff(&dev->dev);
+#endif
+}
+
+int gk20a_get_client(struct gk20a *g)
+{
+	int err = 0;
+
+	mutex_lock(&g->client_lock);
+	if (g->client_refcount == 0)
+		err = gk20a_init_client(g->dev);
+	if (!err)
+		g->client_refcount++;
+	mutex_unlock(&g->client_lock);
+	return err;
+}
+
+void gk20a_put_client(struct gk20a *g)
+{
+	mutex_lock(&g->client_lock);
+	if (g->client_refcount == 1)
+		gk20a_deinit_client(g->dev);
+	g->client_refcount--;
+	mutex_unlock(&g->client_lock);
+	WARN_ON(g->client_refcount < 0);
+}
+
+static int gk20a_pm_prepare_poweroff(struct device *_dev)
+{
+	struct platform_device *dev = to_platform_device(_dev);
+	struct gk20a *g = get_gk20a(dev);
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (!g->power_on)
+		return 0;
+
+	ret |= gk20a_channel_suspend(g);
+
+	/* disable elpg before gr or fifo suspend */
+	ret |= gk20a_pmu_destroy(g);
+	ret |= gk20a_gr_suspend(g);
+	ret |= gk20a_mm_suspend(g);
+	ret |= gk20a_fifo_suspend(g);
+
+	/*
+	 * After this point, gk20a interrupts should not get
+	 * serviced.
+	 */
+	if (g->irq_requested) {
+		free_irq(g->irq_stall, g);
+		free_irq(g->irq_nonstall, g);
+		g->irq_requested = false;
+	}
+
+	/* Disable GPCPLL */
+	ret |= gk20a_suspend_clk_support(g);
+	g->power_on = false;
+
+	return ret;
+}
+
+static void gk20a_detect_chip(struct gk20a *g)
+{
+	struct nvhost_gpu_characteristics *gpu = &g->gpu_characteristics;
+
+	u32 mc_boot_0_value = gk20a_readl(g, mc_boot_0_r());
+	gpu->arch = mc_boot_0_architecture_v(mc_boot_0_value) <<
+		NVHOST_GPU_ARCHITECTURE_SHIFT;
+	gpu->impl = mc_boot_0_implementation_v(mc_boot_0_value);
+	gpu->rev =
+		(mc_boot_0_major_revision_v(mc_boot_0_value) << 4) |
+		mc_boot_0_minor_revision_v(mc_boot_0_value);
+
+	gk20a_dbg_info("arch: %x, impl: %x, rev: %x\n",
+			g->gpu_characteristics.arch,
+			g->gpu_characteristics.impl,
+			g->gpu_characteristics.rev);
+}
+
+static int gk20a_pm_finalize_poweron(struct device *_dev)
+{
+	struct platform_device *dev = to_platform_device(_dev);
+	struct gk20a *g = get_gk20a(dev);
+	int err, nice_value;
+
+	gk20a_dbg_fn("");
+
+	if (g->power_on)
+		return 0;
+
+	nice_value = task_nice(current);
+	set_user_nice(current, -20);
+
+	if (!g->irq_requested) {
+		err = request_threaded_irq(g->irq_stall,
+				gk20a_intr_isr_stall,
+				gk20a_intr_thread_stall,
+				0, "gk20a_stall", g);
+		if (err) {
+			dev_err(dev_from_gk20a(g),
+				"failed to request stall intr irq @ %lld\n",
+					(u64)g->irq_stall);
+			goto done;
+		}
+		err = request_threaded_irq(g->irq_nonstall,
+				gk20a_intr_isr_nonstall,
+				gk20a_intr_thread_nonstall,
+				0, "gk20a_nonstall", g);
+		if (err) {
+			dev_err(dev_from_gk20a(g),
+				"failed to request non-stall intr irq @ %lld\n",
+					(u64)g->irq_nonstall);
+			goto done;
+		}
+		g->irq_requested = true;
+	}
+
+	g->power_on = true;
+
+	gk20a_writel(g, mc_intr_mask_1_r(),
+			mc_intr_0_pfifo_pending_f()
+			| mc_intr_0_pgraph_pending_f());
+	gk20a_writel(g, mc_intr_en_1_r(),
+		mc_intr_en_1_inta_hardware_f());
+
+	gk20a_writel(g, mc_intr_mask_0_r(),
+			mc_intr_0_pgraph_pending_f()
+			| mc_intr_0_pfifo_pending_f()
+			| mc_intr_0_priv_ring_pending_f()
+			| mc_intr_0_ltc_pending_f()
+			| mc_intr_0_pbus_pending_f());
+	gk20a_writel(g, mc_intr_en_0_r(),
+		mc_intr_en_0_inta_hardware_f());
+
+	if (!tegra_platform_is_silicon())
+		gk20a_writel(g, bus_intr_en_0_r(), 0x0);
+	else
+		gk20a_writel(g, bus_intr_en_0_r(),
+			        bus_intr_en_0_pri_squash_m() |
+			        bus_intr_en_0_pri_fecserr_m() |
+			        bus_intr_en_0_pri_timeout_m());
+	gk20a_reset_priv_ring(g);
+
+	gk20a_detect_chip(g);
+	err = gpu_init_hal(g);
+	if (err)
+		goto done;
+
+	/* TBD: move this after graphics init in which blcg/slcg is enabled.
+	   This function removes SlowdownOnBoot which applies 32x divider
+	   on gpcpll bypass path. The purpose of slowdown is to save power
+	   during boot but it also significantly slows down gk20a init on
+	   simulation and emulation. We should remove SOB after graphics power
+	   saving features (blcg/slcg) are enabled. For now, do it here. */
+	err = gk20a_init_clk_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a clk");
+		goto done;
+	}
+
+	/* enable pri timeout only on silicon */
+	if (tegra_platform_is_silicon()) {
+		gk20a_writel(g,
+			timer_pri_timeout_r(),
+			timer_pri_timeout_period_f(0x186A0) |
+			timer_pri_timeout_en_en_enabled_f());
+	} else {
+		gk20a_writel(g,
+			timer_pri_timeout_r(),
+			timer_pri_timeout_period_f(0x186A0) |
+			timer_pri_timeout_en_en_disabled_f());
+	}
+
+	err = gk20a_init_fifo_reset_enable_hw(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to reset gk20a fifo");
+		goto done;
+	}
+
+	err = gk20a_init_mm_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a mm");
+		goto done;
+	}
+
+	err = gk20a_init_pmu_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a pmu");
+		goto done;
+	}
+
+	err = gk20a_init_fifo_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a fifo");
+		goto done;
+	}
+
+	err = gk20a_init_gr_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a gr");
+		goto done;
+	}
+
+	err = gk20a_init_pmu_setup_hw2(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a pmu_hw2");
+		goto done;
+	}
+
+	err = gk20a_init_therm_support(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a therm");
+		goto done;
+	}
+
+	err = gk20a_init_gpu_characteristics(g);
+	if (err) {
+		gk20a_err(&dev->dev, "failed to init gk20a gpu characteristics");
+		goto done;
+	}
+
+	gk20a_channel_resume(g);
+	set_user_nice(current, nice_value);
+
+done:
+	return err;
+}
+
+static struct of_device_id tegra_gk20a_of_match[] = {
+#ifdef CONFIG_TEGRA_GK20A
+	{ .compatible = "nvidia,tegra124-gk20a",
+		.data = &gk20a_tegra_platform },
+#endif
+	{ .compatible = "nvidia,generic-gk20a",
+		.data = &gk20a_generic_platform },
+	{ },
+};
+
+int tegra_gpu_get_max_state(struct thermal_cooling_device *cdev,
+		unsigned long *max_state)
+{
+	struct cooling_device_gk20a *gk20a_gpufreq_device = cdev->devdata;
+
+	*max_state = gk20a_gpufreq_device->gk20a_freq_table_size - 1;
+	return 0;
+}
+
+int tegra_gpu_get_cur_state(struct thermal_cooling_device *cdev,
+		unsigned long *cur_state)
+{
+	struct cooling_device_gk20a  *gk20a_gpufreq_device = cdev->devdata;
+
+	*cur_state = gk20a_gpufreq_device->gk20a_freq_state;
+	return 0;
+}
+
+int tegra_gpu_set_cur_state(struct thermal_cooling_device *c_dev,
+		unsigned long cur_state)
+{
+	u32 target_freq;
+	struct gk20a *g;
+	struct gpufreq_table_data *gpu_cooling_table;
+	struct cooling_device_gk20a *gk20a_gpufreq_device = c_dev->devdata;
+
+	BUG_ON(cur_state >= gk20a_gpufreq_device->gk20a_freq_table_size);
+
+	g = container_of(gk20a_gpufreq_device, struct gk20a, gk20a_cdev);
+
+	gpu_cooling_table = tegra_gpufreq_table_get();
+	target_freq = gpu_cooling_table[cur_state].frequency;
+
+	/* ensure a query for state will get the proper value */
+	gk20a_gpufreq_device->gk20a_freq_state = cur_state;
+
+	gk20a_clk_set_rate(g, target_freq);
+
+	return 0;
+}
+
+static struct thermal_cooling_device_ops tegra_gpu_cooling_ops = {
+	.get_max_state = tegra_gpu_get_max_state,
+	.get_cur_state = tegra_gpu_get_cur_state,
+	.set_cur_state = tegra_gpu_set_cur_state,
+};
+
+static int gk20a_create_device(
+	struct platform_device *pdev, int devno, const char *cdev_name,
+	struct cdev *cdev, struct device **out,
+	const struct file_operations *ops)
+{
+	struct device *dev;
+	int err;
+	struct gk20a *g = get_gk20a(pdev);
+
+	gk20a_dbg_fn("");
+
+	cdev_init(cdev, ops);
+	cdev->owner = THIS_MODULE;
+
+	err = cdev_add(cdev, devno, 1);
+	if (err) {
+		dev_err(&pdev->dev,
+			"failed to add %s cdev\n", cdev_name);
+		return err;
+	}
+
+	dev = device_create(g->class, NULL, devno, NULL,
+		(pdev->id <= 0) ? INTERFACE_NAME : INTERFACE_NAME ".%d",
+		cdev_name, pdev->id);
+
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		cdev_del(cdev);
+		dev_err(&pdev->dev,
+			"failed to create %s device for %s\n",
+			cdev_name, pdev->name);
+		return err;
+	}
+
+	*out = dev;
+	return 0;
+}
+
+static void gk20a_user_deinit(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+
+	if (g->channel.node) {
+		device_destroy(g->class, g->channel.cdev.dev);
+		cdev_del(&g->channel.cdev);
+	}
+
+	if (g->as.node) {
+		device_destroy(g->class, g->as.cdev.dev);
+		cdev_del(&g->as.cdev);
+	}
+
+	if (g->ctrl.node) {
+		device_destroy(g->class, g->ctrl.cdev.dev);
+		cdev_del(&g->ctrl.cdev);
+	}
+
+	if (g->dbg.node) {
+		device_destroy(g->class, g->dbg.cdev.dev);
+		cdev_del(&g->dbg.cdev);
+	}
+
+	if (g->prof.node) {
+		device_destroy(g->class, g->prof.cdev.dev);
+		cdev_del(&g->prof.cdev);
+	}
+
+	if (g->cdev_region)
+		unregister_chrdev_region(g->cdev_region, GK20A_NUM_CDEVS);
+
+	if (g->class)
+		class_destroy(g->class);
+}
+
+static int gk20a_user_init(struct platform_device *dev)
+{
+	int err;
+	dev_t devno;
+	struct gk20a *g = get_gk20a(dev);
+
+	g->class = class_create(THIS_MODULE, CLASS_NAME);
+	if (IS_ERR(g->class)) {
+		err = PTR_ERR(g->class);
+		g->class = NULL;
+		dev_err(&dev->dev,
+			"failed to create " CLASS_NAME " class\n");
+		goto fail;
+	}
+
+	err = alloc_chrdev_region(&devno, 0, GK20A_NUM_CDEVS, CLASS_NAME);
+	if (err) {
+		dev_err(&dev->dev, "failed to allocate devno\n");
+		goto fail;
+	}
+	g->cdev_region = devno;
+
+	err = gk20a_create_device(dev, devno++, "",
+				  &g->channel.cdev, &g->channel.node,
+				  &gk20a_channel_ops);
+	if (err)
+		goto fail;
+
+	err = gk20a_create_device(dev, devno++, "-as",
+				  &g->as.cdev, &g->as.node,
+				  &gk20a_as_ops);
+	if (err)
+		goto fail;
+
+	err = gk20a_create_device(dev, devno++, "-ctrl",
+				  &g->ctrl.cdev, &g->ctrl.node,
+				  &gk20a_ctrl_ops);
+	if (err)
+		goto fail;
+
+	err = gk20a_create_device(dev, devno++, "-dbg",
+				  &g->dbg.cdev, &g->dbg.node,
+				  &gk20a_dbg_ops);
+	if (err)
+		goto fail;
+
+	err = gk20a_create_device(dev, devno++, "-prof",
+				  &g->prof.cdev, &g->prof.node,
+				  &gk20a_prof_ops);
+	if (err)
+		goto fail;
+
+	return 0;
+fail:
+	gk20a_user_deinit(dev);
+	return err;
+}
+
+struct channel_gk20a *gk20a_get_channel_from_file(int fd)
+{
+	struct channel_gk20a *ch;
+	struct file *f = fget(fd);
+	if (!f)
+		return 0;
+
+	if (f->f_op != &gk20a_channel_ops) {
+		fput(f);
+		return 0;
+	}
+
+	ch = (struct channel_gk20a *)f->private_data;
+	fput(f);
+	return ch;
+}
+
+static int gk20a_pm_enable_clk(struct device *dev)
+{
+	int index = 0;
+	struct gk20a_platform *platform;
+
+	platform = dev_get_drvdata(dev);
+	if (!platform)
+		return -EINVAL;
+
+	for (index = 0; index < platform->num_clks; index++) {
+		int err = clk_prepare_enable(platform->clk[index]);
+		if (err)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int gk20a_pm_disable_clk(struct device *dev)
+{
+	int index = 0;
+	struct gk20a_platform *platform;
+
+	platform = dev_get_drvdata(dev);
+	if (!platform)
+		return -EINVAL;
+
+	for (index = 0; index < platform->num_clks; index++)
+		clk_disable_unprepare(platform->clk[index]);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+const struct dev_pm_ops gk20a_pm_ops = {
+#if defined(CONFIG_PM_RUNTIME) && !defined(CONFIG_PM_GENERIC_DOMAINS)
+	.runtime_resume = gk20a_pm_enable_clk,
+	.runtime_suspend = gk20a_pm_disable_clk,
+#endif
+};
+#endif
+
+static int gk20a_pm_railgate(struct generic_pm_domain *domain)
+{
+	struct gk20a *g = container_of(domain, struct gk20a, pd);
+	struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+	int ret = 0;
+
+	if (platform->railgate)
+		ret = platform->railgate(platform->g->dev);
+
+	return ret;
+}
+
+static int gk20a_pm_unrailgate(struct generic_pm_domain *domain)
+{
+	struct gk20a *g = container_of(domain, struct gk20a, pd);
+	struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+	int ret = 0;
+
+	if (platform->unrailgate)
+		ret = platform->unrailgate(platform->g->dev);
+
+	return ret;
+}
+
+static int gk20a_pm_suspend(struct device *dev)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	int ret = 0;
+
+	if (atomic_read(&dev->power.usage_count) > 1)
+		return -EBUSY;
+
+	ret = gk20a_pm_prepare_poweroff(dev);
+	if (ret)
+		return ret;
+
+	gk20a_scale_suspend(to_platform_device(dev));
+
+	if (platform->suspend)
+		platform->suspend(dev);
+
+	return 0;
+}
+
+static int gk20a_pm_resume(struct device *dev)
+{
+	int ret = 0;
+
+	ret = gk20a_pm_finalize_poweron(dev);
+	if (ret)
+		return ret;
+
+	gk20a_scale_resume(to_platform_device(dev));
+
+	return 0;
+}
+
+static int gk20a_pm_initialise_domain(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct dev_power_governor *pm_domain_gov = NULL;
+	struct generic_pm_domain *domain = &platform->g->pd;
+	int ret = 0;
+
+	domain->name = kstrdup(pdev->name, GFP_KERNEL);
+
+	if (!platform->can_railgate)
+		pm_domain_gov = &pm_domain_always_on_gov;
+
+	pm_genpd_init(domain, pm_domain_gov, true);
+
+	domain->power_off = gk20a_pm_railgate;
+	domain->power_on = gk20a_pm_unrailgate;
+	domain->dev_ops.start = gk20a_pm_enable_clk;
+	domain->dev_ops.stop = gk20a_pm_disable_clk;
+	domain->dev_ops.save_state = gk20a_pm_prepare_poweroff;
+	domain->dev_ops.restore_state = gk20a_pm_finalize_poweron;
+	domain->dev_ops.suspend = gk20a_pm_suspend;
+	domain->dev_ops.resume = gk20a_pm_resume;
+
+	device_set_wakeup_capable(&pdev->dev, 0);
+	ret = pm_genpd_add_device(domain, &pdev->dev);
+
+	if (platform->railgate_delay)
+		pm_genpd_set_poweroff_delay(domain, platform->railgate_delay);
+
+	return ret;
+}
+
+static int gk20a_pm_init(struct platform_device *dev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(dev);
+	int err = 0;
+
+	/* Initialise pm runtime */
+	if (platform->clockgate_delay) {
+		pm_runtime_set_autosuspend_delay(&dev->dev,
+						 platform->clockgate_delay);
+		pm_runtime_use_autosuspend(&dev->dev);
+	}
+
+	pm_runtime_enable(&dev->dev);
+	if (!pm_runtime_enabled(&dev->dev))
+		gk20a_pm_enable_clk(&dev->dev);
+
+	/* Enable runtime railgating if possible. If not,
+	 * turn on the rail now. */
+	if (platform->can_railgate && IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
+		platform->railgate(dev);
+	else
+		platform->unrailgate(dev);
+
+	/* genpd will take care of runtime power management if it is enabled */
+	if (IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
+		err = gk20a_pm_initialise_domain(dev);
+
+	return err;
+}
+
+static int gk20a_probe(struct platform_device *dev)
+{
+	struct gk20a *gk20a;
+	int err;
+	struct gk20a_platform *platform = NULL;
+	struct cooling_device_gk20a *gpu_cdev = NULL;
+
+	if (dev->dev.of_node) {
+		const struct of_device_id *match;
+
+		match = of_match_device(tegra_gk20a_of_match, &dev->dev);
+		if (match)
+			platform = (struct gk20a_platform *)match->data;
+	} else
+		platform = (struct gk20a_platform *)dev->dev.platform_data;
+
+	if (!platform) {
+		dev_err(&dev->dev, "no platform data\n");
+		return -ENODATA;
+	}
+
+	gk20a_dbg_fn("");
+
+	platform_set_drvdata(dev, platform);
+
+	gk20a = kzalloc(sizeof(struct gk20a), GFP_KERNEL);
+	if (!gk20a) {
+		dev_err(&dev->dev, "couldn't allocate gk20a support");
+		return -ENOMEM;
+	}
+
+	set_gk20a(dev, gk20a);
+	gk20a->dev = dev;
+
+	err = gk20a_user_init(dev);
+	if (err)
+		return err;
+
+	gk20a_init_support(dev);
+
+	spin_lock_init(&gk20a->mc_enable_lock);
+
+	/* Initialize the platform interface. */
+	err = platform->probe(dev);
+	if (err) {
+		dev_err(&dev->dev, "platform probe failed");
+		return err;
+	}
+
+	err = gk20a_pm_init(dev);
+	if (err) {
+		dev_err(&dev->dev, "pm init failed");
+		return err;
+	}
+
+	/* Initialise scaling */
+	if (IS_ENABLED(CONFIG_GK20A_DEVFREQ))
+		gk20a_scale_init(dev);
+
+	if (platform->late_probe) {
+		err = platform->late_probe(dev);
+		if (err) {
+			dev_err(&dev->dev, "late probe failed");
+			return err;
+		}
+	}
+
+	gk20a_debug_init(dev);
+
+	/* Set DMA parameters to allow larger sgt lists */
+	dev->dev.dma_parms = &gk20a->dma_parms;
+	dma_set_max_seg_size(&dev->dev, UINT_MAX);
+
+	gpu_cdev = &gk20a->gk20a_cdev;
+	gpu_cdev->gk20a_freq_table_size = tegra_gpufreq_table_size_get();
+	gpu_cdev->gk20a_freq_state = 0;
+	gpu_cdev->g = gk20a;
+	gpu_cdev->gk20a_cooling_dev = thermal_cooling_device_register("gk20a_cdev", gpu_cdev,
+					&tegra_gpu_cooling_ops);
+
+	gk20a->gr_idle_timeout_default =
+			CONFIG_GK20A_DEFAULT_TIMEOUT;
+	gk20a->timeouts_enabled = true;
+
+	/* Set up initial clock gating settings */
+	if (tegra_platform_is_silicon()) {
+		gk20a->slcg_enabled = true;
+		gk20a->blcg_enabled = true;
+		gk20a->elcg_enabled = true;
+		gk20a->elpg_enabled = true;
+		gk20a->aelpg_enabled = true;
+	}
+
+	gk20a_create_sysfs(dev);
+
+#ifdef CONFIG_DEBUG_FS
+	clk_gk20a_debugfs_init(dev);
+
+	spin_lock_init(&gk20a->debugfs_lock);
+	gk20a->mm.ltc_enabled = true;
+	gk20a->mm.ltc_enabled_debug = true;
+	gk20a->debugfs_ltc_enabled =
+			debugfs_create_bool("ltc_enabled", S_IRUGO|S_IWUSR,
+				 platform->debugfs,
+				 &gk20a->mm.ltc_enabled_debug);
+	gk20a->mm.ltc_enabled_debug = true;
+	gk20a->debugfs_gr_idle_timeout_default =
+			debugfs_create_u32("gr_idle_timeout_default_us",
+					S_IRUGO|S_IWUSR, platform->debugfs,
+					 &gk20a->gr_idle_timeout_default);
+	gk20a->debugfs_timeouts_enabled =
+			debugfs_create_bool("timeouts_enabled",
+					S_IRUGO|S_IWUSR,
+					platform->debugfs,
+					&gk20a->timeouts_enabled);
+	gk20a_pmu_debugfs_init(dev);
+#endif
+
+#ifdef CONFIG_INPUT_CFBOOST
+	cfb_add_device(&dev->dev);
+#endif
+
+	return 0;
+}
+
+static int __exit gk20a_remove(struct platform_device *dev)
+{
+	struct gk20a *g = get_gk20a(dev);
+	gk20a_dbg_fn("");
+
+#ifdef CONFIG_INPUT_CFBOOST
+	cfb_remove_device(&dev->dev);
+#endif
+
+	if (g->remove_support)
+		g->remove_support(dev);
+
+	gk20a_user_deinit(dev);
+
+	set_gk20a(dev, 0);
+#ifdef CONFIG_DEBUG_FS
+	debugfs_remove(g->debugfs_ltc_enabled);
+	debugfs_remove(g->debugfs_gr_idle_timeout_default);
+	debugfs_remove(g->debugfs_timeouts_enabled);
+#endif
+
+	kfree(g);
+
+#ifdef CONFIG_PM_RUNTIME
+	pm_runtime_put(&dev->dev);
+	pm_runtime_disable(&dev->dev);
+#else
+	nvhost_module_disable_clk(&dev->dev);
+#endif
+
+	return 0;
+}
+
+static struct platform_driver gk20a_driver = {
+	.probe = gk20a_probe,
+	.remove = __exit_p(gk20a_remove),
+	.driver = {
+		.owner = THIS_MODULE,
+		.name = "gk20a",
+#ifdef CONFIG_OF
+		.of_match_table = tegra_gk20a_of_match,
+#endif
+#ifdef CONFIG_PM
+		.pm = &gk20a_pm_ops,
+#endif
+	}
+};
+
+static int __init gk20a_init(void)
+{
+	return platform_driver_register(&gk20a_driver);
+}
+
+static void __exit gk20a_exit(void)
+{
+	platform_driver_unregister(&gk20a_driver);
+}
+
+bool is_gk20a_module(struct platform_device *dev)
+{
+	return &gk20a_driver.driver == dev->dev.driver;
+}
+
+void gk20a_busy_noresume(struct platform_device *pdev)
+{
+	pm_runtime_get_noresume(&pdev->dev);
+}
+
+int gk20a_channel_busy(struct platform_device *pdev)
+{
+	int ret = 0;
+
+	ret = gk20a_platform_channel_busy(pdev);
+	if (ret)
+		return ret;
+
+	ret = gk20a_busy(pdev);
+	if (ret)
+		gk20a_platform_channel_idle(pdev);
+
+	return ret;
+}
+
+void gk20a_channel_idle(struct platform_device *pdev)
+{
+	gk20a_idle(pdev);
+	gk20a_platform_channel_idle(pdev);
+}
+
+int gk20a_busy(struct platform_device *pdev)
+{
+	int ret = 0;
+
+#ifdef CONFIG_PM_RUNTIME
+	ret = pm_runtime_get_sync(&pdev->dev);
+#endif
+	gk20a_scale_notify_busy(pdev);
+
+	return ret < 0 ? ret : 0;
+}
+
+void gk20a_idle(struct platform_device *pdev)
+{
+#ifdef CONFIG_PM_RUNTIME
+	if (atomic_read(&pdev->dev.power.usage_count) == 1)
+		gk20a_scale_notify_idle(pdev);
+	pm_runtime_mark_last_busy(&pdev->dev);
+	pm_runtime_put_sync_autosuspend(&pdev->dev);
+#else
+	gk20a_scale_notify_idle(pdev);
+#endif
+}
+
+void gk20a_disable(struct gk20a *g, u32 units)
+{
+	u32 pmc;
+
+	gk20a_dbg(gpu_dbg_info, "pmc disable: %08x\n", units);
+
+	spin_lock(&g->mc_enable_lock);
+	pmc = gk20a_readl(g, mc_enable_r());
+	pmc &= ~units;
+	gk20a_writel(g, mc_enable_r(), pmc);
+	spin_unlock(&g->mc_enable_lock);
+}
+
+void gk20a_enable(struct gk20a *g, u32 units)
+{
+	u32 pmc;
+
+	gk20a_dbg(gpu_dbg_info, "pmc enable: %08x\n", units);
+
+	spin_lock(&g->mc_enable_lock);
+	pmc = gk20a_readl(g, mc_enable_r());
+	pmc |= units;
+	gk20a_writel(g, mc_enable_r(), pmc);
+	spin_unlock(&g->mc_enable_lock);
+	gk20a_readl(g, mc_enable_r());
+
+	udelay(20);
+}
+
+void gk20a_reset(struct gk20a *g, u32 units)
+{
+	gk20a_disable(g, units);
+	udelay(20);
+	gk20a_enable(g, units);
+}
+
+int gk20a_init_gpu_characteristics(struct gk20a *g)
+{
+	struct nvhost_gpu_characteristics *gpu = &g->gpu_characteristics;
+
+	gpu->L2_cache_size = g->ops.ltc.determine_L2_size_bytes(g);
+	gpu->on_board_video_memory_size = 0; /* integrated GPU */
+
+	gpu->num_gpc = g->gr.gpc_count;
+	gpu->num_tpc_per_gpc = g->gr.max_tpc_per_gpc_count;
+
+	gpu->bus_type = NVHOST_GPU_BUS_TYPE_AXI; /* always AXI for now */
+
+	gpu->big_page_size = g->mm.big_page_size;
+	gpu->compression_page_size = g->mm.compression_page_size;
+
+	return 0;
+}
+
+int nvhost_vpr_info_fetch(void)
+{
+	struct gk20a *g = get_gk20a(to_platform_device(
+			bus_find_device_by_name(&platform_bus_type,
+			NULL, "gk20a.0")));
+
+	if (!g) {
+		pr_info("gk20a ins't ready yet\n");
+		return 0;
+	}
+
+	return gk20a_mm_mmu_vpr_info_fetch(g);
+}
+
+static const struct firmware *
+do_request_firmware(struct device *dev, const char *prefix, const char *fw_name)
+{
+	const struct firmware *fw;
+	char *fw_path = NULL;
+	int path_len, err;
+
+	if (prefix) {
+		path_len = strlen(prefix) + strlen(fw_name);
+		path_len += 2; /* for the path separator and zero terminator*/
+
+		fw_path = kzalloc(sizeof(*fw_path) * path_len, GFP_KERNEL);
+		if (!fw_path)
+			return NULL;
+
+		sprintf(fw_path, "%s/%s", prefix, fw_name);
+		fw_name = fw_path;
+	}
+
+	err = request_firmware(&fw, fw_name, dev);
+	kfree(fw_path);
+	if (err)
+		return NULL;
+	return fw;
+}
+
+/* This is a simple wrapper around request_firmware that takes 'fw_name' and
+ * applies an IP specific relative path prefix to it. The caller is
+ * responsible for calling release_firmware later. */
+const struct firmware *
+gk20a_request_firmware(struct gk20a *g, const char *fw_name)
+{
+	struct device *dev = &g->dev->dev;
+	const struct firmware *fw;
+
+	/* current->fs is NULL when calling from SYS_EXIT.
+	   Add a check here to prevent crash in request_firmware */
+	if (!current->fs || !fw_name)
+		return NULL;
+
+	BUG_ON(!g->ops.name);
+	fw = do_request_firmware(dev, g->ops.name, fw_name);
+
+#ifdef CONFIG_TEGRA_GK20A
+	/* TO BE REMOVED - Support loading from legacy SOC specific path. */
+	if (!fw)
+		fw = nvhost_client_request_firmware(g->dev, fw_name);
+#endif
+
+	if (!fw) {
+		dev_err(dev, "failed to get firmware\n");
+		return NULL;
+	}
+
+	return fw;
+}
+
+module_init(gk20a_init);
+module_exit(gk20a_exit);
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a.h b/drivers/gpu/nvgpu/gk20a/gk20a.h
new file mode 100644
index 000000000000..a9081a9dfb1c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a.h
@@ -0,0 +1,559 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a.h
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_GK20A_H_
+#define _NVHOST_GK20A_H_
+
+
+struct gk20a;
+struct fifo_gk20a;
+struct channel_gk20a;
+struct gr_gk20a;
+struct sim_gk20a;
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/nvhost_gpu_ioctl.h>
+#include <linux/tegra-soc.h>
+
+#include "../../../arch/arm/mach-tegra/iomap.h"
+
+#include "as_gk20a.h"
+#include "clk_gk20a.h"
+#include "fifo_gk20a.h"
+#include "gr_gk20a.h"
+#include "sim_gk20a.h"
+#include "pmu_gk20a.h"
+#include "priv_ring_gk20a.h"
+#include "therm_gk20a.h"
+#include "platform_gk20a.h"
+
+extern struct platform_device tegra_gk20a_device;
+
+bool is_gk20a_module(struct platform_device *dev);
+
+struct cooling_device_gk20a {
+	struct thermal_cooling_device *gk20a_cooling_dev;
+	unsigned int gk20a_freq_state;
+	unsigned int gk20a_freq_table_size;
+	struct gk20a *g;
+};
+
+struct gpu_ops {
+	struct {
+		int (*determine_L2_size_bytes)(struct gk20a *gk20a);
+		void (*set_max_ways_evict_last)(struct gk20a *g, u32 max_ways);
+		int (*init_comptags)(struct gk20a *g, struct gr_gk20a *gr);
+		int (*clear_comptags)(struct gk20a *g, u32 min, u32 max);
+		void (*set_zbc_color_entry)(struct gk20a *g,
+					    struct zbc_entry *color_val,
+					    u32 index);
+		void (*set_zbc_depth_entry)(struct gk20a *g,
+					    struct zbc_entry *depth_val,
+					    u32 index);
+		void (*clear_zbc_color_entry)(struct gk20a *g, u32 index);
+		void (*clear_zbc_depth_entry)(struct gk20a *g, u32 index);
+		int  (*init_zbc)(struct gk20a *g, struct gr_gk20a *gr);
+		void (*init_cbc)(struct gk20a *g, struct gr_gk20a *gr);
+		void (*sync_debugfs)(struct gk20a *g);
+		void (*elpg_flush)(struct gk20a *g);
+	} ltc;
+	struct {
+		int (*init_fs_state)(struct gk20a *g);
+		void (*access_smpc_reg)(struct gk20a *g, u32 quad, u32 offset);
+		void (*bundle_cb_defaults)(struct gk20a *g);
+		void (*cb_size_default)(struct gk20a *g);
+		int (*calc_global_ctx_buffer_size)(struct gk20a *g);
+		void (*commit_global_attrib_cb)(struct gk20a *g,
+						struct channel_ctx_gk20a *ch_ctx,
+						u64 addr, bool patch);
+		void (*commit_global_bundle_cb)(struct gk20a *g,
+						struct channel_ctx_gk20a *ch_ctx,
+						u64 addr, u64 size, bool patch);
+		int (*commit_global_cb_manager)(struct gk20a *g,
+						struct channel_gk20a *ch,
+						bool patch);
+		void (*commit_global_pagepool)(struct gk20a *g,
+					       struct channel_ctx_gk20a *ch_ctx,
+					       u64 addr, u32 size, bool patch);
+		void (*init_gpc_mmu)(struct gk20a *g);
+		int (*handle_sw_method)(struct gk20a *g, u32 addr,
+					 u32 class_num, u32 offset, u32 data);
+		void (*set_alpha_circular_buffer_size)(struct gk20a *g,
+					               u32 data);
+		void (*set_circular_buffer_size)(struct gk20a *g, u32 data);
+		void (*enable_hww_exceptions)(struct gk20a *g);
+		bool (*is_valid_class)(struct gk20a *g, u32 class_num);
+		void (*get_sm_dsm_perf_regs)(struct gk20a *g,
+						  u32 *num_sm_dsm_perf_regs,
+						  u32 **sm_dsm_perf_regs,
+						  u32 *perf_register_stride);
+		void (*get_sm_dsm_perf_ctrl_regs)(struct gk20a *g,
+						  u32 *num_sm_dsm_perf_regs,
+						  u32 **sm_dsm_perf_regs,
+						  u32 *perf_register_stride);
+		void (*set_hww_esr_report_mask)(struct gk20a *g);
+		int (*setup_alpha_beta_tables)(struct gk20a *g,
+					      struct gr_gk20a *gr);
+	} gr;
+	const char *name;
+	struct {
+		void (*init_fs_state)(struct gk20a *g);
+		void (*reset)(struct gk20a *g);
+		void (*init_uncompressed_kind_map)(struct gk20a *g);
+		void (*init_kind_attr)(struct gk20a *g);
+	} fb;
+	struct {
+		void (*slcg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+		void (*slcg_perf_load_gating_prod)(struct gk20a *g, bool prod);
+		void (*blcg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+		void (*pg_gr_load_gating_prod)(struct gk20a *g, bool prod);
+		void (*slcg_therm_load_gating_prod)(struct gk20a *g, bool prod);
+	} clock_gating;
+	struct {
+		void (*bind_channel)(struct channel_gk20a *ch_gk20a);
+	} fifo;
+	struct pmu_v {
+		/*used for change of enum zbc update cmd id from ver 0 to ver1*/
+		u32 cmd_id_zbc_table_update;
+		u32 (*get_pmu_cmdline_args_size)(struct pmu_gk20a *pmu);
+		void (*set_pmu_cmdline_args_cpu_freq)(struct pmu_gk20a *pmu,
+			u32 freq);
+		void * (*get_pmu_cmdline_args_ptr)(struct pmu_gk20a *pmu);
+		u32 (*get_pmu_allocation_struct_size)(struct pmu_gk20a *pmu);
+		void (*set_pmu_allocation_ptr)(struct pmu_gk20a *pmu,
+				void **pmu_alloc_ptr, void *assign_ptr);
+		void (*pmu_allocation_set_dmem_size)(struct pmu_gk20a *pmu,
+				void *pmu_alloc_ptr, u16 size);
+		u16 (*pmu_allocation_get_dmem_size)(struct pmu_gk20a *pmu,
+				void *pmu_alloc_ptr);
+		u32 (*pmu_allocation_get_dmem_offset)(struct pmu_gk20a *pmu,
+				void *pmu_alloc_ptr);
+		u32 * (*pmu_allocation_get_dmem_offset_addr)(
+				struct pmu_gk20a *pmu, void *pmu_alloc_ptr);
+		void (*pmu_allocation_set_dmem_offset)(struct pmu_gk20a *pmu,
+				void *pmu_alloc_ptr, u32 offset);
+		void (*get_pmu_init_msg_pmu_queue_params)(
+				struct pmu_queue *queue, u32 id,
+				void *pmu_init_msg);
+		void *(*get_pmu_msg_pmu_init_msg_ptr)(
+				struct pmu_init_msg *init);
+		u16 (*get_pmu_init_msg_pmu_sw_mg_off)(
+			union pmu_init_msg_pmu *init_msg);
+		u16 (*get_pmu_init_msg_pmu_sw_mg_size)(
+			union pmu_init_msg_pmu *init_msg);
+		u32 (*get_pmu_perfmon_cmd_start_size)(void);
+		int (*get_perfmon_cmd_start_offsetofvar)(
+				enum pmu_perfmon_cmd_start_fields field);
+		void (*perfmon_start_set_cmd_type)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		void (*perfmon_start_set_group_id)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		void (*perfmon_start_set_state_id)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		void (*perfmon_start_set_flags)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		u8 (*perfmon_start_get_flags)(struct pmu_perfmon_cmd *pc);
+		u32 (*get_pmu_perfmon_cmd_init_size)(void);
+		int (*get_perfmon_cmd_init_offsetofvar)(
+				enum pmu_perfmon_cmd_start_fields field);
+		void (*perfmon_cmd_init_set_sample_buffer)(
+				struct pmu_perfmon_cmd *pc, u16 value);
+		void (*perfmon_cmd_init_set_dec_cnt)(
+				struct pmu_perfmon_cmd *pc, u8 value);
+		void (*perfmon_cmd_init_set_base_cnt_id)(
+				struct pmu_perfmon_cmd *pc, u8 value);
+		void (*perfmon_cmd_init_set_samp_period_us)(
+				struct pmu_perfmon_cmd *pc, u32 value);
+		void (*perfmon_cmd_init_set_num_cnt)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		void (*perfmon_cmd_init_set_mov_avg)(struct pmu_perfmon_cmd *pc,
+				u8 value);
+		void *(*get_pmu_seq_in_a_ptr)(
+				struct pmu_sequence *seq);
+		void *(*get_pmu_seq_out_a_ptr)(
+				struct pmu_sequence *seq);
+	} pmu_ver;
+};
+
+struct gk20a {
+	struct platform_device *dev;
+
+	struct resource *reg_mem;
+	void __iomem *regs;
+
+	struct resource *bar1_mem;
+	void __iomem *bar1;
+
+	bool power_on;
+	bool irq_requested;
+
+	struct clk_gk20a clk;
+	struct fifo_gk20a fifo;
+	struct gr_gk20a gr;
+	struct sim_gk20a sim;
+	struct mm_gk20a mm;
+	struct pmu_gk20a pmu;
+	struct cooling_device_gk20a gk20a_cdev;
+
+	/* Save pmu fw here so that it lives cross suspend/resume.
+	   pmu suspend destroys all pmu sw/hw states. Loading pmu
+	   fw in resume crashes when the resume is from sys_exit. */
+	const struct firmware *pmu_fw;
+
+	u32 gr_idle_timeout_default;
+	u32 timeouts_enabled;
+
+	bool slcg_enabled;
+	bool blcg_enabled;
+	bool elcg_enabled;
+	bool elpg_enabled;
+	bool aelpg_enabled;
+
+#ifdef CONFIG_DEBUG_FS
+	spinlock_t debugfs_lock;
+	struct dentry *debugfs_ltc_enabled;
+	struct dentry *debugfs_timeouts_enabled;
+	struct dentry *debugfs_gr_idle_timeout_default;
+#endif
+	struct gk20a_ctxsw_ucode_info ctxsw_ucode_info;
+
+	/* held while manipulating # of debug/profiler sessions present */
+	/* also prevents debug sessions from attaching until released */
+	struct mutex dbg_sessions_lock;
+	int dbg_sessions; /* number attached */
+	int dbg_powergating_disabled_refcount; /*refcount for pg disable */
+
+	void (*remove_support)(struct platform_device *);
+
+	u64 pg_ingating_time_us;
+	u64 pg_ungating_time_us;
+	u32 pg_gating_cnt;
+
+	spinlock_t mc_enable_lock;
+
+	struct nvhost_gpu_characteristics gpu_characteristics;
+
+	struct {
+		struct cdev cdev;
+		struct device *node;
+	} channel;
+
+	struct gk20a_as as;
+
+	struct {
+		struct cdev cdev;
+		struct device *node;
+	} ctrl;
+
+	struct {
+		struct cdev cdev;
+		struct device *node;
+	} dbg;
+
+	struct {
+		struct cdev cdev;
+		struct device *node;
+	} prof;
+
+	struct mutex client_lock;
+	int client_refcount; /* open channels and ctrl nodes */
+
+	dev_t cdev_region;
+	struct class *class;
+
+	struct gpu_ops ops;
+
+	int irq_stall;
+	int irq_nonstall;
+
+	struct generic_pm_domain pd;
+
+	struct devfreq *devfreq;
+
+	struct gk20a_scale_profile *scale_profile;
+
+	struct device_dma_parameters dma_parms;
+};
+
+static inline unsigned long gk20a_get_gr_idle_timeout(struct gk20a *g)
+{
+	return g->timeouts_enabled ?
+		g->gr_idle_timeout_default : MAX_SCHEDULE_TIMEOUT;
+}
+
+static inline struct gk20a *get_gk20a(struct platform_device *dev)
+{
+	return gk20a_get_platform(dev)->g;
+}
+
+enum BAR0_DEBUG_OPERATION {
+	BARO_ZERO_NOP = 0,
+	OP_END = 'DONE',
+	BAR0_READ32 = '0R32',
+	BAR0_WRITE32 = '0W32',
+};
+
+struct share_buffer_head {
+	enum BAR0_DEBUG_OPERATION operation;
+/* size of the operation item */
+	u32 size;
+	u32 completed;
+	u32 failed;
+	u64 context;
+	u64 completion_callback;
+};
+
+struct gk20a_cyclestate_buffer_elem {
+	struct share_buffer_head	head;
+/* in */
+	u64 p_data;
+	u64 p_done;
+	u32 offset_bar0;
+	u16 first_bit;
+	u16 last_bit;
+/* out */
+/* keep 64 bits to be consistent */
+	u64 data;
+};
+
+/* debug accessories */
+
+#ifdef CONFIG_DEBUG_FS
+    /* debug info, default is compiled-in but effectively disabled (0 mask) */
+    #define GK20A_DEBUG
+    /*e.g: echo 1 > /d/tegra_host/dbg_mask */
+    #define GK20A_DEFAULT_DBG_MASK 0
+#else
+    /* manually enable and turn it on the mask */
+    /*#define NVHOST_DEBUG*/
+    #define GK20A_DEFAULT_DBG_MASK (dbg_info)
+#endif
+
+enum gk20a_dbg_categories {
+	gpu_dbg_info    = BIT(0),  /* lightly verbose info */
+	gpu_dbg_fn      = BIT(2),  /* fn name tracing */
+	gpu_dbg_reg     = BIT(3),  /* register accesses, very verbose */
+	gpu_dbg_pte     = BIT(4),  /* gmmu ptes */
+	gpu_dbg_intr    = BIT(5),  /* interrupts */
+	gpu_dbg_pmu     = BIT(6),  /* gk20a pmu */
+	gpu_dbg_clk     = BIT(7),  /* gk20a clk */
+	gpu_dbg_map     = BIT(8),  /* mem mappings */
+	gpu_dbg_gpu_dbg = BIT(9),  /* gpu debugger/profiler */
+	gpu_dbg_mem     = BIT(31), /* memory accesses, very verbose */
+};
+
+#if defined(GK20A_DEBUG)
+extern u32 gk20a_dbg_mask;
+extern u32 gk20a_dbg_ftrace;
+#define gk20a_dbg(dbg_mask, format, arg...)				\
+do {									\
+	if (unlikely((dbg_mask) & gk20a_dbg_mask)) {		\
+		if (gk20a_dbg_ftrace)					\
+			trace_printk(format "\n", ##arg);		\
+		else							\
+			pr_info("gk20a %s: " format "\n",		\
+					__func__, ##arg);		\
+	}								\
+} while (0)
+
+#else /* GK20A_DEBUG */
+#define gk20a_dbg(dbg_mask, format, arg...)				\
+do {									\
+	if (0)								\
+		pr_info("gk20a %s: " format "\n", __func__, ##arg);\
+} while (0)
+
+#endif
+
+#define gk20a_err(d, fmt, arg...) \
+	dev_err(d, "%s: " fmt "\n", __func__, ##arg)
+
+#define gk20a_warn(d, fmt, arg...) \
+	dev_warn(d, "%s: " fmt "\n", __func__, ##arg)
+
+#define gk20a_dbg_fn(fmt, arg...) \
+	gk20a_dbg(gpu_dbg_fn, fmt, ##arg)
+
+#define gk20a_dbg_info(fmt, arg...) \
+	gk20a_dbg(gpu_dbg_info, fmt, ##arg)
+
+/* mem access with dbg_mem logging */
+static inline u8 gk20a_mem_rd08(void *ptr, int b)
+{
+	u8 _b = ((const u8 *)ptr)[b];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, _b);
+#endif
+	return _b;
+}
+static inline u16 gk20a_mem_rd16(void *ptr, int s)
+{
+	u16 _s = ((const u16 *)ptr)[s];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, _s);
+#endif
+	return _s;
+}
+static inline u32 gk20a_mem_rd32(void *ptr, int w)
+{
+	u32 _w = ((const u32 *)ptr)[w];
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr + sizeof(u32)*w, _w);
+#endif
+	return _w;
+}
+static inline void gk20a_mem_wr08(void *ptr, int b, u8 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u8)*b, data);
+#endif
+	((u8 *)ptr)[b] = data;
+}
+static inline void gk20a_mem_wr16(void *ptr, int s, u16 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u16)*s, data);
+#endif
+	((u16 *)ptr)[s] = data;
+}
+static inline void gk20a_mem_wr32(void *ptr, int w, u32 data)
+{
+#ifdef CONFIG_TEGRA_SIMULATION_PLATFORM
+	gk20a_dbg(gpu_dbg_mem, " %p = 0x%x", ptr+sizeof(u32)*w, data);
+#endif
+	((u32 *)ptr)[w] = data;
+}
+
+/* register accessors */
+static inline void gk20a_writel(struct gk20a *g, u32 r, u32 v)
+{
+	gk20a_dbg(gpu_dbg_reg, " r=0x%x v=0x%x", r, v);
+	writel(v, g->regs + r);
+}
+static inline u32 gk20a_readl(struct gk20a *g, u32 r)
+{
+	u32 v = readl(g->regs + r);
+	gk20a_dbg(gpu_dbg_reg, " r=0x%x v=0x%x", r, v);
+	return v;
+}
+
+static inline void gk20a_bar1_writel(struct gk20a *g, u32 b, u32 v)
+{
+	gk20a_dbg(gpu_dbg_reg, " b=0x%x v=0x%x", b, v);
+	writel(v, g->bar1 + b);
+}
+
+static inline u32 gk20a_bar1_readl(struct gk20a *g, u32 b)
+{
+	u32 v = readl(g->bar1 + b);
+	gk20a_dbg(gpu_dbg_reg, " b=0x%x v=0x%x", b, v);
+	return v;
+}
+
+/* convenience */
+static inline struct device *dev_from_gk20a(struct gk20a *g)
+{
+	return &g->dev->dev;
+}
+static inline struct gk20a *gk20a_from_as(struct gk20a_as *as)
+{
+	return container_of(as, struct gk20a, as);
+}
+static inline u32 u64_hi32(u64 n)
+{
+	return (u32)((n >> 32) & ~(u32)0);
+}
+
+static inline u32 u64_lo32(u64 n)
+{
+	return (u32)(n & ~(u32)0);
+}
+
+static inline u32 set_field(u32 val, u32 mask, u32 field)
+{
+	return ((val & ~mask) | field);
+}
+
+/* invalidate channel lookup tlb */
+static inline void gk20a_gr_flush_channel_tlb(struct gr_gk20a *gr)
+{
+	spin_lock(&gr->ch_tlb_lock);
+	memset(gr->chid_tlb, 0,
+		sizeof(struct gr_channel_map_tlb_entry) *
+		GR_CHANNEL_MAP_TLB_SIZE);
+	spin_unlock(&gr->ch_tlb_lock);
+}
+
+/* classes that the device supports */
+/* TBD: get these from an open-sourced SDK? */
+enum {
+	KEPLER_C                  = 0xA297,
+	FERMI_TWOD_A              = 0x902D,
+	KEPLER_COMPUTE_A          = 0xA0C0,
+	KEPLER_INLINE_TO_MEMORY_A = 0xA040,
+	KEPLER_DMA_COPY_A         = 0xA0B5, /*not sure about this one*/
+};
+
+#if defined(CONFIG_GK20A_PMU)
+static inline int support_gk20a_pmu(void)
+{
+	return 1;
+}
+#else
+static inline int support_gk20a_pmu(void){return 0;}
+#endif
+
+void gk20a_create_sysfs(struct platform_device *dev);
+
+#ifdef CONFIG_DEBUG_FS
+int clk_gk20a_debugfs_init(struct platform_device *dev);
+#endif
+
+#define GK20A_BAR0_IORESOURCE_MEM 0
+#define GK20A_BAR1_IORESOURCE_MEM 1
+#define GK20A_SIM_IORESOURCE_MEM 2
+
+void gk20a_busy_noresume(struct platform_device *pdev);
+int gk20a_busy(struct platform_device *pdev);
+void gk20a_idle(struct platform_device *pdev);
+int gk20a_channel_busy(struct platform_device *pdev);
+void gk20a_channel_idle(struct platform_device *pdev);
+void gk20a_disable(struct gk20a *g, u32 units);
+void gk20a_enable(struct gk20a *g, u32 units);
+void gk20a_reset(struct gk20a *g, u32 units);
+int gk20a_get_client(struct gk20a *g);
+void gk20a_put_client(struct gk20a *g);
+
+const struct firmware *
+gk20a_request_firmware(struct gk20a *g, const char *fw_name);
+
+#define NVHOST_GPU_ARCHITECTURE_SHIFT 4
+
+/* constructs unique and compact GPUID from nvhost_gpu_characteristics
+ * arch/impl fields */
+#define GK20A_GPUID(arch, impl) ((u32) ((arch) | (impl)))
+
+#define GK20A_GPUID_GK20A \
+	GK20A_GPUID(NVHOST_GPU_ARCH_GK100, NVHOST_GPU_IMPL_GK20A)
+
+int gk20a_init_gpu_characteristics(struct gk20a *g);
+
+#endif /* _NVHOST_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
new file mode 100644
index 000000000000..32c003b655a6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.c
@@ -0,0 +1,1247 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "gk20a_allocator.h"
+
+static inline void link_block_list(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev,
+		struct rb_node *rb_parent);
+static inline void link_block_rb(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct rb_node **rb_link,
+		struct rb_node *rb_parent);
+static void link_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev, struct rb_node **rb_link,
+		struct rb_node *rb_parent);
+static void insert_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block);
+
+static void unlink_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev);
+static struct gk20a_alloc_block *unlink_blocks(
+		struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev, u32 end);
+
+static struct gk20a_alloc_block *find_block(
+		struct gk20a_allocator *allocator, u32 addr);
+static struct gk20a_alloc_block *find_block_prev(
+		struct gk20a_allocator *allocator, u32 addr,
+		struct gk20a_alloc_block **pprev);
+static struct gk20a_alloc_block *find_block_prepare(
+		struct gk20a_allocator *allocator, u32 addr,
+		struct gk20a_alloc_block **pprev, struct rb_node ***rb_link,
+		struct rb_node **rb_parent);
+
+static u32 check_free_space(u32 addr, u32 limit, u32 len, u32 align);
+static void update_free_addr_cache(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		u32 addr, u32 len, bool free);
+static int find_free_area(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len);
+static int find_free_area_nc(struct gk20a_allocator *allocator,
+		u32 *addr, u32 *len);
+
+static void adjust_block(struct gk20a_alloc_block *block,
+		u32 start, u32 end,
+		struct gk20a_alloc_block *insert);
+static struct gk20a_alloc_block *merge_block(
+		struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block, u32 addr, u32 end);
+static int split_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		u32 addr, int new_below);
+
+static int block_alloc_single_locked(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len);
+static int block_alloc_list_locked(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len,
+		struct gk20a_alloc_block **pblock);
+static int block_free_locked(struct gk20a_allocator *allocator,
+		u32 addr, u32 len);
+static void block_free_list_locked(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *list);
+
+/* link a block into allocator block list */
+static inline void link_block_list(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev,
+		struct rb_node *rb_parent)
+{
+	struct gk20a_alloc_block *next;
+
+	block->prev = prev;
+	if (prev) {
+		next = prev->next;
+		prev->next = block;
+	} else {
+		allocator->block_first = block;
+		if (rb_parent)
+			next = rb_entry(rb_parent,
+					struct gk20a_alloc_block, rb);
+		else
+			next = NULL;
+	}
+	block->next = next;
+	if (next)
+		next->prev = block;
+}
+
+/* link a block into allocator rb tree */
+static inline void link_block_rb(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block, struct rb_node **rb_link,
+		struct rb_node *rb_parent)
+{
+	rb_link_node(&block->rb, rb_parent, rb_link);
+	rb_insert_color(&block->rb, &allocator->rb_root);
+}
+
+/* add a block to allocator with known location */
+static void link_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev, struct rb_node **rb_link,
+		struct rb_node *rb_parent)
+{
+	struct gk20a_alloc_block *next;
+
+	link_block_list(allocator, block, prev, rb_parent);
+	link_block_rb(allocator, block, rb_link, rb_parent);
+	allocator->block_count++;
+
+	next = block->next;
+	allocator_dbg(allocator, "link new block %d:%d between block %d:%d and block %d:%d",
+		block->start, block->end,
+		prev ? prev->start : -1, prev ? prev->end : -1,
+		next ? next->start : -1, next ? next->end : -1);
+}
+
+/* add a block to allocator */
+static void insert_block(struct gk20a_allocator *allocator,
+			struct gk20a_alloc_block *block)
+{
+	struct gk20a_alloc_block *prev;
+	struct rb_node **rb_link, *rb_parent;
+
+	find_block_prepare(allocator, block->start,
+			&prev, &rb_link, &rb_parent);
+	link_block(allocator, block, prev, rb_link, rb_parent);
+}
+
+/* remove a block from allocator */
+static void unlink_block(struct gk20a_allocator *allocator,
+			struct gk20a_alloc_block *block,
+			struct gk20a_alloc_block *prev)
+{
+	struct gk20a_alloc_block *next = block->next;
+
+	allocator_dbg(allocator, "unlink block %d:%d between block %d:%d and block %d:%d",
+		block->start, block->end,
+		prev ? prev->start : -1, prev ? prev->end : -1,
+		next ? next->start : -1, next ? next->end : -1);
+
+	BUG_ON(block->start < allocator->base);
+	BUG_ON(block->end > allocator->limit);
+
+	if (prev)
+		prev->next = next;
+	else
+		allocator->block_first = next;
+
+	if (next)
+		next->prev = prev;
+	rb_erase(&block->rb, &allocator->rb_root);
+	if (allocator->block_recent == block)
+		allocator->block_recent = prev;
+
+	allocator->block_count--;
+}
+
+/* remove a list of blocks from allocator. the list can contain both
+   regular blocks and non-contiguous blocks. skip all non-contiguous
+   blocks, remove regular blocks into a separate list, return list head */
+static struct gk20a_alloc_block *
+unlink_blocks(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block,
+		struct gk20a_alloc_block *prev,
+		u32 end)
+{
+	struct gk20a_alloc_block **insertion_point;
+	struct gk20a_alloc_block *last_unfreed_block = prev;
+	struct gk20a_alloc_block *last_freed_block = NULL;
+	struct gk20a_alloc_block *first_freed_block = NULL;
+
+	insertion_point = (prev ? &prev->next : &allocator->block_first);
+	*insertion_point = NULL;
+
+	do {
+		if (!block->nc_block) {
+			allocator_dbg(allocator, "unlink block %d:%d",
+				block->start, block->end);
+			if (last_freed_block)
+				last_freed_block->next = block;
+			block->prev = last_freed_block;
+			rb_erase(&block->rb, &allocator->rb_root);
+			last_freed_block = block;
+			allocator->block_count--;
+			if (!first_freed_block)
+				first_freed_block = block;
+		} else {
+			allocator_dbg(allocator, "skip nc block %d:%d",
+				block->start, block->end);
+			if (!*insertion_point)
+				*insertion_point = block;
+			if (last_unfreed_block)
+				last_unfreed_block->next = block;
+			block->prev = last_unfreed_block;
+			last_unfreed_block = block;
+		}
+		block = block->next;
+	} while (block && block->start < end);
+
+	if (!*insertion_point)
+		*insertion_point = block;
+
+	if (block)
+		block->prev = last_unfreed_block;
+	if (last_unfreed_block)
+		last_unfreed_block->next = block;
+	if (last_freed_block)
+		last_freed_block->next = NULL;
+
+	allocator->block_recent = NULL;
+
+	return first_freed_block;
+}
+
+/* Look up the first block which satisfies addr < block->end,
+   NULL if none */
+static struct gk20a_alloc_block *
+find_block(struct gk20a_allocator *allocator, u32 addr)
+{
+	struct gk20a_alloc_block *block = allocator->block_recent;
+
+	if (!(block && block->end > addr && block->start <= addr)) {
+		struct rb_node *rb_node;
+
+		rb_node = allocator->rb_root.rb_node;
+		block = NULL;
+
+		while (rb_node) {
+			struct gk20a_alloc_block *block_tmp;
+
+			block_tmp = rb_entry(rb_node,
+					struct gk20a_alloc_block, rb);
+
+			if (block_tmp->end > addr) {
+				block = block_tmp;
+				if (block_tmp->start <= addr)
+					break;
+				rb_node = rb_node->rb_left;
+			} else
+				rb_node = rb_node->rb_right;
+			if (block)
+				allocator->block_recent = block;
+		}
+	}
+	return block;
+}
+
+/* Same as find_block, but also return a pointer to the previous block */
+static struct gk20a_alloc_block *
+find_block_prev(struct gk20a_allocator *allocator, u32 addr,
+		struct gk20a_alloc_block **pprev)
+{
+	struct gk20a_alloc_block *block = NULL, *prev = NULL;
+	struct rb_node *rb_node;
+	if (!allocator)
+		goto out;
+
+	block = allocator->block_first;
+
+	rb_node = allocator->rb_root.rb_node;
+
+	while (rb_node) {
+		struct gk20a_alloc_block *block_tmp;
+		block_tmp = rb_entry(rb_node, struct gk20a_alloc_block, rb);
+
+		if (addr < block_tmp->end)
+			rb_node = rb_node->rb_left;
+		else {
+			prev = block_tmp;
+			if (!prev->next || addr < prev->next->end)
+				break;
+			rb_node = rb_node->rb_right;
+		}
+	}
+
+out:
+	*pprev = prev;
+	return prev ? prev->next : block;
+}
+
+/* Same as find_block, but also return a pointer to the previous block
+   and return rb_node to prepare for rbtree insertion */
+static struct gk20a_alloc_block *
+find_block_prepare(struct gk20a_allocator *allocator, u32 addr,
+		struct gk20a_alloc_block **pprev, struct rb_node ***rb_link,
+		struct rb_node **rb_parent)
+{
+	struct gk20a_alloc_block *block;
+	struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+
+	__rb_link = &allocator->rb_root.rb_node;
+	rb_prev = __rb_parent = NULL;
+	block = NULL;
+
+	while (*__rb_link) {
+		struct gk20a_alloc_block *block_tmp;
+
+		__rb_parent = *__rb_link;
+		block_tmp = rb_entry(__rb_parent,
+				struct gk20a_alloc_block, rb);
+
+		if (block_tmp->end > addr) {
+			block = block_tmp;
+			if (block_tmp->start <= addr)
+				break;
+			__rb_link = &__rb_parent->rb_left;
+		} else {
+			rb_prev = __rb_parent;
+			__rb_link = &__rb_parent->rb_right;
+		}
+	}
+
+	*pprev = NULL;
+	if (rb_prev)
+		*pprev = rb_entry(rb_prev, struct gk20a_alloc_block, rb);
+	*rb_link = __rb_link;
+	*rb_parent = __rb_parent;
+	return block;
+}
+
+/* return available space */
+static u32 check_free_space(u32 addr, u32 limit, u32 len, u32 align)
+{
+	if (addr >= limit)
+		return 0;
+	if (addr + len <= limit)
+		return len;
+	return (limit - addr) & ~(align - 1);
+}
+
+/* update first_free_addr/last_free_addr based on new free addr
+   called when free block(s) and allocate block(s) */
+static void update_free_addr_cache(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *next,
+		u32 addr, u32 len, bool free)
+{
+	/* update from block free */
+	if (free) {
+		if (allocator->first_free_addr > addr)
+			allocator->first_free_addr = addr;
+	} else { /* update from block alloc */
+		if (allocator->last_free_addr < addr + len)
+			allocator->last_free_addr = addr + len;
+		if (allocator->first_free_addr == addr) {
+			if (!next || next->start > addr + len)
+				allocator->first_free_addr = addr + len;
+			else
+				allocator->first_free_addr = next->end;
+		}
+	}
+
+	if (allocator->first_free_addr > allocator->last_free_addr)
+		allocator->first_free_addr = allocator->last_free_addr;
+}
+
+/* find a free address range for a fixed len */
+static int find_free_area(struct gk20a_allocator *allocator,
+			u32 *addr, u32 len)
+{
+	struct gk20a_alloc_block *block;
+	u32 start_addr, search_base, search_limit;
+
+	/* fixed addr allocation */
+	/* note: constraints for fixed are handled by caller */
+	if (*addr) {
+		block = find_block(allocator, *addr);
+		if (allocator->limit - len >= *addr &&
+		    (!block || *addr + len <= block->start)) {
+			update_free_addr_cache(allocator, block,
+					*addr, len, false);
+			return 0;
+		} else
+			return -ENOMEM;
+	}
+
+	if (!allocator->constraint.enable) {
+		search_base  = allocator->base;
+		search_limit = allocator->limit;
+	} else {
+		start_addr = *addr = allocator->constraint.base;
+		search_base = allocator->constraint.base;
+		search_limit = allocator->constraint.limit;
+	}
+
+	/* cached_hole_size has max free space up to last_free_addr */
+	if (len > allocator->cached_hole_size)
+		start_addr = *addr = allocator->last_free_addr;
+	else {
+		start_addr = *addr = allocator->base;
+		allocator->cached_hole_size = 0;
+	}
+
+	allocator_dbg(allocator, "start search addr : %d", start_addr);
+
+full_search:
+	for (block = find_block(allocator, *addr);; block = block->next) {
+		if (search_limit - len < *addr) {
+			/* start a new search in case we missed any hole */
+			if (start_addr != search_base) {
+				start_addr = *addr = search_base;
+				allocator->cached_hole_size = 0;
+				allocator_dbg(allocator, "start a new search from base");
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (!block || *addr + len <= block->start) {
+			update_free_addr_cache(allocator, block,
+					*addr, len, false);
+			allocator_dbg(allocator, "free space from %d, len %d",
+				*addr, len);
+			allocator_dbg(allocator, "next free addr: %d",
+				allocator->last_free_addr);
+			return 0;
+		}
+		if (*addr + allocator->cached_hole_size < block->start)
+			allocator->cached_hole_size = block->start - *addr;
+		*addr = block->end;
+	}
+}
+
+/* find a free address range for as long as it meets alignment or meet len */
+static int find_free_area_nc(struct gk20a_allocator *allocator,
+			u32 *addr, u32 *len)
+{
+	struct gk20a_alloc_block *block;
+	u32 start_addr;
+	u32 avail_len;
+
+	/* fixed addr allocation */
+	if (*addr) {
+		block = find_block(allocator, *addr);
+		if (allocator->limit - *len >= *addr) {
+			if (!block)
+				return 0;
+
+			avail_len = check_free_space(*addr, block->start,
+						*len, allocator->align);
+			if (avail_len != 0) {
+				update_free_addr_cache(allocator, block,
+					*addr, avail_len, false);
+				allocator_dbg(allocator,
+					"free space between %d, %d, len %d",
+					*addr, block->start, avail_len);
+				allocator_dbg(allocator, "next free addr: %d",
+					allocator->last_free_addr);
+				*len = avail_len;
+				return 0;
+			} else
+				return -ENOMEM;
+		} else
+			return -ENOMEM;
+	}
+
+	start_addr = *addr = allocator->first_free_addr;
+
+	allocator_dbg(allocator, "start search addr : %d", start_addr);
+
+	for (block = find_block(allocator, *addr);; block = block->next) {
+		if (allocator->limit - *len < *addr)
+			return -ENOMEM;
+		if (!block) {
+			update_free_addr_cache(allocator, block,
+					*addr, *len, false);
+			allocator_dbg(allocator, "free space from %d, len %d",
+				*addr, *len);
+			allocator_dbg(allocator, "next free addr: %d",
+				allocator->first_free_addr);
+			return 0;
+		}
+
+		avail_len = check_free_space(*addr, block->start,
+					*len, allocator->align);
+		if (avail_len != 0) {
+			update_free_addr_cache(allocator, block,
+					*addr, avail_len, false);
+			allocator_dbg(allocator, "free space between %d, %d, len %d",
+				*addr, block->start, avail_len);
+			allocator_dbg(allocator, "next free addr: %d",
+				allocator->first_free_addr);
+			*len = avail_len;
+			return 0;
+		}
+		if (*addr + allocator->cached_hole_size < block->start)
+			allocator->cached_hole_size = block->start - *addr;
+		*addr = block->end;
+	}
+}
+
+/* expand/shrink a block with new start and new end
+   split_block function provides insert block for shrink */
+static void adjust_block(struct gk20a_alloc_block *block,
+		u32 start, u32 end, struct gk20a_alloc_block *insert)
+{
+	struct gk20a_allocator *allocator = block->allocator;
+
+	allocator_dbg(allocator, "curr block %d:%d, new start %d, new end %d",
+		block->start, block->end, start, end);
+
+	/* expand */
+	if (!insert) {
+		if (start == block->end) {
+			struct gk20a_alloc_block *next = block->next;
+
+			if (next && end == next->start) {
+				/* ....AAAA.... */
+				/* PPPP....NNNN */
+				/* PPPPPPPPPPPP */
+				unlink_block(allocator, next, block);
+				block->end = next->end;
+				kmem_cache_free(allocator->block_cache, next);
+			} else {
+				/* ....AAAA.... */
+				/* PPPP........ */
+				/* PPPPPPPP.... */
+				block->end = end;
+			}
+		}
+
+		if (end == block->start) {
+			/* ....AAAA.... */
+			/* ........NNNN */
+			/* PP..NNNNNNNN        ....NNNNNNNN */
+			block->start = start;
+		}
+	} else { /* shrink */
+		/* BBBBBBBB -> BBBBIIII  OR  BBBBBBBB -> IIIIBBBB */
+		block->start = start;
+		block->end = end;
+		insert_block(allocator, insert);
+	}
+}
+
+/* given a range [addr, end], merge it with blocks before or after or both
+   if they can be combined into a contiguous block */
+static struct gk20a_alloc_block *
+merge_block(struct gk20a_allocator *allocator,
+	struct gk20a_alloc_block *prev, u32 addr, u32 end)
+{
+	struct gk20a_alloc_block *next;
+
+	if (prev)
+		next = prev->next;
+	else
+		next = allocator->block_first;
+
+	allocator_dbg(allocator, "curr block %d:%d", addr, end);
+	if (prev)
+		allocator_dbg(allocator, "prev block %d:%d",
+			prev->start, prev->end);
+	if (next)
+		allocator_dbg(allocator, "next block %d:%d",
+			next->start, next->end);
+
+	/* don't merge with non-contiguous allocation block */
+	if (prev && prev->end == addr && !prev->nc_block) {
+		adjust_block(prev, addr, end, NULL);
+		return prev;
+	}
+
+	/* don't merge with non-contiguous allocation block */
+	if (next && end == next->start && !next->nc_block) {
+		adjust_block(next, addr, end, NULL);
+		return next;
+	}
+
+	return NULL;
+}
+
+/* split a block based on addr. addr must be within (start, end).
+   if new_below == 1, link new block before adjusted current block */
+static int split_block(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block, u32 addr, int new_below)
+{
+	struct gk20a_alloc_block *new_block;
+
+	allocator_dbg(allocator, "start %d, split %d, end %d, new_below %d",
+		block->start, addr, block->end, new_below);
+
+	BUG_ON(!(addr > block->start && addr < block->end));
+
+	new_block = kmem_cache_alloc(allocator->block_cache, GFP_KERNEL);
+	if (!new_block)
+		return -ENOMEM;
+
+	*new_block = *block;
+
+	if (new_below)
+		new_block->end = addr;
+	else
+		new_block->start = addr;
+
+	if (new_below)
+		adjust_block(block, addr, block->end, new_block);
+	else
+		adjust_block(block, block->start, addr, new_block);
+
+	return 0;
+}
+
+/* free a list of blocks */
+static void free_blocks(struct gk20a_allocator *allocator,
+			struct gk20a_alloc_block *block)
+{
+	struct gk20a_alloc_block *curr_block;
+	while (block) {
+		curr_block = block;
+		block = block->next;
+		kmem_cache_free(allocator->block_cache, curr_block);
+	}
+}
+
+/* called with rw_sema acquired */
+static int block_alloc_single_locked(struct gk20a_allocator *allocator,
+				u32 *addr_req, u32 len)
+{
+	struct gk20a_alloc_block *block, *prev;
+	struct rb_node **rb_link, *rb_parent;
+	u32 addr = *addr_req;
+	int err;
+
+	*addr_req = ~0;
+
+	err = find_free_area(allocator, &addr, len);
+	if (err)
+		return err;
+
+	find_block_prepare(allocator, addr, &prev, &rb_link, &rb_parent);
+
+	/* merge requested free space with existing block(s)
+	   if they can be combined into one contiguous block */
+	block = merge_block(allocator, prev, addr, addr + len);
+	if (block) {
+		*addr_req = addr;
+		return 0;
+	}
+
+	/* create a new block if cannot merge */
+	block = kmem_cache_zalloc(allocator->block_cache, GFP_KERNEL);
+	if (!block)
+		return -ENOMEM;
+
+	block->allocator = allocator;
+	block->start = addr;
+	block->end = addr + len;
+
+	link_block(allocator, block, prev, rb_link, rb_parent);
+
+	*addr_req = addr;
+
+	return 0;
+}
+
+static int block_alloc_list_locked(struct gk20a_allocator *allocator,
+	u32 *addr_req, u32 nc_len, struct gk20a_alloc_block **pblock)
+{
+	struct gk20a_alloc_block *block;
+	struct gk20a_alloc_block *nc_head = NULL, *nc_prev = NULL;
+	u32 addr = *addr_req, len = nc_len;
+	int err = 0;
+
+	*addr_req = ~0;
+
+	while (nc_len > 0) {
+		err = find_free_area_nc(allocator, &addr, &len);
+		if (err) {
+			allocator_dbg(allocator, "not enough free space");
+			goto clean_up;
+		}
+
+		/* never merge non-contiguous allocation block,
+		   just create a new block */
+		block = kmem_cache_zalloc(allocator->block_cache,
+					GFP_KERNEL);
+		if (!block) {
+			err = -ENOMEM;
+			goto clean_up;
+		}
+
+		block->allocator = allocator;
+		block->start = addr;
+		block->end = addr + len;
+
+		insert_block(allocator, block);
+
+		block->nc_prev = nc_prev;
+		if (nc_prev)
+			nc_prev->nc_next = block;
+		nc_prev = block;
+		block->nc_block = true;
+
+		if (!nc_head)
+			nc_head = block;
+
+		if (*addr_req == ~0)
+			*addr_req = addr;
+
+		addr = 0;
+		nc_len -= len;
+		len = nc_len;
+		allocator_dbg(allocator, "remaining length %d", nc_len);
+	}
+
+clean_up:
+	if (err) {
+		while (nc_head) {
+			unlink_block(allocator, nc_head, nc_head->prev);
+			nc_prev = nc_head;
+			nc_head = nc_head->nc_next;
+			kmem_cache_free(allocator->block_cache, nc_prev);
+		}
+		*pblock = NULL;
+		*addr_req = ~0;
+	} else {
+		*pblock = nc_head;
+	}
+
+	return err;
+}
+
+/* called with rw_sema acquired */
+static int block_free_locked(struct gk20a_allocator *allocator,
+			u32 addr, u32 len)
+{
+	struct gk20a_alloc_block *block, *prev, *last;
+	u32 end;
+	int err;
+
+	/* no block has block->end > addr, already free */
+	block = find_block_prev(allocator, addr, &prev);
+	if (!block)
+		return 0;
+
+	allocator_dbg(allocator, "first block in free range %d:%d",
+		block->start, block->end);
+
+	end = addr + len;
+	/* not in any block, already free */
+	if (block->start >= end)
+		return 0;
+
+	/* don't touch nc_block in range free */
+	if (addr > block->start && !block->nc_block) {
+		int err = split_block(allocator, block, addr, 0);
+		if (err)
+			return err;
+		prev = block;
+	}
+
+	last = find_block(allocator, end);
+	if (last && end > last->start && !last->nc_block) {
+
+		allocator_dbg(allocator, "last block in free range %d:%d",
+			last->start, last->end);
+
+		err = split_block(allocator, last, end, 1);
+		if (err)
+			return err;
+	}
+
+	block = prev ? prev->next : allocator->block_first;
+
+	allocator_dbg(allocator, "first block for free %d:%d",
+		block->start, block->end);
+
+	/* remove blocks between [addr, addr + len) from rb tree
+	   and put them in a list */
+	block = unlink_blocks(allocator, block, prev, end);
+	free_blocks(allocator, block);
+
+	update_free_addr_cache(allocator, NULL, addr, len, true);
+
+	return 0;
+}
+
+/* called with rw_sema acquired */
+static void block_free_list_locked(struct gk20a_allocator *allocator,
+			struct gk20a_alloc_block *list)
+{
+	struct gk20a_alloc_block *block;
+	u32 len;
+
+	update_free_addr_cache(allocator, NULL,
+			list->start, list->end - list->start, true);
+
+	while (list) {
+		block = list;
+		unlink_block(allocator, block, block->prev);
+
+		len = block->end - block->start;
+		if (allocator->cached_hole_size < len)
+			allocator->cached_hole_size = len;
+
+		list = block->nc_next;
+		kmem_cache_free(allocator->block_cache, block);
+	}
+}
+
+static int
+gk20a_allocator_constrain(struct gk20a_allocator *a,
+			   bool enable, u32 base, u32 limit)
+{
+	if (enable) {
+		a->constraint.enable = (base >= a->base &&
+					limit <= a->limit);
+		if (!a->constraint.enable)
+			return -EINVAL;
+		a->constraint.base  = base;
+		a->constraint.limit = limit;
+		a->first_free_addr = a->last_free_addr = base;
+
+	} else {
+		a->constraint.enable = false;
+		a->first_free_addr = a->last_free_addr = a->base;
+	}
+
+	a->cached_hole_size = 0;
+
+	return 0;
+}
+
+/* init allocator struct */
+int gk20a_allocator_init(struct gk20a_allocator *allocator,
+		const char *name, u32 start, u32 len, u32 align)
+{
+	memset(allocator, 0, sizeof(struct gk20a_allocator));
+
+	strncpy(allocator->name, name, 32);
+
+	allocator->block_cache =
+		kmem_cache_create(allocator->name,
+			sizeof(struct gk20a_alloc_block), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!allocator->block_cache)
+		return -ENOMEM;
+
+	allocator->rb_root = RB_ROOT;
+
+	allocator->base = start;
+	allocator->limit = start + len - 1;
+	allocator->align = align;
+
+	allocator_dbg(allocator, "%s : base %d, limit %d, align %d",
+		allocator->name, allocator->base,
+		allocator->limit, allocator->align);
+
+	allocator->first_free_addr = allocator->last_free_addr = start;
+	allocator->cached_hole_size = len;
+
+	init_rwsem(&allocator->rw_sema);
+
+	allocator->alloc = gk20a_allocator_block_alloc;
+	allocator->alloc_nc = gk20a_allocator_block_alloc_nc;
+	allocator->free = gk20a_allocator_block_free;
+	allocator->free_nc = gk20a_allocator_block_free_nc;
+	allocator->constrain = gk20a_allocator_constrain;
+
+	return 0;
+}
+
+/* destroy allocator, free all remaining blocks if any */
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator)
+{
+	struct gk20a_alloc_block *block, *next;
+	u32 free_count = 0;
+
+	down_write(&allocator->rw_sema);
+
+	for (block = allocator->block_first; block; ) {
+		allocator_dbg(allocator, "free remaining block %d:%d",
+			block->start, block->end);
+		next = block->next;
+		kmem_cache_free(allocator->block_cache, block);
+		free_count++;
+		block = next;
+	}
+
+	up_write(&allocator->rw_sema);
+
+	/* block_count doesn't match real number of blocks */
+	BUG_ON(free_count != allocator->block_count);
+
+	kmem_cache_destroy(allocator->block_cache);
+
+	memset(allocator, 0, sizeof(struct gk20a_allocator));
+}
+
+/*
+ * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
+ * returned to caller in *addr.
+ *
+ * contiguous allocation, which allocates one block of
+ * contiguous address.
+*/
+int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len)
+{
+	int ret;
+#if defined(ALLOCATOR_DEBUG)
+	struct gk20a_alloc_block *block;
+	bool should_fail = false;
+#endif
+
+	allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+
+	if (*addr + len > allocator->limit || /* check addr range */
+	    *addr & (allocator->align - 1) || /* check addr alignment */
+	     len == 0)			      /* check len */
+		return -EINVAL;
+
+	if (allocator->constraint.enable &&
+	    (*addr + len > allocator->constraint.limit ||
+	     *addr > allocator->constraint.base))
+		return -EINVAL;
+
+	len = ALIGN(len, allocator->align);
+	if (!len)
+		return -ENOMEM;
+
+	down_write(&allocator->rw_sema);
+
+#if defined(ALLOCATOR_DEBUG)
+	if (*addr) {
+		for (block = allocator->block_first;
+		     block; block = block->next) {
+			if (block->end > *addr && block->start < *addr + len) {
+				should_fail = true;
+				break;
+			}
+		}
+	}
+#endif
+
+	ret = block_alloc_single_locked(allocator, addr, len);
+
+#if defined(ALLOCATOR_DEBUG)
+	if (!ret) {
+		bool allocated = false;
+		BUG_ON(should_fail);
+		BUG_ON(*addr < allocator->base);
+		BUG_ON(*addr + len > allocator->limit);
+		for (block = allocator->block_first;
+		     block; block = block->next) {
+			if (!block->nc_block &&
+			    block->start <= *addr &&
+			    block->end >= *addr + len) {
+				allocated = true;
+				break;
+			}
+		}
+		BUG_ON(!allocated);
+	}
+#endif
+
+	up_write(&allocator->rw_sema);
+
+	allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+
+	return ret;
+}
+
+/*
+ * *addr != ~0 for fixed address allocation. if *addr == 0, base addr is
+ * returned to caller in *addr.
+ *
+ * non-contiguous allocation, which returns a list of blocks with aggregated
+ * size == len. Individual block size must meet alignment requirement.
+ */
+int gk20a_allocator_block_alloc_nc(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len, struct gk20a_alloc_block **pblock)
+{
+	int ret;
+
+	allocator_dbg(allocator, "[in] addr %d, len %d", *addr, len);
+
+	BUG_ON(pblock == NULL);
+	*pblock = NULL;
+
+	if (*addr + len > allocator->limit || /* check addr range */
+	    *addr & (allocator->align - 1) || /* check addr alignment */
+	     len == 0)			      /* check len */
+		return -EINVAL;
+
+	len = ALIGN(len, allocator->align);
+	if (!len)
+		return -ENOMEM;
+
+	down_write(&allocator->rw_sema);
+
+	ret = block_alloc_list_locked(allocator, addr, len, pblock);
+
+#if defined(ALLOCATOR_DEBUG)
+	if (!ret) {
+		struct gk20a_alloc_block *block = *pblock;
+		BUG_ON(!block);
+		BUG_ON(block->start < allocator->base);
+		while (block->nc_next) {
+			BUG_ON(block->end > block->nc_next->start);
+			block = block->nc_next;
+		}
+		BUG_ON(block->end > allocator->limit);
+	}
+#endif
+
+	up_write(&allocator->rw_sema);
+
+	allocator_dbg(allocator, "[out] addr %d, len %d", *addr, len);
+
+	return ret;
+}
+
+/* free all blocks between start and end */
+int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+		u32 addr, u32 len)
+{
+	int ret;
+
+	allocator_dbg(allocator, "[in] addr %d, len %d", addr, len);
+
+	if (addr + len > allocator->limit || /* check addr range */
+	    addr < allocator->base ||
+	    addr & (allocator->align - 1))   /* check addr alignment */
+		return -EINVAL;
+
+	len = ALIGN(len, allocator->align);
+	if (!len)
+		return -EINVAL;
+
+	down_write(&allocator->rw_sema);
+
+	ret = block_free_locked(allocator, addr, len);
+
+#if defined(ALLOCATOR_DEBUG)
+	if (!ret) {
+		struct gk20a_alloc_block *block;
+		for (block = allocator->block_first;
+		     block; block = block->next) {
+			if (!block->nc_block)
+				BUG_ON(block->start >= addr &&
+					block->end <= addr + len);
+		}
+	}
+#endif
+	up_write(&allocator->rw_sema);
+
+	allocator_dbg(allocator, "[out] addr %d, len %d", addr, len);
+
+	return ret;
+}
+
+/* free non-contiguous allocation block list */
+void gk20a_allocator_block_free_nc(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block)
+{
+	/* nothing to free */
+	if (!block)
+		return;
+
+	down_write(&allocator->rw_sema);
+	block_free_list_locked(allocator, block);
+	up_write(&allocator->rw_sema);
+}
+
+#if defined(ALLOCATOR_DEBUG)
+
+#include <linux/random.h>
+
+/* test suite */
+void gk20a_allocator_test(void)
+{
+	struct gk20a_allocator allocator;
+	struct gk20a_alloc_block *list[5];
+	u32 addr, len;
+	u32 count;
+	int n;
+
+	gk20a_allocator_init(&allocator, "test", 0, 10, 1);
+
+	/* alloc/free a single block in the beginning */
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_block_free(&allocator, addr, 2);
+	gk20a_allocator_dump(&allocator);
+	/* alloc/free a single block in the middle */
+	addr = 4;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_block_free(&allocator, addr, 2);
+	gk20a_allocator_dump(&allocator);
+	/* alloc/free a single block in the end */
+	addr = 8;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_block_free(&allocator, addr, 2);
+	gk20a_allocator_dump(&allocator);
+
+	/* allocate contiguous blocks */
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 4);
+	gk20a_allocator_dump(&allocator);
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 4);
+	gk20a_allocator_dump(&allocator);
+
+	/* no free space */
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+
+	/* free in the end */
+	gk20a_allocator_block_free(&allocator, 8, 2);
+	gk20a_allocator_dump(&allocator);
+	/* free in the beginning */
+	gk20a_allocator_block_free(&allocator, 0, 2);
+	gk20a_allocator_dump(&allocator);
+	/* free in the middle */
+	gk20a_allocator_block_free(&allocator, 4, 2);
+	gk20a_allocator_dump(&allocator);
+
+	/* merge case PPPPAAAANNNN */
+	addr = 4;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	/* merge case ....AAAANNNN */
+	addr = 0;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	/* merge case PPPPAAAA.... */
+	addr = 8;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+
+	/* test free across multiple blocks and split */
+	gk20a_allocator_block_free(&allocator, 2, 2);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_block_free(&allocator, 6, 2);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_block_free(&allocator, 1, 8);
+	gk20a_allocator_dump(&allocator);
+
+	/* test non-contiguous allocation */
+	addr = 4;
+	gk20a_allocator_block_alloc(&allocator, &addr, 2);
+	gk20a_allocator_dump(&allocator);
+	addr = 0;
+	gk20a_allocator_block_alloc_nc(&allocator, &addr, 5, &list[0]);
+	gk20a_allocator_dump(&allocator);
+	gk20a_allocator_dump_nc_list(&allocator, list[0]);
+
+	/* test free a range overlaping non-contiguous blocks */
+	gk20a_allocator_block_free(&allocator, 2, 6);
+	gk20a_allocator_dump(&allocator);
+
+	/* test non-contiguous free */
+	gk20a_allocator_block_free_nc(&allocator, list[0]);
+	gk20a_allocator_dump(&allocator);
+
+	gk20a_allocator_destroy(&allocator);
+
+	/* random stress test */
+	gk20a_allocator_init(&allocator, "test", 4096, 4096 * 1024, 4096);
+	for (;;) {
+		pr_debug("alloc tests...\n");
+		for (count = 0; count < 50; count++) {
+			addr = 0;
+			len = random32() % (4096 * 1024 / 16);
+			gk20a_allocator_block_alloc(&allocator, &addr, len);
+			gk20a_allocator_dump(&allocator);
+		}
+
+		pr_debug("free tests...\n");
+		for (count = 0; count < 30; count++) {
+			addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+			len = random32() % (4096 * 1024 / 16);
+			gk20a_allocator_block_free(&allocator, addr, len);
+			gk20a_allocator_dump(&allocator);
+		}
+
+		pr_debug("non-contiguous alloc tests...\n");
+		for (n = 0; n < 5; n++) {
+			addr = 0;
+			len = random32() % (4096 * 1024 / 8);
+			gk20a_allocator_block_alloc_nc(&allocator, &addr,
+				len, &list[n]);
+			gk20a_allocator_dump(&allocator);
+			gk20a_allocator_dump_nc_list(&allocator, list[n]);
+		}
+
+		pr_debug("free tests...\n");
+		for (count = 0; count < 10; count++) {
+			addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+			len = random32() % (4096 * 1024 / 16);
+			gk20a_allocator_block_free(&allocator, addr, len);
+			gk20a_allocator_dump(&allocator);
+		}
+
+		pr_debug("non-contiguous free tests...\n");
+		for (n = 4; n >= 0; n--) {
+			gk20a_allocator_dump_nc_list(&allocator, list[n]);
+			gk20a_allocator_block_free_nc(&allocator, list[n]);
+			gk20a_allocator_dump(&allocator);
+		}
+
+		pr_debug("fixed addr alloc tests...\n");
+		for (count = 0; count < 10; count++) {
+			addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+			len = random32() % (4096 * 1024 / 32);
+			gk20a_allocator_block_alloc(&allocator, &addr, len);
+			gk20a_allocator_dump(&allocator);
+		}
+
+		pr_debug("free tests...\n");
+		for (count = 0; count < 10; count++) {
+			addr = (random32() % (4096 * 1024)) & ~(4096 - 1);
+			len = random32() % (4096 * 1024 / 16);
+			gk20a_allocator_block_free(&allocator, addr, len);
+			gk20a_allocator_dump(&allocator);
+		}
+	}
+	gk20a_allocator_destroy(&allocator);
+}
+
+#endif /* ALLOCATOR_DEBUG */
+
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
new file mode 100644
index 000000000000..dba397e2481c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_allocator.h
@@ -0,0 +1,177 @@
+/*
+ * gk20a allocator
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __NVHOST_ALLOCATOR_H__
+#define __NVHOST_ALLOCATOR_H__
+
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+
+/* #define ALLOCATOR_DEBUG */
+
+struct allocator_block;
+
+/* main struct */
+struct gk20a_allocator {
+
+	char name[32];			/* name for allocator */
+	struct rb_root rb_root;		/* rb tree root for blocks */
+
+	u32 base;			/* min value of this linear space */
+	u32 limit;			/* max value = limit - 1 */
+	u32 align;			/* alignment size, power of 2 */
+
+	struct gk20a_alloc_block *block_first;	/* first block in list */
+	struct gk20a_alloc_block *block_recent; /* last visited block */
+
+	u32 first_free_addr;		/* first free addr, non-contigous
+					   allocation preferred start,
+					   in order to pick up small holes */
+	u32 last_free_addr;		/* last free addr, contiguous
+					   allocation preferred start */
+	u32 cached_hole_size;		/* max free hole size up to
+					   last_free_addr */
+	u32 block_count;		/* number of blocks */
+
+	struct rw_semaphore rw_sema;	/* lock */
+	struct kmem_cache *block_cache;	/* slab cache */
+
+	/* if enabled, constrain to [base, limit) */
+	struct {
+		bool enable;
+		u32 base;
+		u32 limit;
+	} constraint;
+
+	int (*alloc)(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len);
+	int (*alloc_nc)(struct gk20a_allocator *allocator,
+		u32 *addr, u32 len,
+		struct gk20a_alloc_block **pblock);
+	int (*free)(struct gk20a_allocator *allocator,
+		u32 addr, u32 len);
+	void (*free_nc)(struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block);
+
+	int (*constrain)(struct gk20a_allocator *a,
+			 bool enable,
+			 u32 base, u32 limit);
+};
+
+/* a block of linear space range [start, end) */
+struct gk20a_alloc_block {
+	struct gk20a_allocator *allocator;	/* parent allocator */
+	struct rb_node rb;			/* rb tree node */
+
+	u32 start;				/* linear space range
+						   [start, end) */
+	u32 end;
+
+	void *priv;				/* backing structure for this
+						   linear space block
+						   page table, comp tag, etc */
+
+	struct gk20a_alloc_block *prev;	/* prev block with lower address */
+	struct gk20a_alloc_block *next;	/* next block with higher address */
+
+	bool nc_block;
+	struct gk20a_alloc_block *nc_prev;	/* prev block for
+						   non-contiguous allocation */
+	struct gk20a_alloc_block *nc_next;	/* next block for
+						   non-contiguous allocation */
+};
+
+int gk20a_allocator_init(struct gk20a_allocator *allocator,
+			const char *name, u32 base, u32 size, u32 align);
+void gk20a_allocator_destroy(struct gk20a_allocator *allocator);
+
+int gk20a_allocator_block_alloc(struct gk20a_allocator *allocator,
+			u32 *addr, u32 len);
+int gk20a_allocator_block_alloc_nc(struct gk20a_allocator *allocator,
+			u32 *addr, u32 len,
+			struct gk20a_alloc_block **pblock);
+
+int gk20a_allocator_block_free(struct gk20a_allocator *allocator,
+			u32 addr, u32 len);
+void gk20a_allocator_block_free_nc(struct gk20a_allocator *allocator,
+			struct gk20a_alloc_block *block);
+
+#if defined(ALLOCATOR_DEBUG)
+
+#define allocator_dbg(alloctor, format, arg...)				\
+do {								\
+	if (1)							\
+		pr_debug("gk20a_allocator (%s) %s: " format "\n",\
+			alloctor->name, __func__, ##arg);\
+} while (0)
+
+static inline void
+gk20a_allocator_dump(struct gk20a_allocator *allocator) {
+	struct gk20a_alloc_block *block;
+	u32 count = 0;
+
+	down_read(&allocator->rw_sema);
+	for (block = allocator->block_first; block; block = block->next) {
+		allocator_dbg(allocator, "block %d - %d:%d, nc %d",
+			count++, block->start, block->end, block->nc_block);
+
+		if (block->prev)
+			BUG_ON(block->prev->end > block->start);
+		if (block->next)
+			BUG_ON(block->next->start < block->end);
+	}
+	allocator_dbg(allocator, "tracked count %d, actual count %d",
+		allocator->block_count, count);
+	allocator_dbg(allocator, "first block %d:%d",
+		allocator->block_first ? allocator->block_first->start : -1,
+		allocator->block_first ? allocator->block_first->end : -1);
+	allocator_dbg(allocator, "first free addr %d",
+		allocator->first_free_addr);
+	allocator_dbg(allocator, "last free addr %d",
+		allocator->last_free_addr);
+	allocator_dbg(allocator, "cached hole size %d",
+		allocator->cached_hole_size);
+	up_read(&allocator->rw_sema);
+
+	BUG_ON(count != allocator->block_count);
+}
+
+static inline void
+gk20a_allocator_dump_nc_list(
+		struct gk20a_allocator *allocator,
+		struct gk20a_alloc_block *block)
+{
+	down_read(&allocator->rw_sema);
+	while (block) {
+		pr_debug("non-contiguous block %d:%d\n",
+			block->start, block->end);
+		block = block->nc_next;
+	}
+	up_read(&allocator->rw_sema);
+}
+
+void gk20a_allocator_test(void);
+
+#else /* ALLOCATOR_DEBUG */
+
+#define allocator_dbg(format, arg...)
+
+#endif /* ALLOCATOR_DEBUG */
+
+#endif /*__NVHOST_ALLOCATOR_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c
new file mode 100644
index 000000000000..c6478a5e1328
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA Corporation.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * This file is autogenerated.  Do not edit.
+ */
+
+#ifndef __gk20a_gating_reglist_h__
+#define __gk20a_gating_reglist_h__
+
+#include <linux/types.h>
+#include "gk20a_gating_reglist.h"
+
+struct gating_desc {
+	u32 addr;
+	u32 prod;
+	u32 disable;
+};
+/* slcg gr */
+const struct gating_desc gk20a_slcg_gr[] = {
+	{.addr = 0x004041f4, .prod = 0x00000000, .disable = 0x03fffffe},
+	{.addr = 0x00409894, .prod = 0x00000040, .disable = 0x0003fffe},
+	{.addr = 0x004078c4, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00406004, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00405864, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00405910, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00408044, .prod = 0x00000000, .disable = 0x000007fe},
+	{.addr = 0x00407004, .prod = 0x00000000, .disable = 0x0000001e},
+	{.addr = 0x0041a894, .prod = 0x00000000, .disable = 0x0003fffe},
+	{.addr = 0x00418504, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x0041860c, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x0041868c, .prod = 0x00000000, .disable = 0x0000001e},
+	{.addr = 0x0041871c, .prod = 0x00000000, .disable = 0x0000003e},
+	{.addr = 0x00418388, .prod = 0x00000000, .disable = 0x00000001},
+	{.addr = 0x0041882c, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00418bc0, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00418974, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00418c74, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00418cf4, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00418d74, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00418f10, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00418e10, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00419024, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00419a44, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419a4c, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00419a54, .prod = 0x00000000, .disable = 0x0000003e},
+	{.addr = 0x00419a5c, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419a64, .prod = 0x00000000, .disable = 0x000001fe},
+	{.addr = 0x00419a6c, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419a74, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419a7c, .prod = 0x00000000, .disable = 0x0000003e},
+	{.addr = 0x00419a84, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419ad0, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x0041986c, .prod = 0x0000dfc0, .disable = 0x00fffffe},
+	{.addr = 0x00419cd8, .prod = 0x00000000, .disable = 0x001ffffe},
+	{.addr = 0x00419ce0, .prod = 0x00000000, .disable = 0x001ffffe},
+	{.addr = 0x00419c74, .prod = 0x00000000, .disable = 0x0000001e},
+	{.addr = 0x00419fd4, .prod = 0x00000000, .disable = 0x0003fffe},
+	{.addr = 0x00419fdc, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00419fe4, .prod = 0x00000000, .disable = 0x0000000e},
+	{.addr = 0x00419ff4, .prod = 0x00000000, .disable = 0x00003ffe},
+	{.addr = 0x00419ffc, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x0041be2c, .prod = 0x020bbfc0, .disable = 0xfffffffe},
+	{.addr = 0x0041bfec, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x0041bed4, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00408814, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x0040881c, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408a84, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408a8c, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408a94, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408a9c, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408aa4, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408aac, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x004089ac, .prod = 0x00000000, .disable = 0x0001fffe},
+	{.addr = 0x00408a24, .prod = 0x00000000, .disable = 0x000001ff},
+	{.addr = 0x0017e050, .prod = 0x00000000, .disable = 0x00fffffe},
+	{.addr = 0x001200a8, .prod = 0x00000000, .disable = 0x00000001},
+	{.addr = 0x0010e48c, .prod = 0x00000000, .disable = 0x0000003e},
+	{.addr = 0x00001c04, .prod = 0x00000000, .disable = 0x000000fe},
+	{.addr = 0x00106f28, .prod = 0x00000040, .disable = 0x000007fe},
+	{.addr = 0x000206b8, .prod = 0x00000000, .disable = 0x0000000f},
+	{.addr = 0x0017ea98, .prod = 0x00000000, .disable = 0xfffffffe},
+	{.addr = 0x00106f28, .prod = 0x00000040, .disable = 0x000007fe},
+	{.addr = 0x00120048, .prod = 0x00000000, .disable = 0x00000049},
+};
+
+/* slcg perf */
+const struct gating_desc gk20a_slcg_perf[] = {
+	{.addr = 0x001be018, .prod = 0x000001ff, .disable = 0x00000000},
+	{.addr = 0x001bc018, .prod = 0x000001ff, .disable = 0x00000000},
+	{.addr = 0x001b8018, .prod = 0x000001ff, .disable = 0x00000000},
+	{.addr = 0x001b4124, .prod = 0x00000001, .disable = 0x00000000},
+};
+
+/* blcg gr */
+const struct gating_desc gk20a_blcg_gr[] = {
+	{.addr = 0x004041f0, .prod = 0x00004046, .disable = 0x00000000},
+	{.addr = 0x00409890, .prod = 0x0000007f, .disable = 0x00000000},
+	{.addr = 0x004098b0, .prod = 0x0000007f, .disable = 0x00000000},
+	{.addr = 0x004078c0, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x00406000, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00405860, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x0040590c, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00408040, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00407000, .prod = 0x00004041, .disable = 0x00000000},
+	{.addr = 0x00405bf0, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x0041a890, .prod = 0x0000007f, .disable = 0x00000000},
+	{.addr = 0x0041a8b0, .prod = 0x0000007f, .disable = 0x00000000},
+	{.addr = 0x00418500, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00418608, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00418688, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00418718, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x00418828, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x00418bbc, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00418970, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00418c70, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00418cf0, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00418d70, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00418f0c, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00418e0c, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00419020, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419038, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x00419a40, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a48, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a50, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a58, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a60, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a68, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a70, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a78, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419a80, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419acc, .prod = 0x00004047, .disable = 0x00000000},
+	{.addr = 0x00419868, .prod = 0x00000043, .disable = 0x00000000},
+	{.addr = 0x00419cd4, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419cdc, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419c70, .prod = 0x00004045, .disable = 0x00000000},
+	{.addr = 0x00419fd0, .prod = 0x00004043, .disable = 0x00000000},
+	{.addr = 0x00419fd8, .prod = 0x00004045, .disable = 0x00000000},
+	{.addr = 0x00419fe0, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419fe8, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419ff0, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00419ff8, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00419f90, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x0041be28, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x0041bfe8, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x0041bed0, .prod = 0x00004044, .disable = 0x00000000},
+	{.addr = 0x00408810, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408818, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408a80, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408a88, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408a90, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408a98, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408aa0, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x00408aa8, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x004089a8, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x004089b0, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x004089b8, .prod = 0x00004042, .disable = 0x00000000},
+	{.addr = 0x0017ea60, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x0017ea68, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x00100d30, .prod = 0x0000c242, .disable = 0x00000000},
+	{.addr = 0x00100d48, .prod = 0x0000c242, .disable = 0x00000000},
+	{.addr = 0x00100d3c, .prod = 0x00000242, .disable = 0x00000000},
+	{.addr = 0x0017ea78, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x0017e040, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x00100d1c, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x00106f24, .prod = 0x0000c242, .disable = 0x00000000},
+	{.addr = 0x0041be00, .prod = 0x00000004, .disable = 0x00000007},
+	{.addr = 0x00100d10, .prod = 0x0000c242, .disable = 0x00000000},
+	{.addr = 0x0017ea70, .prod = 0x00000044, .disable = 0x00000000},
+	{.addr = 0x00001c00, .prod = 0x00000042, .disable = 0x00000000},
+	{.addr = 0x00100c98, .prod = 0x00000242, .disable = 0x00000000},
+	{.addr = 0x0017e030, .prod = 0x00000044, .disable = 0x00000000},
+};
+
+/* pg gr */
+const struct gating_desc gk20a_pg_gr[] = {
+	{.addr = 0x004041f8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x004041fc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00409898, .prod = 0x10140000, .disable = 0x00000000},
+	{.addr = 0x0040989c, .prod = 0xff00000a, .disable = 0x00000000},
+	{.addr = 0x004078c8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x004078cc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00406008, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0040600c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00405868, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0040586c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00405914, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00405924, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408048, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0040804c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00407008, .prod = 0x10140000, .disable = 0x00000000},
+	{.addr = 0x0040700c, .prod = 0xff00000a, .disable = 0x00000000},
+	{.addr = 0x00405bf8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00405bfc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x0041a898, .prod = 0x10140000, .disable = 0x00000000},
+	{.addr = 0x0041a89c, .prod = 0xff00000a, .disable = 0x00000000},
+	{.addr = 0x00418510, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418514, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418610, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418614, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418690, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418694, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418720, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418724, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418840, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418844, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418bc4, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418bc8, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418978, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0041897c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418c78, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418c7c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418cf8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418cfc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418d78, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418d7c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418f14, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418f18, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00418e14, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00418e18, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419030, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419050, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419a88, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419a8c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419a90, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419a94, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419a98, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419a9c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419aa0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419aa4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419ad4, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419ad8, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419870, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419874, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419ce4, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419cf0, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419c78, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419c7c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fa0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fa4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fa8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fac, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fb0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fb4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fb8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fbc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fc0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fc4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00419fc8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00419fcc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x0041be30, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0041be34, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x0041bff0, .prod = 0x10747c00, .disable = 0x00000000},
+	{.addr = 0x0041bff4, .prod = 0xff00000a, .disable = 0x00000000},
+	{.addr = 0x0041bed8, .prod = 0x10240a00, .disable = 0x00000000},
+	{.addr = 0x0041bee0, .prod = 0xff00000a, .disable = 0x00000000},
+	{.addr = 0x00408820, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408824, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408828, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x0040882c, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ac0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408ac4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ac8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408acc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ad0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408ad4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ad8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408adc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ae0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408ae4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x00408ae8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x00408aec, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x004089c0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x004089c4, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x004089c8, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x004089cc, .prod = 0xff00a725, .disable = 0x00000000},
+	{.addr = 0x004089d0, .prod = 0x10940000, .disable = 0x00000000},
+	{.addr = 0x004089d4, .prod = 0xff00a725, .disable = 0x00000000},
+};
+
+/* therm gr */
+const struct gating_desc gk20a_slcg_therm[] = {
+	{.addr = 0x000206b8, .prod = 0x00000000, .disable = 0x0000000f},
+};
+
+/* static inline functions */
+void gr_gk20a_slcg_gr_load_gating_prod(struct gk20a *g,
+	bool prod)
+{
+	u32 i;
+	u32 size = sizeof(gk20a_slcg_gr) / sizeof(struct gating_desc);
+	for (i = 0; i < size; i++) {
+		if (prod)
+			gk20a_writel(g, gk20a_slcg_gr[i].addr,
+				gk20a_slcg_gr[i].prod);
+		else
+			gk20a_writel(g, gk20a_slcg_gr[i].addr,
+				 gk20a_slcg_gr[i].disable);
+	}
+}
+
+void gr_gk20a_slcg_perf_load_gating_prod(struct gk20a *g,
+	bool prod)
+{
+	u32 i;
+	u32 size = sizeof(gk20a_slcg_perf) / sizeof(struct gating_desc);
+	for (i = 0; i < size; i++) {
+		if (prod)
+			gk20a_writel(g, gk20a_slcg_perf[i].addr,
+				gk20a_slcg_perf[i].prod);
+		else
+			gk20a_writel(g, gk20a_slcg_perf[i].addr,
+				gk20a_slcg_perf[i].disable);
+	}
+}
+
+void gr_gk20a_blcg_gr_load_gating_prod(struct gk20a *g,
+	bool prod)
+{
+	u32 i;
+	u32 size = sizeof(gk20a_blcg_gr) / sizeof(struct gating_desc);
+	for (i = 0; i < size; i++) {
+		if (prod)
+			gk20a_writel(g, gk20a_blcg_gr[i].addr,
+				gk20a_blcg_gr[i].prod);
+		else
+			gk20a_writel(g, gk20a_blcg_gr[i].addr,
+				gk20a_blcg_gr[i].disable);
+	}
+}
+
+void gr_gk20a_pg_gr_load_gating_prod(struct gk20a *g,
+	bool prod)
+{
+	u32 i;
+	u32 size = sizeof(gk20a_pg_gr) / sizeof(struct gating_desc);
+	for (i = 0; i < size; i++) {
+		if (prod)
+			gk20a_writel(g, gk20a_pg_gr[i].addr,
+				gk20a_pg_gr[i].prod);
+		else
+			gk20a_writel(g, gk20a_pg_gr[i].addr,
+				gk20a_pg_gr[i].disable);
+	}
+}
+
+void gr_gk20a_slcg_therm_load_gating_prod(struct gk20a *g,
+	bool prod)
+{
+	u32 i;
+	u32 size = sizeof(gk20a_slcg_therm) / sizeof(struct gating_desc);
+	for (i = 0; i < size; i++) {
+		if (prod)
+			gk20a_writel(g, gk20a_slcg_therm[i].addr,
+				gk20a_slcg_therm[i].prod);
+		else
+			gk20a_writel(g, gk20a_slcg_therm[i].addr,
+				gk20a_slcg_therm[i].disable);
+	}
+}
+
+#endif /* __gk20a_gating_reglist_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h
new file mode 100644
index 000000000000..40a6c545cf39
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_gating_reglist.h
@@ -0,0 +1,39 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_gating_reglist.h
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * This file is autogenerated.  Do not edit.
+ */
+
+#include "gk20a.h"
+
+void gr_gk20a_slcg_gr_load_gating_prod(struct gk20a *g,
+	bool prod);
+
+void gr_gk20a_slcg_perf_load_gating_prod(struct gk20a *g,
+	bool prod);
+
+void gr_gk20a_blcg_gr_load_gating_prod(struct gk20a *g,
+	bool prod);
+
+void gr_gk20a_pg_gr_load_gating_prod(struct gk20a *g,
+	bool prod);
+
+void gr_gk20a_slcg_therm_load_gating_prod(struct gk20a *g,
+	bool prod);
+
+
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_scale.c b/drivers/gpu/nvgpu/gk20a/gk20a_scale.c
new file mode 100644
index 000000000000..d1fd71fe4e36
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_scale.c
@@ -0,0 +1,358 @@
+/*
+ * gk20a clock scaling profile
+ *
+ * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/devfreq.h>
+#include <linux/debugfs.h>
+#include <linux/types.h>
+#include <linux/clk.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/clk/tegra.h>
+#include <linux/tegra-soc.h>
+#include <linux/platform_data/tegra_edp.h>
+#include <linux/pm_qos.h>
+
+#include <governor.h>
+
+#include "gk20a.h"
+#include "pmu_gk20a.h"
+#include "clk_gk20a.h"
+#include "gk20a_scale.h"
+
+static ssize_t gk20a_scale_load_show(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct gk20a *g = get_gk20a(pdev);
+	u32 busy_time;
+	ssize_t res;
+
+	if (!g->power_on) {
+		busy_time = 0;
+	} else {
+		gk20a_busy(g->dev);
+		gk20a_pmu_load_norm(g, &busy_time);
+		gk20a_idle(g->dev);
+	}
+
+	res = snprintf(buf, PAGE_SIZE, "%u\n", busy_time);
+
+	return res;
+}
+
+static DEVICE_ATTR(load, S_IRUGO, gk20a_scale_load_show, NULL);
+
+/*
+ * gk20a_scale_qos_notify()
+ *
+ * This function is called when the minimum QoS requirement for the device
+ * has changed. The function calls postscaling callback if it is defined.
+ */
+
+static int gk20a_scale_qos_notify(struct notifier_block *nb,
+				  unsigned long n, void *p)
+{
+	struct gk20a_scale_profile *profile =
+		container_of(nb, struct gk20a_scale_profile,
+			     qos_notify_block);
+	struct gk20a_platform *platform = platform_get_drvdata(profile->pdev);
+	struct gk20a *g = get_gk20a(profile->pdev);
+	unsigned long freq;
+
+	if (!platform->postscale)
+		return NOTIFY_OK;
+
+	/* get the frequency requirement. if devfreq is enabled, check if it
+	 * has higher demand than qos */
+	freq = gk20a_clk_round_rate(g, pm_qos_request(platform->qos_id));
+	if (g->devfreq)
+		freq = max(g->devfreq->previous_freq, freq);
+
+	platform->postscale(profile->pdev, freq);
+
+	return NOTIFY_OK;
+}
+
+/*
+ * gk20a_scale_make_freq_table(profile)
+ *
+ * This function initialises the frequency table for the given device profile
+ */
+
+static int gk20a_scale_make_freq_table(struct gk20a_scale_profile *profile)
+{
+	struct gk20a *g = get_gk20a(profile->pdev);
+	unsigned long *freqs;
+	int num_freqs, err;
+
+	/* make sure the clock is available */
+	if (!gk20a_clk_get(g))
+		return -ENOSYS;
+
+	/* get gpu dvfs table */
+	err = tegra_dvfs_get_freqs(clk_get_parent(g->clk.tegra_clk),
+				   &freqs, &num_freqs);
+	if (err)
+		return -ENOSYS;
+
+	profile->devfreq_profile.freq_table = (unsigned long *)freqs;
+	profile->devfreq_profile.max_state = num_freqs;
+
+	return 0;
+}
+
+/*
+ * gk20a_scale_target(dev, *freq, flags)
+ *
+ * This function scales the clock
+ */
+
+static int gk20a_scale_target(struct device *dev, unsigned long *freq,
+			      u32 flags)
+{
+	struct gk20a *g = get_gk20a(to_platform_device(dev));
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	struct gk20a_scale_profile *profile = g->scale_profile;
+	unsigned long rounded_rate = gk20a_clk_round_rate(g, *freq);
+
+	if (gk20a_clk_get_rate(g) == rounded_rate) {
+		*freq = rounded_rate;
+		return 0;
+	}
+
+	gk20a_clk_set_rate(g, rounded_rate);
+	if (platform->postscale)
+		platform->postscale(profile->pdev, rounded_rate);
+	*freq = gk20a_clk_get_rate(g);
+
+	return 0;
+}
+
+/*
+ * update_load_estimate_gpmu(profile)
+ *
+ * Update load estimate using gpmu. The gpmu value is normalised
+ * based on the time it was asked last time.
+ */
+
+static void update_load_estimate_gpmu(struct platform_device *pdev)
+{
+	struct gk20a *g = get_gk20a(pdev);
+	struct gk20a_scale_profile *profile = g->scale_profile;
+	unsigned long dt;
+	u32 busy_time;
+	ktime_t t;
+
+	t = ktime_get();
+	dt = ktime_us_delta(t, profile->last_event_time);
+
+	profile->dev_stat.total_time = dt;
+	profile->last_event_time = t;
+	gk20a_pmu_load_norm(g, &busy_time);
+	profile->dev_stat.busy_time = (busy_time * dt) / 1000;
+}
+
+/*
+ * gk20a_scale_suspend(pdev)
+ *
+ * This function informs devfreq of suspend
+ */
+
+void gk20a_scale_suspend(struct platform_device *pdev)
+{
+	struct gk20a *g = get_gk20a(pdev);
+	struct devfreq *devfreq = g->devfreq;
+
+	if (!devfreq)
+		return;
+
+	devfreq_suspend_device(devfreq);
+}
+
+/*
+ * gk20a_scale_resume(pdev)
+ *
+ * This functions informs devfreq of resume
+ */
+
+void gk20a_scale_resume(struct platform_device *pdev)
+{
+	struct gk20a *g = get_gk20a(pdev);
+	struct devfreq *devfreq = g->devfreq;
+
+	if (!devfreq)
+		return;
+
+	devfreq_resume_device(devfreq);
+}
+
+/*
+ * gk20a_scale_notify(pdev, busy)
+ *
+ * Calling this function informs that the device is idling (..or busy). This
+ * data is used to estimate the current load
+ */
+
+static void gk20a_scale_notify(struct platform_device *pdev, bool busy)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct gk20a *g = get_gk20a(pdev);
+	struct gk20a_scale_profile *profile = g->scale_profile;
+	struct devfreq *devfreq = g->devfreq;
+
+	/* inform edp about new constraint */
+	if (platform->prescale)
+		platform->prescale(pdev);
+
+	/* Is the device profile initialised? */
+	if (!(profile && devfreq))
+		return;
+
+	mutex_lock(&devfreq->lock);
+	profile->dev_stat.busy = busy;
+	update_devfreq(devfreq);
+	mutex_unlock(&devfreq->lock);
+}
+
+void gk20a_scale_notify_idle(struct platform_device *pdev)
+{
+	gk20a_scale_notify(pdev, false);
+
+}
+
+void gk20a_scale_notify_busy(struct platform_device *pdev)
+{
+	gk20a_scale_notify(pdev, true);
+}
+
+/*
+ * gk20a_scale_get_dev_status(dev, *stat)
+ *
+ * This function queries the current device status.
+ */
+
+static int gk20a_scale_get_dev_status(struct device *dev,
+				      struct devfreq_dev_status *stat)
+{
+	struct gk20a *g = get_gk20a(to_platform_device(dev));
+	struct gk20a_scale_profile *profile = g->scale_profile;
+
+	/* Make sure there are correct values for the current frequency */
+	profile->dev_stat.current_frequency = gk20a_clk_get_rate(g);
+
+	/* Update load estimate */
+	update_load_estimate_gpmu(to_platform_device(dev));
+
+	/* Copy the contents of the current device status */
+	*stat = profile->dev_stat;
+
+	/* Finally, clear out the local values */
+	profile->dev_stat.total_time = 0;
+	profile->dev_stat.busy_time = 0;
+
+	return 0;
+}
+
+/*
+ * gk20a_scale_init(pdev)
+ */
+
+void gk20a_scale_init(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct gk20a *g = platform->g;
+	struct gk20a_scale_profile *profile;
+	int err;
+
+	if (g->scale_profile)
+		return;
+
+	profile = kzalloc(sizeof(*profile), GFP_KERNEL);
+
+	profile->pdev = pdev;
+	profile->dev_stat.busy = false;
+
+	/* Create frequency table */
+	err = gk20a_scale_make_freq_table(profile);
+	if (err || !profile->devfreq_profile.max_state)
+		goto err_get_freqs;
+
+	if (device_create_file(&pdev->dev, &dev_attr_load))
+		goto err_create_sysfs_entry;
+
+	/* Store device profile so we can access it if devfreq governor
+	 * init needs that */
+	g->scale_profile = profile;
+
+	if (platform->devfreq_governor) {
+		struct devfreq *devfreq;
+
+		profile->devfreq_profile.initial_freq =
+			profile->devfreq_profile.freq_table[0];
+		profile->devfreq_profile.target = gk20a_scale_target;
+		profile->devfreq_profile.get_dev_status =
+			gk20a_scale_get_dev_status;
+
+		devfreq = devfreq_add_device(&pdev->dev,
+					&profile->devfreq_profile,
+					platform->devfreq_governor, NULL);
+
+		if (IS_ERR(devfreq))
+			devfreq = NULL;
+
+		g->devfreq = devfreq;
+	}
+
+	/* Should we register QoS callback for this device? */
+	if (platform->qos_id < PM_QOS_NUM_CLASSES &&
+	    platform->qos_id != PM_QOS_RESERVED &&
+	    platform->postscale) {
+		profile->qos_notify_block.notifier_call =
+			&gk20a_scale_qos_notify;
+		pm_qos_add_notifier(platform->qos_id,
+				    &profile->qos_notify_block);
+	}
+
+	return;
+
+err_get_freqs:
+	device_remove_file(&pdev->dev, &dev_attr_load);
+err_create_sysfs_entry:
+	kfree(g->scale_profile);
+	g->scale_profile = NULL;
+}
+
+/*
+ * gk20a_scale_hw_init(dev)
+ *
+ * Initialize hardware portion of the device
+ */
+
+void gk20a_scale_hw_init(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct gk20a_scale_profile *profile = platform->g->scale_profile;
+
+	/* make sure that scaling has bee initialised */
+	if (!profile)
+		return;
+
+	profile->dev_stat.total_time = 0;
+	profile->last_event_time = ktime_get();
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_scale.h b/drivers/gpu/nvgpu/gk20a/gk20a_scale.h
new file mode 100644
index 000000000000..e76b16627105
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_scale.h
@@ -0,0 +1,51 @@
+/*
+ * gk20a clock scaling profile
+ *
+ * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GK20A_SCALE_H
+#define GK20A_SCALE_H
+
+#include <linux/nvhost.h>
+#include <linux/devfreq.h>
+
+struct platform_device;
+struct clk;
+
+struct gk20a_scale_profile {
+	struct platform_device		*pdev;
+	ktime_t				last_event_time;
+	struct devfreq_dev_profile	devfreq_profile;
+	struct devfreq_dev_status	dev_stat;
+	struct notifier_block		qos_notify_block;
+	void				*private_data;
+};
+
+/* Initialization and de-initialization for module */
+void gk20a_scale_init(struct platform_device *);
+void gk20a_scale_hw_init(struct platform_device *pdev);
+
+/*
+ * call when performing submit to notify scaling mechanism that the module is
+ * in use
+ */
+void gk20a_scale_notify_busy(struct platform_device *);
+void gk20a_scale_notify_idle(struct platform_device *);
+
+void gk20a_scale_suspend(struct platform_device *);
+void gk20a_scale_resume(struct platform_device *);
+
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
new file mode 100644
index 000000000000..f6b43f506bd0
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gk20a_sysfs.c
@@ -0,0 +1,335 @@
+/*
+ * drivers/video/tegra/host/gk20a/gk20a_sysfs.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/kernel.h>
+#include <linux/fb.h>
+
+#include <mach/clk.h>
+
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "fifo_gk20a.h"
+
+
+#define PTIMER_FP_FACTOR			1000000
+/* PTIMER_REF_FREQ_HZ corresponds to a period of 32 nanoseconds. 32 ns is
+   the resolution of ptimer. */
+#define PTIMER_REF_FREQ_HZ			31250000
+
+
+static ssize_t elcg_enable_store(struct device *device,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+	unsigned long val = 0;
+
+	if (kstrtoul(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	gk20a_busy(g->dev);
+	if (val) {
+		g->elcg_enabled = true;
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+	} else {
+		g->elcg_enabled = false;
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+	}
+	gk20a_idle(g->dev);
+
+	dev_info(device, "ELCG is %s.\n", g->elcg_enabled ? "enabled" :
+			"disabled");
+
+	return count;
+}
+
+static ssize_t elcg_enable_read(struct device *device,
+	struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+
+	return sprintf(buf, "%d\n", g->elcg_enabled ? 1 : 0);
+}
+
+static DEVICE_ATTR(elcg_enable, S_IRWXUGO, elcg_enable_read, elcg_enable_store);
+
+static ssize_t blcg_enable_store(struct device *device,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+	unsigned long val = 0;
+
+	if (kstrtoul(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	if (val)
+		g->blcg_enabled = true;
+	else
+		g->blcg_enabled = false;
+
+	gk20a_busy(g->dev);
+	g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
+	gk20a_idle(g->dev);
+
+	dev_info(device, "BLCG is %s.\n", g->blcg_enabled ? "enabled" :
+			"disabled");
+
+	return count;
+}
+
+static ssize_t blcg_enable_read(struct device *device,
+	struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+
+	return sprintf(buf, "%d\n", g->blcg_enabled ? 1 : 0);
+}
+
+static DEVICE_ATTR(blcg_enable, S_IRWXUGO, blcg_enable_read, blcg_enable_store);
+
+static ssize_t slcg_enable_store(struct device *device,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+	unsigned long val = 0;
+
+	if (kstrtoul(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	if (val)
+		g->slcg_enabled = true;
+	else
+		g->slcg_enabled = false;
+
+	/*
+	 * TODO: slcg_therm_load_gating is not enabled anywhere during
+	 * init. Therefore, it would be incongruous to add it here. Once
+	 * it is added to init, we should add it here too.
+	 */
+	gk20a_busy(g->dev);
+	g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
+	g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
+	gk20a_idle(g->dev);
+
+	dev_info(device, "SLCG is %s.\n", g->slcg_enabled ? "enabled" :
+			"disabled");
+
+	return count;
+}
+
+static ssize_t slcg_enable_read(struct device *device,
+	struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+
+	return sprintf(buf, "%d\n", g->slcg_enabled ? 1 : 0);
+}
+
+static DEVICE_ATTR(slcg_enable, S_IRWXUGO, slcg_enable_read, slcg_enable_store);
+
+static ssize_t ptimer_scale_factor_show(struct device *dev,
+						struct device_attribute *attr,
+						char *buf)
+{
+	u32 tsc_freq_hz = clk_get_rate(clk_get_sys(NULL, "clk_m"));
+	u32 scaling_factor_fp = (u32)(PTIMER_REF_FREQ_HZ) /
+				((u32)(tsc_freq_hz) /
+				(u32)(PTIMER_FP_FACTOR));
+	ssize_t res = snprintf(buf,
+				PAGE_SIZE,
+				"%u.%u\n",
+				scaling_factor_fp / PTIMER_FP_FACTOR,
+				scaling_factor_fp % PTIMER_FP_FACTOR);
+
+	return res;
+}
+
+static DEVICE_ATTR(ptimer_scale_factor,
+			S_IRUGO,
+			ptimer_scale_factor_show,
+			NULL);
+
+static ssize_t railgate_delay_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	int railgate_delay = 0, ret = 0;
+
+	if (!platform->can_railgate) {
+		dev_info(dev, "does not support power-gating\n");
+		return count;
+	}
+
+	ret = sscanf(buf, "%d", &railgate_delay);
+	if (ret == 1 && railgate_delay >= 0) {
+		struct generic_pm_domain *genpd = pd_to_genpd(dev->pm_domain);
+		platform->railgate_delay = railgate_delay;
+		pm_genpd_set_poweroff_delay(genpd, platform->railgate_delay);
+	} else
+		dev_err(dev, "Invalid powergate delay\n");
+
+	return count;
+}
+static ssize_t railgate_delay_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	return snprintf(buf, PAGE_SIZE, "%d\n", platform->railgate_delay);
+}
+static DEVICE_ATTR(railgate_delay, S_IRWXUGO, railgate_delay_show,
+		   railgate_delay_store);
+
+static ssize_t clockgate_delay_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	int clockgate_delay = 0, ret = 0;
+
+	ret = sscanf(buf, "%d", &clockgate_delay);
+	if (ret == 1 && clockgate_delay >= 0) {
+		platform->clockgate_delay = clockgate_delay;
+		pm_runtime_set_autosuspend_delay(dev,
+						 platform->clockgate_delay);
+	} else
+		dev_err(dev, "Invalid clockgate delay\n");
+
+	return count;
+}
+static ssize_t clockgate_delay_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct gk20a_platform *platform = dev_get_drvdata(dev);
+	return snprintf(buf, PAGE_SIZE, "%d\n", platform->clockgate_delay);
+}
+static DEVICE_ATTR(clockgate_delay, S_IRWXUGO, clockgate_delay_show,
+		   clockgate_delay_store);
+
+static ssize_t counters_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct gk20a *g = get_gk20a(pdev);
+	u32 busy_cycles, total_cycles;
+	ssize_t res;
+
+	gk20a_pmu_get_load_counters(g, &busy_cycles, &total_cycles);
+
+	res = snprintf(buf, PAGE_SIZE, "%u %u\n", busy_cycles, total_cycles);
+
+	return res;
+}
+
+static DEVICE_ATTR(counters, S_IRUGO, counters_show, NULL);
+static ssize_t counters_show_reset(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	ssize_t res = counters_show(dev, attr, buf);
+	struct platform_device *pdev = to_platform_device(dev);
+	struct gk20a *g = get_gk20a(pdev);
+
+	gk20a_pmu_reset_load_counters(g);
+
+	return res;
+}
+
+static DEVICE_ATTR(counters_reset, S_IRUGO, counters_show_reset, NULL);
+
+static ssize_t elpg_enable_store(struct device *device,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+	unsigned long val = 0;
+
+	if (kstrtoul(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	/*
+	 * Since elpg is refcounted, we should not unnecessarily call
+	 * enable/disable if it is already so.
+	 */
+	gk20a_channel_busy(g->dev);
+	if (val && !g->elpg_enabled) {
+		g->elpg_enabled = true;
+		gk20a_pmu_enable_elpg(g);
+	} else if (!val && g->elpg_enabled) {
+		g->elpg_enabled = false;
+		gk20a_pmu_disable_elpg(g);
+	}
+	gk20a_channel_idle(g->dev);
+
+	dev_info(device, "ELPG is %s.\n", g->elpg_enabled ? "enabled" :
+			"disabled");
+
+	return count;
+}
+
+static ssize_t elpg_enable_read(struct device *device,
+	struct device_attribute *attr, char *buf)
+{
+	struct platform_device *ndev = to_platform_device(device);
+	struct gk20a *g = get_gk20a(ndev);
+
+	return sprintf(buf, "%d\n", g->elpg_enabled ? 1 : 0);
+}
+
+static DEVICE_ATTR(elpg_enable, S_IRWXUGO, elpg_enable_read, elpg_enable_store);
+
+void gk20a_remove_sysfs(struct device *dev)
+{
+	device_remove_file(dev, &dev_attr_elcg_enable);
+	device_remove_file(dev, &dev_attr_blcg_enable);
+	device_remove_file(dev, &dev_attr_slcg_enable);
+	device_remove_file(dev, &dev_attr_ptimer_scale_factor);
+	device_remove_file(dev, &dev_attr_elpg_enable);
+	device_remove_file(dev, &dev_attr_counters);
+	device_remove_file(dev, &dev_attr_counters_reset);
+	device_remove_file(dev, &dev_attr_railgate_delay);
+	device_remove_file(dev, &dev_attr_clockgate_delay);
+}
+
+void gk20a_create_sysfs(struct platform_device *dev)
+{
+	int error = 0;
+
+	error |= device_create_file(&dev->dev, &dev_attr_elcg_enable);
+	error |= device_create_file(&dev->dev, &dev_attr_blcg_enable);
+	error |= device_create_file(&dev->dev, &dev_attr_slcg_enable);
+	error |= device_create_file(&dev->dev, &dev_attr_ptimer_scale_factor);
+	error |= device_create_file(&dev->dev, &dev_attr_elpg_enable);
+	error |= device_create_file(&dev->dev, &dev_attr_counters);
+	error |= device_create_file(&dev->dev, &dev_attr_counters_reset);
+	error |= device_create_file(&dev->dev, &dev_attr_railgate_delay);
+	error |= device_create_file(&dev->dev, &dev_attr_clockgate_delay);
+
+	if (error)
+		dev_err(&dev->dev, "Failed to create sysfs attributes!\n");
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
new file mode 100644
index 000000000000..59404f1d8868
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.c
@@ -0,0 +1,333 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_gk20a.c
+ *
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/firmware.h>
+
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+#include "hw_gr_gk20a.h"
+
+static int gr_gk20a_alloc_load_netlist_u32(u32 *src, u32 len,
+			struct u32_list_gk20a *u32_list)
+{
+	u32_list->count = (len + sizeof(u32) - 1) / sizeof(u32);
+	if (!alloc_u32_list_gk20a(u32_list))
+		return -ENOMEM;
+
+	memcpy(u32_list->l, src, len);
+
+	return 0;
+}
+
+static int gr_gk20a_alloc_load_netlist_av(u32 *src, u32 len,
+			struct av_list_gk20a *av_list)
+{
+	av_list->count = len / sizeof(struct av_gk20a);
+	if (!alloc_av_list_gk20a(av_list))
+		return -ENOMEM;
+
+	memcpy(av_list->l, src, len);
+
+	return 0;
+}
+
+static int gr_gk20a_alloc_load_netlist_aiv(u32 *src, u32 len,
+			struct aiv_list_gk20a *aiv_list)
+{
+	aiv_list->count = len / sizeof(struct aiv_gk20a);
+	if (!alloc_aiv_list_gk20a(aiv_list))
+		return -ENOMEM;
+
+	memcpy(aiv_list->l, src, len);
+
+	return 0;
+}
+
+static int gr_gk20a_get_netlist_name(int index, char *name)
+{
+	switch (index) {
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+	case NETLIST_FINAL:
+		sprintf(name, GK20A_NETLIST_IMAGE_FW_NAME);
+		return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_A
+	case NETLIST_SLOT_A:
+		sprintf(name, GK20A_NETLIST_IMAGE_A);
+		return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+	case NETLIST_SLOT_B:
+		sprintf(name, GK20A_NETLIST_IMAGE_B);
+		return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+	case NETLIST_SLOT_C:
+		sprintf(name, GK20A_NETLIST_IMAGE_C);
+		return 0;
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+	case NETLIST_SLOT_D:
+		sprintf(name, GK20A_NETLIST_IMAGE_D);
+		return 0;
+#endif
+	default:
+		return -1;
+	}
+
+	return -1;
+}
+
+static int gr_gk20a_init_ctx_vars_fw(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct device *d = dev_from_gk20a(g);
+	const struct firmware *netlist_fw;
+	struct netlist_image *netlist = NULL;
+	char name[MAX_NETLIST_NAME];
+	u32 i, major_v = ~0, major_v_hw, netlist_num;
+	int net, max, err = -ENOENT;
+
+	gk20a_dbg_fn("");
+
+#ifdef GK20A_NETLIST_IMAGE_FW_NAME
+	net = NETLIST_FINAL;
+	max = 0;
+	major_v_hw = ~0;
+	g->gr.ctx_vars.dynamic = false;
+#else
+	net = NETLIST_SLOT_A;
+	max = MAX_NETLIST;
+	major_v_hw = gk20a_readl(g, gr_fecs_ctx_state_store_major_rev_id_r());
+	g->gr.ctx_vars.dynamic = true;
+#endif
+
+	for (; net < max; net++) {
+
+		if (gr_gk20a_get_netlist_name(net, name) != 0) {
+			gk20a_warn(d, "invalid netlist index %d", net);
+			continue;
+		}
+
+		netlist_fw = gk20a_request_firmware(g, name);
+		if (!netlist_fw) {
+			gk20a_warn(d, "failed to load netlist %s", name);
+			continue;
+		}
+
+		netlist = (struct netlist_image *)netlist_fw->data;
+
+		for (i = 0; i < netlist->header.regions; i++) {
+			u32 *src = (u32 *)((u8 *)netlist + netlist->regions[i].data_offset);
+			u32 size = netlist->regions[i].data_size;
+
+			switch (netlist->regions[i].region_id) {
+			case NETLIST_REGIONID_FECS_UCODE_DATA:
+				gk20a_dbg_info("NETLIST_REGIONID_FECS_UCODE_DATA");
+				err = gr_gk20a_alloc_load_netlist_u32(
+					src, size, &g->gr.ctx_vars.ucode.fecs.data);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_FECS_UCODE_INST:
+				gk20a_dbg_info("NETLIST_REGIONID_FECS_UCODE_INST");
+				err = gr_gk20a_alloc_load_netlist_u32(
+					src, size, &g->gr.ctx_vars.ucode.fecs.inst);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_GPCCS_UCODE_DATA:
+				gk20a_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_DATA");
+				err = gr_gk20a_alloc_load_netlist_u32(
+					src, size, &g->gr.ctx_vars.ucode.gpccs.data);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_GPCCS_UCODE_INST:
+				gk20a_dbg_info("NETLIST_REGIONID_GPCCS_UCODE_INST");
+				err = gr_gk20a_alloc_load_netlist_u32(
+					src, size, &g->gr.ctx_vars.ucode.gpccs.inst);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_SW_BUNDLE_INIT:
+				gk20a_dbg_info("NETLIST_REGIONID_SW_BUNDLE_INIT");
+				err = gr_gk20a_alloc_load_netlist_av(
+					src, size, &g->gr.ctx_vars.sw_bundle_init);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_SW_METHOD_INIT:
+				gk20a_dbg_info("NETLIST_REGIONID_SW_METHOD_INIT");
+				err = gr_gk20a_alloc_load_netlist_av(
+					src, size, &g->gr.ctx_vars.sw_method_init);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_SW_CTX_LOAD:
+				gk20a_dbg_info("NETLIST_REGIONID_SW_CTX_LOAD");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.sw_ctx_load);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_SW_NON_CTX_LOAD:
+				gk20a_dbg_info("NETLIST_REGIONID_SW_NON_CTX_LOAD");
+				err = gr_gk20a_alloc_load_netlist_av(
+					src, size, &g->gr.ctx_vars.sw_non_ctx_load);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_SYS:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_SYS");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.sys);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_GPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_GPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.gpc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_TPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_TPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.tpc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_ZCULL_GPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_ZCULL_GPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_PPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.ppc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_PM_SYS:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_SYS");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.pm_sys);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_PM_GPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_GPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_CTXREG_PM_TPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PM_TPC");
+				err = gr_gk20a_alloc_load_netlist_aiv(
+					src, size, &g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+				if (err)
+					goto clean_up;
+				break;
+			case NETLIST_REGIONID_BUFFER_SIZE:
+				g->gr.ctx_vars.buffer_size = *src;
+				gk20a_dbg_info("NETLIST_REGIONID_BUFFER_SIZE : %d",
+					g->gr.ctx_vars.buffer_size);
+				break;
+			case NETLIST_REGIONID_CTXSW_REG_BASE_INDEX:
+				g->gr.ctx_vars.regs_base_index = *src;
+				gk20a_dbg_info("NETLIST_REGIONID_CTXSW_REG_BASE_INDEX : %d",
+					g->gr.ctx_vars.regs_base_index);
+				break;
+			case NETLIST_REGIONID_MAJORV:
+				major_v = *src;
+				gk20a_dbg_info("NETLIST_REGIONID_MAJORV : %d",
+					major_v);
+				break;
+			case NETLIST_REGIONID_NETLIST_NUM:
+				netlist_num = *src;
+				gk20a_dbg_info("NETLIST_REGIONID_NETLIST_NUM : %d",
+					netlist_num);
+				break;
+			case NETLIST_REGIONID_CTXREG_PMPPC:
+				gk20a_dbg_info("NETLIST_REGIONID_CTXREG_PMPPC skipped");
+				break;
+			default:
+				gk20a_warn(d, "unrecognized region %d skipped", i);
+				break;
+			}
+		}
+
+		if (net != NETLIST_FINAL && major_v != major_v_hw) {
+			gk20a_dbg_info("skip %s: major_v 0x%08x doesn't match hw 0x%08x",
+				name, major_v, major_v_hw);
+			goto clean_up;
+		}
+
+		g->gr.ctx_vars.valid = true;
+		g->gr.netlist = net;
+
+		release_firmware(netlist_fw);
+		gk20a_dbg_fn("done");
+		goto done;
+
+clean_up:
+		kfree(g->gr.ctx_vars.ucode.fecs.inst.l);
+		kfree(g->gr.ctx_vars.ucode.fecs.data.l);
+		kfree(g->gr.ctx_vars.ucode.gpccs.inst.l);
+		kfree(g->gr.ctx_vars.ucode.gpccs.data.l);
+		kfree(g->gr.ctx_vars.sw_bundle_init.l);
+		kfree(g->gr.ctx_vars.sw_method_init.l);
+		kfree(g->gr.ctx_vars.sw_ctx_load.l);
+		kfree(g->gr.ctx_vars.sw_non_ctx_load.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.sys.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.gpc.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.tpc.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.ppc.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.pm_sys.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.pm_gpc.l);
+		kfree(g->gr.ctx_vars.ctxsw_regs.pm_tpc.l);
+		release_firmware(netlist_fw);
+		err = -ENOENT;
+	}
+
+done:
+	if (g->gr.ctx_vars.valid) {
+		gk20a_dbg_info("netlist image %s loaded", name);
+		return 0;
+	} else {
+		gk20a_err(d, "failed to load netlist image!!");
+		return err;
+	}
+}
+
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr)
+{
+	if (tegra_platform_is_linsim())
+		return gr_gk20a_init_ctx_vars_sim(g, gr);
+	else
+		return gr_gk20a_init_ctx_vars_fw(g, gr);
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
new file mode 100644
index 000000000000..909a166ae9c3
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a.h
@@ -0,0 +1,149 @@
+/*
+ * GK20A Graphics Context
+ *
+ * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __GR_CTX_GK20A_H__
+#define __GR_CTX_GK20A_H__
+
+
+/* production netlist, one and only one from below */
+/*#undef GK20A_NETLIST_IMAGE_FW_NAME*/
+#define GK20A_NETLIST_IMAGE_FW_NAME GK20A_NETLIST_IMAGE_B
+/* emulation netlists, match majorV with HW */
+#define GK20A_NETLIST_IMAGE_A	"NETA_img.bin"
+#define GK20A_NETLIST_IMAGE_B	"NETB_img.bin"
+#define GK20A_NETLIST_IMAGE_C	"NETC_img.bin"
+#define GK20A_NETLIST_IMAGE_D   "NETD_img.bin"
+
+union __max_name {
+#ifdef GK20A_NETLIST_IMAGE_A
+	char __name_a[sizeof(GK20A_NETLIST_IMAGE_A)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_B
+	char __name_b[sizeof(GK20A_NETLIST_IMAGE_B)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_C
+	char __name_c[sizeof(GK20A_NETLIST_IMAGE_C)];
+#endif
+#ifdef GK20A_NETLIST_IMAGE_D
+	char __name_d[sizeof(GK20A_NETLIST_IMAGE_D)];
+#endif
+};
+
+#define MAX_NETLIST_NAME sizeof(union __max_name)
+
+/* index for emulation netlists */
+#define NETLIST_FINAL		-1
+#define NETLIST_SLOT_A		0
+#define NETLIST_SLOT_B		1
+#define NETLIST_SLOT_C		2
+#define NETLIST_SLOT_D		3
+#define MAX_NETLIST		4
+
+/* netlist regions */
+#define NETLIST_REGIONID_FECS_UCODE_DATA	0
+#define NETLIST_REGIONID_FECS_UCODE_INST	1
+#define NETLIST_REGIONID_GPCCS_UCODE_DATA	2
+#define NETLIST_REGIONID_GPCCS_UCODE_INST	3
+#define NETLIST_REGIONID_SW_BUNDLE_INIT		4
+#define NETLIST_REGIONID_SW_CTX_LOAD		5
+#define NETLIST_REGIONID_SW_NON_CTX_LOAD	6
+#define NETLIST_REGIONID_SW_METHOD_INIT		7
+#define NETLIST_REGIONID_CTXREG_SYS		8
+#define NETLIST_REGIONID_CTXREG_GPC		9
+#define NETLIST_REGIONID_CTXREG_TPC		10
+#define NETLIST_REGIONID_CTXREG_ZCULL_GPC	11
+#define NETLIST_REGIONID_CTXREG_PM_SYS		12
+#define NETLIST_REGIONID_CTXREG_PM_GPC		13
+#define NETLIST_REGIONID_CTXREG_PM_TPC		14
+#define NETLIST_REGIONID_MAJORV			15
+#define NETLIST_REGIONID_BUFFER_SIZE		16
+#define NETLIST_REGIONID_CTXSW_REG_BASE_INDEX	17
+#define NETLIST_REGIONID_NETLIST_NUM		18
+#define NETLIST_REGIONID_CTXREG_PPC		19
+#define NETLIST_REGIONID_CTXREG_PMPPC		20
+
+struct netlist_region {
+	u32 region_id;
+	u32 data_size;
+	u32 data_offset;
+};
+
+struct netlist_image_header {
+	u32 version;
+	u32 regions;
+};
+
+struct netlist_image {
+	struct netlist_image_header header;
+	struct netlist_region regions[1];
+};
+
+struct av_gk20a {
+	u32 addr;
+	u32 value;
+};
+struct aiv_gk20a {
+	u32 addr;
+	u32 index;
+	u32 value;
+};
+struct aiv_list_gk20a {
+	struct aiv_gk20a *l;
+	u32 count;
+};
+struct av_list_gk20a {
+	struct av_gk20a *l;
+	u32 count;
+};
+struct u32_list_gk20a {
+	u32 *l;
+	u32 count;
+};
+
+static inline
+struct av_gk20a *alloc_av_list_gk20a(struct av_list_gk20a *avl)
+{
+	avl->l = kzalloc(avl->count * sizeof(*avl->l), GFP_KERNEL);
+	return avl->l;
+}
+
+static inline
+struct aiv_gk20a *alloc_aiv_list_gk20a(struct aiv_list_gk20a *aivl)
+{
+	aivl->l = kzalloc(aivl->count * sizeof(*aivl->l), GFP_KERNEL);
+	return aivl->l;
+}
+
+static inline
+u32 *alloc_u32_list_gk20a(struct u32_list_gk20a *u32l)
+{
+	u32l->l = kzalloc(u32l->count * sizeof(*u32l->l), GFP_KERNEL);
+	return u32l->l;
+}
+
+struct gr_ucode_gk20a {
+	struct {
+		struct u32_list_gk20a inst;
+		struct u32_list_gk20a data;
+	} gpccs, fecs;
+};
+
+/* main entry for grctx loading */
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr);
+
+#endif /*__GR_CTX_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c
new file mode 100644
index 000000000000..12bba1fd7249
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_ctx_gk20a_sim.c
@@ -0,0 +1,256 @@
+/*
+ * drivers/video/tegra/host/gk20a/gr_ctx_sim_gk20a.c
+ *
+ * GK20A Graphics Context for Simulation
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gk20a.h"
+#include "gr_ctx_gk20a.h"
+
+int gr_gk20a_init_ctx_vars_sim(struct gk20a *g, struct gr_gk20a *gr)
+{
+	int err = 0;
+	u32 i, temp;
+	char *size_path  = NULL;
+	char *reg_path   = NULL;
+	char *value_path = NULL;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_info,
+		   "querying grctx info from chiplib");
+
+	g->gr.ctx_vars.dynamic = true;
+	g->gr.netlist = GR_NETLIST_DYNAMIC;
+
+	/* query sizes and counts */
+	gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS_COUNT", 0,
+			    &g->gr.ctx_vars.ucode.fecs.inst.count);
+	gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS_COUNT", 0,
+			    &g->gr.ctx_vars.ucode.fecs.data.count);
+	gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS_COUNT", 0,
+			    &g->gr.ctx_vars.ucode.gpccs.inst.count);
+	gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS_COUNT", 0,
+			    &g->gr.ctx_vars.ucode.gpccs.data.count);
+	gk20a_sim_esc_readl(g, "GRCTX_ALL_CTX_TOTAL_WORDS", 0, &temp);
+	g->gr.ctx_vars.buffer_size = temp << 2;
+	gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT_SIZE", 0,
+			    &g->gr.ctx_vars.sw_bundle_init.count);
+	gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT_SIZE", 0,
+			    &g->gr.ctx_vars.sw_method_init.count);
+	gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD_SIZE", 0,
+			    &g->gr.ctx_vars.sw_ctx_load.count);
+
+	switch (0) { /*g->gr.ctx_vars.reg_init_override)*/
+#if 0
+	case NV_REG_STR_RM_GR_REG_INIT_OVERRIDE_PROD_DIFF:
+		sizePath   = "GRCTX_NONCTXSW_PROD_DIFF_REG_SIZE";
+		regPath    = "GRCTX_NONCTXSW_PROD_DIFF_REG:REG";
+		valuePath  = "GRCTX_NONCTXSW_PROD_DIFF_REG:VALUE";
+		break;
+#endif
+	default:
+		size_path   = "GRCTX_NONCTXSW_REG_SIZE";
+		reg_path    = "GRCTX_NONCTXSW_REG:REG";
+		value_path  = "GRCTX_NONCTXSW_REG:VALUE";
+		break;
+	}
+
+	gk20a_sim_esc_readl(g, size_path, 0,
+			    &g->gr.ctx_vars.sw_non_ctx_load.count);
+
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.sys.count);
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.gpc.count);
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.tpc.count);
+#if 0
+	/* looks to be unused, actually chokes the sim */
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.ppc.count);
+#endif
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count);
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.pm_sys.count);
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.pm_gpc.count);
+	gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC_COUNT", 0,
+			    &g->gr.ctx_vars.ctxsw_regs.pm_tpc.count);
+
+	err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.inst);
+	err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.fecs.data);
+	err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.inst);
+	err |= !alloc_u32_list_gk20a(&g->gr.ctx_vars.ucode.gpccs.data);
+	err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_bundle_init);
+	err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_method_init);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.sw_ctx_load);
+	err |= !alloc_av_list_gk20a(&g->gr.ctx_vars.sw_non_ctx_load);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.sys);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.gpc);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.tpc);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.zcull_gpc);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.ppc);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_sys);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_gpc);
+	err |= !alloc_aiv_list_gk20a(&g->gr.ctx_vars.ctxsw_regs.pm_tpc);
+
+	if (err)
+		goto fail;
+
+	for (i = 0; i < g->gr.ctx_vars.ucode.fecs.inst.count; i++)
+		gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_FECS",
+				    i, &g->gr.ctx_vars.ucode.fecs.inst.l[i]);
+
+	for (i = 0; i < g->gr.ctx_vars.ucode.fecs.data.count; i++)
+		gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_FECS",
+				    i, &g->gr.ctx_vars.ucode.fecs.data.l[i]);
+
+	for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.inst.count; i++)
+		gk20a_sim_esc_readl(g, "GRCTX_UCODE_INST_GPCCS",
+				    i, &g->gr.ctx_vars.ucode.gpccs.inst.l[i]);
+
+	for (i = 0; i < g->gr.ctx_vars.ucode.gpccs.data.count; i++)
+		gk20a_sim_esc_readl(g, "GRCTX_UCODE_DATA_GPCCS",
+				    i, &g->gr.ctx_vars.ucode.gpccs.data.l[i]);
+
+	for (i = 0; i < g->gr.ctx_vars.sw_bundle_init.count; i++) {
+		struct av_gk20a *l = g->gr.ctx_vars.sw_bundle_init.l;
+		gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_SW_BUNDLE_INIT:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.sw_method_init.count; i++) {
+		struct av_gk20a *l = g->gr.ctx_vars.sw_method_init.l;
+		gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_SW_METHOD_INIT:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.sw_ctx_load.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.sw_ctx_load.l;
+		gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_SW_CTX_LOAD:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.sw_non_ctx_load.count; i++) {
+		struct av_gk20a *l = g->gr.ctx_vars.sw_non_ctx_load.l;
+		gk20a_sim_esc_readl(g, reg_path, i, &l[i].addr);
+		gk20a_sim_esc_readl(g, value_path, i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.sys.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_SYS:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.gpc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_GPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.tpc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_TPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.ppc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.zcull_gpc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.zcull_gpc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_ZCULL_GPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_sys.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_sys.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_SYS:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_gpc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_gpc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_GPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.pm_tpc.count; i++) {
+		struct aiv_gk20a *l = g->gr.ctx_vars.ctxsw_regs.pm_tpc.l;
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:ADDR",
+				    i, &l[i].addr);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:INDEX",
+				    i, &l[i].index);
+		gk20a_sim_esc_readl(g, "GRCTX_REG_LIST_PM_TPC:VALUE",
+				    i, &l[i].value);
+	}
+
+	g->gr.ctx_vars.valid = true;
+
+	gk20a_sim_esc_readl(g, "GRCTX_GEN_CTX_REGS_BASE_INDEX", 0,
+			    &g->gr.ctx_vars.regs_base_index);
+
+	gk20a_dbg(gpu_dbg_info | gpu_dbg_fn, "finished querying grctx info from chiplib");
+	return 0;
+fail:
+	gk20a_err(dev_from_gk20a(g),
+		   "failed querying grctx info from chiplib");
+	return err;
+
+}
+
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.c b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
new file mode 100644
index 000000000000..0f93940b402f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.c
@@ -0,0 +1,6747 @@
+/*
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/delay.h>	/* for udelay */
+#include <linux/mm.h>		/* for totalram_pages */
+#include <linux/scatterlist.h>
+#include <linux/tegra-soc.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-mapping.h>
+#include <linux/firmware.h>
+#include <linux/nvhost.h>
+
+#include "gk20a.h"
+#include "kind_gk20a.h"
+#include "gr_ctx_gk20a.h"
+
+#include "hw_ccsr_gk20a.h"
+#include "hw_ctxsw_prog_gk20a.h"
+#include "hw_fifo_gk20a.h"
+#include "hw_gr_gk20a.h"
+#include "hw_gmmu_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_pri_ringmaster_gk20a.h"
+#include "hw_pri_ringstation_sys_gk20a.h"
+#include "hw_pri_ringstation_gpc_gk20a.h"
+#include "hw_pri_ringstation_fbp_gk20a.h"
+#include "hw_proj_gk20a.h"
+#include "hw_top_gk20a.h"
+#include "hw_ltc_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_therm_gk20a.h"
+#include "hw_pbdma_gk20a.h"
+#include "gr_pri_gk20a.h"
+#include "regops_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+
+#define BLK_SIZE (256)
+
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va);
+
+/* global ctx buffer */
+static int  gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g);
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g);
+static int  gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+					    struct channel_gk20a *c);
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c);
+
+/* channel gr ctx buffer */
+static int  gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+					struct channel_gk20a *c);
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c);
+
+/* channel patch ctx buffer */
+static int  gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+					struct channel_gk20a *c);
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c);
+
+/* golden ctx image */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+					  struct channel_gk20a *c);
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+					  struct channel_gk20a *c);
+
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g)
+{
+	int i;
+
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_os_r : %d",
+		gk20a_readl(g, gr_fecs_os_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_cpuctl_r : 0x%x",
+		gk20a_readl(g, gr_fecs_cpuctl_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_idlestate_r : 0x%x",
+		gk20a_readl(g, gr_fecs_idlestate_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox0_r : 0x%x",
+		gk20a_readl(g, gr_fecs_mailbox0_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_mailbox1_r : 0x%x",
+		gk20a_readl(g, gr_fecs_mailbox1_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_irqstat_r : 0x%x",
+		gk20a_readl(g, gr_fecs_irqstat_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmode_r : 0x%x",
+		gk20a_readl(g, gr_fecs_irqmode_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_irqmask_r : 0x%x",
+		gk20a_readl(g, gr_fecs_irqmask_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_irqdest_r : 0x%x",
+		gk20a_readl(g, gr_fecs_irqdest_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_debug1_r : 0x%x",
+		gk20a_readl(g, gr_fecs_debug1_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_debuginfo_r : 0x%x",
+		gk20a_readl(g, gr_fecs_debuginfo_r()));
+
+	for (i = 0; i < gr_fecs_ctxsw_mailbox__size_1_v(); i++)
+		gk20a_err(dev_from_gk20a(g), "gr_fecs_ctxsw_mailbox_r(%d) : 0x%x",
+			i, gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(i)));
+
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_engctl_r : 0x%x",
+		gk20a_readl(g, gr_fecs_engctl_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_curctx_r : 0x%x",
+		gk20a_readl(g, gr_fecs_curctx_r()));
+	gk20a_err(dev_from_gk20a(g), "gr_fecs_nxtctx_r : 0x%x",
+		gk20a_readl(g, gr_fecs_nxtctx_r()));
+
+	gk20a_writel(g, gr_fecs_icd_cmd_r(),
+		gr_fecs_icd_cmd_opc_rreg_f() |
+		gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
+	gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_IMB : 0x%x",
+		gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+	gk20a_writel(g, gr_fecs_icd_cmd_r(),
+		gr_fecs_icd_cmd_opc_rreg_f() |
+		gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
+	gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_DMB : 0x%x",
+		gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+	gk20a_writel(g, gr_fecs_icd_cmd_r(),
+		gr_fecs_icd_cmd_opc_rreg_f() |
+		gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
+	gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CSW : 0x%x",
+		gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+	gk20a_writel(g, gr_fecs_icd_cmd_r(),
+		gr_fecs_icd_cmd_opc_rreg_f() |
+		gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
+	gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_CTX : 0x%x",
+		gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+	gk20a_writel(g, gr_fecs_icd_cmd_r(),
+		gr_fecs_icd_cmd_opc_rreg_f() |
+		gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
+	gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_EXCI : 0x%x",
+		gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+	for (i = 0; i < 4; i++) {
+		gk20a_writel(g, gr_fecs_icd_cmd_r(),
+			gr_fecs_icd_cmd_opc_rreg_f() |
+			gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_PC));
+		gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_PC : 0x%x",
+			gk20a_readl(g, gr_fecs_icd_rdata_r()));
+
+		gk20a_writel(g, gr_fecs_icd_cmd_r(),
+			gr_fecs_icd_cmd_opc_rreg_f() |
+			gr_fecs_icd_cmd_idx_f(PMU_FALCON_REG_SP));
+		gk20a_err(dev_from_gk20a(g), "FECS_FALCON_REG_SP : 0x%x",
+			gk20a_readl(g, gr_fecs_icd_rdata_r()));
+	}
+}
+
+static void gr_gk20a_load_falcon_dmem(struct gk20a *g)
+{
+	u32 i, ucode_u32_size;
+	const u32 *ucode_u32_data;
+	u32 checksum;
+
+	gk20a_dbg_fn("");
+
+	gk20a_writel(g, gr_gpccs_dmemc_r(0), (gr_gpccs_dmemc_offs_f(0) |
+					      gr_gpccs_dmemc_blk_f(0)  |
+					      gr_gpccs_dmemc_aincw_f(1)));
+
+	ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.data.count;
+	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.data.l;
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		gk20a_writel(g, gr_gpccs_dmemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+
+	gk20a_writel(g, gr_fecs_dmemc_r(0), (gr_fecs_dmemc_offs_f(0) |
+					     gr_fecs_dmemc_blk_f(0)  |
+					     gr_fecs_dmemc_aincw_f(1)));
+
+	ucode_u32_size = g->gr.ctx_vars.ucode.fecs.data.count;
+	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.data.l;
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		gk20a_writel(g, gr_fecs_dmemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+	gk20a_dbg_fn("done");
+}
+
+static void gr_gk20a_load_falcon_imem(struct gk20a *g)
+{
+	u32 cfg, fecs_imem_size, gpccs_imem_size, ucode_u32_size;
+	const u32 *ucode_u32_data;
+	u32 tag, i, pad_start, pad_end;
+	u32 checksum;
+
+	gk20a_dbg_fn("");
+
+	cfg = gk20a_readl(g, gr_fecs_cfg_r());
+	fecs_imem_size = gr_fecs_cfg_imem_sz_v(cfg);
+
+	cfg = gk20a_readl(g, gr_gpc0_cfg_r());
+	gpccs_imem_size = gr_gpc0_cfg_imem_sz_v(cfg);
+
+	/* Use the broadcast address to access all of the GPCCS units. */
+	gk20a_writel(g, gr_gpccs_imemc_r(0), (gr_gpccs_imemc_offs_f(0) |
+					      gr_gpccs_imemc_blk_f(0) |
+					      gr_gpccs_imemc_aincw_f(1)));
+
+	/* Setup the tags for the instruction memory. */
+	tag = 0;
+	gk20a_writel(g, gr_gpccs_imemt_r(0), gr_gpccs_imemt_tag_f(tag));
+
+	ucode_u32_size = g->gr.ctx_vars.ucode.gpccs.inst.count;
+	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.gpccs.inst.l;
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		if (i && ((i % (256/sizeof(u32))) == 0)) {
+			tag++;
+			gk20a_writel(g, gr_gpccs_imemt_r(0),
+				      gr_gpccs_imemt_tag_f(tag));
+		}
+		gk20a_writel(g, gr_gpccs_imemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+
+	pad_start = i*4;
+	pad_end = pad_start+(256-pad_start%256)+256;
+	for (i = pad_start;
+	     (i < gpccs_imem_size * 256) && (i < pad_end);
+	     i += 4) {
+		if (i && ((i % 256) == 0)) {
+			tag++;
+			gk20a_writel(g, gr_gpccs_imemt_r(0),
+				      gr_gpccs_imemt_tag_f(tag));
+		}
+		gk20a_writel(g, gr_gpccs_imemd_r(0), 0);
+	}
+
+	gk20a_writel(g, gr_fecs_imemc_r(0), (gr_fecs_imemc_offs_f(0) |
+					     gr_fecs_imemc_blk_f(0) |
+					     gr_fecs_imemc_aincw_f(1)));
+
+	/* Setup the tags for the instruction memory. */
+	tag = 0;
+	gk20a_writel(g, gr_fecs_imemt_r(0), gr_fecs_imemt_tag_f(tag));
+
+	ucode_u32_size = g->gr.ctx_vars.ucode.fecs.inst.count;
+	ucode_u32_data = (const u32 *)g->gr.ctx_vars.ucode.fecs.inst.l;
+
+	for (i = 0, checksum = 0; i < ucode_u32_size; i++) {
+		if (i && ((i % (256/sizeof(u32))) == 0)) {
+			tag++;
+			gk20a_writel(g, gr_fecs_imemt_r(0),
+				      gr_fecs_imemt_tag_f(tag));
+		}
+		gk20a_writel(g, gr_fecs_imemd_r(0), ucode_u32_data[i]);
+		checksum += ucode_u32_data[i];
+	}
+
+	pad_start = i*4;
+	pad_end = pad_start+(256-pad_start%256)+256;
+	for (i = pad_start; (i < fecs_imem_size * 256) && i < pad_end; i += 4) {
+		if (i && ((i % 256) == 0)) {
+			tag++;
+			gk20a_writel(g, gr_fecs_imemt_r(0),
+				      gr_fecs_imemt_tag_f(tag));
+		}
+		gk20a_writel(g, gr_fecs_imemd_r(0), 0);
+	}
+}
+
+static int gr_gk20a_wait_idle(struct gk20a *g, unsigned long end_jiffies,
+		u32 expect_delay)
+{
+	u32 delay = expect_delay;
+	bool gr_enabled;
+	bool ctxsw_active;
+	bool gr_busy;
+
+	gk20a_dbg_fn("");
+
+	do {
+		/* fmodel: host gets fifo_engine_status(gr) from gr
+		   only when gr_status is read */
+		gk20a_readl(g, gr_status_r());
+
+		gr_enabled = gk20a_readl(g, mc_enable_r()) &
+			mc_enable_pgraph_enabled_f();
+
+		ctxsw_active = gk20a_readl(g,
+			fifo_engine_status_r(ENGINE_GR_GK20A)) &
+			fifo_engine_status_ctxsw_in_progress_f();
+
+		gr_busy = gk20a_readl(g, gr_engine_status_r()) &
+			gr_engine_status_value_busy_f();
+
+		if (!gr_enabled || (!gr_busy && !ctxsw_active)) {
+			gk20a_dbg_fn("done");
+			return 0;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+
+	} while (time_before(jiffies, end_jiffies)
+			|| !tegra_platform_is_silicon());
+
+	gk20a_err(dev_from_gk20a(g),
+		"timeout, ctxsw busy : %d, gr busy : %d",
+		ctxsw_active, gr_busy);
+
+	return -EAGAIN;
+}
+
+static int gr_gk20a_ctx_reset(struct gk20a *g, u32 rst_mask)
+{
+	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 reg;
+
+	gk20a_dbg_fn("");
+
+	if (!tegra_platform_is_linsim()) {
+		/* Force clocks on */
+		gk20a_writel(g, gr_fe_pwr_mode_r(),
+			     gr_fe_pwr_mode_req_send_f() |
+			     gr_fe_pwr_mode_mode_force_on_f());
+
+		/* Wait for the clocks to indicate that they are on */
+		do {
+			reg = gk20a_readl(g, gr_fe_pwr_mode_r());
+
+			if (gr_fe_pwr_mode_req_v(reg) ==
+					gr_fe_pwr_mode_req_done_v())
+				break;
+
+			usleep_range(delay, delay * 2);
+			delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+
+		} while (time_before(jiffies, end_jiffies));
+
+		if (!time_before(jiffies, end_jiffies)) {
+			gk20a_err(dev_from_gk20a(g),
+				   "failed to force the clocks on\n");
+			WARN_ON(1);
+		}
+	}
+	if (rst_mask) {
+		gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(), rst_mask);
+	} else {
+		gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+			     gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+			     gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+			     gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f() |
+			     gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f());
+	}
+
+	/* we need to read the reset register *and* wait for a moment to ensure
+	 * reset propagation */
+
+	gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+	udelay(20);
+
+	gk20a_writel(g, gr_fecs_ctxsw_reset_ctl_r(),
+		     gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f()  |
+		     gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f()  |
+		     gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f() |
+		     gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f());
+
+	/* we need to readl the reset and then wait a small moment after that */
+	gk20a_readl(g, gr_fecs_ctxsw_reset_ctl_r());
+	udelay(20);
+
+	if (!tegra_platform_is_linsim()) {
+		/* Set power mode back to auto */
+		gk20a_writel(g, gr_fe_pwr_mode_r(),
+			     gr_fe_pwr_mode_req_send_f() |
+			     gr_fe_pwr_mode_mode_auto_f());
+
+		/* Wait for the request to complete */
+		end_jiffies = jiffies +
+			msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+		do {
+			reg = gk20a_readl(g, gr_fe_pwr_mode_r());
+
+			if (gr_fe_pwr_mode_req_v(reg) ==
+					gr_fe_pwr_mode_req_done_v())
+				break;
+
+			usleep_range(delay, delay * 2);
+			delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+
+		} while (time_before(jiffies, end_jiffies));
+
+		if (!time_before(jiffies, end_jiffies))
+			gk20a_warn(dev_from_gk20a(g),
+				   "failed to set power mode to auto\n");
+	}
+
+	return 0;
+}
+
+static int gr_gk20a_ctx_wait_ucode(struct gk20a *g, u32 mailbox_id,
+				   u32 *mailbox_ret, u32 opc_success,
+				   u32 mailbox_ok, u32 opc_fail,
+				   u32 mailbox_fail)
+{
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	u32 check = WAIT_UCODE_LOOP;
+	u32 reg;
+
+	gk20a_dbg_fn("");
+
+	while (check == WAIT_UCODE_LOOP) {
+		if (!time_before(jiffies, end_jiffies) &&
+				tegra_platform_is_silicon())
+			check = WAIT_UCODE_TIMEOUT;
+
+		reg = gk20a_readl(g, gr_fecs_ctxsw_mailbox_r(mailbox_id));
+
+		if (mailbox_ret)
+			*mailbox_ret = reg;
+
+		switch (opc_success) {
+		case GR_IS_UCODE_OP_EQUAL:
+			if (reg == mailbox_ok)
+				check = WAIT_UCODE_OK;
+			break;
+		case GR_IS_UCODE_OP_NOT_EQUAL:
+			if (reg != mailbox_ok)
+				check = WAIT_UCODE_OK;
+			break;
+		case GR_IS_UCODE_OP_AND:
+			if (reg & mailbox_ok)
+				check = WAIT_UCODE_OK;
+			break;
+		case GR_IS_UCODE_OP_LESSER:
+			if (reg < mailbox_ok)
+				check = WAIT_UCODE_OK;
+			break;
+		case GR_IS_UCODE_OP_LESSER_EQUAL:
+			if (reg <= mailbox_ok)
+				check = WAIT_UCODE_OK;
+			break;
+		case GR_IS_UCODE_OP_SKIP:
+			/* do no success check */
+			break;
+		default:
+			gk20a_err(dev_from_gk20a(g),
+				   "invalid success opcode 0x%x", opc_success);
+
+			check = WAIT_UCODE_ERROR;
+			break;
+		}
+
+		switch (opc_fail) {
+		case GR_IS_UCODE_OP_EQUAL:
+			if (reg == mailbox_fail)
+				check = WAIT_UCODE_ERROR;
+			break;
+		case GR_IS_UCODE_OP_NOT_EQUAL:
+			if (reg != mailbox_fail)
+				check = WAIT_UCODE_ERROR;
+			break;
+		case GR_IS_UCODE_OP_AND:
+			if (reg & mailbox_fail)
+				check = WAIT_UCODE_ERROR;
+			break;
+		case GR_IS_UCODE_OP_LESSER:
+			if (reg < mailbox_fail)
+				check = WAIT_UCODE_ERROR;
+			break;
+		case GR_IS_UCODE_OP_LESSER_EQUAL:
+			if (reg <= mailbox_fail)
+				check = WAIT_UCODE_ERROR;
+			break;
+		case GR_IS_UCODE_OP_SKIP:
+			/* do no check on fail*/
+			break;
+		default:
+			gk20a_err(dev_from_gk20a(g),
+				   "invalid fail opcode 0x%x", opc_fail);
+			check = WAIT_UCODE_ERROR;
+			break;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+	}
+
+	if (check == WAIT_UCODE_TIMEOUT) {
+		gk20a_err(dev_from_gk20a(g),
+			   "timeout waiting on ucode response");
+		gk20a_fecs_dump_falcon_stats(g);
+		return -1;
+	} else if (check == WAIT_UCODE_ERROR) {
+		gk20a_err(dev_from_gk20a(g),
+			   "ucode method failed on mailbox=%d value=0x%08x",
+			   mailbox_id, reg);
+		gk20a_fecs_dump_falcon_stats(g);
+		return -1;
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+/* The following is a less brittle way to call gr_gk20a_submit_fecs_method(...)
+ * We should replace most, if not all, fecs method calls to this instead. */
+struct fecs_method_op_gk20a {
+	struct {
+		u32 addr;
+		u32 data;
+	} method;
+
+	struct {
+		u32 id;
+		u32 data;
+		u32 clr;
+		u32 *ret;
+		u32 ok;
+		u32 fail;
+	} mailbox;
+
+	struct {
+		u32 ok;
+		u32 fail;
+	} cond;
+
+};
+
+int gr_gk20a_submit_fecs_method_op(struct gk20a *g,
+				   struct fecs_method_op_gk20a op)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int ret;
+
+	mutex_lock(&gr->fecs_mutex);
+
+	if (op.mailbox.id != 0)
+		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(op.mailbox.id),
+			     op.mailbox.data);
+
+	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+		gr_fecs_ctxsw_mailbox_clear_value_f(op.mailbox.clr));
+
+	gk20a_writel(g, gr_fecs_method_data_r(), op.method.data);
+	gk20a_writel(g, gr_fecs_method_push_r(),
+		gr_fecs_method_push_adr_f(op.method.addr));
+
+	/* op.mb.id == 4 cases require waiting for completion on
+	 * for op.mb.id == 0 */
+	if (op.mailbox.id == 4)
+		op.mailbox.id = 0;
+
+	ret = gr_gk20a_ctx_wait_ucode(g, op.mailbox.id, op.mailbox.ret,
+				      op.cond.ok, op.mailbox.ok,
+				      op.cond.fail, op.mailbox.fail);
+
+	mutex_unlock(&gr->fecs_mutex);
+
+	return ret;
+}
+
+int gr_gk20a_ctrl_ctxsw(struct gk20a *g, u32 fecs_method, u32 *ret)
+{
+	return gr_gk20a_submit_fecs_method_op(g,
+	      (struct fecs_method_op_gk20a) {
+		      .method.addr = fecs_method,
+		      .method.data = ~0,
+		      .mailbox = { .id   = 1, /*sideband?*/
+				   .data = ~0, .clr = ~0, .ret = ret,
+				   .ok   = gr_fecs_ctxsw_mailbox_value_pass_v(),
+				   .fail = gr_fecs_ctxsw_mailbox_value_fail_v(), },
+		      .cond.ok = GR_IS_UCODE_OP_EQUAL,
+		      .cond.fail = GR_IS_UCODE_OP_EQUAL });
+}
+
+/* Stop processing (stall) context switches at FECS.
+ * The caller must hold the dbg_sessions_lock, else if mutliple stop methods
+ * are sent to the ucode in sequence, it can get into an undefined state. */
+int gr_gk20a_disable_ctxsw(struct gk20a *g)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+	return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_stop_ctxsw_v(), 0);
+}
+
+/* Start processing (continue) context switches at FECS */
+int gr_gk20a_enable_ctxsw(struct gk20a *g)
+{
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+	return gr_gk20a_ctrl_ctxsw(g, gr_fecs_method_push_adr_start_ctxsw_v(), 0);
+}
+
+
+static int gr_gk20a_commit_inst(struct channel_gk20a *c, u64 gpu_va)
+{
+	u32 addr_lo;
+	u32 addr_hi;
+	void *inst_ptr = NULL;
+
+	gk20a_dbg_fn("");
+
+	/* flush gpu_va before commit */
+	gk20a_mm_fb_flush(c->g);
+	gk20a_mm_l2_flush(c->g, true);
+
+	inst_ptr = c->inst_block.cpuva;
+	if (!inst_ptr)
+		return -ENOMEM;
+
+	addr_lo = u64_lo32(gpu_va) >> 12;
+	addr_hi = u64_hi32(gpu_va);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_target_w(),
+		 ram_in_gr_cs_wfi_f() | ram_in_gr_wfi_mode_virtual_f() |
+		 ram_in_gr_wfi_ptr_lo_f(addr_lo));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_gr_wfi_ptr_hi_w(),
+		 ram_in_gr_wfi_ptr_hi_f(addr_hi));
+
+	gk20a_mm_l2_invalidate(c->g);
+
+	return 0;
+}
+
+/*
+ * Context state can be written directly or "patched" at times.
+ * So that code can be used in either situation it is written
+ * using a series _ctx_patch_write(..., patch) statements.
+ * However any necessary cpu map/unmap and gpu l2 invalidates
+ * should be minimized (to avoid doing it once per patch write).
+ * Before a sequence of these set up with "_ctx_patch_write_begin"
+ * and close with "_ctx_patch_write_end."
+ */
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+					  struct channel_ctx_gk20a *ch_ctx)
+{
+	/* being defensive still... */
+	if (ch_ctx->patch_ctx.cpu_va) {
+		gk20a_err(dev_from_gk20a(g), "nested ctx patch begin?");
+		return -EBUSY;
+	}
+
+	ch_ctx->patch_ctx.cpu_va = vmap(ch_ctx->patch_ctx.pages,
+			PAGE_ALIGN(ch_ctx->patch_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+
+	if (!ch_ctx->patch_ctx.cpu_va)
+		return -ENOMEM;
+
+	return 0;
+}
+
+int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+					struct channel_ctx_gk20a *ch_ctx)
+{
+	/* being defensive still... */
+	if (!ch_ctx->patch_ctx.cpu_va) {
+		gk20a_err(dev_from_gk20a(g), "dangling ctx patch end?");
+		return -EINVAL;
+	}
+
+	vunmap(ch_ctx->patch_ctx.cpu_va);
+	ch_ctx->patch_ctx.cpu_va = NULL;
+
+	gk20a_mm_l2_invalidate(g);
+	return 0;
+}
+
+int gr_gk20a_ctx_patch_write(struct gk20a *g,
+				    struct channel_ctx_gk20a *ch_ctx,
+				    u32 addr, u32 data, bool patch)
+{
+	u32 patch_slot = 0;
+	void *patch_ptr = NULL;
+	bool mapped_here = false;
+
+	BUG_ON(patch != 0 && ch_ctx == NULL);
+
+	if (patch) {
+		if (!ch_ctx)
+			return -EINVAL;
+		/* we added an optimization prolog, epilog
+		 * to get rid of unnecessary maps and l2 invals.
+		 * but be defensive still... */
+		if (!ch_ctx->patch_ctx.cpu_va) {
+			int err;
+			gk20a_err(dev_from_gk20a(g),
+				   "per-write ctx patch begin?");
+			/* yes, gr_gk20a_ctx_patch_smpc causes this one */
+			err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+			if (err)
+				return err;
+			mapped_here = true;
+		} else
+			mapped_here = false;
+
+		patch_ptr = ch_ctx->patch_ctx.cpu_va;
+		patch_slot = ch_ctx->patch_ctx.data_count * 2;
+
+		gk20a_mem_wr32(patch_ptr, patch_slot++, addr);
+		gk20a_mem_wr32(patch_ptr, patch_slot++, data);
+
+		ch_ctx->patch_ctx.data_count++;
+
+		if (mapped_here)
+			gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+	} else
+		gk20a_writel(g, addr, data);
+
+	return 0;
+}
+
+static int gr_gk20a_fecs_ctx_bind_channel(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	u32 inst_base_ptr = u64_lo32(c->inst_block.cpu_pa
+				     >> ram_in_base_shift_v());
+	u32 ret;
+
+	gk20a_dbg_info("bind channel %d inst ptr 0x%08x",
+		   c->hw_chid, inst_base_ptr);
+
+	ret = gr_gk20a_submit_fecs_method_op(g,
+		     (struct fecs_method_op_gk20a) {
+		     .method.addr = gr_fecs_method_push_adr_bind_pointer_v(),
+		     .method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+				     gr_fecs_current_ctx_target_vid_mem_f() |
+				     gr_fecs_current_ctx_valid_f(1)),
+		     .mailbox = { .id = 0, .data = 0,
+				  .clr = 0x30,
+				  .ret = NULL,
+				  .ok = 0x10,
+				  .fail = 0x20, },
+		     .cond.ok = GR_IS_UCODE_OP_AND,
+		     .cond.fail = GR_IS_UCODE_OP_AND});
+	if (ret)
+		gk20a_err(dev_from_gk20a(g),
+			"bind channel instance failed");
+
+	return ret;
+}
+
+static int gr_gk20a_ctx_zcull_setup(struct gk20a *g, struct channel_gk20a *c,
+				    bool disable_fifo)
+{
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+	u32 va_lo, va_hi, va;
+	int ret = 0;
+	void *ctx_ptr = NULL;
+
+	gk20a_dbg_fn("");
+
+	ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+			PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!ctx_ptr)
+		return -ENOMEM;
+
+	if (ch_ctx->zcull_ctx.gpu_va == 0 &&
+	    ch_ctx->zcull_ctx.ctx_sw_mode ==
+		ctxsw_prog_main_image_zcull_mode_separate_buffer_v()) {
+		ret = -EINVAL;
+		goto clean_up;
+	}
+
+	va_lo = u64_lo32(ch_ctx->zcull_ctx.gpu_va);
+	va_hi = u64_hi32(ch_ctx->zcull_ctx.gpu_va);
+	va = ((va_lo >> 8) & 0x00FFFFFF) | ((va_hi << 24) & 0xFF000000);
+
+	if (disable_fifo) {
+		ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+		if (ret) {
+			gk20a_err(dev_from_gk20a(g),
+				"failed to disable gr engine activity\n");
+			goto clean_up;
+		}
+	}
+
+	/* Channel gr_ctx buffer is gpu cacheable.
+	   Flush and invalidate before cpu update. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+		 ch_ctx->zcull_ctx.ctx_sw_mode);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, va);
+
+	if (disable_fifo) {
+		ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+		if (ret) {
+			gk20a_err(dev_from_gk20a(g),
+				"failed to enable gr engine activity\n");
+			goto clean_up;
+		}
+	}
+	gk20a_mm_l2_invalidate(g);
+
+clean_up:
+	vunmap(ctx_ptr);
+
+	return ret;
+}
+
+static int gr_gk20a_commit_global_cb_manager(struct gk20a *g,
+			struct channel_gk20a *c, bool patch)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = NULL;
+	u32 attrib_offset_in_chunk = 0;
+	u32 alpha_offset_in_chunk = 0;
+	u32 pd_ab_max_output;
+	u32 gpc_index, ppc_index;
+	u32 temp;
+	u32 cbm_cfg_size1, cbm_cfg_size2;
+
+	gk20a_dbg_fn("");
+
+	if (patch) {
+		int err;
+		ch_ctx = &c->ch_ctx;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_tga_constraintlogic_r(),
+		gr_ds_tga_constraintlogic_beta_cbsize_f(gr->attrib_cb_default_size) |
+		gr_ds_tga_constraintlogic_alpha_cbsize_f(gr->alpha_cb_default_size),
+		patch);
+
+	pd_ab_max_output = (gr->alpha_cb_default_size *
+		gr_gpc0_ppc0_cbm_cfg_size_granularity_v()) /
+		gr_pd_ab_dist_cfg1_max_output_granularity_v();
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg1_r(),
+		gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output) |
+		gr_pd_ab_dist_cfg1_max_batches_init_f(), patch);
+
+	alpha_offset_in_chunk = attrib_offset_in_chunk +
+		gr->tpc_count * gr->attrib_cb_size;
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		temp = proj_gpc_stride_v() * gpc_index;
+		for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+		     ppc_index++) {
+			cbm_cfg_size1 = gr->attrib_cb_default_size *
+				gr->pes_tpc_count[ppc_index][gpc_index];
+			cbm_cfg_size2 = gr->alpha_cb_default_size *
+				gr->pes_tpc_count[ppc_index][gpc_index];
+
+			gr_gk20a_ctx_patch_write(g, ch_ctx,
+				gr_gpc0_ppc0_cbm_cfg_r() + temp +
+				proj_ppc_in_gpc_stride_v() * ppc_index,
+				gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(gr->timeslice_mode) |
+				gr_gpc0_ppc0_cbm_cfg_start_offset_f(attrib_offset_in_chunk) |
+				gr_gpc0_ppc0_cbm_cfg_size_f(cbm_cfg_size1), patch);
+
+			attrib_offset_in_chunk += gr->attrib_cb_size *
+				gr->pes_tpc_count[ppc_index][gpc_index];
+
+			gr_gk20a_ctx_patch_write(g, ch_ctx,
+				gr_gpc0_ppc0_cbm_cfg2_r() + temp +
+				proj_ppc_in_gpc_stride_v() * ppc_index,
+				gr_gpc0_ppc0_cbm_cfg2_start_offset_f(alpha_offset_in_chunk) |
+				gr_gpc0_ppc0_cbm_cfg2_size_f(cbm_cfg_size2), patch);
+
+			alpha_offset_in_chunk += gr->alpha_cb_size *
+				gr->pes_tpc_count[ppc_index][gpc_index];
+		}
+	}
+
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+	return 0;
+}
+
+static int gr_gk20a_commit_global_ctx_buffers(struct gk20a *g,
+			struct channel_gk20a *c, bool patch)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	u64 addr;
+	u32 size;
+
+	gk20a_dbg_fn("");
+	if (patch) {
+		int err;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
+
+	/* global pagepool buffer */
+	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) >>
+		gr_scc_pagepool_base_addr_39_8_align_bits_v()) |
+		(u64_hi32(ch_ctx->global_ctx_buffer_va[PAGEPOOL_VA]) <<
+		 (32 - gr_scc_pagepool_base_addr_39_8_align_bits_v()));
+
+	size = gr->global_ctx_buffer[PAGEPOOL].size /
+		gr_scc_pagepool_total_pages_byte_granularity_v();
+
+	if (size == gr_scc_pagepool_total_pages_hwmax_value_v())
+		size = gr_scc_pagepool_total_pages_hwmax_v();
+
+	gk20a_dbg_info("pagepool buffer addr : 0x%016llx, size : %d",
+		addr, size);
+
+	g->ops.gr.commit_global_pagepool(g, ch_ctx, addr, size, patch);
+
+	/* global bundle cb */
+	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) >>
+		gr_scc_bundle_cb_base_addr_39_8_align_bits_v()) |
+		(u64_hi32(ch_ctx->global_ctx_buffer_va[CIRCULAR_VA]) <<
+		 (32 - gr_scc_bundle_cb_base_addr_39_8_align_bits_v()));
+
+	size = gr->bundle_cb_default_size;
+
+	gk20a_dbg_info("bundle cb addr : 0x%016llx, size : %d",
+		addr, size);
+
+	g->ops.gr.commit_global_bundle_cb(g, ch_ctx, addr, size, patch);
+
+	/* global attrib cb */
+	addr = (u64_lo32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) >>
+		gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()) |
+		(u64_hi32(ch_ctx->global_ctx_buffer_va[ATTRIBUTE_VA]) <<
+		 (32 - gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v()));
+
+	gk20a_dbg_info("attrib cb addr : 0x%016llx", addr);
+	g->ops.gr.commit_global_attrib_cb(g, ch_ctx, addr, patch);
+
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+	return 0;
+}
+
+static void gr_gk20a_commit_global_attrib_cb(struct gk20a *g,
+					    struct channel_ctx_gk20a *ch_ctx,
+					    u64 addr, bool patch)
+{
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_attrib_cb_base_r(),
+		gr_gpcs_setup_attrib_cb_base_addr_39_12_f(addr) |
+		gr_gpcs_setup_attrib_cb_base_valid_true_f(), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(),
+		gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(addr) |
+		gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(), patch);
+}
+
+static void gr_gk20a_commit_global_bundle_cb(struct gk20a *g,
+					    struct channel_ctx_gk20a *ch_ctx,
+					    u64 addr, u64 size, bool patch)
+{
+	u32 data;
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_base_r(),
+		gr_scc_bundle_cb_base_addr_39_8_f(addr), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_bundle_cb_size_r(),
+		gr_scc_bundle_cb_size_div_256b_f(size) |
+		gr_scc_bundle_cb_size_valid_true_f(), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_base_r(),
+		gr_gpcs_setup_bundle_cb_base_addr_39_8_f(addr), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_setup_bundle_cb_size_r(),
+		gr_gpcs_setup_bundle_cb_size_div_256b_f(size) |
+		gr_gpcs_setup_bundle_cb_size_valid_true_f(), patch);
+
+	/* data for state_limit */
+	data = (g->gr.bundle_cb_default_size *
+		gr_scc_bundle_cb_size_div_256b_byte_granularity_v()) /
+		gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v();
+
+	data = min_t(u32, data, g->gr.min_gpm_fifo_depth);
+
+	gk20a_dbg_info("bundle cb token limit : %d, state limit : %d",
+		   g->gr.bundle_cb_token_limit, data);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg2_r(),
+		gr_pd_ab_dist_cfg2_token_limit_f(g->gr.bundle_cb_token_limit) |
+		gr_pd_ab_dist_cfg2_state_limit_f(data), patch);
+
+}
+
+static int gr_gk20a_commit_global_timeslice(struct gk20a *g, struct channel_gk20a *c, bool patch)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = NULL;
+	u32 gpm_pd_cfg;
+	u32 pd_ab_dist_cfg0;
+	u32 ds_debug;
+	u32 mpc_vtg_debug;
+	u32 pe_vaf;
+	u32 pe_vsc_vpc;
+
+	gk20a_dbg_fn("");
+
+	gpm_pd_cfg = gk20a_readl(g, gr_gpcs_gpm_pd_cfg_r());
+	pd_ab_dist_cfg0 = gk20a_readl(g, gr_pd_ab_dist_cfg0_r());
+	ds_debug = gk20a_readl(g, gr_ds_debug_r());
+	mpc_vtg_debug = gk20a_readl(g, gr_gpcs_tpcs_mpc_vtg_debug_r());
+
+	if (patch) {
+		int err;
+		ch_ctx = &c->ch_ctx;
+		err = gr_gk20a_ctx_patch_write_begin(g, ch_ctx);
+		if (err)
+			return err;
+	}
+
+	if (gr->timeslice_mode == gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v()) {
+		pe_vaf = gk20a_readl(g, gr_gpcs_tpcs_pe_vaf_r());
+		pe_vsc_vpc = gk20a_readl(g, gr_gpcs_tpcs_pes_vsc_vpc_r());
+
+		gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f() | gpm_pd_cfg;
+		pe_vaf = gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f() | pe_vaf;
+		pe_vsc_vpc = gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f() | pe_vsc_vpc;
+		pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_en_f() | pd_ab_dist_cfg0;
+		ds_debug = gr_ds_debug_timeslice_mode_enable_f() | ds_debug;
+		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f() | mpc_vtg_debug;
+
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pe_vaf_r(), pe_vaf, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_pes_vsc_vpc_r(), pe_vsc_vpc, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+	} else {
+		gpm_pd_cfg = gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f() | gpm_pd_cfg;
+		pd_ab_dist_cfg0 = gr_pd_ab_dist_cfg0_timeslice_enable_dis_f() | pd_ab_dist_cfg0;
+		ds_debug = gr_ds_debug_timeslice_mode_disable_f() | ds_debug;
+		mpc_vtg_debug = gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f() | mpc_vtg_debug;
+
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gpm_pd_cfg_r(), gpm_pd_cfg, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_ab_dist_cfg0_r(), pd_ab_dist_cfg0, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_ds_debug_r(), ds_debug, patch);
+		gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_tpcs_mpc_vtg_debug_r(), mpc_vtg_debug, patch);
+	}
+
+	if (patch)
+		gr_gk20a_ctx_patch_write_end(g, ch_ctx);
+
+	return 0;
+}
+
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 norm_entries, norm_shift;
+	u32 coeff5_mod, coeff6_mod, coeff7_mod, coeff8_mod, coeff9_mod, coeff10_mod, coeff11_mod;
+	u32 map0, map1, map2, map3, map4, map5;
+
+	if (!gr->map_tiles)
+		return -1;
+
+	gk20a_dbg_fn("");
+
+	gk20a_writel(g, gr_crstr_map_table_cfg_r(),
+		     gr_crstr_map_table_cfg_row_offset_f(gr->map_row_offset) |
+		     gr_crstr_map_table_cfg_num_entries_f(gr->tpc_count));
+
+	map0 =  gr_crstr_gpc_map0_tile0_f(gr->map_tiles[0]) |
+		gr_crstr_gpc_map0_tile1_f(gr->map_tiles[1]) |
+		gr_crstr_gpc_map0_tile2_f(gr->map_tiles[2]) |
+		gr_crstr_gpc_map0_tile3_f(gr->map_tiles[3]) |
+		gr_crstr_gpc_map0_tile4_f(gr->map_tiles[4]) |
+		gr_crstr_gpc_map0_tile5_f(gr->map_tiles[5]);
+
+	map1 =  gr_crstr_gpc_map1_tile6_f(gr->map_tiles[6]) |
+		gr_crstr_gpc_map1_tile7_f(gr->map_tiles[7]) |
+		gr_crstr_gpc_map1_tile8_f(gr->map_tiles[8]) |
+		gr_crstr_gpc_map1_tile9_f(gr->map_tiles[9]) |
+		gr_crstr_gpc_map1_tile10_f(gr->map_tiles[10]) |
+		gr_crstr_gpc_map1_tile11_f(gr->map_tiles[11]);
+
+	map2 =  gr_crstr_gpc_map2_tile12_f(gr->map_tiles[12]) |
+		gr_crstr_gpc_map2_tile13_f(gr->map_tiles[13]) |
+		gr_crstr_gpc_map2_tile14_f(gr->map_tiles[14]) |
+		gr_crstr_gpc_map2_tile15_f(gr->map_tiles[15]) |
+		gr_crstr_gpc_map2_tile16_f(gr->map_tiles[16]) |
+		gr_crstr_gpc_map2_tile17_f(gr->map_tiles[17]);
+
+	map3 =  gr_crstr_gpc_map3_tile18_f(gr->map_tiles[18]) |
+		gr_crstr_gpc_map3_tile19_f(gr->map_tiles[19]) |
+		gr_crstr_gpc_map3_tile20_f(gr->map_tiles[20]) |
+		gr_crstr_gpc_map3_tile21_f(gr->map_tiles[21]) |
+		gr_crstr_gpc_map3_tile22_f(gr->map_tiles[22]) |
+		gr_crstr_gpc_map3_tile23_f(gr->map_tiles[23]);
+
+	map4 =  gr_crstr_gpc_map4_tile24_f(gr->map_tiles[24]) |
+		gr_crstr_gpc_map4_tile25_f(gr->map_tiles[25]) |
+		gr_crstr_gpc_map4_tile26_f(gr->map_tiles[26]) |
+		gr_crstr_gpc_map4_tile27_f(gr->map_tiles[27]) |
+		gr_crstr_gpc_map4_tile28_f(gr->map_tiles[28]) |
+		gr_crstr_gpc_map4_tile29_f(gr->map_tiles[29]);
+
+	map5 =  gr_crstr_gpc_map5_tile30_f(gr->map_tiles[30]) |
+		gr_crstr_gpc_map5_tile31_f(gr->map_tiles[31]) |
+		gr_crstr_gpc_map5_tile32_f(0) |
+		gr_crstr_gpc_map5_tile33_f(0) |
+		gr_crstr_gpc_map5_tile34_f(0) |
+		gr_crstr_gpc_map5_tile35_f(0);
+
+	gk20a_writel(g, gr_crstr_gpc_map0_r(), map0);
+	gk20a_writel(g, gr_crstr_gpc_map1_r(), map1);
+	gk20a_writel(g, gr_crstr_gpc_map2_r(), map2);
+	gk20a_writel(g, gr_crstr_gpc_map3_r(), map3);
+	gk20a_writel(g, gr_crstr_gpc_map4_r(), map4);
+	gk20a_writel(g, gr_crstr_gpc_map5_r(), map5);
+
+	switch (gr->tpc_count) {
+	case 1:
+		norm_shift = 4;
+		break;
+	case 2:
+	case 3:
+		norm_shift = 3;
+		break;
+	case 4:
+	case 5:
+	case 6:
+	case 7:
+		norm_shift = 2;
+		break;
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+		norm_shift = 1;
+		break;
+	default:
+		norm_shift = 0;
+		break;
+	}
+
+	norm_entries = gr->tpc_count << norm_shift;
+	coeff5_mod = (1 << 5) % norm_entries;
+	coeff6_mod = (1 << 6) % norm_entries;
+	coeff7_mod = (1 << 7) % norm_entries;
+	coeff8_mod = (1 << 8) % norm_entries;
+	coeff9_mod = (1 << 9) % norm_entries;
+	coeff10_mod = (1 << 10) % norm_entries;
+	coeff11_mod = (1 << 11) % norm_entries;
+
+	gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg_r(),
+		     gr_ppcs_wwdx_map_table_cfg_row_offset_f(gr->map_row_offset) |
+		     gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(norm_entries) |
+		     gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(norm_shift) |
+		     gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(coeff5_mod) |
+		     gr_ppcs_wwdx_map_table_cfg_num_entries_f(gr->tpc_count));
+
+	gk20a_writel(g, gr_ppcs_wwdx_map_table_cfg2_r(),
+		     gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(coeff6_mod) |
+		     gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(coeff7_mod) |
+		     gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(coeff8_mod) |
+		     gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(coeff9_mod) |
+		     gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(coeff10_mod) |
+		     gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(coeff11_mod));
+
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map0_r(), map0);
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map1_r(), map1);
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map2_r(), map2);
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map3_r(), map3);
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map4_r(), map4);
+	gk20a_writel(g, gr_ppcs_wwdx_map_gpc_map5_r(), map5);
+
+	gk20a_writel(g, gr_rstr2d_map_table_cfg_r(),
+		     gr_rstr2d_map_table_cfg_row_offset_f(gr->map_row_offset) |
+		     gr_rstr2d_map_table_cfg_num_entries_f(gr->tpc_count));
+
+	gk20a_writel(g, gr_rstr2d_gpc_map0_r(), map0);
+	gk20a_writel(g, gr_rstr2d_gpc_map1_r(), map1);
+	gk20a_writel(g, gr_rstr2d_gpc_map2_r(), map2);
+	gk20a_writel(g, gr_rstr2d_gpc_map3_r(), map3);
+	gk20a_writel(g, gr_rstr2d_gpc_map4_r(), map4);
+	gk20a_writel(g, gr_rstr2d_gpc_map5_r(), map5);
+
+	return 0;
+}
+
+static inline u32 count_bits(u32 mask)
+{
+	u32 temp = mask;
+	u32 count;
+	for (count = 0; temp != 0; count++)
+		temp &= temp - 1;
+
+	return count;
+}
+
+static inline u32 clear_count_bits(u32 num, u32 clear_count)
+{
+	u32 count = clear_count;
+	for (; (num != 0) && (count != 0); count--)
+		num &= num - 1;
+
+	return num;
+}
+
+static int gr_gk20a_setup_alpha_beta_tables(struct gk20a *g,
+					struct gr_gk20a *gr)
+{
+	u32 table_index_bits = 5;
+	u32 rows = (1 << table_index_bits);
+	u32 row_stride = gr_pd_alpha_ratio_table__size_1_v() / rows;
+
+	u32 row;
+	u32 index;
+	u32 gpc_index;
+	u32 gpcs_per_reg = 4;
+	u32 pes_index;
+	u32 tpc_count_pes;
+	u32 num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+
+	u32 alpha_target, beta_target;
+	u32 alpha_bits, beta_bits;
+	u32 alpha_mask, beta_mask, partial_mask;
+	u32 reg_offset;
+	bool assign_alpha;
+
+	u32 map_alpha[gr_pd_alpha_ratio_table__size_1_v()];
+	u32 map_beta[gr_pd_alpha_ratio_table__size_1_v()];
+	u32 map_reg_used[gr_pd_alpha_ratio_table__size_1_v()];
+
+	gk20a_dbg_fn("");
+
+	memset(map_alpha, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+	memset(map_beta, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+	memset(map_reg_used, 0, gr_pd_alpha_ratio_table__size_1_v() * sizeof(u32));
+
+	for (row = 0; row < rows; ++row) {
+		alpha_target = max_t(u32, gr->tpc_count * row / rows, 1);
+		beta_target = gr->tpc_count - alpha_target;
+
+		assign_alpha = (alpha_target < beta_target);
+
+		for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+			reg_offset = (row * row_stride) + (gpc_index / gpcs_per_reg);
+			alpha_mask = beta_mask = 0;
+
+			for (pes_index = 0; pes_index < num_pes_per_gpc; pes_index++) {
+				tpc_count_pes = gr->pes_tpc_count[pes_index][gpc_index];
+
+				if (assign_alpha) {
+					alpha_bits = (alpha_target == 0) ? 0 : tpc_count_pes;
+					beta_bits = tpc_count_pes - alpha_bits;
+				} else {
+					beta_bits = (beta_target == 0) ? 0 : tpc_count_pes;
+					alpha_bits = tpc_count_pes - beta_bits;
+				}
+
+				partial_mask = gr->pes_tpc_mask[pes_index][gpc_index];
+				partial_mask = clear_count_bits(partial_mask, tpc_count_pes - alpha_bits);
+				alpha_mask |= partial_mask;
+
+				partial_mask = gr->pes_tpc_mask[pes_index][gpc_index] ^ partial_mask;
+				beta_mask |= partial_mask;
+
+				alpha_target -= min(alpha_bits, alpha_target);
+				beta_target -= min(beta_bits, beta_target);
+
+				if ((alpha_bits > 0) || (beta_bits > 0))
+					assign_alpha = !assign_alpha;
+			}
+
+			switch (gpc_index % gpcs_per_reg) {
+			case 0:
+				map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n0_mask_f(alpha_mask);
+				map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n0_mask_f(beta_mask);
+				break;
+			case 1:
+				map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n1_mask_f(alpha_mask);
+				map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n1_mask_f(beta_mask);
+				break;
+			case 2:
+				map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n2_mask_f(alpha_mask);
+				map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n2_mask_f(beta_mask);
+				break;
+			case 3:
+				map_alpha[reg_offset] |= gr_pd_alpha_ratio_table_gpc_4n3_mask_f(alpha_mask);
+				map_beta[reg_offset] |= gr_pd_beta_ratio_table_gpc_4n3_mask_f(beta_mask);
+				break;
+			}
+			map_reg_used[reg_offset] = true;
+		}
+	}
+
+	for (index = 0; index < gr_pd_alpha_ratio_table__size_1_v(); index++) {
+		if (map_reg_used[index]) {
+			gk20a_writel(g, gr_pd_alpha_ratio_table_r(index), map_alpha[index]);
+			gk20a_writel(g, gr_pd_beta_ratio_table_r(index), map_beta[index]);
+		}
+	}
+
+	return 0;
+}
+
+static int gr_gk20a_ctx_state_floorsweep(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 tpc_index, gpc_index;
+	u32 tpc_offset, gpc_offset;
+	u32 sm_id = 0, gpc_id = 0;
+	u32 sm_id_to_gpc_id[proj_scal_max_gpcs_v() * proj_scal_max_tpc_per_gpc_v()];
+	u32 tpc_per_gpc;
+	u32 max_ways_evict = INVALID_MAX_WAYS;
+	u32 l1c_dbg_reg_val;
+
+	gk20a_dbg_fn("");
+
+	for (tpc_index = 0; tpc_index < gr->max_tpc_per_gpc_count; tpc_index++) {
+		for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+			gpc_offset = proj_gpc_stride_v() * gpc_index;
+			if (tpc_index < gr->gpc_tpc_count[gpc_index]) {
+				tpc_offset = proj_tpc_in_gpc_stride_v() * tpc_index;
+
+				gk20a_writel(g, gr_gpc0_tpc0_sm_cfg_r() + gpc_offset + tpc_offset,
+					     gr_gpc0_tpc0_sm_cfg_sm_id_f(sm_id));
+				gk20a_writel(g, gr_gpc0_tpc0_l1c_cfg_smid_r() + gpc_offset + tpc_offset,
+					     gr_gpc0_tpc0_l1c_cfg_smid_value_f(sm_id));
+				gk20a_writel(g, gr_gpc0_gpm_pd_sm_id_r(tpc_index) + gpc_offset,
+					     gr_gpc0_gpm_pd_sm_id_id_f(sm_id));
+				gk20a_writel(g, gr_gpc0_tpc0_pe_cfg_smid_r() + gpc_offset + tpc_offset,
+					     gr_gpc0_tpc0_pe_cfg_smid_value_f(sm_id));
+
+				sm_id_to_gpc_id[sm_id] = gpc_index;
+				sm_id++;
+			}
+
+			gk20a_writel(g, gr_gpc0_gpm_pd_active_tpcs_r() + gpc_offset,
+				     gr_gpc0_gpm_pd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+			gk20a_writel(g, gr_gpc0_gpm_sd_active_tpcs_r() + gpc_offset,
+				     gr_gpc0_gpm_sd_active_tpcs_num_f(gr->gpc_tpc_count[gpc_index]));
+		}
+	}
+
+	for (tpc_index = 0, gpc_id = 0;
+	     tpc_index < gr_pd_num_tpc_per_gpc__size_1_v();
+	     tpc_index++, gpc_id += 8) {
+
+		if (gpc_id >= gr->gpc_count)
+			gpc_id = 0;
+
+		tpc_per_gpc =
+			gr_pd_num_tpc_per_gpc_count0_f(gr->gpc_tpc_count[gpc_id + 0]) |
+			gr_pd_num_tpc_per_gpc_count1_f(gr->gpc_tpc_count[gpc_id + 1]) |
+			gr_pd_num_tpc_per_gpc_count2_f(gr->gpc_tpc_count[gpc_id + 2]) |
+			gr_pd_num_tpc_per_gpc_count3_f(gr->gpc_tpc_count[gpc_id + 3]) |
+			gr_pd_num_tpc_per_gpc_count4_f(gr->gpc_tpc_count[gpc_id + 4]) |
+			gr_pd_num_tpc_per_gpc_count5_f(gr->gpc_tpc_count[gpc_id + 5]) |
+			gr_pd_num_tpc_per_gpc_count6_f(gr->gpc_tpc_count[gpc_id + 6]) |
+			gr_pd_num_tpc_per_gpc_count7_f(gr->gpc_tpc_count[gpc_id + 7]);
+
+		gk20a_writel(g, gr_pd_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+		gk20a_writel(g, gr_ds_num_tpc_per_gpc_r(tpc_index), tpc_per_gpc);
+	}
+
+	/* gr__setup_pd_mapping stubbed for gk20a */
+	gr_gk20a_setup_rop_mapping(g, gr);
+	if (g->ops.gr.setup_alpha_beta_tables)
+		g->ops.gr.setup_alpha_beta_tables(g, gr);
+
+	if (gr->num_fbps == 1)
+		max_ways_evict = 9;
+
+	if (max_ways_evict != INVALID_MAX_WAYS)
+		g->ops.ltc.set_max_ways_evict_last(g, max_ways_evict);
+
+	for (gpc_index = 0;
+	     gpc_index < gr_pd_dist_skip_table__size_1_v() * 4;
+	     gpc_index += 4) {
+
+		gk20a_writel(g, gr_pd_dist_skip_table_r(gpc_index/4),
+			     gr_pd_dist_skip_table_gpc_4n0_mask_f(gr->gpc_skip_mask[gpc_index]) ||
+			     gr_pd_dist_skip_table_gpc_4n1_mask_f(gr->gpc_skip_mask[gpc_index + 1]) ||
+			     gr_pd_dist_skip_table_gpc_4n2_mask_f(gr->gpc_skip_mask[gpc_index + 2]) ||
+			     gr_pd_dist_skip_table_gpc_4n3_mask_f(gr->gpc_skip_mask[gpc_index + 3]));
+	}
+
+	gk20a_writel(g, gr_cwd_fs_r(),
+		     gr_cwd_fs_num_gpcs_f(gr->gpc_count) |
+		     gr_cwd_fs_num_tpcs_f(gr->tpc_count));
+
+	gk20a_writel(g, gr_bes_zrop_settings_r(),
+		     gr_bes_zrop_settings_num_active_fbps_f(gr->num_fbps));
+	gk20a_writel(g, gr_bes_crop_settings_r(),
+		     gr_bes_crop_settings_num_active_fbps_f(gr->num_fbps));
+
+	/* turn on cya15 bit for a default val that missed the cut */
+	l1c_dbg_reg_val = gk20a_readl(g, gr_gpc0_tpc0_l1c_dbg_r());
+	l1c_dbg_reg_val |= gr_gpc0_tpc0_l1c_dbg_cya15_en_f();
+	gk20a_writel(g, gr_gpc0_tpc0_l1c_dbg_r(), l1c_dbg_reg_val);
+
+	return 0;
+}
+
+static int gr_gk20a_fecs_ctx_image_save(struct channel_gk20a *c, u32 save_type)
+{
+	struct gk20a *g = c->g;
+	int ret;
+
+	u32 inst_base_ptr =
+		u64_lo32(c->inst_block.cpu_pa
+		>> ram_in_base_shift_v());
+
+
+	gk20a_dbg_fn("");
+
+	ret = gr_gk20a_submit_fecs_method_op(g,
+		(struct fecs_method_op_gk20a) {
+		.method.addr = save_type,
+		.method.data = (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+				gr_fecs_current_ctx_target_vid_mem_f() |
+				gr_fecs_current_ctx_valid_f(1)),
+		.mailbox = {.id = 0, .data = 0, .clr = 3, .ret = NULL,
+			.ok = 1, .fail = 2,
+		},
+		.cond.ok = GR_IS_UCODE_OP_AND,
+		.cond.fail = GR_IS_UCODE_OP_AND,
+		 });
+
+	if (ret)
+		gk20a_err(dev_from_gk20a(g), "save context image failed");
+
+	return ret;
+}
+
+static u32 gk20a_init_sw_bundle(struct gk20a *g)
+{
+	struct av_list_gk20a *sw_bundle_init = &g->gr.ctx_vars.sw_bundle_init;
+	u32 last_bundle_data = 0;
+	u32 err = 0;
+	int i;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+
+	/* enable pipe mode override */
+	gk20a_writel(g, gr_pipe_bundle_config_r(),
+		gr_pipe_bundle_config_override_pipe_mode_enabled_f());
+
+	/* load bundle init */
+	for (i = 0; i < sw_bundle_init->count; i++) {
+
+		if (i == 0 || last_bundle_data != sw_bundle_init->l[i].value) {
+			gk20a_writel(g, gr_pipe_bundle_data_r(),
+				sw_bundle_init->l[i].value);
+			last_bundle_data = sw_bundle_init->l[i].value;
+		}
+
+		gk20a_writel(g, gr_pipe_bundle_address_r(),
+			     sw_bundle_init->l[i].addr);
+
+		if (gr_pipe_bundle_address_value_v(sw_bundle_init->l[i].addr) ==
+		    GR_GO_IDLE_BUNDLE)
+			err |= gr_gk20a_wait_idle(g, end_jiffies,
+					GR_IDLE_CHECK_DEFAULT);
+	}
+
+	/* disable pipe mode override */
+	gk20a_writel(g, gr_pipe_bundle_config_r(),
+		     gr_pipe_bundle_config_override_pipe_mode_disabled_f());
+
+	return err;
+}
+
+/* init global golden image from a fresh gr_ctx in channel ctx.
+   save a copy in local_golden_image in ctx_vars */
+static int gr_gk20a_init_golden_ctx_image(struct gk20a *g,
+					  struct channel_gk20a *c)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	u32 ctx_header_bytes = ctxsw_prog_fecs_header_v();
+	u32 ctx_header_words;
+	u32 i;
+	u32 data;
+	void *ctx_ptr = NULL;
+	void *gold_ptr = NULL;
+	u32 err = 0;
+
+	gk20a_dbg_fn("");
+
+	/* golden ctx is global to all channels. Although only the first
+	   channel initializes golden image, driver needs to prevent multiple
+	   channels from initializing golden ctx at the same time */
+	mutex_lock(&gr->ctx_mutex);
+
+	if (gr->ctx_vars.golden_image_initialized)
+		goto clean_up;
+
+	err = gr_gk20a_fecs_ctx_bind_channel(g, c);
+	if (err)
+		goto clean_up;
+
+	err = gk20a_init_sw_bundle(g);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_elpg_protected_call(g,
+			gr_gk20a_commit_global_ctx_buffers(g, c, false));
+	if (err)
+		goto clean_up;
+
+	gold_ptr = vmap(gr->global_ctx_buffer[GOLDEN_CTX].pages,
+			PAGE_ALIGN(gr->global_ctx_buffer[GOLDEN_CTX].size) >>
+			PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!gold_ptr)
+		goto clean_up;
+
+	ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+			PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!ctx_ptr)
+		goto clean_up;
+
+	ctx_header_words =  roundup(ctx_header_bytes, sizeof(u32));
+	ctx_header_words >>= 2;
+
+	/* Channel gr_ctx buffer is gpu cacheable.
+	   Flush before cpu read. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, false);
+
+	for (i = 0; i < ctx_header_words; i++) {
+		data = gk20a_mem_rd32(ctx_ptr, i);
+		gk20a_mem_wr32(gold_ptr, i, data);
+	}
+
+	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_o(), 0,
+		 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v());
+
+	gk20a_mem_wr32(gold_ptr + ctxsw_prog_main_image_zcull_ptr_o(), 0, 0);
+
+	gr_gk20a_commit_inst(c, ch_ctx->global_ctx_buffer_va[GOLDEN_CTX_VA]);
+
+	gr_gk20a_fecs_ctx_image_save(c, gr_fecs_method_push_adr_wfi_golden_save_v());
+
+	if (gr->ctx_vars.local_golden_image == NULL) {
+
+		gr->ctx_vars.local_golden_image =
+			kzalloc(gr->ctx_vars.golden_image_size, GFP_KERNEL);
+
+		if (gr->ctx_vars.local_golden_image == NULL) {
+			err = -ENOMEM;
+			goto clean_up;
+		}
+
+		for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+			gr->ctx_vars.local_golden_image[i] =
+				gk20a_mem_rd32(gold_ptr, i);
+	}
+
+	gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
+
+	gr->ctx_vars.golden_image_initialized = true;
+
+	gk20a_mm_l2_invalidate(g);
+
+	gk20a_writel(g, gr_fecs_current_ctx_r(),
+		gr_fecs_current_ctx_valid_false_f());
+
+clean_up:
+	if (err)
+		gk20a_err(dev_from_gk20a(g), "fail");
+	else
+		gk20a_dbg_fn("done");
+
+	if (gold_ptr)
+		vunmap(gold_ptr);
+	if (ctx_ptr)
+		vunmap(ctx_ptr);
+
+	mutex_unlock(&gr->ctx_mutex);
+	return err;
+}
+
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+				    struct channel_gk20a *c,
+				    bool enable_smpc_ctxsw)
+{
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	void *ctx_ptr = NULL;
+	u32 data;
+
+	/*XXX caller responsible for making sure the channel is quiesced? */
+
+	/* Channel gr_ctx buffer is gpu cacheable.
+	   Flush and invalidate before cpu update. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+			PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!ctx_ptr)
+		return -ENOMEM;
+
+	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = data & ~ctxsw_prog_main_image_pm_smpc_mode_m();
+	data |= enable_smpc_ctxsw ?
+		ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f() :
+		ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f();
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
+		 data);
+
+	vunmap(ctx_ptr);
+
+	gk20a_mm_l2_invalidate(g);
+
+	return 0;
+}
+
+/* load saved fresh copy of gloden image into channel gr_ctx */
+static int gr_gk20a_load_golden_ctx_image(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	u32 virt_addr_lo;
+	u32 virt_addr_hi;
+	u32 i, v, data;
+	int ret = 0;
+	void *ctx_ptr = NULL;
+
+	gk20a_dbg_fn("");
+
+	if (gr->ctx_vars.local_golden_image == NULL)
+		return -1;
+
+	/* Channel gr_ctx buffer is gpu cacheable.
+	   Flush and invalidate before cpu update. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+			PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!ctx_ptr)
+		return -ENOMEM;
+
+	for (i = 0; i < gr->ctx_vars.golden_image_size / 4; i++)
+		gk20a_mem_wr32(ctx_ptr, i, gr->ctx_vars.local_golden_image[i]);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_save_ops_o(), 0, 0);
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_num_restore_ops_o(), 0, 0);
+
+	virt_addr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+	virt_addr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_count_o(), 0,
+		 ch_ctx->patch_ctx.data_count);
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_lo_o(), 0,
+		 virt_addr_lo);
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_patch_adr_hi_o(), 0,
+		 virt_addr_hi);
+
+	/* no user for client managed performance counter ctx */
+	ch_ctx->pm_ctx.ctx_sw_mode =
+		ctxsw_prog_main_image_pm_mode_no_ctxsw_f();
+	data = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0);
+	data = data & ~ctxsw_prog_main_image_pm_mode_m();
+	data |= ch_ctx->pm_ctx.ctx_sw_mode;
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_o(), 0,
+		 data);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_pm_ptr_o(), 0, 0);
+
+	/* set priv access map */
+	virt_addr_lo =
+		 u64_lo32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+	virt_addr_hi =
+		 u64_hi32(ch_ctx->global_ctx_buffer_va[PRIV_ACCESS_MAP_VA]);
+
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_config_o(), 0,
+		 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f());
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_lo_o(), 0,
+		 virt_addr_lo);
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_priv_access_map_addr_hi_o(), 0,
+		 virt_addr_hi);
+	/* disable verif features */
+	v = gk20a_mem_rd32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0);
+	v = v & ~(ctxsw_prog_main_image_misc_options_verif_features_m());
+	v = v | ctxsw_prog_main_image_misc_options_verif_features_disabled_f();
+	gk20a_mem_wr32(ctx_ptr + ctxsw_prog_main_image_misc_options_o(), 0, v);
+
+
+	vunmap(ctx_ptr);
+
+	gk20a_mm_l2_invalidate(g);
+
+	if (tegra_platform_is_linsim()) {
+		u32 inst_base_ptr =
+			u64_lo32(c->inst_block.cpu_pa
+			>> ram_in_base_shift_v());
+
+		ret = gr_gk20a_submit_fecs_method_op(g,
+			  (struct fecs_method_op_gk20a) {
+				  .method.data =
+					  (gr_fecs_current_ctx_ptr_f(inst_base_ptr) |
+					   gr_fecs_current_ctx_target_vid_mem_f() |
+					   gr_fecs_current_ctx_valid_f(1)),
+				  .method.addr =
+					  gr_fecs_method_push_adr_restore_golden_v(),
+				  .mailbox = {
+					  .id = 0, .data = 0,
+					  .clr = ~0, .ret = NULL,
+					  .ok = gr_fecs_ctxsw_mailbox_value_pass_v(),
+					  .fail = 0},
+				  .cond.ok = GR_IS_UCODE_OP_EQUAL,
+				  .cond.fail = GR_IS_UCODE_OP_SKIP});
+
+		if (ret)
+			gk20a_err(dev_from_gk20a(g),
+				   "restore context image failed");
+	}
+
+	return ret;
+}
+
+static void gr_gk20a_start_falcon_ucode(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0),
+		     gr_fecs_ctxsw_mailbox_clear_value_f(~0));
+
+	gk20a_writel(g, gr_gpccs_dmactl_r(), gr_gpccs_dmactl_require_ctx_f(0));
+	gk20a_writel(g, gr_fecs_dmactl_r(), gr_fecs_dmactl_require_ctx_f(0));
+
+	gk20a_writel(g, gr_gpccs_cpuctl_r(), gr_gpccs_cpuctl_startcpu_f(1));
+	gk20a_writel(g, gr_fecs_cpuctl_r(), gr_fecs_cpuctl_startcpu_f(1));
+
+	gk20a_dbg_fn("done");
+}
+
+static int gr_gk20a_init_ctxsw_ucode_vaspace(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct device *d = dev_from_gk20a(g);
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	void *inst_ptr;
+	u32 pde_addr_lo;
+	u32 pde_addr_hi;
+	u64 pde_addr;
+	dma_addr_t iova;
+
+	/* Alloc mem of inst block */
+	ucode_info->inst_blk_desc.size = ram_in_alloc_size_v();
+	ucode_info->inst_blk_desc.cpuva = dma_alloc_coherent(d,
+					ucode_info->inst_blk_desc.size,
+					&iova,
+					GFP_KERNEL);
+	if (!ucode_info->inst_blk_desc.cpuva) {
+		gk20a_err(d, "failed to allocate memory\n");
+		return -ENOMEM;
+	}
+
+	ucode_info->inst_blk_desc.iova = iova;
+	ucode_info->inst_blk_desc.cpu_pa = gk20a_get_phys_from_iova(d,
+					ucode_info->inst_blk_desc.iova);
+
+	inst_ptr = ucode_info->inst_blk_desc.cpuva;
+
+	/* Set inst block */
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+		 u64_lo32(vm->va_limit) | 0xFFF);
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+
+	pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+	pde_addr_lo = u64_lo32(pde_addr >> 12);
+	pde_addr_hi = u64_hi32(pde_addr);
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+		ram_in_page_dir_base_target_vid_mem_f() |
+		ram_in_page_dir_base_vol_true_f() |
+		ram_in_page_dir_base_lo_f(pde_addr_lo));
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+		ram_in_page_dir_base_hi_f(pde_addr_hi));
+
+	/* Map ucode surface to GMMU */
+	ucode_info->ucode_gpuva = gk20a_gmmu_map(vm,
+					&ucode_info->surface_desc.sgt,
+					ucode_info->surface_desc.size,
+					0, /* flags */
+					gk20a_mem_flag_read_only);
+	if (!ucode_info->ucode_gpuva) {
+		gk20a_err(d, "failed to update gmmu ptes\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void gr_gk20a_init_ctxsw_ucode_segment(
+	struct gk20a_ctxsw_ucode_segment *p_seg, u32 *offset, u32 size)
+{
+	p_seg->offset = *offset;
+	p_seg->size = size;
+	*offset = ALIGN(*offset + size, BLK_SIZE);
+}
+
+static void gr_gk20a_init_ctxsw_ucode_segments(
+	struct gk20a_ctxsw_ucode_segments *segments, u32 *offset,
+	struct gk20a_ctxsw_bootloader_desc *bootdesc,
+	u32 code_size, u32 data_size)
+{
+	u32 boot_size = ALIGN(bootdesc->size, sizeof(u32));
+	segments->boot_entry = bootdesc->entry_point;
+	segments->boot_imem_offset = bootdesc->imem_offset;
+	gr_gk20a_init_ctxsw_ucode_segment(&segments->boot, offset, boot_size);
+	gr_gk20a_init_ctxsw_ucode_segment(&segments->code, offset, code_size);
+	gr_gk20a_init_ctxsw_ucode_segment(&segments->data, offset, data_size);
+}
+
+static int gr_gk20a_copy_ctxsw_ucode_segments(
+	u8 *buf,
+	struct gk20a_ctxsw_ucode_segments *segments,
+	u32 *bootimage,
+	u32 *code, u32 *data)
+{
+	memcpy(buf + segments->boot.offset, bootimage, segments->boot.size);
+	memcpy(buf + segments->code.offset, code,      segments->code.size);
+	memcpy(buf + segments->data.offset, data,      segments->data.size);
+	return 0;
+}
+
+static int gr_gk20a_init_ctxsw_ucode(struct gk20a *g)
+{
+	struct device *d = dev_from_gk20a(g);
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct gk20a_ctxsw_bootloader_desc *fecs_boot_desc;
+	struct gk20a_ctxsw_bootloader_desc *gpccs_boot_desc;
+	const struct firmware *fecs_fw;
+	const struct firmware *gpccs_fw;
+	u32 *fecs_boot_image;
+	u32 *gpccs_boot_image;
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	u8 *buf;
+	u32 ucode_size;
+	int err = 0;
+	dma_addr_t iova;
+	DEFINE_DMA_ATTRS(attrs);
+
+	fecs_fw = gk20a_request_firmware(g, GK20A_FECS_UCODE_IMAGE);
+	if (!fecs_fw) {
+		gk20a_err(d, "failed to load fecs ucode!!");
+		return -ENOENT;
+	}
+
+	fecs_boot_desc = (void *)fecs_fw->data;
+	fecs_boot_image = (void *)(fecs_fw->data +
+				sizeof(struct gk20a_ctxsw_bootloader_desc));
+
+	gpccs_fw = gk20a_request_firmware(g, GK20A_GPCCS_UCODE_IMAGE);
+	if (!gpccs_fw) {
+		release_firmware(fecs_fw);
+		gk20a_err(d, "failed to load gpccs ucode!!");
+		return -ENOENT;
+	}
+
+	gpccs_boot_desc = (void *)gpccs_fw->data;
+	gpccs_boot_image = (void *)(gpccs_fw->data +
+				sizeof(struct gk20a_ctxsw_bootloader_desc));
+
+	ucode_size = 0;
+	gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->fecs, &ucode_size,
+		fecs_boot_desc,
+		g->gr.ctx_vars.ucode.fecs.inst.count * sizeof(u32),
+		g->gr.ctx_vars.ucode.fecs.data.count * sizeof(u32));
+	gr_gk20a_init_ctxsw_ucode_segments(&ucode_info->gpccs, &ucode_size,
+		gpccs_boot_desc,
+		g->gr.ctx_vars.ucode.gpccs.inst.count * sizeof(u32),
+		g->gr.ctx_vars.ucode.gpccs.data.count * sizeof(u32));
+
+	ucode_info->surface_desc.size = ucode_size;
+	dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
+	ucode_info->surface_desc.cpuva = dma_alloc_attrs(d,
+					ucode_info->surface_desc.size,
+					&iova,
+					GFP_KERNEL,
+					&attrs);
+	if (!ucode_info->surface_desc.cpuva) {
+		gk20a_err(d, "memory allocation failed\n");
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	ucode_info->surface_desc.iova = iova;
+	err = gk20a_get_sgtable(d, &ucode_info->surface_desc.sgt,
+				ucode_info->surface_desc.cpuva,
+				ucode_info->surface_desc.iova,
+				ucode_info->surface_desc.size);
+	if (err) {
+		gk20a_err(d, "failed to create sg table\n");
+		goto clean_up;
+	}
+
+	buf = (u8 *)ucode_info->surface_desc.cpuva;
+	if (!buf) {
+		gk20a_err(d, "failed to map surface desc buffer");
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->fecs,
+		fecs_boot_image,
+		g->gr.ctx_vars.ucode.fecs.inst.l,
+		g->gr.ctx_vars.ucode.fecs.data.l);
+
+	release_firmware(fecs_fw);
+	fecs_fw = NULL;
+
+	gr_gk20a_copy_ctxsw_ucode_segments(buf, &ucode_info->gpccs,
+		gpccs_boot_image,
+		g->gr.ctx_vars.ucode.gpccs.inst.l,
+		g->gr.ctx_vars.ucode.gpccs.data.l);
+
+	release_firmware(gpccs_fw);
+	gpccs_fw = NULL;
+
+	err = gr_gk20a_init_ctxsw_ucode_vaspace(g);
+	if (err)
+		goto clean_up;
+
+	gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
+
+	return 0;
+
+ clean_up:
+	if (ucode_info->ucode_gpuva)
+		gk20a_gmmu_unmap(vm, ucode_info->ucode_gpuva,
+			ucode_info->surface_desc.size, gk20a_mem_flag_none);
+	if (ucode_info->surface_desc.sgt)
+		gk20a_free_sgtable(&ucode_info->surface_desc.sgt);
+	if (ucode_info->surface_desc.cpuva)
+		dma_free_attrs(d, ucode_info->surface_desc.size,
+				ucode_info->surface_desc.cpuva,
+				ucode_info->surface_desc.iova,
+				&attrs);
+	ucode_info->surface_desc.cpuva = NULL;
+	ucode_info->surface_desc.iova = 0;
+
+	release_firmware(gpccs_fw);
+	gpccs_fw = NULL;
+	release_firmware(fecs_fw);
+	fecs_fw = NULL;
+
+	return err;
+}
+
+static void gr_gk20a_load_falcon_bind_instblk(struct gk20a *g)
+{
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	int retries = 20;
+	phys_addr_t inst_ptr;
+	u32 val;
+
+	while ((gk20a_readl(g, gr_fecs_ctxsw_status_1_r()) &
+			gr_fecs_ctxsw_status_1_arb_busy_m()) && retries) {
+		udelay(2);
+		retries--;
+	}
+	if (!retries)
+		gk20a_err(dev_from_gk20a(g), "arbiter idle timeout");
+
+	gk20a_writel(g, gr_fecs_arb_ctx_adr_r(), 0x0);
+
+	inst_ptr = ucode_info->inst_blk_desc.cpu_pa;
+	gk20a_writel(g, gr_fecs_new_ctx_r(),
+			gr_fecs_new_ctx_ptr_f(inst_ptr >> 12) |
+			gr_fecs_new_ctx_target_m() |
+			gr_fecs_new_ctx_valid_m());
+
+	gk20a_writel(g, gr_fecs_arb_ctx_ptr_r(),
+			gr_fecs_arb_ctx_ptr_ptr_f(inst_ptr >> 12) |
+			gr_fecs_arb_ctx_ptr_target_m());
+
+	gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), 0x7);
+
+	/* Wait for arbiter command to complete */
+	retries = 20;
+	val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+	while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
+		udelay(2);
+		retries--;
+		val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+	}
+	if (!retries)
+		gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
+
+	gk20a_writel(g, gr_fecs_current_ctx_r(),
+			gr_fecs_current_ctx_ptr_f(inst_ptr >> 12) |
+			gr_fecs_current_ctx_target_m() |
+			gr_fecs_current_ctx_valid_m());
+	/* Send command to arbiter to flush */
+	gk20a_writel(g, gr_fecs_arb_ctx_cmd_r(), gr_fecs_arb_ctx_cmd_cmd_s());
+
+	retries = 20;
+	val = (gk20a_readl(g, gr_fecs_arb_ctx_cmd_r()));
+	while (gr_fecs_arb_ctx_cmd_cmd_v(val) && retries) {
+		udelay(2);
+		retries--;
+		val = gk20a_readl(g, gr_fecs_arb_ctx_cmd_r());
+	}
+	if (!retries)
+		gk20a_err(dev_from_gk20a(g), "arbiter complete timeout");
+}
+
+static int gr_gk20a_load_ctxsw_ucode_segments(struct gk20a *g, u64 addr_base,
+	struct gk20a_ctxsw_ucode_segments *segments, u32 reg_offset)
+{
+	u32 addr_code32;
+	u32 addr_data32;
+	u32 addr_load32;
+	u32 dst = 0;
+	u32 blocks;
+	u32 b;
+
+	addr_code32 = u64_lo32((addr_base + segments->code.offset) >> 8);
+	addr_data32 = u64_lo32((addr_base + segments->data.offset) >> 8);
+	addr_load32 = u64_lo32((addr_base + segments->boot.offset) >> 8);
+
+	gk20a_writel(g, reg_offset + gr_fecs_dmactl_r(),
+			gr_fecs_dmactl_require_ctx_f(0));
+
+	/*
+	 * Copy falcon bootloader header into dmem at offset 0.
+	 * Configure dmem port 0 for auto-incrementing writes starting at dmem
+	 * offset 0.
+	 */
+	gk20a_writel(g, reg_offset + gr_fecs_dmemc_r(0),
+			gr_fecs_dmemc_offs_f(0) |
+			gr_fecs_dmemc_blk_f(0) |
+			gr_fecs_dmemc_aincw_f(1));
+
+	/* Write out the actual data */
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->code.size);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_data32);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), segments->data.size);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), addr_code32);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+	gk20a_writel(g, reg_offset + gr_fecs_dmemd_r(0), 0);
+
+	blocks = ((segments->boot.size + 0xFF) & ~0xFF) >> 8;
+
+	/*
+	 * Set the base FB address for the DMA transfer. Subtract off the 256
+	 * byte IMEM block offset such that the relative FB and IMEM offsets
+	 * match, allowing the IMEM tags to be properly created.
+	 */
+
+	dst = segments->boot_imem_offset;
+	gk20a_writel(g, reg_offset + gr_fecs_dmatrfbase_r(),
+			(addr_load32 - (dst >> 8)));
+
+	for (b = 0; b < blocks; b++) {
+		/* Setup destination IMEM offset */
+		gk20a_writel(g, reg_offset + gr_fecs_dmatrfmoffs_r(),
+				dst + (b << 8));
+
+		/* Setup source offset (relative to BASE) */
+		gk20a_writel(g, reg_offset + gr_fecs_dmatrffboffs_r(),
+				dst + (b << 8));
+
+		gk20a_writel(g, reg_offset + gr_fecs_dmatrfcmd_r(),
+				gr_fecs_dmatrfcmd_imem_f(0x01) |
+				gr_fecs_dmatrfcmd_write_f(0x00) |
+				gr_fecs_dmatrfcmd_size_f(0x06) |
+				gr_fecs_dmatrfcmd_ctxdma_f(0));
+	}
+
+	/* Specify the falcon boot vector */
+	gk20a_writel(g, reg_offset + gr_fecs_bootvec_r(),
+			gr_fecs_bootvec_vec_f(segments->boot_entry));
+
+	/* Write to CPUCTL to start the falcon */
+	gk20a_writel(g, reg_offset + gr_fecs_cpuctl_r(),
+			gr_fecs_cpuctl_startcpu_f(0x01));
+
+	return 0;
+}
+
+static void gr_gk20a_load_falcon_with_bootloader(struct gk20a *g)
+{
+	struct gk20a_ctxsw_ucode_info *ucode_info = &g->ctxsw_ucode_info;
+	u64 addr_base = ucode_info->ucode_gpuva;
+
+	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0x0);
+
+	gr_gk20a_load_falcon_bind_instblk(g);
+
+	gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
+		&g->ctxsw_ucode_info.fecs, 0);
+
+	gr_gk20a_load_ctxsw_ucode_segments(g, addr_base,
+		&g->ctxsw_ucode_info.gpccs,
+		gr_gpcs_gpccs_falcon_hwcfg_r() -
+		gr_fecs_falcon_hwcfg_r());
+}
+
+static int gr_gk20a_load_ctxsw_ucode(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 ret;
+
+	gk20a_dbg_fn("");
+
+	if (tegra_platform_is_linsim()) {
+		gk20a_writel(g, gr_fecs_ctxsw_mailbox_r(7),
+			gr_fecs_ctxsw_mailbox_value_f(0xc0de7777));
+		gk20a_writel(g, gr_gpccs_ctxsw_mailbox_r(7),
+			gr_gpccs_ctxsw_mailbox_value_f(0xc0de7777));
+	}
+
+	/*
+	 * In case the gPMU falcon is not being used, revert to the old way of
+	 * loading gr ucode, without the faster bootstrap routine.
+	 */
+	if (!support_gk20a_pmu()) {
+		gr_gk20a_load_falcon_dmem(g);
+		gr_gk20a_load_falcon_imem(g);
+		gr_gk20a_start_falcon_ucode(g);
+	} else {
+		if (!gr->skip_ucode_init)
+			gr_gk20a_init_ctxsw_ucode(g);
+		gr_gk20a_load_falcon_with_bootloader(g);
+		gr->skip_ucode_init = true;
+	}
+
+	ret = gr_gk20a_ctx_wait_ucode(g, 0, 0,
+				      GR_IS_UCODE_OP_EQUAL,
+				      eUcodeHandshakeInitComplete,
+				      GR_IS_UCODE_OP_SKIP, 0);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g), "falcon ucode init timeout");
+		return ret;
+	}
+
+	if (support_gk20a_pmu())
+		gk20a_writel(g, gr_fecs_current_ctx_r(),
+			gr_fecs_current_ctx_valid_false_f());
+
+	gk20a_writel(g, gr_fecs_ctxsw_mailbox_clear_r(0), 0xffffffff);
+	gk20a_writel(g, gr_fecs_method_data_r(), 0x7fffffff);
+	gk20a_writel(g, gr_fecs_method_push_r(),
+		     gr_fecs_method_push_adr_set_watchdog_timeout_f());
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static int gr_gk20a_init_ctx_state(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 golden_ctx_image_size = 0;
+	u32 zcull_ctx_image_size = 0;
+	u32 pm_ctx_image_size = 0;
+	u32 ret;
+	struct fecs_method_op_gk20a op = {
+		.mailbox = { .id = 0, .data = 0,
+			     .clr = ~0, .ok = 0, .fail = 0},
+		.method.data = 0,
+		.cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+		.cond.fail = GR_IS_UCODE_OP_SKIP,
+		};
+
+	gk20a_dbg_fn("");
+	op.method.addr = gr_fecs_method_push_adr_discover_image_size_v();
+	op.mailbox.ret = &golden_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			   "query golden image size failed");
+		return ret;
+	}
+	op.method.addr = gr_fecs_method_push_adr_discover_zcull_image_size_v();
+	op.mailbox.ret = &zcull_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			   "query zcull ctx image size failed");
+		return ret;
+	}
+	op.method.addr = gr_fecs_method_push_adr_discover_pm_image_size_v();
+	op.mailbox.ret = &pm_ctx_image_size;
+	ret = gr_gk20a_submit_fecs_method_op(g, op);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			   "query pm ctx image size failed");
+		return ret;
+	}
+
+	if (!g->gr.ctx_vars.golden_image_size &&
+	    !g->gr.ctx_vars.zcull_ctxsw_image_size) {
+		g->gr.ctx_vars.golden_image_size = golden_ctx_image_size;
+		g->gr.ctx_vars.zcull_ctxsw_image_size = zcull_ctx_image_size;
+	} else {
+		/* hw is different after railgating? */
+		BUG_ON(g->gr.ctx_vars.golden_image_size != golden_ctx_image_size);
+		BUG_ON(g->gr.ctx_vars.zcull_ctxsw_image_size != zcull_ctx_image_size);
+	}
+
+	g->gr.ctx_vars.priv_access_map_size = 512 * 1024;
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static void gk20a_gr_destroy_ctx_buffer(struct platform_device *pdev,
+					struct gr_ctx_buffer_desc *desc)
+{
+	struct device *dev = &pdev->dev;
+	gk20a_free_sgtable(&desc->sgt);
+	dma_free_attrs(dev, desc->size, desc->pages,
+		       desc->iova, &desc->attrs);
+}
+
+static int gk20a_gr_alloc_ctx_buffer(struct platform_device *pdev,
+				     struct gr_ctx_buffer_desc *desc,
+				     size_t size)
+{
+	struct device *dev = &pdev->dev;
+	DEFINE_DMA_ATTRS(attrs);
+	dma_addr_t iova;
+	int err = 0;
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+
+	desc->pages = dma_alloc_attrs(&pdev->dev, size, &iova,
+				      GFP_KERNEL, &attrs);
+	if (!desc->pages)
+		return -ENOMEM;
+
+	desc->iova = iova;
+	desc->size = size;
+	desc->attrs = attrs;
+	desc->destroy = gk20a_gr_destroy_ctx_buffer;
+	err = gk20a_get_sgtable_from_pages(&pdev->dev, &desc->sgt, desc->pages,
+					   desc->iova, desc->size);
+	if (err) {
+		dma_free_attrs(dev, desc->size, desc->pages,
+			       desc->iova, &desc->attrs);
+		memset(desc, 0, sizeof(*desc));
+	}
+
+	return err;
+}
+
+static int gr_gk20a_alloc_global_ctx_buffers(struct gk20a *g)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+	struct gr_gk20a *gr = &g->gr;
+	int i, attr_buffer_size, err;
+	struct platform_device *pdev = g->dev;
+
+	u32 cb_buffer_size = gr->bundle_cb_default_size *
+		gr_scc_bundle_cb_size_div_256b_byte_granularity_v();
+
+	u32 pagepool_buffer_size = gr_scc_pagepool_total_pages_hwmax_value_v() *
+		gr_scc_pagepool_total_pages_byte_granularity_v();
+
+	gk20a_dbg_fn("");
+
+	attr_buffer_size = g->ops.gr.calc_global_ctx_buffer_size(g);
+
+	gk20a_dbg_info("cb_buffer_size : %d", cb_buffer_size);
+
+	err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[CIRCULAR],
+					cb_buffer_size);
+	if (err)
+		goto clean_up;
+
+	if (platform->secure_alloc)
+		platform->secure_alloc(pdev,
+				       &gr->global_ctx_buffer[CIRCULAR_VPR],
+				       cb_buffer_size);
+
+	gk20a_dbg_info("pagepool_buffer_size : %d", pagepool_buffer_size);
+
+	err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[PAGEPOOL],
+					pagepool_buffer_size);
+	if (err)
+		goto clean_up;
+
+	if (platform->secure_alloc)
+		platform->secure_alloc(pdev,
+				       &gr->global_ctx_buffer[PAGEPOOL_VPR],
+				       pagepool_buffer_size);
+
+	gk20a_dbg_info("attr_buffer_size : %d", attr_buffer_size);
+
+	err = gk20a_gr_alloc_ctx_buffer(pdev, &gr->global_ctx_buffer[ATTRIBUTE],
+					attr_buffer_size);
+	if (err)
+		goto clean_up;
+
+	if (platform->secure_alloc)
+		platform->secure_alloc(pdev,
+				       &gr->global_ctx_buffer[ATTRIBUTE_VPR],
+				       attr_buffer_size);
+
+	gk20a_dbg_info("golden_image_size : %d",
+		   gr->ctx_vars.golden_image_size);
+
+	err = gk20a_gr_alloc_ctx_buffer(pdev,
+					&gr->global_ctx_buffer[GOLDEN_CTX],
+					gr->ctx_vars.golden_image_size);
+	if (err)
+		goto clean_up;
+
+	gk20a_dbg_info("priv_access_map_size : %d",
+		   gr->ctx_vars.priv_access_map_size);
+
+	err = gk20a_gr_alloc_ctx_buffer(pdev,
+					&gr->global_ctx_buffer[PRIV_ACCESS_MAP],
+					gr->ctx_vars.priv_access_map_size);
+
+	if (err)
+		goto clean_up;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+ clean_up:
+	gk20a_err(dev_from_gk20a(g), "fail");
+	for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+		if (gr->global_ctx_buffer[i].destroy) {
+			gr->global_ctx_buffer[i].destroy(pdev,
+					&gr->global_ctx_buffer[i]);
+		}
+	}
+	return -ENOMEM;
+}
+
+static void gr_gk20a_free_global_ctx_buffers(struct gk20a *g)
+{
+	struct platform_device *pdev = g->dev;
+	struct gr_gk20a *gr = &g->gr;
+	DEFINE_DMA_ATTRS(attrs);
+	u32 i;
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+
+	for (i = 0; i < NR_GLOBAL_CTX_BUF; i++) {
+		gr->global_ctx_buffer[i].destroy(pdev,
+				&gr->global_ctx_buffer[i]);
+	}
+
+	gk20a_dbg_fn("done");
+}
+
+static int gr_gk20a_map_global_ctx_buffers(struct gk20a *g,
+					struct channel_gk20a *c)
+{
+	struct vm_gk20a *ch_vm = c->vm;
+	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	struct gr_gk20a *gr = &g->gr;
+	struct sg_table *sgt;
+	u64 size;
+	u64 gpu_va;
+	u32 i;
+	gk20a_dbg_fn("");
+
+	/* Circular Buffer */
+	if (!c->vpr || (gr->global_ctx_buffer[CIRCULAR_VPR].sgt == NULL)) {
+		sgt = gr->global_ctx_buffer[CIRCULAR].sgt;
+		size = gr->global_ctx_buffer[CIRCULAR].size;
+	} else {
+		sgt = gr->global_ctx_buffer[CIRCULAR_VPR].sgt;
+		size = gr->global_ctx_buffer[CIRCULAR_VPR].size;
+	}
+
+	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+				NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_none);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[CIRCULAR_VA] = gpu_va;
+	g_bfr_size[CIRCULAR_VA] = size;
+
+	/* Attribute Buffer */
+	if (!c->vpr || (gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt == NULL)) {
+		sgt = gr->global_ctx_buffer[ATTRIBUTE].sgt;
+		size = gr->global_ctx_buffer[ATTRIBUTE].size;
+	} else {
+		sgt = gr->global_ctx_buffer[ATTRIBUTE_VPR].sgt;
+		size = gr->global_ctx_buffer[ATTRIBUTE_VPR].size;
+	}
+
+	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+				NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_none);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[ATTRIBUTE_VA] = gpu_va;
+	g_bfr_size[ATTRIBUTE_VA] = size;
+
+	/* Page Pool */
+	if (!c->vpr || (gr->global_ctx_buffer[PAGEPOOL_VPR].sgt == NULL)) {
+		sgt = gr->global_ctx_buffer[PAGEPOOL].sgt;
+		size = gr->global_ctx_buffer[PAGEPOOL].size;
+	} else {
+		sgt = gr->global_ctx_buffer[PAGEPOOL_VPR].sgt;
+		size = gr->global_ctx_buffer[PAGEPOOL_VPR].size;
+	}
+
+	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size,
+				NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_none);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[PAGEPOOL_VA] = gpu_va;
+	g_bfr_size[PAGEPOOL_VA] = size;
+
+	/* Golden Image */
+	sgt = gr->global_ctx_buffer[GOLDEN_CTX].sgt;
+	size = gr->global_ctx_buffer[GOLDEN_CTX].size;
+	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
+				gk20a_mem_flag_none);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[GOLDEN_CTX_VA] = gpu_va;
+	g_bfr_size[GOLDEN_CTX_VA] = size;
+
+	/* Priv register Access Map */
+	sgt = gr->global_ctx_buffer[PRIV_ACCESS_MAP].sgt;
+	size = gr->global_ctx_buffer[PRIV_ACCESS_MAP].size;
+	gpu_va = gk20a_gmmu_map(ch_vm, &sgt, size, 0,
+				gk20a_mem_flag_none);
+	if (!gpu_va)
+		goto clean_up;
+	g_bfr_va[PRIV_ACCESS_MAP_VA] = gpu_va;
+	g_bfr_size[PRIV_ACCESS_MAP_VA] = size;
+
+	c->ch_ctx.global_ctx_buffer_mapped = true;
+	return 0;
+
+ clean_up:
+	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+		if (g_bfr_va[i]) {
+			gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
+					 gr->global_ctx_buffer[i].size,
+					 gk20a_mem_flag_none);
+			g_bfr_va[i] = 0;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void gr_gk20a_unmap_global_ctx_buffers(struct channel_gk20a *c)
+{
+	struct vm_gk20a *ch_vm = c->vm;
+	u64 *g_bfr_va = c->ch_ctx.global_ctx_buffer_va;
+	u64 *g_bfr_size = c->ch_ctx.global_ctx_buffer_size;
+	u32 i;
+
+	gk20a_dbg_fn("");
+
+	for (i = 0; i < NR_GLOBAL_CTX_BUF_VA; i++) {
+		if (g_bfr_va[i]) {
+			gk20a_gmmu_unmap(ch_vm, g_bfr_va[i],
+					 g_bfr_size[i],
+					 gk20a_mem_flag_none);
+			g_bfr_va[i] = 0;
+			g_bfr_size[i] = 0;
+		}
+	}
+	c->ch_ctx.global_ctx_buffer_mapped = false;
+}
+
+static int gr_gk20a_alloc_channel_gr_ctx(struct gk20a *g,
+				struct channel_gk20a *c)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct gr_ctx_desc *gr_ctx = &c->ch_ctx.gr_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+	struct device *d = dev_from_gk20a(g);
+	struct sg_table *sgt;
+	DEFINE_DMA_ATTRS(attrs);
+	int err = 0;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	if (gr->ctx_vars.buffer_size == 0)
+		return 0;
+
+	/* alloc channel gr ctx buffer */
+	gr->ctx_vars.buffer_size = gr->ctx_vars.golden_image_size;
+	gr->ctx_vars.buffer_total_size = gr->ctx_vars.golden_image_size;
+
+	gr_ctx->size = gr->ctx_vars.buffer_total_size;
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	gr_ctx->pages = dma_alloc_attrs(d, gr_ctx->size,
+				&iova, GFP_KERNEL, &attrs);
+	if (!gr_ctx->pages)
+		return -ENOMEM;
+
+	gr_ctx->iova = iova;
+	err = gk20a_get_sgtable_from_pages(d, &sgt, gr_ctx->pages,
+			gr_ctx->iova, gr_ctx->size);
+	if (err)
+		goto err_free;
+
+	gr_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, gr_ctx->size,
+				NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				gk20a_mem_flag_none);
+	if (!gr_ctx->gpu_va)
+		goto err_free_sgt;
+
+	gk20a_free_sgtable(&sgt);
+
+	return 0;
+
+ err_free_sgt:
+	gk20a_free_sgtable(&sgt);
+ err_free:
+	dma_free_attrs(d, gr_ctx->size,
+		gr_ctx->pages, gr_ctx->iova, &attrs);
+	gr_ctx->pages = NULL;
+	gr_ctx->iova = 0;
+
+	return err;
+}
+
+static void gr_gk20a_free_channel_gr_ctx(struct channel_gk20a *c)
+{
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+	struct gk20a *g = c->g;
+	struct device *d = dev_from_gk20a(g);
+	DEFINE_DMA_ATTRS(attrs);
+
+	gk20a_dbg_fn("");
+
+	if (!ch_ctx->gr_ctx.gpu_va)
+		return;
+
+	gk20a_gmmu_unmap(ch_vm, ch_ctx->gr_ctx.gpu_va,
+			ch_ctx->gr_ctx.size, gk20a_mem_flag_none);
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	dma_free_attrs(d, ch_ctx->gr_ctx.size,
+		ch_ctx->gr_ctx.pages, ch_ctx->gr_ctx.iova, &attrs);
+	ch_ctx->gr_ctx.pages = NULL;
+	ch_ctx->gr_ctx.iova = 0;
+}
+
+static int gr_gk20a_alloc_channel_patch_ctx(struct gk20a *g,
+				struct channel_gk20a *c)
+{
+	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct device *d = dev_from_gk20a(g);
+	struct vm_gk20a *ch_vm = c->vm;
+	DEFINE_DMA_ATTRS(attrs);
+	struct sg_table *sgt;
+	int err = 0;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	patch_ctx->size = 128 * sizeof(u32);
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	patch_ctx->pages = dma_alloc_attrs(d, patch_ctx->size,
+				&iova, GFP_KERNEL,
+				&attrs);
+	if (!patch_ctx->pages)
+		return -ENOMEM;
+
+	patch_ctx->iova = iova;
+	err = gk20a_get_sgtable_from_pages(d, &sgt, patch_ctx->pages,
+			patch_ctx->iova, patch_ctx->size);
+	if (err)
+		goto err_free;
+
+	patch_ctx->gpu_va = gk20a_gmmu_map(ch_vm, &sgt, patch_ctx->size,
+					0, gk20a_mem_flag_none);
+	if (!patch_ctx->gpu_va)
+		goto err_free_sgtable;
+
+	gk20a_free_sgtable(&sgt);
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+ err_free_sgtable:
+	gk20a_free_sgtable(&sgt);
+ err_free:
+	dma_free_attrs(d, patch_ctx->size,
+		patch_ctx->pages, patch_ctx->iova, &attrs);
+	patch_ctx->pages = NULL;
+	patch_ctx->iova = 0;
+	gk20a_err(dev_from_gk20a(g), "fail");
+	return err;
+}
+
+static void gr_gk20a_unmap_channel_patch_ctx(struct channel_gk20a *c)
+{
+	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct vm_gk20a *ch_vm = c->vm;
+
+	gk20a_dbg_fn("");
+
+	if (patch_ctx->gpu_va)
+		gk20a_gmmu_unmap(ch_vm, patch_ctx->gpu_va,
+			patch_ctx->size, gk20a_mem_flag_none);
+	patch_ctx->gpu_va = 0;
+	patch_ctx->data_count = 0;
+}
+
+static void gr_gk20a_free_channel_patch_ctx(struct channel_gk20a *c)
+{
+	struct patch_desc *patch_ctx = &c->ch_ctx.patch_ctx;
+	struct gk20a *g = c->g;
+	struct device *d = dev_from_gk20a(g);
+	DEFINE_DMA_ATTRS(attrs);
+
+	gk20a_dbg_fn("");
+
+	gr_gk20a_unmap_channel_patch_ctx(c);
+
+	if (patch_ctx->pages) {
+		dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+		dma_free_attrs(d, patch_ctx->size,
+			patch_ctx->pages, patch_ctx->iova, &attrs);
+		patch_ctx->pages = NULL;
+		patch_ctx->iova = 0;
+	}
+}
+
+void gk20a_free_channel_ctx(struct channel_gk20a *c)
+{
+	gr_gk20a_unmap_global_ctx_buffers(c);
+	gr_gk20a_free_channel_patch_ctx(c);
+	gr_gk20a_free_channel_gr_ctx(c);
+
+	/* zcull_ctx, pm_ctx */
+
+	memset(&c->ch_ctx, 0, sizeof(struct channel_ctx_gk20a));
+
+	c->num_objects = 0;
+	c->first_init = false;
+}
+
+static bool gr_gk20a_is_valid_class(struct gk20a *g, u32 class_num)
+{
+	bool valid = false;
+
+	switch (class_num) {
+	case KEPLER_COMPUTE_A:
+	case KEPLER_C:
+	case FERMI_TWOD_A:
+	case KEPLER_DMA_COPY_A:
+		valid = true;
+		break;
+
+	default:
+		break;
+	}
+
+	return valid;
+}
+
+int gk20a_alloc_obj_ctx(struct channel_gk20a  *c,
+			struct nvhost_alloc_obj_ctx_args *args)
+{
+	struct gk20a *g = c->g;
+	struct channel_ctx_gk20a *ch_ctx = &c->ch_ctx;
+	int err = 0;
+
+	gk20a_dbg_fn("");
+
+	/* an address space needs to have been bound at this point.*/
+	if (!gk20a_channel_as_bound(c)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "not bound to address space at time"
+			   " of grctx allocation");
+		return -EINVAL;
+	}
+
+	if (!g->ops.gr.is_valid_class(g, args->class_num)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "invalid obj class 0x%x", args->class_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* allocate gr ctx buffer */
+	if (ch_ctx->gr_ctx.pages == NULL) {
+		err = gr_gk20a_alloc_channel_gr_ctx(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to allocate gr ctx buffer");
+			goto out;
+		}
+		c->obj_class = args->class_num;
+	} else {
+		/*TBD: needs to be more subtle about which is being allocated
+		* as some are allowed to be allocated along same channel */
+		gk20a_err(dev_from_gk20a(g),
+			"too many classes alloc'd on same channel");
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* commit gr ctx buffer */
+	err = gr_gk20a_commit_inst(c, ch_ctx->gr_ctx.gpu_va);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to commit gr ctx buffer");
+		goto out;
+	}
+
+	/* allocate patch buffer */
+	if (ch_ctx->patch_ctx.pages == NULL) {
+		err = gr_gk20a_alloc_channel_patch_ctx(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to allocate patch buffer");
+			goto out;
+		}
+	}
+
+	/* map global buffer to channel gpu_va and commit */
+	if (!ch_ctx->global_ctx_buffer_mapped) {
+		err = gr_gk20a_map_global_ctx_buffers(g, c);
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to map global ctx buffer");
+			goto out;
+		}
+		gr_gk20a_elpg_protected_call(g,
+			gr_gk20a_commit_global_ctx_buffers(g, c, true));
+	}
+
+	/* init golden image, ELPG enabled after this is done */
+	err = gr_gk20a_init_golden_ctx_image(g, c);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to init golden ctx image");
+		goto out;
+	}
+
+	/* load golden image */
+	if (!c->first_init) {
+		err = gr_gk20a_elpg_protected_call(g,
+			gr_gk20a_load_golden_ctx_image(g, c));
+		if (err) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to load golden ctx image");
+			goto out;
+		}
+		c->first_init = true;
+	}
+	gk20a_mm_l2_invalidate(g);
+
+	c->num_objects++;
+
+	gk20a_dbg_fn("done");
+	return 0;
+out:
+	/* 1. gr_ctx, patch_ctx and global ctx buffer mapping
+	   can be reused so no need to release them.
+	   2. golden image init and load is a one time thing so if
+	   they pass, no need to undo. */
+	gk20a_err(dev_from_gk20a(g), "fail");
+	return err;
+}
+
+int gk20a_free_obj_ctx(struct channel_gk20a  *c,
+		       struct nvhost_free_obj_ctx_args *args)
+{
+	unsigned long timeout = gk20a_get_gr_idle_timeout(c->g);
+
+	gk20a_dbg_fn("");
+
+	if (c->num_objects == 0)
+		return 0;
+
+	c->num_objects--;
+
+	if (c->num_objects == 0) {
+		c->first_init = false;
+		gk20a_disable_channel(c,
+			!c->has_timedout,
+			timeout);
+		gr_gk20a_unmap_channel_patch_ctx(c);
+	}
+
+	return 0;
+}
+
+static void gk20a_remove_gr_support(struct gr_gk20a *gr)
+{
+	struct gk20a *g = gr->g;
+	struct device *d = dev_from_gk20a(g);
+	DEFINE_DMA_ATTRS(attrs);
+
+	gk20a_dbg_fn("");
+
+	gr_gk20a_free_global_ctx_buffers(g);
+
+	dma_free_coherent(d, gr->mmu_wr_mem.size,
+		gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
+	gr->mmu_wr_mem.cpuva = NULL;
+	gr->mmu_wr_mem.iova = 0;
+	dma_free_coherent(d, gr->mmu_rd_mem.size,
+		gr->mmu_rd_mem.cpuva, gr->mmu_rd_mem.iova);
+	gr->mmu_rd_mem.cpuva = NULL;
+	gr->mmu_rd_mem.iova = 0;
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	dma_free_attrs(d, gr->compbit_store.size, gr->compbit_store.pages,
+			gr->compbit_store.base_iova, &attrs);
+
+	memset(&gr->mmu_wr_mem, 0, sizeof(struct mmu_desc));
+	memset(&gr->mmu_rd_mem, 0, sizeof(struct mmu_desc));
+	memset(&gr->compbit_store, 0, sizeof(struct compbit_store_desc));
+
+	kfree(gr->gpc_tpc_count);
+	kfree(gr->gpc_zcb_count);
+	kfree(gr->gpc_ppc_count);
+	kfree(gr->pes_tpc_count[0]);
+	kfree(gr->pes_tpc_count[1]);
+	kfree(gr->pes_tpc_mask[0]);
+	kfree(gr->pes_tpc_mask[1]);
+	kfree(gr->gpc_skip_mask);
+	kfree(gr->map_tiles);
+	gr->gpc_tpc_count = NULL;
+	gr->gpc_zcb_count = NULL;
+	gr->gpc_ppc_count = NULL;
+	gr->pes_tpc_count[0] = NULL;
+	gr->pes_tpc_count[1] = NULL;
+	gr->pes_tpc_mask[0] = NULL;
+	gr->pes_tpc_mask[1] = NULL;
+	gr->gpc_skip_mask = NULL;
+	gr->map_tiles = NULL;
+
+	kfree(gr->ctx_vars.ucode.fecs.inst.l);
+	kfree(gr->ctx_vars.ucode.fecs.data.l);
+	kfree(gr->ctx_vars.ucode.gpccs.inst.l);
+	kfree(gr->ctx_vars.ucode.gpccs.data.l);
+	kfree(gr->ctx_vars.sw_bundle_init.l);
+	kfree(gr->ctx_vars.sw_method_init.l);
+	kfree(gr->ctx_vars.sw_ctx_load.l);
+	kfree(gr->ctx_vars.sw_non_ctx_load.l);
+	kfree(gr->ctx_vars.ctxsw_regs.sys.l);
+	kfree(gr->ctx_vars.ctxsw_regs.gpc.l);
+	kfree(gr->ctx_vars.ctxsw_regs.tpc.l);
+	kfree(gr->ctx_vars.ctxsw_regs.zcull_gpc.l);
+	kfree(gr->ctx_vars.ctxsw_regs.ppc.l);
+	kfree(gr->ctx_vars.ctxsw_regs.pm_sys.l);
+	kfree(gr->ctx_vars.ctxsw_regs.pm_gpc.l);
+	kfree(gr->ctx_vars.ctxsw_regs.pm_tpc.l);
+
+	kfree(gr->ctx_vars.local_golden_image);
+	gr->ctx_vars.local_golden_image = NULL;
+
+	gk20a_allocator_destroy(&gr->comp_tags);
+}
+
+static void gr_gk20a_bundle_cb_defaults(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+
+	gr->bundle_cb_default_size =
+		gr_scc_bundle_cb_size_div_256b__prod_v();
+	gr->min_gpm_fifo_depth =
+		gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v();
+	gr->bundle_cb_token_limit =
+		gr_pd_ab_dist_cfg2_token_limit_init_v();
+}
+
+static int gr_gk20a_init_gr_config(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 gpc_index, pes_index;
+	u32 pes_tpc_mask;
+	u32 pes_tpc_count;
+	u32 pes_heavy_index;
+	u32 gpc_new_skip_mask;
+	u32 tmp;
+
+	tmp = gk20a_readl(g, pri_ringmaster_enum_fbp_r());
+	gr->num_fbps = pri_ringmaster_enum_fbp_count_v(tmp);
+
+	tmp = gk20a_readl(g, top_num_gpcs_r());
+	gr->max_gpc_count = top_num_gpcs_value_v(tmp);
+
+	tmp = gk20a_readl(g, top_num_fbps_r());
+	gr->max_fbps_count = top_num_fbps_value_v(tmp);
+
+	tmp = gk20a_readl(g, top_tpc_per_gpc_r());
+	gr->max_tpc_per_gpc_count = top_tpc_per_gpc_value_v(tmp);
+
+	gr->max_tpc_count = gr->max_gpc_count * gr->max_tpc_per_gpc_count;
+
+	tmp = gk20a_readl(g, top_num_fbps_r());
+	gr->sys_count = top_num_fbps_value_v(tmp);
+
+	tmp = gk20a_readl(g, pri_ringmaster_enum_gpc_r());
+	gr->gpc_count = pri_ringmaster_enum_gpc_count_v(tmp);
+
+	gr->pe_count_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+	gr->max_zcull_per_gpc_count = proj_scal_litter_num_zcull_banks_v();
+
+	if (!gr->gpc_count) {
+		gk20a_err(dev_from_gk20a(g), "gpc_count==0!");
+		goto clean_up;
+	}
+
+	gr->gpc_tpc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->gpc_zcb_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->gpc_ppc_count = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->pes_tpc_count[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->pes_tpc_count[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->pes_tpc_mask[0] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->pes_tpc_mask[1] = kzalloc(gr->gpc_count * sizeof(u32), GFP_KERNEL);
+	gr->gpc_skip_mask =
+		kzalloc(gr_pd_dist_skip_table__size_1_v() * 4 * sizeof(u32),
+			GFP_KERNEL);
+
+	if (!gr->gpc_tpc_count || !gr->gpc_zcb_count || !gr->gpc_ppc_count ||
+	    !gr->pes_tpc_count[0] || !gr->pes_tpc_count[1] ||
+	    !gr->pes_tpc_mask[0] || !gr->pes_tpc_mask[1] || !gr->gpc_skip_mask)
+		goto clean_up;
+
+	gr->ppc_count = 0;
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		tmp = gk20a_readl(g, gr_gpc0_fs_gpc_r());
+
+		gr->gpc_tpc_count[gpc_index] =
+			gr_gpc0_fs_gpc_num_available_tpcs_v(tmp);
+		gr->tpc_count += gr->gpc_tpc_count[gpc_index];
+
+		gr->gpc_zcb_count[gpc_index] =
+			gr_gpc0_fs_gpc_num_available_zculls_v(tmp);
+		gr->zcb_count += gr->gpc_zcb_count[gpc_index];
+
+		gr->gpc_ppc_count[gpc_index] = gr->pe_count_per_gpc;
+		gr->ppc_count += gr->gpc_ppc_count[gpc_index];
+		for (pes_index = 0; pes_index < gr->pe_count_per_gpc; pes_index++) {
+
+			tmp = gk20a_readl(g,
+				gr_gpc0_gpm_pd_pes_tpc_id_mask_r(pes_index) +
+				gpc_index * proj_gpc_stride_v());
+
+			pes_tpc_mask = gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(tmp);
+			pes_tpc_count = count_bits(pes_tpc_mask);
+
+			gr->pes_tpc_count[pes_index][gpc_index] = pes_tpc_count;
+			gr->pes_tpc_mask[pes_index][gpc_index] = pes_tpc_mask;
+		}
+
+		gpc_new_skip_mask = 0;
+		if (gr->pes_tpc_count[0][gpc_index] +
+		    gr->pes_tpc_count[1][gpc_index] == 5) {
+			pes_heavy_index =
+				gr->pes_tpc_count[0][gpc_index] >
+				gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+
+			gpc_new_skip_mask =
+				gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+				   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+				   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+
+		} else if ((gr->pes_tpc_count[0][gpc_index] +
+			    gr->pes_tpc_count[1][gpc_index] == 4) &&
+			   (gr->pes_tpc_count[0][gpc_index] !=
+			    gr->pes_tpc_count[1][gpc_index])) {
+				pes_heavy_index =
+				    gr->pes_tpc_count[0][gpc_index] >
+				    gr->pes_tpc_count[1][gpc_index] ? 0 : 1;
+
+			gpc_new_skip_mask =
+				gr->pes_tpc_mask[pes_heavy_index][gpc_index] ^
+				   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] &
+				   (gr->pes_tpc_mask[pes_heavy_index][gpc_index] - 1));
+		}
+		gr->gpc_skip_mask[gpc_index] = gpc_new_skip_mask;
+	}
+
+	gk20a_dbg_info("fbps: %d", gr->num_fbps);
+	gk20a_dbg_info("max_gpc_count: %d", gr->max_gpc_count);
+	gk20a_dbg_info("max_fbps_count: %d", gr->max_fbps_count);
+	gk20a_dbg_info("max_tpc_per_gpc_count: %d", gr->max_tpc_per_gpc_count);
+	gk20a_dbg_info("max_zcull_per_gpc_count: %d", gr->max_zcull_per_gpc_count);
+	gk20a_dbg_info("max_tpc_count: %d", gr->max_tpc_count);
+	gk20a_dbg_info("sys_count: %d", gr->sys_count);
+	gk20a_dbg_info("gpc_count: %d", gr->gpc_count);
+	gk20a_dbg_info("pe_count_per_gpc: %d", gr->pe_count_per_gpc);
+	gk20a_dbg_info("tpc_count: %d", gr->tpc_count);
+	gk20a_dbg_info("ppc_count: %d", gr->ppc_count);
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		gk20a_dbg_info("gpc_tpc_count[%d] : %d",
+			   gpc_index, gr->gpc_tpc_count[gpc_index]);
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		gk20a_dbg_info("gpc_zcb_count[%d] : %d",
+			   gpc_index, gr->gpc_zcb_count[gpc_index]);
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		gk20a_dbg_info("gpc_ppc_count[%d] : %d",
+			   gpc_index, gr->gpc_ppc_count[gpc_index]);
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		gk20a_dbg_info("gpc_skip_mask[%d] : %d",
+			   gpc_index, gr->gpc_skip_mask[gpc_index]);
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		for (pes_index = 0;
+		     pes_index < gr->pe_count_per_gpc;
+		     pes_index++)
+			gk20a_dbg_info("pes_tpc_count[%d][%d] : %d",
+				   pes_index, gpc_index,
+				   gr->pes_tpc_count[pes_index][gpc_index]);
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+		for (pes_index = 0;
+		     pes_index < gr->pe_count_per_gpc;
+		     pes_index++)
+			gk20a_dbg_info("pes_tpc_mask[%d][%d] : %d",
+				   pes_index, gpc_index,
+				   gr->pes_tpc_mask[pes_index][gpc_index]);
+
+	g->ops.gr.bundle_cb_defaults(g);
+	g->ops.gr.cb_size_default(g);
+	g->ops.gr.calc_global_ctx_buffer_size(g);
+	gr->timeslice_mode = gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v();
+
+	gk20a_dbg_info("bundle_cb_default_size: %d",
+		   gr->bundle_cb_default_size);
+	gk20a_dbg_info("min_gpm_fifo_depth: %d", gr->min_gpm_fifo_depth);
+	gk20a_dbg_info("bundle_cb_token_limit: %d", gr->bundle_cb_token_limit);
+	gk20a_dbg_info("attrib_cb_default_size: %d",
+		   gr->attrib_cb_default_size);
+	gk20a_dbg_info("attrib_cb_size: %d", gr->attrib_cb_size);
+	gk20a_dbg_info("alpha_cb_default_size: %d", gr->alpha_cb_default_size);
+	gk20a_dbg_info("alpha_cb_size: %d", gr->alpha_cb_size);
+	gk20a_dbg_info("timeslice_mode: %d", gr->timeslice_mode);
+
+	return 0;
+
+clean_up:
+	return -ENOMEM;
+}
+
+static int gr_gk20a_init_mmu_sw(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct device *d = dev_from_gk20a(g);
+	dma_addr_t iova;
+
+	gr->mmu_wr_mem_size = gr->mmu_rd_mem_size = 0x1000;
+
+	gr->mmu_wr_mem.size = gr->mmu_wr_mem_size;
+	gr->mmu_wr_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_wr_mem_size,
+					&iova, GFP_KERNEL);
+	if (!gr->mmu_wr_mem.cpuva)
+		goto err;
+
+	gr->mmu_wr_mem.iova = iova;
+
+	gr->mmu_rd_mem.size = gr->mmu_rd_mem_size;
+	gr->mmu_rd_mem.cpuva = dma_zalloc_coherent(d, gr->mmu_rd_mem_size,
+					&iova, GFP_KERNEL);
+	if (!gr->mmu_rd_mem.cpuva)
+		goto err_free_wr_mem;
+
+	gr->mmu_rd_mem.iova = iova;
+	return 0;
+
+ err_free_wr_mem:
+	dma_free_coherent(d, gr->mmu_wr_mem.size,
+		gr->mmu_wr_mem.cpuva, gr->mmu_wr_mem.iova);
+	gr->mmu_wr_mem.cpuva = NULL;
+	gr->mmu_wr_mem.iova = 0;
+ err:
+	return -ENOMEM;
+}
+
+static u32 prime_set[18] = {
+	2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61 };
+
+static int gr_gk20a_init_map_tiles(struct gk20a *g, struct gr_gk20a *gr)
+{
+	s32 comm_denom;
+	s32 mul_factor;
+	s32 *init_frac = NULL;
+	s32 *init_err = NULL;
+	s32 *run_err = NULL;
+	s32 *sorted_num_tpcs = NULL;
+	s32 *sorted_to_unsorted_gpc_map = NULL;
+	u32 gpc_index;
+	u32 gpc_mark = 0;
+	u32 num_tpc;
+	u32 max_tpc_count = 0;
+	u32 swap;
+	u32 tile_count;
+	u32 index;
+	bool delete_map = false;
+	bool gpc_sorted;
+	int ret = 0;
+
+	init_frac = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+	init_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+	run_err = kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+	sorted_num_tpcs =
+		kzalloc(proj_scal_max_gpcs_v() *
+			proj_scal_max_tpc_per_gpc_v() * sizeof(s32),
+			GFP_KERNEL);
+	sorted_to_unsorted_gpc_map =
+		kzalloc(proj_scal_max_gpcs_v() * sizeof(s32), GFP_KERNEL);
+
+	if (!(init_frac && init_err && run_err && sorted_num_tpcs &&
+	      sorted_to_unsorted_gpc_map)) {
+		ret = -ENOMEM;
+		goto clean_up;
+	}
+
+	gr->map_row_offset = INVALID_SCREEN_TILE_ROW_OFFSET;
+
+	if (gr->tpc_count == 3)
+		gr->map_row_offset = 2;
+	else if (gr->tpc_count < 3)
+		gr->map_row_offset = 1;
+	else {
+		gr->map_row_offset = 3;
+
+		for (index = 1; index < 18; index++) {
+			u32 prime = prime_set[index];
+			if ((gr->tpc_count % prime) != 0) {
+				gr->map_row_offset = prime;
+				break;
+			}
+		}
+	}
+
+	switch (gr->tpc_count) {
+	case 15:
+		gr->map_row_offset = 6;
+		break;
+	case 14:
+		gr->map_row_offset = 5;
+		break;
+	case 13:
+		gr->map_row_offset = 2;
+		break;
+	case 11:
+		gr->map_row_offset = 7;
+		break;
+	case 10:
+		gr->map_row_offset = 6;
+		break;
+	case 7:
+	case 5:
+		gr->map_row_offset = 1;
+		break;
+	default:
+		break;
+	}
+
+	if (gr->map_tiles) {
+		if (gr->map_tile_count != gr->tpc_count)
+			delete_map = true;
+
+		for (tile_count = 0; tile_count < gr->map_tile_count; tile_count++) {
+			if ((u32)gr->map_tiles[tile_count] >= gr->tpc_count)
+				delete_map = true;
+		}
+
+		if (delete_map) {
+			kfree(gr->map_tiles);
+			gr->map_tiles = NULL;
+			gr->map_tile_count = 0;
+		}
+	}
+
+	if (gr->map_tiles == NULL) {
+		gr->map_tile_count = proj_scal_max_gpcs_v();
+
+		gr->map_tiles = kzalloc(proj_scal_max_gpcs_v() * sizeof(u8), GFP_KERNEL);
+		if (gr->map_tiles == NULL) {
+			ret = -ENOMEM;
+			goto clean_up;
+		}
+
+		for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+			sorted_num_tpcs[gpc_index] = gr->gpc_tpc_count[gpc_index];
+			sorted_to_unsorted_gpc_map[gpc_index] = gpc_index;
+		}
+
+		gpc_sorted = false;
+		while (!gpc_sorted) {
+			gpc_sorted = true;
+			for (gpc_index = 0; gpc_index < gr->gpc_count - 1; gpc_index++) {
+				if (sorted_num_tpcs[gpc_index + 1] > sorted_num_tpcs[gpc_index]) {
+					gpc_sorted = false;
+					swap = sorted_num_tpcs[gpc_index];
+					sorted_num_tpcs[gpc_index] = sorted_num_tpcs[gpc_index + 1];
+					sorted_num_tpcs[gpc_index + 1] = swap;
+					swap = sorted_to_unsorted_gpc_map[gpc_index];
+					sorted_to_unsorted_gpc_map[gpc_index] =
+						sorted_to_unsorted_gpc_map[gpc_index + 1];
+					sorted_to_unsorted_gpc_map[gpc_index + 1] = swap;
+				}
+			}
+		}
+
+		for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++)
+			if (gr->gpc_tpc_count[gpc_index] > max_tpc_count)
+				max_tpc_count = gr->gpc_tpc_count[gpc_index];
+
+		mul_factor = gr->gpc_count * max_tpc_count;
+		if (mul_factor & 0x1)
+			mul_factor = 2;
+		else
+			mul_factor = 1;
+
+		comm_denom = gr->gpc_count * max_tpc_count * mul_factor;
+
+		for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+			num_tpc = sorted_num_tpcs[gpc_index];
+
+			init_frac[gpc_index] = num_tpc * gr->gpc_count * mul_factor;
+
+			if (num_tpc != 0)
+				init_err[gpc_index] = gpc_index * max_tpc_count * mul_factor - comm_denom/2;
+			else
+				init_err[gpc_index] = 0;
+
+			run_err[gpc_index] = init_frac[gpc_index] + init_err[gpc_index];
+		}
+
+		while (gpc_mark < gr->tpc_count) {
+			for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+				if ((run_err[gpc_index] * 2) >= comm_denom) {
+					gr->map_tiles[gpc_mark++] = (u8)sorted_to_unsorted_gpc_map[gpc_index];
+					run_err[gpc_index] += init_frac[gpc_index] - comm_denom;
+				} else
+					run_err[gpc_index] += init_frac[gpc_index];
+			}
+		}
+	}
+
+clean_up:
+	kfree(init_frac);
+	kfree(init_err);
+	kfree(run_err);
+	kfree(sorted_num_tpcs);
+	kfree(sorted_to_unsorted_gpc_map);
+
+	if (ret)
+		gk20a_err(dev_from_gk20a(g), "fail");
+	else
+		gk20a_dbg_fn("done");
+
+	return ret;
+}
+
+static int gr_gk20a_init_zcull(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct gr_zcull_gk20a *zcull = &gr->zcull;
+
+	zcull->aliquot_width = gr->tpc_count * 16;
+	zcull->aliquot_height = 16;
+
+	zcull->width_align_pixels = gr->tpc_count * 16;
+	zcull->height_align_pixels = 32;
+
+	zcull->aliquot_size =
+		zcull->aliquot_width * zcull->aliquot_height;
+
+	/* assume no floor sweeping since we only have 1 tpc in 1 gpc */
+	zcull->pixel_squares_by_aliquots =
+		gr->zcb_count * 16 * 16 * gr->tpc_count /
+		(gr->gpc_count * gr->gpc_tpc_count[0]);
+
+	zcull->total_aliquots =
+		gr_gpc0_zcull_total_ram_size_num_aliquots_f(
+			gk20a_readl(g, gr_gpc0_zcull_total_ram_size_r()));
+
+	return 0;
+}
+
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr)
+{
+	/* assuming gr has already been initialized */
+	return gr->ctx_vars.zcull_ctxsw_image_size;
+}
+
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+			struct channel_gk20a *c, u64 zcull_va, u32 mode)
+{
+	struct zcull_ctx_desc *zcull_ctx = &c->ch_ctx.zcull_ctx;
+
+	zcull_ctx->ctx_sw_mode = mode;
+	zcull_ctx->gpu_va = zcull_va;
+
+	/* TBD: don't disable channel in sw method processing */
+	return gr_gk20a_ctx_zcull_setup(g, c, true);
+}
+
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+			struct gr_zcull_info *zcull_params)
+{
+	struct gr_zcull_gk20a *zcull = &gr->zcull;
+
+	zcull_params->width_align_pixels = zcull->width_align_pixels;
+	zcull_params->height_align_pixels = zcull->height_align_pixels;
+	zcull_params->pixel_squares_by_aliquots =
+		zcull->pixel_squares_by_aliquots;
+	zcull_params->aliquot_total = zcull->total_aliquots;
+
+	zcull_params->region_byte_multiplier =
+		gr->gpc_count * gr_zcull_bytes_per_aliquot_per_gpu_v();
+	zcull_params->region_header_size =
+		proj_scal_litter_num_gpcs_v() *
+		gr_zcull_save_restore_header_bytes_per_gpc_v();
+
+	zcull_params->subregion_header_size =
+		proj_scal_litter_num_gpcs_v() *
+		gr_zcull_save_restore_subregion_header_bytes_per_gpc_v();
+
+	zcull_params->subregion_width_align_pixels =
+		gr->tpc_count * gr_gpc0_zcull_zcsize_width_subregion__multiple_v();
+	zcull_params->subregion_height_align_pixels =
+		gr_gpc0_zcull_zcsize_height_subregion__multiple_v();
+	zcull_params->subregion_count = gr_zcull_subregion_qty_v();
+
+	return 0;
+}
+
+static int gr_gk20a_add_zbc_color(struct gk20a *g, struct gr_gk20a *gr,
+				  struct zbc_entry *color_val, u32 index)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+	u32 i;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 ret;
+
+	ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to disable gr engine activity\n");
+		return ret;
+	}
+
+	ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to idle graphics\n");
+		goto clean_up;
+	}
+
+	/* update l2 table */
+	g->ops.ltc.set_zbc_color_entry(g, color_val, index);
+
+	/* update ds table */
+	gk20a_writel(g, gr_ds_zbc_color_r_r(),
+		gr_ds_zbc_color_r_val_f(color_val->color_ds[0]));
+	gk20a_writel(g, gr_ds_zbc_color_g_r(),
+		gr_ds_zbc_color_g_val_f(color_val->color_ds[1]));
+	gk20a_writel(g, gr_ds_zbc_color_b_r(),
+		gr_ds_zbc_color_b_val_f(color_val->color_ds[2]));
+	gk20a_writel(g, gr_ds_zbc_color_a_r(),
+		gr_ds_zbc_color_a_val_f(color_val->color_ds[3]));
+
+	gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+		gr_ds_zbc_color_fmt_val_f(color_val->format));
+
+	gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+		gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+
+	/* trigger the write */
+	gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+		gr_ds_zbc_tbl_ld_select_c_f() |
+		gr_ds_zbc_tbl_ld_action_write_f() |
+		gr_ds_zbc_tbl_ld_trigger_active_f());
+
+	/* update local copy */
+	for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+		gr->zbc_col_tbl[index].color_l2[i] = color_val->color_l2[i];
+		gr->zbc_col_tbl[index].color_ds[i] = color_val->color_ds[i];
+	}
+	gr->zbc_col_tbl[index].format = color_val->format;
+	gr->zbc_col_tbl[index].ref_cnt++;
+
+clean_up:
+	ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to enable gr engine activity\n");
+	}
+
+	return ret;
+}
+
+static int gr_gk20a_add_zbc_depth(struct gk20a *g, struct gr_gk20a *gr,
+				struct zbc_entry *depth_val, u32 index)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 ret;
+
+	ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to disable gr engine activity\n");
+		return ret;
+	}
+
+	ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to idle graphics\n");
+		goto clean_up;
+	}
+
+	/* update l2 table */
+	g->ops.ltc.set_zbc_depth_entry(g, depth_val, index);
+
+	/* update ds table */
+	gk20a_writel(g, gr_ds_zbc_z_r(),
+		gr_ds_zbc_z_val_f(depth_val->depth));
+
+	gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+		gr_ds_zbc_z_fmt_val_f(depth_val->format));
+
+	gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+		gr_ds_zbc_tbl_index_val_f(index + GK20A_STARTOF_ZBC_TABLE));
+
+	/* trigger the write */
+	gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+		gr_ds_zbc_tbl_ld_select_z_f() |
+		gr_ds_zbc_tbl_ld_action_write_f() |
+		gr_ds_zbc_tbl_ld_trigger_active_f());
+
+	/* update local copy */
+	gr->zbc_dep_tbl[index].depth = depth_val->depth;
+	gr->zbc_dep_tbl[index].format = depth_val->format;
+	gr->zbc_dep_tbl[index].ref_cnt++;
+
+clean_up:
+	ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to enable gr engine activity\n");
+	}
+
+	return ret;
+}
+
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+		     struct zbc_entry *zbc_val)
+{
+	struct zbc_color_table *c_tbl;
+	struct zbc_depth_table *d_tbl;
+	u32 i, ret = -ENOMEM;
+	bool added = false;
+	u32 entries;
+
+	/* no endian swap ? */
+
+	switch (zbc_val->type) {
+	case GK20A_ZBC_TYPE_COLOR:
+		/* search existing tables */
+		for (i = 0; i < gr->max_used_color_index; i++) {
+
+			c_tbl = &gr->zbc_col_tbl[i];
+
+			if (c_tbl->ref_cnt && c_tbl->format == zbc_val->format &&
+			    memcmp(c_tbl->color_ds, zbc_val->color_ds,
+				sizeof(zbc_val->color_ds)) == 0) {
+
+				if (memcmp(c_tbl->color_l2, zbc_val->color_l2,
+				    sizeof(zbc_val->color_l2))) {
+					gk20a_err(dev_from_gk20a(g),
+						"zbc l2 and ds color don't match with existing entries");
+					return -EINVAL;
+				}
+				added = true;
+				c_tbl->ref_cnt++;
+				ret = 0;
+				break;
+			}
+		}
+		/* add new table */
+		if (!added &&
+		    gr->max_used_color_index < GK20A_ZBC_TABLE_SIZE) {
+
+			c_tbl =
+			    &gr->zbc_col_tbl[gr->max_used_color_index];
+			WARN_ON(c_tbl->ref_cnt != 0);
+
+			ret = gr_gk20a_add_zbc_color(g, gr,
+				zbc_val, gr->max_used_color_index);
+
+			if (!ret)
+				gr->max_used_color_index++;
+		}
+		break;
+	case GK20A_ZBC_TYPE_DEPTH:
+		/* search existing tables */
+		for (i = 0; i < gr->max_used_depth_index; i++) {
+
+			d_tbl = &gr->zbc_dep_tbl[i];
+
+			if (d_tbl->ref_cnt &&
+			    d_tbl->depth == zbc_val->depth &&
+			    d_tbl->format == zbc_val->format) {
+				added = true;
+				d_tbl->ref_cnt++;
+				ret = 0;
+				break;
+			}
+		}
+		/* add new table */
+		if (!added &&
+		    gr->max_used_depth_index < GK20A_ZBC_TABLE_SIZE) {
+
+			d_tbl =
+			    &gr->zbc_dep_tbl[gr->max_used_depth_index];
+			WARN_ON(d_tbl->ref_cnt != 0);
+
+			ret = gr_gk20a_add_zbc_depth(g, gr,
+				zbc_val, gr->max_used_depth_index);
+
+			if (!ret)
+				gr->max_used_depth_index++;
+		}
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			"invalid zbc table type %d", zbc_val->type);
+		return -EINVAL;
+	}
+
+	if (!added && ret == 0) {
+		/* update zbc for elpg only when new entry is added */
+		entries = max(gr->max_used_color_index,
+					gr->max_used_depth_index);
+		gk20a_pmu_save_zbc(g, entries);
+	}
+
+	return ret;
+}
+
+int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct fifo_engine_info_gk20a *gr_info = f->engine_info + ENGINE_GR_GK20A;
+	u32 i, j;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 ret;
+
+	ret = gk20a_fifo_disable_engine_activity(g, gr_info, true);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to disable gr engine activity\n");
+		return ret;
+	}
+
+	ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to idle graphics\n");
+		goto clean_up;
+	}
+
+	for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+		gr->zbc_col_tbl[i].format = 0;
+		gr->zbc_col_tbl[i].ref_cnt = 0;
+
+		gk20a_writel(g, gr_ds_zbc_color_fmt_r(),
+			gr_ds_zbc_color_fmt_val_invalid_f());
+		gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+			gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+
+		/* trigger the write */
+		gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+			gr_ds_zbc_tbl_ld_select_c_f() |
+			gr_ds_zbc_tbl_ld_action_write_f() |
+			gr_ds_zbc_tbl_ld_trigger_active_f());
+
+		/* clear l2 table */
+		g->ops.ltc.clear_zbc_color_entry(g, i);
+
+		for (j = 0; j < GK20A_ZBC_COLOR_VALUE_SIZE; j++) {
+			gr->zbc_col_tbl[i].color_l2[j] = 0;
+			gr->zbc_col_tbl[i].color_ds[j] = 0;
+		}
+	}
+	gr->max_used_color_index = 0;
+	gr->max_default_color_index = 0;
+
+	for (i = 0; i < GK20A_ZBC_TABLE_SIZE; i++) {
+		gr->zbc_dep_tbl[i].depth = 0;
+		gr->zbc_dep_tbl[i].format = 0;
+		gr->zbc_dep_tbl[i].ref_cnt = 0;
+
+		gk20a_writel(g, gr_ds_zbc_z_fmt_r(),
+			gr_ds_zbc_z_fmt_val_invalid_f());
+		gk20a_writel(g, gr_ds_zbc_tbl_index_r(),
+			gr_ds_zbc_tbl_index_val_f(i + GK20A_STARTOF_ZBC_TABLE));
+
+		/* trigger the write */
+		gk20a_writel(g, gr_ds_zbc_tbl_ld_r(),
+			gr_ds_zbc_tbl_ld_select_z_f() |
+			gr_ds_zbc_tbl_ld_action_write_f() |
+			gr_ds_zbc_tbl_ld_trigger_active_f());
+
+		/* clear l2 table */
+		g->ops.ltc.clear_zbc_depth_entry(g, i);
+	}
+	gr->max_used_depth_index = 0;
+	gr->max_default_depth_index = 0;
+
+clean_up:
+	ret = gk20a_fifo_enable_engine_activity(g, gr_info);
+	if (ret) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to enable gr engine activity\n");
+	}
+
+	/* elpg stuff */
+
+	return ret;
+}
+
+/* get a zbc table entry specified by index
+ * return table size when type is invalid */
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+			struct zbc_query_params *query_params)
+{
+	u32 index = query_params->index_size;
+	u32 i;
+
+	switch (query_params->type) {
+	case GK20A_ZBC_TYPE_INVALID:
+		query_params->index_size = GK20A_ZBC_TABLE_SIZE;
+		break;
+	case GK20A_ZBC_TYPE_COLOR:
+		if (index >= GK20A_ZBC_TABLE_SIZE) {
+			gk20a_err(dev_from_gk20a(g),
+				"invalid zbc color table index\n");
+			return -EINVAL;
+		}
+		for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+			query_params->color_l2[i] =
+				gr->zbc_col_tbl[index].color_l2[i];
+			query_params->color_ds[i] =
+				gr->zbc_col_tbl[index].color_ds[i];
+		}
+		query_params->format = gr->zbc_col_tbl[index].format;
+		query_params->ref_cnt = gr->zbc_col_tbl[index].ref_cnt;
+		break;
+	case GK20A_ZBC_TYPE_DEPTH:
+		if (index >= GK20A_ZBC_TABLE_SIZE) {
+			gk20a_err(dev_from_gk20a(g),
+				"invalid zbc depth table index\n");
+			return -EINVAL;
+		}
+		query_params->depth = gr->zbc_dep_tbl[index].depth;
+		query_params->format = gr->zbc_dep_tbl[index].format;
+		query_params->ref_cnt = gr->zbc_dep_tbl[index].ref_cnt;
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(g),
+				"invalid zbc table type\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct zbc_entry zbc_val;
+	u32 i, err;
+
+	/* load default color table */
+	zbc_val.type = GK20A_ZBC_TYPE_COLOR;
+
+	zbc_val.format = gr_ds_zbc_color_fmt_val_zero_v();
+	for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc_val.color_ds[i] = 0;
+		zbc_val.color_l2[i] = 0;
+	}
+	err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	zbc_val.format = gr_ds_zbc_color_fmt_val_unorm_one_v();
+	for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc_val.color_ds[i] = 0xffffffff;
+		zbc_val.color_l2[i] = 0x3f800000;
+	}
+	err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
+	for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc_val.color_ds[i] = 0;
+		zbc_val.color_l2[i] = 0;
+	}
+	err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	zbc_val.format = gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v();
+	for (i = 0; i < GK20A_ZBC_COLOR_VALUE_SIZE; i++) {
+		zbc_val.color_ds[i] = 0x3f800000;
+		zbc_val.color_l2[i] = 0x3f800000;
+	}
+	err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	if (!err)
+		gr->max_default_color_index = 4;
+	else {
+		gk20a_err(dev_from_gk20a(g),
+			   "fail to load default zbc color table\n");
+		return err;
+	}
+
+	/* load default depth table */
+	zbc_val.type = GK20A_ZBC_TYPE_DEPTH;
+
+	zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+	zbc_val.depth = 0;
+	err = gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	zbc_val.format = gr_ds_zbc_z_fmt_val_fp32_v();
+	zbc_val.depth = 0x3f800000;
+	err |= gr_gk20a_add_zbc(g, gr, &zbc_val);
+
+	if (!err)
+		gr->max_default_depth_index = 2;
+	else {
+		gk20a_err(dev_from_gk20a(g),
+			   "fail to load default zbc depth table\n");
+		return err;
+	}
+
+	return 0;
+}
+
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+			struct zbc_entry *zbc_val)
+{
+	gk20a_dbg_fn("");
+
+	return gr_gk20a_elpg_protected_call(g,
+		gr_gk20a_add_zbc(g, gr, zbc_val));
+}
+
+void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine)
+{
+	u32 gate_ctrl;
+
+	gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
+
+	switch (mode) {
+	case BLCG_RUN:
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_blk_clk_m(),
+				therm_gate_ctrl_blk_clk_run_f());
+		break;
+	case BLCG_AUTO:
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_blk_clk_m(),
+				therm_gate_ctrl_blk_clk_auto_f());
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			"invalid blcg mode %d", mode);
+		return;
+	}
+
+	gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
+}
+
+void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine)
+{
+	u32 gate_ctrl, idle_filter;
+
+	gate_ctrl = gk20a_readl(g, therm_gate_ctrl_r(engine));
+
+	switch (mode) {
+	case ELCG_RUN:
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_eng_clk_m(),
+				therm_gate_ctrl_eng_clk_run_f());
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_eng_pwr_m(),
+				/* set elpg to auto to meet hw expectation */
+				therm_gate_ctrl_eng_pwr_auto_f());
+		break;
+	case ELCG_STOP:
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_eng_clk_m(),
+				therm_gate_ctrl_eng_clk_stop_f());
+		break;
+	case ELCG_AUTO:
+		gate_ctrl = set_field(gate_ctrl,
+				therm_gate_ctrl_eng_clk_m(),
+				therm_gate_ctrl_eng_clk_auto_f());
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			"invalid elcg mode %d", mode);
+	}
+
+	if (tegra_platform_is_linsim()) {
+		gate_ctrl = set_field(gate_ctrl,
+			therm_gate_ctrl_eng_delay_after_m(),
+			therm_gate_ctrl_eng_delay_after_f(4));
+	}
+
+	/* 2 * (1 << 9) = 1024 clks */
+	gate_ctrl = set_field(gate_ctrl,
+		therm_gate_ctrl_eng_idle_filt_exp_m(),
+		therm_gate_ctrl_eng_idle_filt_exp_f(9));
+	gate_ctrl = set_field(gate_ctrl,
+		therm_gate_ctrl_eng_idle_filt_mant_m(),
+		therm_gate_ctrl_eng_idle_filt_mant_f(2));
+	gk20a_writel(g, therm_gate_ctrl_r(engine), gate_ctrl);
+
+	/* default fecs_idle_filter to 0 */
+	idle_filter = gk20a_readl(g, therm_fecs_idle_filter_r());
+	idle_filter &= ~therm_fecs_idle_filter_value_m();
+	gk20a_writel(g, therm_fecs_idle_filter_r(), idle_filter);
+	/* default hubmmu_idle_filter to 0 */
+	idle_filter = gk20a_readl(g, therm_hubmmu_idle_filter_r());
+	idle_filter &= ~therm_hubmmu_idle_filter_value_m();
+	gk20a_writel(g, therm_hubmmu_idle_filter_r(), idle_filter);
+}
+
+static int gr_gk20a_zcull_init_hw(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 gpc_index, gpc_tpc_count, gpc_zcull_count;
+	u32 *zcull_map_tiles, *zcull_bank_counters;
+	u32 map_counter;
+	u32 rcp_conserv;
+	u32 offset;
+	bool floorsweep = false;
+
+	if (!gr->map_tiles)
+		return -1;
+
+	zcull_map_tiles = kzalloc(proj_scal_max_gpcs_v() *
+			proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
+	if (!zcull_map_tiles) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to allocate zcull temp buffers");
+		return -ENOMEM;
+	}
+	zcull_bank_counters = kzalloc(proj_scal_max_gpcs_v() *
+			proj_scal_max_tpc_per_gpc_v() * sizeof(u32), GFP_KERNEL);
+
+	if (!zcull_bank_counters) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to allocate zcull temp buffers");
+		kfree(zcull_map_tiles);
+		return -ENOMEM;
+	}
+
+	for (map_counter = 0; map_counter < gr->tpc_count; map_counter++) {
+		zcull_map_tiles[map_counter] =
+			zcull_bank_counters[gr->map_tiles[map_counter]];
+		zcull_bank_counters[gr->map_tiles[map_counter]]++;
+	}
+
+	gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map0_r(),
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(zcull_map_tiles[0]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(zcull_map_tiles[1]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(zcull_map_tiles[2]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(zcull_map_tiles[3]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(zcull_map_tiles[4]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(zcull_map_tiles[5]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(zcull_map_tiles[6]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(zcull_map_tiles[7]));
+
+	gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map1_r(),
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(zcull_map_tiles[8]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(zcull_map_tiles[9]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(zcull_map_tiles[10]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(zcull_map_tiles[11]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(zcull_map_tiles[12]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(zcull_map_tiles[13]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(zcull_map_tiles[14]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(zcull_map_tiles[15]));
+
+	gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map2_r(),
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(zcull_map_tiles[16]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(zcull_map_tiles[17]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(zcull_map_tiles[18]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(zcull_map_tiles[19]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(zcull_map_tiles[20]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(zcull_map_tiles[21]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(zcull_map_tiles[22]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(zcull_map_tiles[23]));
+
+	gk20a_writel(g, gr_gpcs_zcull_sm_in_gpc_number_map3_r(),
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(zcull_map_tiles[24]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(zcull_map_tiles[25]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(zcull_map_tiles[26]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(zcull_map_tiles[27]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(zcull_map_tiles[28]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(zcull_map_tiles[29]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(zcull_map_tiles[30]) |
+		gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(zcull_map_tiles[31]));
+
+	kfree(zcull_map_tiles);
+	kfree(zcull_bank_counters);
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		gpc_tpc_count = gr->gpc_tpc_count[gpc_index];
+		gpc_zcull_count = gr->gpc_zcb_count[gpc_index];
+
+		if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+		    gpc_zcull_count < gpc_tpc_count) {
+			gk20a_err(dev_from_gk20a(g),
+				"zcull_banks (%d) less than tpcs (%d) for gpc (%d)",
+				gpc_zcull_count, gpc_tpc_count, gpc_index);
+			return -EINVAL;
+		}
+		if (gpc_zcull_count != gr->max_zcull_per_gpc_count &&
+		    gpc_zcull_count != 0)
+			floorsweep = true;
+	}
+
+	/* 1.0f / 1.0f * gr_gpc0_zcull_sm_num_rcp_conservative__max_v() */
+	rcp_conserv = gr_gpc0_zcull_sm_num_rcp_conservative__max_v();
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		offset = gpc_index * proj_gpc_stride_v();
+
+		if (floorsweep) {
+			gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+				gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+				gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+					gr->max_zcull_per_gpc_count));
+		} else {
+			gk20a_writel(g, gr_gpc0_zcull_ram_addr_r() + offset,
+				gr_gpc0_zcull_ram_addr_row_offset_f(gr->map_row_offset) |
+				gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(
+					gr->gpc_tpc_count[gpc_index]));
+		}
+
+		gk20a_writel(g, gr_gpc0_zcull_fs_r() + offset,
+			gr_gpc0_zcull_fs_num_active_banks_f(gr->gpc_zcb_count[gpc_index]) |
+			gr_gpc0_zcull_fs_num_sms_f(gr->tpc_count));
+
+		gk20a_writel(g, gr_gpc0_zcull_sm_num_rcp_r() + offset,
+			gr_gpc0_zcull_sm_num_rcp_conservative_f(rcp_conserv));
+	}
+
+	gk20a_writel(g, gr_gpcs_ppcs_wwdx_sm_num_rcp_r(),
+		gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(rcp_conserv));
+
+	return 0;
+}
+
+static void gk20a_gr_enable_gpc_exceptions(struct gk20a *g)
+{
+	/* enable tpc exception forwarding */
+	gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(),
+		gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f());
+
+	/* enable gpc exception forwarding */
+	gk20a_writel(g, gr_gpc0_gpccs_gpc_exception_en_r(),
+		gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f());
+}
+
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g)
+{
+	/* enable exceptions */
+	gk20a_writel(g, gr_fe_hww_esr_r(),
+		     gr_fe_hww_esr_en_enable_f() |
+		     gr_fe_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_memfmt_hww_esr_r(),
+		     gr_memfmt_hww_esr_en_enable_f() |
+		     gr_memfmt_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_scc_hww_esr_r(),
+		     gr_scc_hww_esr_en_enable_f() |
+		     gr_scc_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_mme_hww_esr_r(),
+		     gr_mme_hww_esr_en_enable_f() |
+		     gr_mme_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_pd_hww_esr_r(),
+		     gr_pd_hww_esr_en_enable_f() |
+		     gr_pd_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_sked_hww_esr_r(), /* enabled by default */
+		     gr_sked_hww_esr_reset_active_f());
+	gk20a_writel(g, gr_ds_hww_esr_r(),
+		     gr_ds_hww_esr_en_enabled_f() |
+		     gr_ds_hww_esr_reset_task_f());
+	gk20a_writel(g, gr_ds_hww_report_mask_r(),
+		     gr_ds_hww_report_mask_sph0_err_report_f() |
+		     gr_ds_hww_report_mask_sph1_err_report_f() |
+		     gr_ds_hww_report_mask_sph2_err_report_f() |
+		     gr_ds_hww_report_mask_sph3_err_report_f() |
+		     gr_ds_hww_report_mask_sph4_err_report_f() |
+		     gr_ds_hww_report_mask_sph5_err_report_f() |
+		     gr_ds_hww_report_mask_sph6_err_report_f() |
+		     gr_ds_hww_report_mask_sph7_err_report_f() |
+		     gr_ds_hww_report_mask_sph8_err_report_f() |
+		     gr_ds_hww_report_mask_sph9_err_report_f() |
+		     gr_ds_hww_report_mask_sph10_err_report_f() |
+		     gr_ds_hww_report_mask_sph11_err_report_f() |
+		     gr_ds_hww_report_mask_sph12_err_report_f() |
+		     gr_ds_hww_report_mask_sph13_err_report_f() |
+		     gr_ds_hww_report_mask_sph14_err_report_f() |
+		     gr_ds_hww_report_mask_sph15_err_report_f() |
+		     gr_ds_hww_report_mask_sph16_err_report_f() |
+		     gr_ds_hww_report_mask_sph17_err_report_f() |
+		     gr_ds_hww_report_mask_sph18_err_report_f() |
+		     gr_ds_hww_report_mask_sph19_err_report_f() |
+		     gr_ds_hww_report_mask_sph20_err_report_f() |
+		     gr_ds_hww_report_mask_sph21_err_report_f() |
+		     gr_ds_hww_report_mask_sph22_err_report_f() |
+		     gr_ds_hww_report_mask_sph23_err_report_f());
+}
+
+static void gr_gk20a_set_hww_esr_report_mask(struct gk20a *g)
+{
+	/* setup sm warp esr report masks */
+	gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f()	|
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
+		gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
+
+	/* setup sm global esr report mask */
+	gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
+		gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
+}
+
+static int gk20a_init_gr_setup_hw(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct aiv_list_gk20a *sw_ctx_load = &g->gr.ctx_vars.sw_ctx_load;
+	struct av_list_gk20a *sw_method_init = &g->gr.ctx_vars.sw_method_init;
+	u32 data;
+	u32 addr_lo, addr_hi;
+	u64 addr;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 fe_go_idle_timeout_save;
+	u32 last_method_data = 0;
+	u32 i, err;
+
+	gk20a_dbg_fn("");
+
+	/* slcg prod values */
+	g->ops.clock_gating.slcg_gr_load_gating_prod(g, g->slcg_enabled);
+	g->ops.clock_gating.slcg_perf_load_gating_prod(g, g->slcg_enabled);
+
+	/* init mmu debug buffer */
+	addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_wr_mem.iova);
+	addr_lo = u64_lo32(addr);
+	addr_hi = u64_hi32(addr);
+	addr = (addr_lo >> fb_mmu_debug_wr_addr_alignment_v()) |
+		(addr_hi << (32 - fb_mmu_debug_wr_addr_alignment_v()));
+
+	gk20a_writel(g, fb_mmu_debug_wr_r(),
+		     fb_mmu_debug_wr_aperture_vid_mem_f() |
+		     fb_mmu_debug_wr_vol_false_f() |
+		     fb_mmu_debug_wr_addr_v(addr));
+
+	addr = NV_MC_SMMU_VADDR_TRANSLATE(gr->mmu_rd_mem.iova);
+	addr_lo = u64_lo32(addr);
+	addr_hi = u64_hi32(addr);
+	addr = (addr_lo >> fb_mmu_debug_rd_addr_alignment_v()) |
+		(addr_hi << (32 - fb_mmu_debug_rd_addr_alignment_v()));
+
+	gk20a_writel(g, fb_mmu_debug_rd_r(),
+		     fb_mmu_debug_rd_aperture_vid_mem_f() |
+		     fb_mmu_debug_rd_vol_false_f() |
+		     fb_mmu_debug_rd_addr_v(addr));
+
+	/* load gr floorsweeping registers */
+	data = gk20a_readl(g, gr_gpc0_ppc0_pes_vsc_strem_r());
+	data = set_field(data, gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(),
+			gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f());
+	gk20a_writel(g, gr_gpc0_ppc0_pes_vsc_strem_r(), data);
+
+	gr_gk20a_zcull_init_hw(g, gr);
+
+	g->ops.clock_gating.blcg_gr_load_gating_prod(g, g->blcg_enabled);
+	g->ops.clock_gating.pg_gr_load_gating_prod(g, true);
+
+	if (g->elcg_enabled) {
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_AUTO, ENGINE_CE2_GK20A);
+	} else {
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_GR_GK20A);
+		gr_gk20a_init_elcg_mode(g, ELCG_RUN, ENGINE_CE2_GK20A);
+	}
+
+	/* Bug 1340570: increase the clock timeout to avoid potential
+	 * operation failure at high gpcclk rate. Default values are 0x400.
+	 */
+	gk20a_writel(g, pri_ringstation_sys_master_config_r(0x15), 0x800);
+	gk20a_writel(g, pri_ringstation_gpc_master_config_r(0xa), 0x800);
+	gk20a_writel(g, pri_ringstation_fbp_master_config_r(0x8), 0x800);
+
+	/* enable fifo access */
+	gk20a_writel(g, gr_gpfifo_ctl_r(),
+		     gr_gpfifo_ctl_access_enabled_f() |
+		     gr_gpfifo_ctl_semaphore_access_enabled_f());
+
+	/* TBD: reload gr ucode when needed */
+
+	/* enable interrupts */
+	gk20a_writel(g, gr_intr_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_intr_en_r(), 0xFFFFFFFF);
+
+	/* enable fecs error interrupts */
+	gk20a_writel(g, gr_fecs_host_int_enable_r(),
+		     gr_fecs_host_int_enable_fault_during_ctxsw_enable_f() |
+		     gr_fecs_host_int_enable_umimp_firmware_method_enable_f() |
+		     gr_fecs_host_int_enable_umimp_illegal_method_enable_f() |
+		     gr_fecs_host_int_enable_watchdog_enable_f());
+
+	g->ops.gr.enable_hww_exceptions(g);
+	g->ops.gr.set_hww_esr_report_mask(g);
+
+	/* enable per GPC exceptions */
+	gk20a_gr_enable_gpc_exceptions(g);
+
+	/* TBD: ECC for L1/SM */
+	/* TBD: enable per BE exceptions */
+
+	/* reset and enable all exceptions */
+	gk20a_writel(g, gr_exception_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_exception_en_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_exception1_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_exception1_en_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_exception2_r(), 0xFFFFFFFF);
+	gk20a_writel(g, gr_exception2_en_r(), 0xFFFFFFFF);
+
+	/* ignore status from some units */
+	data = gk20a_readl(g, gr_status_mask_r());
+	gk20a_writel(g, gr_status_mask_r(), data & gr->status_disable_mask);
+
+	g->ops.ltc.init_zbc(g, gr);
+	g->ops.ltc.init_cbc(g, gr);
+
+	/* load ctx init */
+	for (i = 0; i < sw_ctx_load->count; i++)
+		gk20a_writel(g, sw_ctx_load->l[i].addr,
+			     sw_ctx_load->l[i].value);
+
+	err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (err)
+		goto out;
+
+	/* save and disable fe_go_idle */
+	fe_go_idle_timeout_save =
+		gk20a_readl(g, gr_fe_go_idle_timeout_r());
+	gk20a_writel(g, gr_fe_go_idle_timeout_r(),
+		(fe_go_idle_timeout_save & gr_fe_go_idle_timeout_count_f(0)) |
+		gr_fe_go_idle_timeout_count_disabled_f());
+
+	/* override a few ctx state registers */
+	g->ops.gr.commit_global_cb_manager(g, NULL, false);
+	gr_gk20a_commit_global_timeslice(g, NULL, false);
+
+	/* floorsweep anything left */
+	g->ops.gr.init_fs_state(g);
+
+	err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (err)
+		goto restore_fe_go_idle;
+
+restore_fe_go_idle:
+	/* restore fe_go_idle */
+	gk20a_writel(g, gr_fe_go_idle_timeout_r(), fe_go_idle_timeout_save);
+
+	if (err || gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT))
+		goto out;
+
+	/* load method init */
+	if (sw_method_init->count) {
+		gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+			     sw_method_init->l[0].value);
+		gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+			     gr_pri_mme_shadow_raw_index_write_trigger_f() |
+			     sw_method_init->l[0].addr);
+		last_method_data = sw_method_init->l[0].value;
+	}
+	for (i = 1; i < sw_method_init->count; i++) {
+		if (sw_method_init->l[i].value != last_method_data) {
+			gk20a_writel(g, gr_pri_mme_shadow_raw_data_r(),
+				sw_method_init->l[i].value);
+			last_method_data = sw_method_init->l[i].value;
+		}
+		gk20a_writel(g, gr_pri_mme_shadow_raw_index_r(),
+			gr_pri_mme_shadow_raw_index_write_trigger_f() |
+			sw_method_init->l[i].addr);
+	}
+
+	gk20a_mm_l2_invalidate(g);
+
+	err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (err)
+		goto out;
+
+out:
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static int gk20a_init_gr_prepare(struct gk20a *g)
+{
+	u32 gpfifo_ctrl, pmc_en;
+	u32 err = 0;
+
+	/* disable fifo access */
+	pmc_en = gk20a_readl(g, mc_enable_r());
+	if (pmc_en & mc_enable_pgraph_enabled_f()) {
+		gpfifo_ctrl = gk20a_readl(g, gr_gpfifo_ctl_r());
+		gpfifo_ctrl &= ~gr_gpfifo_ctl_access_enabled_f();
+		gk20a_writel(g, gr_gpfifo_ctl_r(), gpfifo_ctrl);
+	}
+
+	/* reset gr engine */
+	gk20a_reset(g, mc_enable_pgraph_enabled_f()
+			| mc_enable_blg_enabled_f()
+			| mc_enable_perfmon_enabled_f());
+
+	/* enable fifo access */
+	gk20a_writel(g, gr_gpfifo_ctl_r(),
+		gr_gpfifo_ctl_access_enabled_f() |
+		gr_gpfifo_ctl_semaphore_access_enabled_f());
+
+	if (!g->gr.ctx_vars.valid) {
+		err = gr_gk20a_init_ctx_vars(g, &g->gr);
+		if (err)
+			gk20a_err(dev_from_gk20a(g),
+				"fail to load gr init ctx");
+	}
+	return err;
+}
+
+static int gr_gk20a_wait_mem_scrubbing(struct gk20a *g)
+{
+	int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
+	bool fecs_scrubbing;
+	bool gpccs_scrubbing;
+
+	gk20a_dbg_fn("");
+
+	do {
+		fecs_scrubbing = gk20a_readl(g, gr_fecs_dmactl_r()) &
+			(gr_fecs_dmactl_imem_scrubbing_m() |
+			 gr_fecs_dmactl_dmem_scrubbing_m());
+
+		gpccs_scrubbing = gk20a_readl(g, gr_gpccs_dmactl_r()) &
+			(gr_gpccs_dmactl_imem_scrubbing_m() |
+			 gr_gpccs_dmactl_imem_scrubbing_m());
+
+		if (!fecs_scrubbing && !gpccs_scrubbing) {
+			gk20a_dbg_fn("done");
+			return 0;
+		}
+
+		udelay(GR_IDLE_CHECK_DEFAULT);
+	} while (--retries || !tegra_platform_is_silicon());
+
+	gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
+	return -ETIMEDOUT;
+}
+
+static int gk20a_init_gr_reset_enable_hw(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	struct av_list_gk20a *sw_non_ctx_load = &g->gr.ctx_vars.sw_non_ctx_load;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 i, err = 0;
+
+	gk20a_dbg_fn("");
+
+	/* enable interrupts */
+	gk20a_writel(g, gr_intr_r(), ~0);
+	gk20a_writel(g, gr_intr_en_r(), ~0);
+
+	/* reset ctx switch state */
+	gr_gk20a_ctx_reset(g, 0);
+
+	/* clear scc ram */
+	gk20a_writel(g, gr_scc_init_r(),
+		gr_scc_init_ram_trigger_f());
+
+	/* load non_ctx init */
+	for (i = 0; i < sw_non_ctx_load->count; i++)
+		gk20a_writel(g, sw_non_ctx_load->l[i].addr,
+			sw_non_ctx_load->l[i].value);
+
+	err = gr_gk20a_wait_mem_scrubbing(g);
+	if (err)
+		goto out;
+
+	err = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (err)
+		goto out;
+
+	err = gr_gk20a_load_ctxsw_ucode(g, gr);
+	if (err)
+		goto out;
+
+	/* this appears query for sw states but fecs actually init
+	   ramchain, etc so this is hw init */
+	err = gr_gk20a_init_ctx_state(g, gr);
+	if (err)
+		goto out;
+
+out:
+	if (err)
+		gk20a_err(dev_from_gk20a(g), "fail");
+	else
+		gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+/*
+ * XXX Merge this list with the debugger/profiler
+ * session regops whitelists?
+ */
+static u32 wl_addr_gk20a[] = {
+	/* this list must be sorted (low to high) */
+	0x404468, /* gr_pri_mme_max_instructions       */
+	0x418800, /* gr_pri_gpcs_setup_debug           */
+	0x419a04, /* gr_pri_gpcs_tpcs_tex_lod_dbg      */
+	0x419a08, /* gr_pri_gpcs_tpcs_tex_samp_dbg     */
+	0x419e10, /* gr_pri_gpcs_tpcs_sm_dbgr_control0 */
+	0x419f78, /* gr_pri_gpcs_tpcs_sm_disp_ctrl     */
+};
+
+static int gr_gk20a_init_access_map(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	void *data;
+	int err = 0;
+	u32 w, nr_pages =
+		DIV_ROUND_UP(gr->ctx_vars.priv_access_map_size,
+			     PAGE_SIZE);
+
+	data = vmap(gr->global_ctx_buffer[PRIV_ACCESS_MAP].pages,
+		    PAGE_ALIGN(gr->global_ctx_buffer[PRIV_ACCESS_MAP].size) >>
+		    PAGE_SHIFT, 0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!data) {
+		gk20a_err(dev_from_gk20a(g),
+			  "failed to map priv access map memory");
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	memset(data, 0x0, PAGE_SIZE * nr_pages);
+
+	for (w = 0; w < ARRAY_SIZE(wl_addr_gk20a); w++) {
+		u32 map_bit, map_byte, map_shift;
+		map_bit = wl_addr_gk20a[w] >> 2;
+		map_byte = map_bit >> 3;
+		map_shift = map_bit & 0x7; /* i.e. 0-7 */
+		gk20a_dbg_info("access map addr:0x%x byte:0x%x bit:%d",
+		  wl_addr_gk20a[w], map_byte, map_shift);
+		((u8 *)data)[map_byte] |= 1 << map_shift;
+	}
+
+clean_up:
+	if (data)
+		vunmap(data);
+	return 0;
+}
+
+static int gk20a_init_gr_setup_sw(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (gr->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	gr->g = g;
+
+	err = gr_gk20a_init_gr_config(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_init_mmu_sw(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_init_map_tiles(g, gr);
+	if (err)
+		goto clean_up;
+
+	if (tegra_cpu_is_asim())
+		gr->max_comptag_mem = 1; /* MBs worth of comptag coverage */
+	else {
+		gk20a_dbg_info("total ram pages : %lu", totalram_pages);
+		gr->max_comptag_mem = totalram_pages
+					 >> (10 - (PAGE_SHIFT - 10));
+	}
+	err = g->ops.ltc.init_comptags(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_init_zcull(g, gr);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_alloc_global_ctx_buffers(g);
+	if (err)
+		goto clean_up;
+
+	err = gr_gk20a_init_access_map(g);
+	if (err)
+		goto clean_up;
+
+	mutex_init(&gr->ctx_mutex);
+	spin_lock_init(&gr->ch_tlb_lock);
+
+	gr->remove_support = gk20a_remove_gr_support;
+	gr->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+clean_up:
+	gk20a_err(dev_from_gk20a(g), "fail");
+	gk20a_remove_gr_support(gr);
+	return err;
+}
+
+int gk20a_init_gr_support(struct gk20a *g)
+{
+	u32 err;
+
+	gk20a_dbg_fn("");
+
+	err = gk20a_init_gr_prepare(g);
+	if (err)
+		return err;
+
+	/* this is required before gr_gk20a_init_ctx_state */
+	mutex_init(&g->gr.fecs_mutex);
+
+	err = gk20a_init_gr_reset_enable_hw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_gr_setup_sw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_gr_setup_hw(g);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+#define NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE	0x02dc
+#define NVA297_SET_CIRCULAR_BUFFER_SIZE		0x1280
+#define NVA297_SET_SHADER_EXCEPTIONS		0x1528
+#define NVA0C0_SET_SHADER_EXCEPTIONS		0x1528
+
+#define NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE 0
+
+struct gr_isr_data {
+	u32 addr;
+	u32 data_lo;
+	u32 data_hi;
+	u32 curr_ctx;
+	u32 chid;
+	u32 offset;
+	u32 sub_chan;
+	u32 class_num;
+};
+
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data)
+{
+	gk20a_dbg_fn("");
+
+	if (data == NVA297_SET_SHADER_EXCEPTIONS_ENABLE_FALSE) {
+		gk20a_writel(g,
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(), 0);
+		gk20a_writel(g,
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(), 0);
+	} else {
+		/* setup sm warp esr report masks */
+		gk20a_writel(g, gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(),
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f()	|
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f() |
+			gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f());
+
+		/* setup sm global esr report mask */
+		gk20a_writel(g, gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(),
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f() |
+			gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f());
+	}
+}
+
+static void gk20a_gr_set_circular_buffer_size(struct gk20a *g, u32 data)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 gpc_index, ppc_index, stride, val, offset;
+	u32 cb_size = data * 4;
+
+	gk20a_dbg_fn("");
+
+	if (cb_size > gr->attrib_cb_size)
+		cb_size = gr->attrib_cb_size;
+
+	gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
+		(gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
+		 ~gr_ds_tga_constraintlogic_beta_cbsize_f(~0)) |
+		 gr_ds_tga_constraintlogic_beta_cbsize_f(cb_size));
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		stride = proj_gpc_stride_v() * gpc_index;
+
+		for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+			ppc_index++) {
+
+			val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg_r() +
+				stride +
+				proj_ppc_in_gpc_stride_v() * ppc_index);
+
+			offset = gr_gpc0_ppc0_cbm_cfg_start_offset_v(val);
+
+			val = set_field(val,
+				gr_gpc0_ppc0_cbm_cfg_size_m(),
+				gr_gpc0_ppc0_cbm_cfg_size_f(cb_size *
+					gr->pes_tpc_count[ppc_index][gpc_index]));
+			val = set_field(val,
+				gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
+				(offset + 1));
+
+			gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
+				stride +
+				proj_ppc_in_gpc_stride_v() * ppc_index, val);
+
+			val = set_field(val,
+				gr_gpc0_ppc0_cbm_cfg_start_offset_m(),
+				offset);
+
+			gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg_r() +
+				stride +
+				proj_ppc_in_gpc_stride_v() * ppc_index, val);
+		}
+	}
+}
+
+static void gk20a_gr_set_alpha_circular_buffer_size(struct gk20a *g, u32 data)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 gpc_index, ppc_index, stride, val;
+	u32 pd_ab_max_output;
+	u32 alpha_cb_size = data * 4;
+
+	gk20a_dbg_fn("");
+	/* if (NO_ALPHA_BETA_TIMESLICE_SUPPORT_DEF)
+		return; */
+
+	if (alpha_cb_size > gr->alpha_cb_size)
+		alpha_cb_size = gr->alpha_cb_size;
+
+	gk20a_writel(g, gr_ds_tga_constraintlogic_r(),
+		(gk20a_readl(g, gr_ds_tga_constraintlogic_r()) &
+		 ~gr_ds_tga_constraintlogic_alpha_cbsize_f(~0)) |
+		 gr_ds_tga_constraintlogic_alpha_cbsize_f(alpha_cb_size));
+
+	pd_ab_max_output = alpha_cb_size *
+		gr_gpc0_ppc0_cbm_cfg_size_granularity_v() /
+		gr_pd_ab_dist_cfg1_max_output_granularity_v();
+
+	gk20a_writel(g, gr_pd_ab_dist_cfg1_r(),
+		gr_pd_ab_dist_cfg1_max_output_f(pd_ab_max_output));
+
+	for (gpc_index = 0; gpc_index < gr->gpc_count; gpc_index++) {
+		stride = proj_gpc_stride_v() * gpc_index;
+
+		for (ppc_index = 0; ppc_index < gr->gpc_ppc_count[gpc_index];
+			ppc_index++) {
+
+			val = gk20a_readl(g, gr_gpc0_ppc0_cbm_cfg2_r() +
+				stride +
+				proj_ppc_in_gpc_stride_v() * ppc_index);
+
+			val = set_field(val, gr_gpc0_ppc0_cbm_cfg2_size_m(),
+					gr_gpc0_ppc0_cbm_cfg2_size_f(alpha_cb_size *
+						gr->pes_tpc_count[ppc_index][gpc_index]));
+
+			gk20a_writel(g, gr_gpc0_ppc0_cbm_cfg2_r() +
+				stride +
+				proj_ppc_in_gpc_stride_v() * ppc_index, val);
+		}
+	}
+}
+
+void gk20a_gr_reset(struct gk20a *g)
+{
+	int err;
+	err = gk20a_init_gr_prepare(g);
+	BUG_ON(err);
+	err = gk20a_init_gr_reset_enable_hw(g);
+	BUG_ON(err);
+	err = gk20a_init_gr_setup_hw(g);
+	BUG_ON(err);
+}
+
+static int gr_gk20a_handle_sw_method(struct gk20a *g, u32 addr,
+					  u32 class_num, u32 offset, u32 data)
+{
+	gk20a_dbg_fn("");
+
+	if (class_num == KEPLER_COMPUTE_A) {
+		switch (offset << 2) {
+		case NVA0C0_SET_SHADER_EXCEPTIONS:
+			gk20a_gr_set_shader_exceptions(g, data);
+			break;
+		default:
+			goto fail;
+		}
+	}
+
+	if (class_num == KEPLER_C) {
+		switch (offset << 2) {
+		case NVA297_SET_SHADER_EXCEPTIONS:
+			gk20a_gr_set_shader_exceptions(g, data);
+			break;
+		case NVA297_SET_CIRCULAR_BUFFER_SIZE:
+			g->ops.gr.set_circular_buffer_size(g, data);
+			break;
+		case NVA297_SET_ALPHA_CIRCULAR_BUFFER_SIZE:
+			g->ops.gr.set_alpha_circular_buffer_size(g, data);
+			break;
+		default:
+			goto fail;
+		}
+	}
+	return 0;
+
+fail:
+	return -EINVAL;
+}
+
+static int gk20a_gr_handle_semaphore_timeout_pending(struct gk20a *g,
+		  struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+	gk20a_dbg_fn("");
+	gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_GR_SEMAPHORE_TIMEOUT);
+	gk20a_err(dev_from_gk20a(g),
+		   "gr semaphore timeout\n");
+	return -EINVAL;
+}
+
+static int gk20a_gr_intr_illegal_notify_pending(struct gk20a *g,
+		  struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+	gk20a_dbg_fn("");
+	gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_GR_ILLEGAL_NOTIFY);
+	/* This is an unrecoverable error, reset is needed */
+	gk20a_err(dev_from_gk20a(g),
+		   "gr semaphore timeout\n");
+	return -EINVAL;
+}
+
+static int gk20a_gr_handle_illegal_method(struct gk20a *g,
+					  struct gr_isr_data *isr_data)
+{
+	int ret = g->ops.gr.handle_sw_method(g, isr_data->addr,
+			isr_data->class_num, isr_data->offset,
+			isr_data->data_lo);
+	if (ret)
+		gk20a_err(dev_from_gk20a(g), "invalid method class 0x%08x"
+			", offset 0x%08x address 0x%08x\n",
+			isr_data->class_num, isr_data->offset, isr_data->addr);
+
+	return ret;
+}
+
+static int gk20a_gr_handle_illegal_class(struct gk20a *g,
+					  struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+	gk20a_dbg_fn("");
+	gk20a_set_error_notifier(ch,
+				NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+	gk20a_err(dev_from_gk20a(g),
+		   "invalid class 0x%08x, offset 0x%08x",
+		   isr_data->class_num, isr_data->offset);
+	return -EINVAL;
+}
+
+static int gk20a_gr_handle_class_error(struct gk20a *g,
+					  struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+	gk20a_dbg_fn("");
+
+	gk20a_set_error_notifier(ch,
+			NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+	gk20a_err(dev_from_gk20a(g),
+		   "class error 0x%08x, offset 0x%08x",
+		   isr_data->class_num, isr_data->offset);
+	return -EINVAL;
+}
+
+static int gk20a_gr_handle_semaphore_pending(struct gk20a *g,
+					     struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+
+	wake_up(&ch->semaphore_wq);
+
+	return 0;
+}
+
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+static inline bool is_valid_cyclestats_bar0_offset_gk20a(struct gk20a *g,
+							 u32 offset)
+{
+	/* support only 24-bit 4-byte aligned offsets */
+	bool valid = !(offset & 0xFF000003);
+	/* whitelist check */
+	valid = valid &&
+		is_bar0_global_offset_whitelisted_gk20a(offset);
+	/* resource size check in case there was a problem
+	 * with allocating the assumed size of bar0 */
+	valid = valid &&
+		offset < resource_size(g->reg_mem);
+	return valid;
+}
+#endif
+
+static int gk20a_gr_handle_notify_pending(struct gk20a *g,
+					  struct gr_isr_data *isr_data)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct channel_gk20a *ch = &f->channel[isr_data->chid];
+
+#if defined(CONFIG_GK20A_CYCLE_STATS)
+	void *virtual_address;
+	u32 buffer_size;
+	u32 offset;
+	u32 new_offset;
+	bool exit;
+	struct share_buffer_head *sh_hdr;
+	u32 raw_reg;
+	u64 mask_orig;
+	u64 v = 0;
+	struct gk20a_cyclestate_buffer_elem *op_elem;
+	/* GL will never use payload 0 for cycle state */
+	if ((ch->cyclestate.cyclestate_buffer == NULL) || (isr_data->data_lo == 0))
+		return 0;
+
+	mutex_lock(&ch->cyclestate.cyclestate_buffer_mutex);
+
+	virtual_address = ch->cyclestate.cyclestate_buffer;
+	buffer_size = ch->cyclestate.cyclestate_buffer_size;
+	offset = isr_data->data_lo;
+	exit = false;
+	while (!exit) {
+		if (offset >= buffer_size) {
+			WARN_ON(1);
+			break;
+		}
+
+		sh_hdr = (struct share_buffer_head *)
+			((char *)virtual_address + offset);
+
+		if (sh_hdr->size < sizeof(struct share_buffer_head)) {
+			WARN_ON(1);
+			break;
+		}
+		new_offset = offset + sh_hdr->size;
+
+		switch (sh_hdr->operation) {
+		case OP_END:
+			exit = true;
+			break;
+
+		case BAR0_READ32:
+		case BAR0_WRITE32:
+		{
+			bool valid;
+			op_elem =
+				(struct gk20a_cyclestate_buffer_elem *)
+					sh_hdr;
+			valid = is_valid_cyclestats_bar0_offset_gk20a(g,
+							op_elem->offset_bar0);
+			if (!valid) {
+				gk20a_err(dev_from_gk20a(g),
+					   "invalid cycletstats op offset: 0x%x\n",
+					   op_elem->offset_bar0);
+
+				sh_hdr->failed = exit = true;
+				break;
+			}
+
+
+			mask_orig =
+				((1ULL <<
+				  (op_elem->last_bit + 1))
+				 -1)&~((1ULL <<
+					op_elem->first_bit)-1);
+
+			raw_reg =
+				gk20a_readl(g,
+					    op_elem->offset_bar0);
+
+			switch (sh_hdr->operation) {
+			case BAR0_READ32:
+				op_elem->data =
+					(raw_reg & mask_orig)
+					>> op_elem->first_bit;
+				break;
+
+			case BAR0_WRITE32:
+				v = 0;
+				if ((unsigned int)mask_orig !=
+				    (unsigned int)~0) {
+					v = (unsigned int)
+						(raw_reg & ~mask_orig);
+				}
+
+				v |= ((op_elem->data
+				       << op_elem->first_bit)
+				      & mask_orig);
+
+				gk20a_writel(g,
+					     op_elem->offset_bar0,
+					     (unsigned int)v);
+				break;
+			default:
+				/* nop ok?*/
+				break;
+			}
+		}
+		break;
+
+		default:
+			/* no operation content case */
+			exit = true;
+			break;
+		}
+		sh_hdr->completed = true;
+		offset = new_offset;
+	}
+	mutex_unlock(&ch->cyclestate.cyclestate_buffer_mutex);
+#endif
+	gk20a_dbg_fn("");
+	wake_up(&ch->notifier_wq);
+	return 0;
+}
+
+/* Used by sw interrupt thread to translate current ctx to chid.
+ * For performance, we don't want to go through 128 channels every time.
+ * A small tlb is used here to cache translation */
+static int gk20a_gr_get_chid_from_ctx(struct gk20a *g, u32 curr_ctx)
+{
+	struct fifo_gk20a *f = &g->fifo;
+	struct gr_gk20a *gr = &g->gr;
+	u32 chid = -1;
+	u32 i;
+
+	spin_lock(&gr->ch_tlb_lock);
+
+	/* check cache first */
+	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+		if (gr->chid_tlb[i].curr_ctx == curr_ctx) {
+			chid = gr->chid_tlb[i].hw_chid;
+			goto unlock;
+		}
+	}
+
+	/* slow path */
+	for (chid = 0; chid < f->num_channels; chid++)
+		if (f->channel[chid].in_use) {
+			if ((u32)(f->channel[chid].inst_block.cpu_pa >>
+				ram_in_base_shift_v()) ==
+				gr_fecs_current_ctx_ptr_v(curr_ctx))
+				break;
+	}
+
+	if (chid >= f->num_channels) {
+		chid = -1;
+		goto unlock;
+	}
+
+	/* add to free tlb entry */
+	for (i = 0; i < GR_CHANNEL_MAP_TLB_SIZE; i++) {
+		if (gr->chid_tlb[i].curr_ctx == 0) {
+			gr->chid_tlb[i].curr_ctx = curr_ctx;
+			gr->chid_tlb[i].hw_chid = chid;
+			goto unlock;
+		}
+	}
+
+	/* no free entry, flush one */
+	gr->chid_tlb[gr->channel_tlb_flush_index].curr_ctx = curr_ctx;
+	gr->chid_tlb[gr->channel_tlb_flush_index].hw_chid = chid;
+
+	gr->channel_tlb_flush_index =
+		(gr->channel_tlb_flush_index + 1) &
+		(GR_CHANNEL_MAP_TLB_SIZE - 1);
+
+unlock:
+	spin_unlock(&gr->ch_tlb_lock);
+	return chid;
+}
+
+static int gk20a_gr_lock_down_sm(struct gk20a *g, u32 global_esr_mask)
+{
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	bool mmu_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled(g);
+	u32 dbgr_control0;
+
+	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locking down SM");
+
+	/* assert stop trigger */
+	dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+	dbgr_control0 |= gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
+	gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
+
+	/* wait for the sm to lock down */
+	do {
+		u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+		u32 warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
+		u32 dbgr_status0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_status0_r());
+		bool locked_down =
+			(gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(dbgr_status0) ==
+			 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v());
+		bool error_pending =
+			(gr_gpc0_tpc0_sm_hww_warp_esr_error_v(warp_esr) !=
+			 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v()) ||
+			((global_esr & ~global_esr_mask) != 0);
+
+		if (locked_down || !error_pending) {
+			gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "locked down SM");
+
+			/* de-assert stop trigger */
+			dbgr_control0 &= ~gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f();
+			gk20a_writel(g, gr_gpc0_tpc0_sm_dbgr_control0_r(), dbgr_control0);
+
+			return 0;
+		}
+
+		/* if an mmu fault is pending and mmu debug mode is not
+		 * enabled, the sm will never lock down. */
+		if (!mmu_debug_mode_enabled && gk20a_fifo_mmu_fault_pending(g)) {
+			gk20a_err(dev_from_gk20a(g), "mmu fault pending, sm will"
+				   " never lock down!");
+			return -EFAULT;
+		}
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+
+	} while (time_before(jiffies, end_jiffies)
+			|| !tegra_platform_is_silicon());
+
+	gk20a_err(dev_from_gk20a(g), "timed out while trying to lock down SM");
+
+	return -EAGAIN;
+}
+
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g)
+{
+	u32 dbgr_control0 = gk20a_readl(g, gr_gpc0_tpc0_sm_dbgr_control0_r());
+
+	/* check if an sm debugger is attached */
+	if (gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(dbgr_control0) ==
+			gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v())
+		return true;
+
+	return false;
+}
+
+static void gk20a_gr_clear_sm_hww(struct gk20a *g, u32 global_esr)
+{
+	gk20a_writel(g, gr_gpc0_tpc0_sm_hww_global_esr_r(), global_esr);
+
+	/* clear the warp hww */
+	gk20a_writel(g, gr_gpc0_tpc0_sm_hww_warp_esr_r(),
+			gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f());
+}
+
+static struct channel_gk20a *
+channel_from_hw_chid(struct gk20a *g, u32 hw_chid)
+{
+	return g->fifo.channel+hw_chid;
+}
+
+static int gk20a_gr_handle_sm_exception(struct gk20a *g,
+		struct gr_isr_data *isr_data)
+{
+	int ret = 0;
+	bool do_warp_sync = false;
+	/* these three interrupts don't require locking down the SM. They can
+	 * be handled by usermode clients as they aren't fatal. Additionally,
+	 * usermode clients may wish to allow some warps to execute while others
+	 * are at breakpoints, as opposed to fatal errors where all warps should
+	 * halt. */
+	u32 global_mask = gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f()   |
+			  gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f() |
+			  gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f();
+	u32 global_esr, warp_esr;
+	bool sm_debugger_attached = gk20a_gr_sm_debugger_attached(g);
+	struct channel_gk20a *fault_ch;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+	warp_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_warp_esr_r());
+
+	/* if an sm debugger is attached, disable forwarding of tpc exceptions.
+	 * the debugger will reenable exceptions after servicing them. */
+	if (sm_debugger_attached) {
+		u32 tpc_exception_en = gk20a_readl(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r());
+		tpc_exception_en &= ~gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f();
+		gk20a_writel(g, gr_gpc0_tpc0_tpccs_tpc_exception_en_r(), tpc_exception_en);
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM debugger attached");
+	}
+
+	/* if a debugger is present and an error has occurred, do a warp sync */
+	if (sm_debugger_attached && ((warp_esr != 0) || ((global_esr & ~global_mask) != 0))) {
+		gk20a_dbg(gpu_dbg_intr, "warp sync needed");
+		do_warp_sync = true;
+	}
+
+	if (do_warp_sync) {
+		ret = gk20a_gr_lock_down_sm(g, global_mask);
+		if (ret) {
+			gk20a_err(dev_from_gk20a(g), "sm did not lock down!\n");
+			return ret;
+		}
+	}
+
+	/* finally, signal any client waiting on an event */
+	fault_ch = channel_from_hw_chid(g, isr_data->chid);
+	if (fault_ch)
+		gk20a_dbg_gpu_post_events(fault_ch);
+
+	return ret;
+}
+
+static int gk20a_gr_handle_tpc_exception(struct gk20a *g,
+		struct gr_isr_data *isr_data)
+{
+	int ret = 0;
+	u32 tpc_exception = gk20a_readl(g, gr_gpcs_tpcs_tpccs_tpc_exception_r());
+
+	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
+
+	/* check if an sm exeption is pending  */
+	if (gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(tpc_exception) ==
+			gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v()) {
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "SM exception pending");
+		ret = gk20a_gr_handle_sm_exception(g, isr_data);
+	}
+
+	return ret;
+}
+
+static int gk20a_gr_handle_gpc_exception(struct gk20a *g,
+		struct gr_isr_data *isr_data)
+{
+	int ret = 0;
+	u32 gpc_exception = gk20a_readl(g, gr_gpcs_gpccs_gpc_exception_r());
+
+	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "");
+
+	/* check if tpc 0 has an exception */
+	if (gr_gpcs_gpccs_gpc_exception_tpc_v(gpc_exception) ==
+			gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v()) {
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "TPC exception pending");
+		ret = gk20a_gr_handle_tpc_exception(g, isr_data);
+	}
+
+	return ret;
+}
+
+int gk20a_gr_isr(struct gk20a *g)
+{
+	struct gr_isr_data isr_data;
+	u32 grfifo_ctl;
+	u32 obj_table;
+	int need_reset = 0;
+	u32 gr_intr = gk20a_readl(g, gr_intr_r());
+
+	gk20a_dbg_fn("");
+	gk20a_dbg(gpu_dbg_intr, "pgraph intr %08x", gr_intr);
+
+	if (!gr_intr)
+		return 0;
+
+	grfifo_ctl = gk20a_readl(g, gr_gpfifo_ctl_r());
+	grfifo_ctl &= ~gr_gpfifo_ctl_semaphore_access_f(1);
+	grfifo_ctl &= ~gr_gpfifo_ctl_access_f(1);
+
+	gk20a_writel(g, gr_gpfifo_ctl_r(),
+		grfifo_ctl | gr_gpfifo_ctl_access_f(0) |
+		gr_gpfifo_ctl_semaphore_access_f(0));
+
+	isr_data.addr = gk20a_readl(g, gr_trapped_addr_r());
+	isr_data.data_lo = gk20a_readl(g, gr_trapped_data_lo_r());
+	isr_data.data_hi = gk20a_readl(g, gr_trapped_data_hi_r());
+	isr_data.curr_ctx = gk20a_readl(g, gr_fecs_current_ctx_r());
+	isr_data.offset = gr_trapped_addr_mthd_v(isr_data.addr);
+	isr_data.sub_chan = gr_trapped_addr_subch_v(isr_data.addr);
+	obj_table = gk20a_readl(g,
+		gr_fe_object_table_r(isr_data.sub_chan));
+	isr_data.class_num = gr_fe_object_table_nvclass_v(obj_table);
+
+	isr_data.chid =
+		gk20a_gr_get_chid_from_ctx(g, isr_data.curr_ctx);
+	if (isr_data.chid == -1) {
+		gk20a_err(dev_from_gk20a(g), "invalid channel ctx 0x%08x",
+			   isr_data.curr_ctx);
+		goto clean_up;
+	}
+
+	gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+		"channel %d: addr 0x%08x, "
+		"data 0x%08x 0x%08x,"
+		"ctx 0x%08x, offset 0x%08x, "
+		"subchannel 0x%08x, class 0x%08x",
+		isr_data.chid, isr_data.addr,
+		isr_data.data_hi, isr_data.data_lo,
+		isr_data.curr_ctx, isr_data.offset,
+		isr_data.sub_chan, isr_data.class_num);
+
+	if (gr_intr & gr_intr_notify_pending_f()) {
+		gk20a_gr_handle_notify_pending(g, &isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_notify_reset_f());
+		gr_intr &= ~gr_intr_notify_pending_f();
+	}
+
+	if (gr_intr & gr_intr_semaphore_pending_f()) {
+		gk20a_gr_handle_semaphore_pending(g, &isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_semaphore_reset_f());
+		gr_intr &= ~gr_intr_semaphore_pending_f();
+	}
+
+	if (gr_intr & gr_intr_semaphore_timeout_pending_f()) {
+		need_reset |= gk20a_gr_handle_semaphore_timeout_pending(g,
+			&isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_semaphore_reset_f());
+		gr_intr &= ~gr_intr_semaphore_pending_f();
+	}
+
+	if (gr_intr & gr_intr_illegal_notify_pending_f()) {
+		need_reset |= gk20a_gr_intr_illegal_notify_pending(g,
+			&isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_illegal_notify_reset_f());
+		gr_intr &= ~gr_intr_illegal_notify_pending_f();
+	}
+
+	if (gr_intr & gr_intr_illegal_method_pending_f()) {
+		need_reset |= gk20a_gr_handle_illegal_method(g, &isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_illegal_method_reset_f());
+		gr_intr &= ~gr_intr_illegal_method_pending_f();
+	}
+
+	if (gr_intr & gr_intr_illegal_class_pending_f()) {
+		need_reset |= gk20a_gr_handle_illegal_class(g, &isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_illegal_class_reset_f());
+		gr_intr &= ~gr_intr_illegal_class_pending_f();
+	}
+
+	if (gr_intr & gr_intr_class_error_pending_f()) {
+		need_reset |= gk20a_gr_handle_class_error(g, &isr_data);
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_class_error_reset_f());
+		gr_intr &= ~gr_intr_class_error_pending_f();
+	}
+
+	/* this one happens if someone tries to hit a non-whitelisted
+	 * register using set_falcon[4] */
+	if (gr_intr & gr_intr_firmware_method_pending_f()) {
+		need_reset |= true;
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "firmware method intr pending\n");
+		gk20a_writel(g, gr_intr_r(),
+			gr_intr_firmware_method_reset_f());
+		gr_intr &= ~gr_intr_firmware_method_pending_f();
+	}
+
+	if (gr_intr & gr_intr_exception_pending_f()) {
+		u32 exception = gk20a_readl(g, gr_exception_r());
+		struct fifo_gk20a *f = &g->fifo;
+		struct channel_gk20a *ch = &f->channel[isr_data.chid];
+
+		gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "exception %08x\n", exception);
+
+		if (exception & gr_exception_fe_m()) {
+			u32 fe = gk20a_readl(g, gr_fe_hww_esr_r());
+			gk20a_dbg(gpu_dbg_intr, "fe warning %08x\n", fe);
+			gk20a_writel(g, gr_fe_hww_esr_r(), fe);
+		}
+
+		/* check if a gpc exception has occurred */
+		if (exception & gr_exception_gpc_m() && need_reset == 0) {
+			u32 exception1 = gk20a_readl(g, gr_exception1_r());
+			u32 global_esr = gk20a_readl(g, gr_gpc0_tpc0_sm_hww_global_esr_r());
+
+			gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg, "GPC exception pending");
+
+			/* if no sm debugger is present, clean up the channel */
+			if (!gk20a_gr_sm_debugger_attached(g)) {
+				gk20a_dbg(gpu_dbg_intr | gpu_dbg_gpu_dbg,
+					   "SM debugger not attached, clearing interrupt");
+				need_reset |= -EFAULT;
+			} else {
+				/* check if gpc 0 has an exception */
+				if (exception1 & gr_exception1_gpc_0_pending_f())
+					need_reset |= gk20a_gr_handle_gpc_exception(g, &isr_data);
+				/* clear the hwws, also causes tpc and gpc
+				 * exceptions to be cleared */
+				gk20a_gr_clear_sm_hww(g, global_esr);
+			}
+
+			if (need_reset)
+				gk20a_set_error_notifier(ch,
+					NVHOST_CHANNEL_GR_ERROR_SW_NOTIFY);
+		}
+
+		gk20a_writel(g, gr_intr_r(), gr_intr_exception_reset_f());
+		gr_intr &= ~gr_intr_exception_pending_f();
+	}
+
+	if (need_reset)
+		gk20a_fifo_recover(g, BIT(ENGINE_GR_GK20A), true);
+
+clean_up:
+	gk20a_writel(g, gr_gpfifo_ctl_r(),
+		grfifo_ctl | gr_gpfifo_ctl_access_f(1) |
+		gr_gpfifo_ctl_semaphore_access_f(1));
+
+	if (gr_intr)
+		gk20a_err(dev_from_gk20a(g),
+			   "unhandled gr interrupt 0x%08x", gr_intr);
+
+	return 0;
+}
+
+int gk20a_gr_nonstall_isr(struct gk20a *g)
+{
+	u32 gr_intr = gk20a_readl(g, gr_intr_nonstall_r());
+	u32 clear_intr = 0;
+
+	gk20a_dbg(gpu_dbg_intr, "pgraph nonstall intr %08x", gr_intr);
+
+	if (gr_intr & gr_intr_nonstall_trap_pending_f()) {
+		gk20a_channel_semaphore_wakeup(g);
+		clear_intr |= gr_intr_nonstall_trap_pending_f();
+	}
+
+	gk20a_writel(g, gr_intr_nonstall_r(), clear_intr);
+
+	return 0;
+}
+
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size)
+{
+	BUG_ON(size == NULL);
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a) {
+			   .mailbox.id = 0,
+			   .mailbox.data = 0,
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_discover_reglist_image_size_v(),
+			   .mailbox.ret = size,
+			   .cond.ok = GR_IS_UCODE_OP_NOT_EQUAL,
+			   .mailbox.ok = 0,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
+}
+
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr)
+{
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a){
+			   .mailbox.id = 4,
+			   .mailbox.data = (gr_fecs_current_ctx_ptr_f(addr >> 12) |
+					    gr_fecs_current_ctx_valid_f(1) |
+					    gr_fecs_current_ctx_target_vid_mem_f()),
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_set_reglist_bind_instance_v(),
+			   .mailbox.ret = NULL,
+			   .cond.ok = GR_IS_UCODE_OP_EQUAL,
+			   .mailbox.ok = 1,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
+}
+
+int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va)
+{
+	return gr_gk20a_submit_fecs_method_op(g,
+		   (struct fecs_method_op_gk20a) {
+			   .mailbox.id = 4,
+			   .mailbox.data = u64_lo32(pmu_va >> 8),
+			   .mailbox.clr = ~0,
+			   .method.data = 1,
+			   .method.addr = gr_fecs_method_push_adr_set_reglist_virtual_address_v(),
+			   .mailbox.ret = NULL,
+			   .cond.ok = GR_IS_UCODE_OP_EQUAL,
+			   .mailbox.ok = 1,
+			   .cond.fail = GR_IS_UCODE_OP_SKIP,
+			   .mailbox.fail = 0});
+}
+
+int gk20a_gr_suspend(struct gk20a *g)
+{
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 ret = 0;
+
+	gk20a_dbg_fn("");
+
+	ret = gr_gk20a_wait_idle(g, end_jiffies, GR_IDLE_CHECK_DEFAULT);
+	if (ret)
+		return ret;
+
+	gk20a_writel(g, gr_gpfifo_ctl_r(),
+		gr_gpfifo_ctl_access_disabled_f());
+
+	/* disable gr intr */
+	gk20a_writel(g, gr_intr_r(), 0);
+	gk20a_writel(g, gr_intr_en_r(), 0);
+
+	/* disable all exceptions */
+	gk20a_writel(g, gr_exception_r(), 0);
+	gk20a_writel(g, gr_exception_en_r(), 0);
+	gk20a_writel(g, gr_exception1_r(), 0);
+	gk20a_writel(g, gr_exception1_en_r(), 0);
+	gk20a_writel(g, gr_exception2_r(), 0);
+	gk20a_writel(g, gr_exception2_en_r(), 0);
+
+	gk20a_gr_flush_channel_tlb(&g->gr);
+
+	gk20a_dbg_fn("done");
+	return ret;
+}
+
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+					       u32 addr,
+					       bool is_quad, u32 quad,
+					       u32 *context_buffer,
+					       u32 context_buffer_size,
+					       u32 *priv_offset);
+
+/* This function will decode a priv address and return the partition type and numbers. */
+int gr_gk20a_decode_priv_addr(struct gk20a *g, u32 addr,
+			      int  *addr_type, /* enum ctxsw_addr_type */
+			      u32 *gpc_num, u32 *tpc_num, u32 *ppc_num, u32 *be_num,
+			      u32 *broadcast_flags)
+{
+	u32 gpc_addr;
+	u32 ppc_address;
+	u32 ppc_broadcast_addr;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	/* setup defaults */
+	ppc_address = 0;
+	ppc_broadcast_addr = 0;
+	*addr_type = CTXSW_ADDR_TYPE_SYS;
+	*broadcast_flags = PRI_BROADCAST_FLAGS_NONE;
+	*gpc_num = 0;
+	*tpc_num = 0;
+	*ppc_num = 0;
+	*be_num  = 0;
+
+	if (pri_is_gpc_addr(addr)) {
+		*addr_type = CTXSW_ADDR_TYPE_GPC;
+		gpc_addr = pri_gpccs_addr_mask(addr);
+		if (pri_is_gpc_addr_shared(addr)) {
+			*addr_type = CTXSW_ADDR_TYPE_GPC;
+			*broadcast_flags |= PRI_BROADCAST_FLAGS_GPC;
+		} else
+			*gpc_num = pri_get_gpc_num(addr);
+
+		if (pri_is_tpc_addr(gpc_addr)) {
+			*addr_type = CTXSW_ADDR_TYPE_TPC;
+			if (pri_is_tpc_addr_shared(gpc_addr)) {
+				*broadcast_flags |= PRI_BROADCAST_FLAGS_TPC;
+				return 0;
+			}
+			*tpc_num = pri_get_tpc_num(gpc_addr);
+		}
+		return 0;
+	} else if (pri_is_be_addr(addr)) {
+		*addr_type = CTXSW_ADDR_TYPE_BE;
+		if (pri_is_be_addr_shared(addr)) {
+			*broadcast_flags |= PRI_BROADCAST_FLAGS_BE;
+			return 0;
+		}
+		*be_num = pri_get_be_num(addr);
+		return 0;
+	} else {
+		*addr_type = CTXSW_ADDR_TYPE_SYS;
+		return 0;
+	}
+	/* PPC!?!?!?! */
+
+	/*NOTREACHED*/
+	return -EINVAL;
+}
+
+static int gr_gk20a_split_ppc_broadcast_addr(struct gk20a *g, u32 addr,
+				      u32 gpc_num,
+				      u32 *priv_addr_table, u32 *t)
+{
+    u32 ppc_num;
+
+    gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+    for (ppc_num = 0; ppc_num < g->gr.pe_count_per_gpc; ppc_num++)
+	    priv_addr_table[(*t)++] = pri_ppc_addr(pri_ppccs_addr_mask(addr),
+						   gpc_num, ppc_num);
+
+    return 0;
+}
+
+/*
+ * The context buffer is indexed using BE broadcast addresses and GPC/TPC
+ * unicast addresses. This function will convert a BE unicast address to a BE
+ * broadcast address and split a GPC/TPC broadcast address into a table of
+ * GPC/TPC addresses.  The addresses generated by this function can be
+ * successfully processed by gr_gk20a_find_priv_offset_in_buffer
+ */
+static int gr_gk20a_create_priv_addr_table(struct gk20a *g,
+					   u32 addr,
+					   u32 *priv_addr_table,
+					   u32 *num_registers)
+{
+	int addr_type; /*enum ctxsw_addr_type */
+	u32 gpc_num, tpc_num, ppc_num, be_num;
+	u32 broadcast_flags;
+	u32 t;
+	int err;
+
+	t = 0;
+	*num_registers = 0;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+					&gpc_num, &tpc_num, &ppc_num, &be_num,
+					&broadcast_flags);
+	gk20a_dbg(gpu_dbg_gpu_dbg, "addr_type = %d", addr_type);
+	if (err)
+		return err;
+
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		/* The BE broadcast registers are included in the compressed PRI
+		 * table. Convert a BE unicast address to a broadcast address
+		 * so that we can look up the offset. */
+		if ((addr_type == CTXSW_ADDR_TYPE_BE) &&
+		    !(broadcast_flags & PRI_BROADCAST_FLAGS_BE))
+			priv_addr_table[t++] = pri_be_shared_addr(addr);
+		else
+			priv_addr_table[t++] = addr;
+
+		*num_registers = t;
+		return 0;
+	}
+
+	/* The GPC/TPC unicast registers are included in the compressed PRI
+	 * tables. Convert a GPC/TPC broadcast address to unicast addresses so
+	 * that we can look up the offsets. */
+	if (broadcast_flags & PRI_BROADCAST_FLAGS_GPC) {
+		for (gpc_num = 0; gpc_num < g->gr.gpc_count; gpc_num++) {
+
+			if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+				for (tpc_num = 0;
+				     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+				     tpc_num++)
+					priv_addr_table[t++] =
+						pri_tpc_addr(pri_tpccs_addr_mask(addr),
+							     gpc_num, tpc_num);
+
+			else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC) {
+				err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+							       priv_addr_table, &t);
+				if (err)
+					return err;
+			} else
+				priv_addr_table[t++] =
+					pri_gpc_addr(pri_gpccs_addr_mask(addr),
+						     gpc_num);
+		}
+	} else {
+		if (broadcast_flags & PRI_BROADCAST_FLAGS_TPC)
+			for (tpc_num = 0;
+			     tpc_num < g->gr.gpc_tpc_count[gpc_num];
+			     tpc_num++)
+				priv_addr_table[t++] =
+					pri_tpc_addr(pri_tpccs_addr_mask(addr),
+						     gpc_num, tpc_num);
+		else if (broadcast_flags & PRI_BROADCAST_FLAGS_PPC)
+			err = gr_gk20a_split_ppc_broadcast_addr(g, addr, gpc_num,
+						       priv_addr_table, &t);
+		else
+			priv_addr_table[t++] = addr;
+	}
+
+	*num_registers = t;
+	return 0;
+}
+
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+				    u32 addr,
+				    u32 max_offsets,
+				    u32 *offsets, u32 *offset_addrs,
+				    u32 *num_offsets,
+				    bool is_quad, u32 quad)
+{
+	u32 i;
+	u32 priv_offset = 0;
+	u32 *priv_registers;
+	u32 num_registers = 0;
+	int err = 0;
+	u32 potential_offsets = proj_scal_litter_num_gpcs_v() *
+		proj_scal_litter_num_tpc_per_gpc_v();
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	/* implementation is crossed-up if either of these happen */
+	if (max_offsets > potential_offsets)
+		return -EINVAL;
+
+	if (!g->gr.ctx_vars.golden_image_initialized)
+		return -ENODEV;
+
+	priv_registers = kzalloc(sizeof(u32) * potential_offsets, GFP_KERNEL);
+	if (IS_ERR_OR_NULL(priv_registers)) {
+		gk20a_dbg_fn("failed alloc for potential_offsets=%d", potential_offsets);
+		err = PTR_ERR(priv_registers);
+		goto cleanup;
+	}
+	memset(offsets,      0, sizeof(u32) * max_offsets);
+	memset(offset_addrs, 0, sizeof(u32) * max_offsets);
+	*num_offsets = 0;
+
+	gr_gk20a_create_priv_addr_table(g, addr, &priv_registers[0], &num_registers);
+
+	if ((max_offsets > 1) && (num_registers > max_offsets)) {
+		err = -EINVAL;
+		goto cleanup;
+	}
+
+	if ((max_offsets == 1) && (num_registers > 1))
+		num_registers = 1;
+
+	if (!g->gr.ctx_vars.local_golden_image) {
+		gk20a_dbg_fn("no context switch header info to work with");
+		err = -EINVAL;
+		goto cleanup;
+	}
+
+	for (i = 0; i < num_registers; i++) {
+		err = gr_gk20a_find_priv_offset_in_buffer(g,
+						  priv_registers[i],
+						  is_quad, quad,
+						  g->gr.ctx_vars.local_golden_image,
+						  g->gr.ctx_vars.golden_image_size,
+						  &priv_offset);
+		if (err) {
+			gk20a_dbg_fn("Could not determine priv_offset for addr:0x%x",
+				      addr); /*, grPriRegStr(addr)));*/
+			goto cleanup;
+		}
+
+		offsets[i] = priv_offset;
+		offset_addrs[i] = priv_registers[i];
+	}
+
+    *num_offsets = num_registers;
+
+ cleanup:
+
+    if (!IS_ERR_OR_NULL(priv_registers))
+	    kfree(priv_registers);
+
+    return err;
+}
+
+/* Setup some register tables.  This looks hacky; our
+ * register/offset functions are just that, functions.
+ * So they can't be used as initializers... TBD: fix to
+ * generate consts at least on an as-needed basis.
+ */
+static const u32 _num_ovr_perf_regs = 17;
+static u32 _ovr_perf_regs[17] = { 0, };
+/* Following are the blocks of registers that the ucode
+ stores in the extended region.*/
+/* ==  ctxsw_extended_sm_dsm_perf_counter_register_stride_v() ? */
+static const u32 _num_sm_dsm_perf_regs = 5;
+/* ==  ctxsw_extended_sm_dsm_perf_counter_control_register_stride_v() ?*/
+static const u32 _num_sm_dsm_perf_ctrl_regs = 4;
+static u32 _sm_dsm_perf_regs[5];
+static u32 _sm_dsm_perf_ctrl_regs[4];
+
+static void init_sm_dsm_reg_info(void)
+{
+	if (_ovr_perf_regs[0] != 0)
+		return;
+
+	_ovr_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r();
+	_ovr_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r();
+	_ovr_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r();
+	_ovr_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r();
+	_ovr_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r();
+	_ovr_perf_regs[5] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r();
+	_ovr_perf_regs[6] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r();
+	_ovr_perf_regs[7] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r();
+	_ovr_perf_regs[8] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r();
+	_ovr_perf_regs[9] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r();
+	_ovr_perf_regs[10] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r();
+	_ovr_perf_regs[11] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r();
+	_ovr_perf_regs[12] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r();
+	_ovr_perf_regs[13] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r();
+	_ovr_perf_regs[14] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r();
+	_ovr_perf_regs[15] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r();
+	_ovr_perf_regs[16] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r();
+
+
+	_sm_dsm_perf_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r();
+	_sm_dsm_perf_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r();
+	_sm_dsm_perf_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r();
+	_sm_dsm_perf_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r();
+	_sm_dsm_perf_regs[4] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r();
+
+	_sm_dsm_perf_ctrl_regs[0] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r();
+	_sm_dsm_perf_ctrl_regs[1] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r();
+	_sm_dsm_perf_ctrl_regs[2] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r();
+	_sm_dsm_perf_ctrl_regs[3] = gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r();
+
+}
+
+/* TBD: would like to handle this elsewhere, at a higher level.
+ * these are currently constructed in a "test-then-write" style
+ * which makes it impossible to know externally whether a ctx
+ * write will actually occur. so later we should put a lazy,
+ *  map-and-hold system in the patch write state */
+int gr_gk20a_ctx_patch_smpc(struct gk20a *g,
+			    struct channel_ctx_gk20a *ch_ctx,
+			    u32 addr, u32 data,
+			    u8 *context)
+{
+	u32 num_gpc = g->gr.gpc_count;
+	u32 num_tpc;
+	u32 tpc, gpc, reg;
+	u32 chk_addr;
+	u32 vaddr_lo;
+	u32 vaddr_hi;
+	u32 tmp;
+
+	init_sm_dsm_reg_info();
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	for (reg = 0; reg < _num_ovr_perf_regs; reg++) {
+		for (gpc = 0; gpc < num_gpc; gpc++)  {
+			num_tpc = g->gr.gpc_tpc_count[gpc];
+			for (tpc = 0; tpc < num_tpc; tpc++) {
+				chk_addr = ((proj_gpc_stride_v() * gpc) +
+					    (proj_tpc_in_gpc_stride_v() * tpc) +
+					    _ovr_perf_regs[reg]);
+				if (chk_addr != addr)
+					continue;
+				/* reset the patch count from previous
+				   runs,if ucode has already processed
+				   it */
+				tmp = gk20a_mem_rd32(context +
+				       ctxsw_prog_main_image_patch_count_o(), 0);
+
+				if (!tmp)
+					ch_ctx->patch_ctx.data_count = 0;
+
+				gr_gk20a_ctx_patch_write(g, ch_ctx,
+							 addr, data, true);
+
+				vaddr_lo = u64_lo32(ch_ctx->patch_ctx.gpu_va);
+				vaddr_hi = u64_hi32(ch_ctx->patch_ctx.gpu_va);
+
+				gk20a_mem_wr32(context +
+					 ctxsw_prog_main_image_patch_count_o(),
+					 0, ch_ctx->patch_ctx.data_count);
+				gk20a_mem_wr32(context +
+					 ctxsw_prog_main_image_patch_adr_lo_o(),
+					 0, vaddr_lo);
+				gk20a_mem_wr32(context +
+					 ctxsw_prog_main_image_patch_adr_hi_o(),
+					 0, vaddr_hi);
+
+				/* we're not caching these on cpu side,
+				   but later watch for it */
+
+				/* the l2 invalidate in the patch_write
+				 * would be too early for this? */
+				gk20a_mm_l2_invalidate(g);
+				return 0;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void gr_gk20a_access_smpc_reg(struct gk20a *g, u32 quad, u32 offset)
+{
+	u32 reg;
+	u32 quad_ctrl;
+	u32 half_ctrl;
+	u32 tpc, gpc;
+	u32 gpc_tpc_addr;
+	u32 gpc_tpc_stride;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "offset=0x%x", offset);
+
+	gpc = pri_get_gpc_num(offset);
+	gpc_tpc_addr = pri_gpccs_addr_mask(offset);
+	tpc = pri_get_tpc_num(gpc_tpc_addr);
+
+	quad_ctrl = quad & 0x1; /* first bit tells us quad */
+	half_ctrl = (quad >> 1) & 0x1; /* second bit tells us half */
+
+	gpc_tpc_stride = gpc * proj_gpc_stride_v() +
+		tpc * proj_tpc_in_gpc_stride_v();
+	gpc_tpc_addr = gr_gpc0_tpc0_sm_halfctl_ctrl_r() + gpc_tpc_stride;
+
+	reg = gk20a_readl(g, gpc_tpc_addr);
+	reg = set_field(reg,
+		gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(),
+		gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(quad_ctrl));
+
+	gk20a_writel(g, gpc_tpc_addr, reg);
+
+	gpc_tpc_addr = gr_gpc0_tpc0_sm_debug_sfe_control_r() + gpc_tpc_stride;
+	reg = gk20a_readl(g, gpc_tpc_addr);
+	reg = set_field(reg,
+		gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(),
+		gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(half_ctrl));
+	gk20a_writel(g, gpc_tpc_addr, reg);
+}
+
+#define ILLEGAL_ID (~0)
+
+static inline bool check_main_image_header_magic(void *context)
+{
+	u32 magic = gk20a_mem_rd32(context +
+			     ctxsw_prog_main_image_magic_value_o(), 0);
+	gk20a_dbg(gpu_dbg_gpu_dbg, "main image magic=0x%x", magic);
+	return magic == ctxsw_prog_main_image_magic_value_v_value_v();
+}
+static inline bool check_local_header_magic(void *context)
+{
+	u32 magic = gk20a_mem_rd32(context +
+			     ctxsw_prog_local_magic_value_o(), 0);
+	gk20a_dbg(gpu_dbg_gpu_dbg, "local magic=0x%x",  magic);
+	return magic == ctxsw_prog_local_magic_value_v_value_v();
+
+}
+
+/* most likely dupe of ctxsw_gpccs_header__size_1_v() */
+static inline int ctxsw_prog_ucode_header_size_in_bytes(void)
+{
+	return 256;
+}
+
+void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
+					u32 *num_sm_dsm_perf_regs,
+					u32 **sm_dsm_perf_regs,
+					u32 *perf_register_stride)
+{
+	*num_sm_dsm_perf_regs = _num_sm_dsm_perf_regs;
+	*sm_dsm_perf_regs = _sm_dsm_perf_regs;
+	*perf_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v();
+}
+
+void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
+					u32 *num_sm_dsm_perf_ctrl_regs,
+					u32 **sm_dsm_perf_ctrl_regs,
+					u32 *ctrl_register_stride)
+{
+	*num_sm_dsm_perf_ctrl_regs = _num_sm_dsm_perf_ctrl_regs;
+	*sm_dsm_perf_ctrl_regs = _sm_dsm_perf_ctrl_regs;
+	*ctrl_register_stride = ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v();
+}
+
+static int gr_gk20a_find_priv_offset_in_ext_buffer(struct gk20a *g,
+						   u32 addr,
+						   bool is_quad, u32 quad,
+						   u32 *context_buffer,
+						   u32 context_buffer_size,
+						   u32 *priv_offset)
+{
+	u32 i, data32;
+	u32 gpc_num, tpc_num;
+	u32 num_gpcs, num_tpcs;
+	u32 chk_addr;
+	u32 ext_priv_offset, ext_priv_size;
+	void *context;
+	u32 offset_to_segment, offset_to_segment_end;
+	u32 sm_dsm_perf_reg_id = ILLEGAL_ID;
+	u32 sm_dsm_perf_ctrl_reg_id = ILLEGAL_ID;
+	u32 num_ext_gpccs_ext_buffer_segments;
+	u32 inter_seg_offset;
+	u32 tpc_gpc_mask = (proj_tpc_in_gpc_stride_v() - 1);
+	u32 max_tpc_count;
+	u32 *sm_dsm_perf_ctrl_regs = NULL;
+	u32 num_sm_dsm_perf_ctrl_regs = 0;
+	u32 *sm_dsm_perf_regs = NULL;
+	u32 num_sm_dsm_perf_regs = 0;
+	u32 buffer_segments_size = 0;
+	u32 marker_size = 0;
+	u32 control_register_stride = 0;
+	u32 perf_register_stride = 0;
+
+	/* Only have TPC registers in extended region, so if not a TPC reg,
+	   then return error so caller can look elsewhere. */
+	if (pri_is_gpc_addr(addr))   {
+		u32 gpc_addr = 0;
+		gpc_num = pri_get_gpc_num(addr);
+		gpc_addr = pri_gpccs_addr_mask(addr);
+		if (pri_is_tpc_addr(gpc_addr))
+			tpc_num = pri_get_tpc_num(gpc_addr);
+		else
+			return -EINVAL;
+
+		gk20a_dbg_info(" gpc = %d tpc = %d",
+				gpc_num, tpc_num);
+	} else
+		return -EINVAL;
+
+	buffer_segments_size = ctxsw_prog_extended_buffer_segments_size_in_bytes_v();
+	/* note below is in words/num_registers */
+	marker_size = ctxsw_prog_extended_marker_size_in_bytes_v() >> 2;
+
+	context = context_buffer;
+	/* sanity check main header */
+	if (!check_main_image_header_magic(context)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "Invalid main header: magic value");
+		return -EINVAL;
+	}
+	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+	if (gpc_num >= num_gpcs) {
+		gk20a_err(dev_from_gk20a(g),
+		   "GPC 0x%08x is greater than total count 0x%08x!\n",
+			   gpc_num, num_gpcs);
+		return -EINVAL;
+	}
+
+	data32 = gk20a_mem_rd32(context + ctxsw_prog_main_extended_buffer_ctl_o(), 0);
+	ext_priv_size   = ctxsw_prog_main_extended_buffer_ctl_size_v(data32);
+	if (0 == ext_priv_size) {
+		gk20a_dbg_info(" No extended memory in context buffer");
+		return -EINVAL;
+	}
+	ext_priv_offset = ctxsw_prog_main_extended_buffer_ctl_offset_v(data32);
+
+	offset_to_segment = ext_priv_offset * ctxsw_prog_ucode_header_size_in_bytes();
+	offset_to_segment_end = offset_to_segment +
+		(ext_priv_size * buffer_segments_size);
+
+	/* check local header magic */
+	context += ctxsw_prog_ucode_header_size_in_bytes();
+	if (!check_local_header_magic(context)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "Invalid local header: magic value\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * See if the incoming register address is in the first table of
+	 * registers. We check this by decoding only the TPC addr portion.
+	 * If we get a hit on the TPC bit, we then double check the address
+	 * by computing it from the base gpc/tpc strides.  Then make sure
+	 * it is a real match.
+	 */
+	g->ops.gr.get_sm_dsm_perf_regs(g, &num_sm_dsm_perf_regs,
+				       &sm_dsm_perf_regs,
+				       &perf_register_stride);
+
+	init_sm_dsm_reg_info();
+
+	for (i = 0; i < num_sm_dsm_perf_regs; i++) {
+		if ((addr & tpc_gpc_mask) == (sm_dsm_perf_regs[i] & tpc_gpc_mask)) {
+			sm_dsm_perf_reg_id = i;
+
+			gk20a_dbg_info("register match: 0x%08x",
+					sm_dsm_perf_regs[i]);
+
+			chk_addr = (proj_gpc_base_v() +
+				   (proj_gpc_stride_v() * gpc_num) +
+				   proj_tpc_in_gpc_base_v() +
+				   (proj_tpc_in_gpc_stride_v() * tpc_num) +
+				   (sm_dsm_perf_regs[sm_dsm_perf_reg_id] & tpc_gpc_mask));
+
+			if (chk_addr != addr) {
+				gk20a_err(dev_from_gk20a(g),
+				   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+					   addr, chk_addr);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	/* Didn't find reg in supported group 1.
+	 *  so try the second group now */
+	g->ops.gr.get_sm_dsm_perf_ctrl_regs(g, &num_sm_dsm_perf_ctrl_regs,
+				       &sm_dsm_perf_ctrl_regs,
+				       &control_register_stride);
+
+	if (ILLEGAL_ID == sm_dsm_perf_reg_id) {
+		for (i = 0; i < num_sm_dsm_perf_ctrl_regs; i++) {
+			if ((addr & tpc_gpc_mask) ==
+			    (sm_dsm_perf_ctrl_regs[i] & tpc_gpc_mask)) {
+				sm_dsm_perf_ctrl_reg_id = i;
+
+				gk20a_dbg_info("register match: 0x%08x",
+						sm_dsm_perf_ctrl_regs[i]);
+
+				chk_addr = (proj_gpc_base_v() +
+					   (proj_gpc_stride_v() * gpc_num) +
+					   proj_tpc_in_gpc_base_v() +
+					   (proj_tpc_in_gpc_stride_v() * tpc_num) +
+					   (sm_dsm_perf_ctrl_regs[sm_dsm_perf_ctrl_reg_id] &
+					    tpc_gpc_mask));
+
+				if (chk_addr != addr) {
+					gk20a_err(dev_from_gk20a(g),
+						   "Oops addr miss-match! : 0x%08x != 0x%08x\n",
+						   addr, chk_addr);
+					return -EINVAL;
+
+				}
+
+				break;
+			}
+		}
+	}
+
+	if ((ILLEGAL_ID == sm_dsm_perf_ctrl_reg_id) &&
+	    (ILLEGAL_ID == sm_dsm_perf_reg_id))
+		return -EINVAL;
+
+	/* Skip the FECS extended header, nothing there for us now. */
+	offset_to_segment += buffer_segments_size;
+
+	/* skip through the GPCCS extended headers until we get to the data for
+	 * our GPC.  The size of each gpc extended segment is enough to hold the
+	 * max tpc count for the gpcs,in 256b chunks.
+	 */
+
+	max_tpc_count = proj_scal_litter_num_tpc_per_gpc_v();
+
+	num_ext_gpccs_ext_buffer_segments = (u32)((max_tpc_count + 1) / 2);
+
+	offset_to_segment += (num_ext_gpccs_ext_buffer_segments *
+			      buffer_segments_size * gpc_num);
+
+	num_tpcs = g->gr.gpc_tpc_count[gpc_num];
+
+	/* skip the head marker to start with */
+	inter_seg_offset = marker_size;
+
+	if (ILLEGAL_ID != sm_dsm_perf_ctrl_reg_id) {
+		/* skip over control regs of TPC's before the one we want.
+		 *  then skip to the register in this tpc */
+		inter_seg_offset = inter_seg_offset +
+			(tpc_num * control_register_stride) +
+			sm_dsm_perf_ctrl_reg_id;
+	} else {
+		/* skip all the control registers */
+		inter_seg_offset = inter_seg_offset +
+			(num_tpcs * control_register_stride);
+
+		/* skip the marker between control and counter segments */
+		inter_seg_offset += marker_size;
+
+		/* skip over counter regs of TPCs before the one we want */
+		inter_seg_offset = inter_seg_offset +
+			(tpc_num * perf_register_stride) *
+			ctxsw_prog_extended_num_smpc_quadrants_v();
+
+		/* skip over the register for the quadrants we do not want.
+		 *  then skip to the register in this tpc */
+		inter_seg_offset = inter_seg_offset +
+			(perf_register_stride * quad) +
+			sm_dsm_perf_reg_id;
+	}
+
+	/* set the offset to the segment offset plus the inter segment offset to
+	 *  our register */
+	offset_to_segment += (inter_seg_offset * 4);
+
+	/* last sanity check: did we somehow compute an offset outside the
+	 * extended buffer? */
+	if (offset_to_segment > offset_to_segment_end) {
+		gk20a_err(dev_from_gk20a(g),
+			   "Overflow ctxsw buffer! 0x%08x > 0x%08x\n",
+			   offset_to_segment, offset_to_segment_end);
+		return -EINVAL;
+	}
+
+	*priv_offset = offset_to_segment;
+
+	return 0;
+}
+
+
+static int
+gr_gk20a_process_context_buffer_priv_segment(struct gk20a *g,
+					     int addr_type,/* enum ctxsw_addr_type */
+					     u32 pri_addr,
+					     u32 gpc_num, u32 num_tpcs,
+					     u32 num_ppcs, u32 ppc_mask,
+					     u32 *priv_offset)
+{
+	u32 i;
+	u32 address, base_address;
+	u32 sys_offset, gpc_offset, tpc_offset, ppc_offset;
+	u32 ppc_num, tpc_num, tpc_addr, gpc_addr, ppc_addr;
+	struct aiv_gk20a *reg;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "pri_addr=0x%x", pri_addr);
+
+	if (!g->gr.ctx_vars.valid)
+		return -EINVAL;
+
+	/* Process the SYS/BE segment. */
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.sys.count; i++) {
+			reg = &g->gr.ctx_vars.ctxsw_regs.sys.l[i];
+			address    = reg->addr;
+			sys_offset = reg->index;
+
+			if (pri_addr == address) {
+				*priv_offset = sys_offset;
+				return 0;
+			}
+		}
+	}
+
+	/* Process the TPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+		for (tpc_num = 0; tpc_num < num_tpcs; tpc_num++) {
+			for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.tpc.count; i++) {
+				reg = &g->gr.ctx_vars.ctxsw_regs.tpc.l[i];
+				address = reg->addr;
+				tpc_addr = pri_tpccs_addr_mask(address);
+				base_address = proj_gpc_base_v() +
+					(gpc_num * proj_gpc_stride_v()) +
+					proj_tpc_in_gpc_base_v() +
+					(tpc_num * proj_tpc_in_gpc_stride_v());
+				address = base_address + tpc_addr;
+				/*
+				 * The data for the TPCs is interleaved in the context buffer.
+				 * Example with num_tpcs = 2
+				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+				 */
+				tpc_offset = (reg->index * num_tpcs) + (tpc_num * 4);
+
+				if (pri_addr == address) {
+					*priv_offset = tpc_offset;
+					return 0;
+				}
+			}
+		}
+	}
+
+	/* Process the PPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+		for (ppc_num = 0; ppc_num < num_ppcs; ppc_num++) {
+			for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.ppc.count; i++) {
+				reg = &g->gr.ctx_vars.ctxsw_regs.ppc.l[i];
+				address = reg->addr;
+				ppc_addr = pri_ppccs_addr_mask(address);
+				base_address = proj_gpc_base_v() +
+					(gpc_num * proj_gpc_stride_v()) +
+					proj_ppc_in_gpc_base_v() +
+					(ppc_num * proj_ppc_in_gpc_stride_v());
+				address = base_address + ppc_addr;
+				/*
+				 * The data for the PPCs is interleaved in the context buffer.
+				 * Example with numPpcs = 2
+				 * 0    1    2    3    4    5    6    7    8    9    10   11 ...
+				 * 0-0  1-0  0-1  1-1  0-2  1-2  0-3  1-3  0-4  1-4  0-5  1-5 ...
+				 */
+				ppc_offset = (reg->index * num_ppcs) + (ppc_num * 4);
+
+				if (pri_addr == address)  {
+					*priv_offset = ppc_offset;
+					return 0;
+				}
+			}
+		}
+	}
+
+
+	/* Process the GPC segment. */
+	if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+		for (i = 0; i < g->gr.ctx_vars.ctxsw_regs.gpc.count; i++) {
+			reg = &g->gr.ctx_vars.ctxsw_regs.gpc.l[i];
+
+			address = reg->addr;
+			gpc_addr = pri_gpccs_addr_mask(address);
+			gpc_offset = reg->index;
+
+			base_address = proj_gpc_base_v() +
+				(gpc_num * proj_gpc_stride_v());
+			address = base_address + gpc_addr;
+
+			if (pri_addr == address) {
+				*priv_offset = gpc_offset;
+				return 0;
+			}
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int gr_gk20a_determine_ppc_configuration(struct gk20a *g,
+					       void *context,
+					       u32 *num_ppcs, u32 *ppc_mask,
+					       u32 *reg_ppc_count)
+{
+	u32 data32;
+	u32 litter_num_pes_per_gpc = proj_scal_litter_num_pes_per_gpc_v();
+
+	/*
+	 * if there is only 1 PES_PER_GPC, then we put the PES registers
+	 * in the GPC reglist, so we can't error out if ppc.count == 0
+	 */
+	if ((!g->gr.ctx_vars.valid) ||
+	    ((g->gr.ctx_vars.ctxsw_regs.ppc.count == 0) &&
+	     (litter_num_pes_per_gpc > 1)))
+		return -EINVAL;
+
+	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_image_ppc_info_o(), 0);
+
+	*num_ppcs = ctxsw_prog_local_image_ppc_info_num_ppcs_v(data32);
+	*ppc_mask = ctxsw_prog_local_image_ppc_info_ppc_mask_v(data32);
+
+	*reg_ppc_count = g->gr.ctx_vars.ctxsw_regs.ppc.count;
+
+	return 0;
+}
+
+
+
+/*
+ *  This function will return the 32 bit offset for a priv register if it is
+ *  present in the context buffer.
+ */
+static int gr_gk20a_find_priv_offset_in_buffer(struct gk20a *g,
+					       u32 addr,
+					       bool is_quad, u32 quad,
+					       u32 *context_buffer,
+					       u32 context_buffer_size,
+					       u32 *priv_offset)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 i, data32;
+	int err;
+	int addr_type; /*enum ctxsw_addr_type */
+	u32 broadcast_flags;
+	u32 gpc_num, tpc_num, ppc_num, be_num;
+	u32 num_gpcs, num_tpcs, num_ppcs;
+	u32 offset;
+	u32 sys_priv_offset, gpc_priv_offset;
+	u32 ppc_mask, reg_list_ppc_count;
+	void *context;
+	u32 offset_to_segment;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "addr=0x%x", addr);
+
+	err = gr_gk20a_decode_priv_addr(g, addr, &addr_type,
+					&gpc_num, &tpc_num, &ppc_num, &be_num,
+					&broadcast_flags);
+	if (err)
+		return err;
+
+	context = context_buffer;
+	if (!check_main_image_header_magic(context)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "Invalid main header: magic value");
+		return -EINVAL;
+	}
+	num_gpcs = gk20a_mem_rd32(context + ctxsw_prog_main_image_num_gpcs_o(), 0);
+
+	/* Parse the FECS local header. */
+	context += ctxsw_prog_ucode_header_size_in_bytes();
+	if (!check_local_header_magic(context)) {
+		gk20a_err(dev_from_gk20a(g),
+			   "Invalid FECS local header: magic value\n");
+		return -EINVAL;
+	}
+	data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+	sys_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+	/* If found in Ext buffer, ok.
+	 * If it failed and we expected to find it there (quad offset)
+	 * then return the error.  Otherwise continue on.
+	 */
+	err = gr_gk20a_find_priv_offset_in_ext_buffer(g,
+				      addr, is_quad, quad, context_buffer,
+				      context_buffer_size, priv_offset);
+	if (!err || (err && is_quad))
+		return err;
+
+	if ((addr_type == CTXSW_ADDR_TYPE_SYS) ||
+	    (addr_type == CTXSW_ADDR_TYPE_BE)) {
+		/* Find the offset in the FECS segment. */
+		offset_to_segment = sys_priv_offset *
+			ctxsw_prog_ucode_header_size_in_bytes();
+
+		err = gr_gk20a_process_context_buffer_priv_segment(g,
+					   addr_type, addr,
+					   0, 0, 0, 0,
+					   &offset);
+		if (err)
+			return err;
+
+		*priv_offset = (offset_to_segment + offset);
+		return 0;
+	}
+
+	if ((gpc_num + 1) > num_gpcs)  {
+		gk20a_err(dev_from_gk20a(g),
+			   "GPC %d not in this context buffer.\n",
+			   gpc_num);
+		return -EINVAL;
+	}
+
+	/* Parse the GPCCS local header(s).*/
+	for (i = 0; i < num_gpcs; i++) {
+		context += ctxsw_prog_ucode_header_size_in_bytes();
+		if (!check_local_header_magic(context)) {
+			gk20a_err(dev_from_gk20a(g),
+				   "Invalid GPCCS local header: magic value\n");
+			return -EINVAL;
+
+		}
+		data32 = gk20a_mem_rd32(context + ctxsw_prog_local_priv_register_ctl_o(), 0);
+		gpc_priv_offset = ctxsw_prog_local_priv_register_ctl_offset_v(data32);
+
+		err = gr_gk20a_determine_ppc_configuration(g, context,
+							   &num_ppcs, &ppc_mask,
+							   &reg_list_ppc_count);
+		if (err)
+			return err;
+
+		num_tpcs = gk20a_mem_rd32(context + ctxsw_prog_local_image_num_tpcs_o(), 0);
+
+		if ((i == gpc_num) && ((tpc_num + 1) > num_tpcs)) {
+			gk20a_err(dev_from_gk20a(g),
+			   "GPC %d TPC %d not in this context buffer.\n",
+				   gpc_num, tpc_num);
+			return -EINVAL;
+		}
+
+		/* Find the offset in the GPCCS segment.*/
+		if (i == gpc_num) {
+			offset_to_segment = gpc_priv_offset *
+				ctxsw_prog_ucode_header_size_in_bytes();
+
+			if (addr_type == CTXSW_ADDR_TYPE_TPC) {
+				/*reg = gr->ctx_vars.ctxsw_regs.tpc.l;*/
+			} else if (addr_type == CTXSW_ADDR_TYPE_PPC) {
+				/* The ucode stores TPC data before PPC data.
+				 * Advance offset past TPC data to PPC data. */
+				offset_to_segment +=
+					((gr->ctx_vars.ctxsw_regs.tpc.count *
+					  num_tpcs) << 2);
+			} else if (addr_type == CTXSW_ADDR_TYPE_GPC) {
+				/* The ucode stores TPC/PPC data before GPC data.
+				 * Advance offset past TPC/PPC data to GPC data. */
+				/* note 1 PES_PER_GPC case */
+				u32 litter_num_pes_per_gpc =
+					proj_scal_litter_num_pes_per_gpc_v();
+				if (litter_num_pes_per_gpc > 1) {
+					offset_to_segment +=
+						(((gr->ctx_vars.ctxsw_regs.tpc.count *
+						   num_tpcs) << 2) +
+						 ((reg_list_ppc_count * num_ppcs) << 2));
+				} else {
+					offset_to_segment +=
+						((gr->ctx_vars.ctxsw_regs.tpc.count *
+						  num_tpcs) << 2);
+				}
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+					   " Unknown address type.\n");
+				return -EINVAL;
+			}
+			err = gr_gk20a_process_context_buffer_priv_segment(g,
+							   addr_type, addr,
+							   i, num_tpcs,
+							   num_ppcs, ppc_mask,
+							   &offset);
+			if (err)
+			    return -EINVAL;
+
+			*priv_offset = offset_to_segment + offset;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+			  struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops)
+{
+	struct gk20a *g = ch->g;
+	struct channel_ctx_gk20a *ch_ctx = &ch->ch_ctx;
+	void *ctx_ptr = NULL;
+	int curr_gr_chid, curr_gr_ctx;
+	bool ch_is_curr_ctx, restart_gr_ctxsw = false;
+	u32 i, j, offset, v;
+	u32 max_offsets = proj_scal_litter_num_gpcs_v() *
+		proj_scal_litter_num_tpc_per_gpc_v();
+	u32 *offsets = NULL;
+	u32 *offset_addrs = NULL;
+	u32 ctx_op_nr, num_ctx_ops[2] = {num_ctx_wr_ops, num_ctx_rd_ops};
+	int err, pass;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "wr_ops=%d rd_ops=%d",
+		   num_ctx_wr_ops, num_ctx_rd_ops);
+
+	/* disable channel switching.
+	 * at that point the hardware state can be inspected to
+	 * determine if the context we're interested in is current.
+	 */
+	err = gr_gk20a_disable_ctxsw(g);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g), "unable to stop gr ctxsw");
+		/* this should probably be ctx-fatal... */
+		goto cleanup;
+	}
+
+	restart_gr_ctxsw = true;
+
+	curr_gr_ctx  = gk20a_readl(g, gr_fecs_current_ctx_r());
+	curr_gr_chid = gk20a_gr_get_chid_from_ctx(g, curr_gr_ctx);
+	ch_is_curr_ctx = (curr_gr_chid != -1) && (ch->hw_chid == curr_gr_chid);
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "is curr ctx=%d", ch_is_curr_ctx);
+	if (ch_is_curr_ctx) {
+		for (pass = 0; pass < 2; pass++) {
+			ctx_op_nr = 0;
+			for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+				/* only do ctx ops and only on the right pass */
+				if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+				    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+				     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+					continue;
+
+				/* if this is a quad access, setup for special access*/
+				if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD)
+						&& g->ops.gr.access_smpc_reg)
+					g->ops.gr.access_smpc_reg(g,
+							ctx_ops[i].quad,
+							ctx_ops[i].offset);
+				offset = ctx_ops[i].offset;
+
+				if (pass == 0) { /* write pass */
+					v = gk20a_readl(g, offset);
+					v &= ~ctx_ops[i].and_n_mask_lo;
+					v |= ctx_ops[i].value_lo;
+					gk20a_writel(g, offset, v);
+
+					gk20a_dbg(gpu_dbg_gpu_dbg,
+						   "direct wr: offset=0x%x v=0x%x",
+						   offset, v);
+
+					if (ctx_ops[i].op == REGOP(WRITE_64)) {
+						v = gk20a_readl(g, offset + 4);
+						v &= ~ctx_ops[i].and_n_mask_hi;
+						v |= ctx_ops[i].value_hi;
+						gk20a_writel(g, offset + 4, v);
+
+						gk20a_dbg(gpu_dbg_gpu_dbg,
+							   "direct wr: offset=0x%x v=0x%x",
+							   offset + 4, v);
+					}
+
+				} else { /* read pass */
+					ctx_ops[i].value_lo =
+						gk20a_readl(g, offset);
+
+					gk20a_dbg(gpu_dbg_gpu_dbg,
+						   "direct rd: offset=0x%x v=0x%x",
+						   offset, ctx_ops[i].value_lo);
+
+					if (ctx_ops[i].op == REGOP(READ_64)) {
+						ctx_ops[i].value_hi =
+							gk20a_readl(g, offset + 4);
+
+						gk20a_dbg(gpu_dbg_gpu_dbg,
+							   "direct rd: offset=0x%x v=0x%x",
+							   offset, ctx_ops[i].value_lo);
+					} else
+						ctx_ops[i].value_hi = 0;
+				}
+				ctx_op_nr++;
+			}
+		}
+		goto cleanup;
+	}
+
+	/* they're the same size, so just use one alloc for both */
+	offsets = kzalloc(2 * sizeof(u32) * max_offsets, GFP_KERNEL);
+	if (!offsets) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+	offset_addrs = offsets + max_offsets;
+
+	/* would have been a variant of gr_gk20a_apply_instmem_overrides */
+	/* recoded in-place instead.*/
+	ctx_ptr = vmap(ch_ctx->gr_ctx.pages,
+			PAGE_ALIGN(ch_ctx->gr_ctx.size) >> PAGE_SHIFT,
+			0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!ctx_ptr) {
+		err = -ENOMEM;
+		goto cleanup;
+	}
+
+	/* Channel gr_ctx buffer is gpu cacheable; so flush and invalidate.
+	 * There should be no on-going/in-flight references by the gpu now. */
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	/* write to appropriate place in context image,
+	 * first have to figure out where that really is */
+
+	/* first pass is writes, second reads */
+	for (pass = 0; pass < 2; pass++) {
+		ctx_op_nr = 0;
+		for (i = 0; (ctx_op_nr < num_ctx_ops[pass]) && (i < num_ops); ++i) {
+			u32 num_offsets;
+
+			/* only do ctx ops and only on the right pass */
+			if ((ctx_ops[i].type == REGOP(TYPE_GLOBAL)) ||
+			    (((pass == 0) && reg_op_is_read(ctx_ops[i].op)) ||
+			     ((pass == 1) && !reg_op_is_read(ctx_ops[i].op))))
+				continue;
+
+			err = gr_gk20a_get_ctx_buffer_offsets(g,
+						ctx_ops[i].offset,
+						max_offsets,
+						offsets, offset_addrs,
+						&num_offsets,
+						ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD),
+						ctx_ops[i].quad);
+			if (err) {
+				gk20a_dbg(gpu_dbg_gpu_dbg,
+					   "ctx op invalid offset: offset=0x%x",
+					   ctx_ops[i].offset);
+				ctx_ops[i].status =
+					NVHOST_DBG_GPU_REG_OP_STATUS_INVALID_OFFSET;
+				continue;
+			}
+
+			/* if this is a quad access, setup for special access*/
+			if (ctx_ops[i].type == REGOP(TYPE_GR_CTX_QUAD) &&
+					g->ops.gr.access_smpc_reg)
+				g->ops.gr.access_smpc_reg(g, ctx_ops[i].quad,
+							 ctx_ops[i].offset);
+
+			for (j = 0; j < num_offsets; j++) {
+				/* sanity check, don't write outside, worst case */
+				if (offsets[j] >= g->gr.ctx_vars.golden_image_size)
+					continue;
+				if (pass == 0) { /* write pass */
+					v = gk20a_mem_rd32(ctx_ptr + offsets[j], 0);
+					v &= ~ctx_ops[i].and_n_mask_lo;
+					v |= ctx_ops[i].value_lo;
+					gk20a_mem_wr32(ctx_ptr + offsets[j], 0, v);
+
+					gk20a_dbg(gpu_dbg_gpu_dbg,
+						   "context wr: offset=0x%x v=0x%x",
+						   offsets[j], v);
+
+					if (ctx_ops[i].op == REGOP(WRITE_64)) {
+						v = gk20a_mem_rd32(ctx_ptr + offsets[j] + 4, 0);
+						v &= ~ctx_ops[i].and_n_mask_hi;
+						v |= ctx_ops[i].value_hi;
+						gk20a_mem_wr32(ctx_ptr + offsets[j] + 4, 0, v);
+
+						gk20a_dbg(gpu_dbg_gpu_dbg,
+							   "context wr: offset=0x%x v=0x%x",
+							   offsets[j] + 4, v);
+					}
+
+					/* check to see if we need to add a special WAR
+					   for some of the SMPC perf regs */
+					gr_gk20a_ctx_patch_smpc(g, ch_ctx, offset_addrs[j],
+							v, ctx_ptr);
+
+				} else { /* read pass */
+					ctx_ops[i].value_lo =
+						gk20a_mem_rd32(ctx_ptr + offsets[0], 0);
+
+					gk20a_dbg(gpu_dbg_gpu_dbg, "context rd: offset=0x%x v=0x%x",
+						   offsets[0], ctx_ops[i].value_lo);
+
+					if (ctx_ops[i].op == REGOP(READ_64)) {
+						ctx_ops[i].value_hi =
+							gk20a_mem_rd32(ctx_ptr + offsets[0] + 4, 0);
+
+						gk20a_dbg(gpu_dbg_gpu_dbg,
+							   "context rd: offset=0x%x v=0x%x",
+							   offsets[0] + 4, ctx_ops[i].value_hi);
+					} else
+						ctx_ops[i].value_hi = 0;
+				}
+			}
+			ctx_op_nr++;
+		}
+	}
+#if 0
+	/* flush cpu caches for the ctx buffer? only if cpu cached, of course.
+	 * they aren't, yet */
+	if (cached) {
+		FLUSH_CPU_DCACHE(ctx_ptr,
+			 sg_phys(ch_ctx->gr_ctx.mem.ref), size);
+	}
+#endif
+
+ cleanup:
+	if (offsets)
+		kfree(offsets);
+
+	if (ctx_ptr)
+		vunmap(ctx_ptr);
+
+	if (restart_gr_ctxsw) {
+		int tmp_err = gr_gk20a_enable_ctxsw(g);
+		if (tmp_err) {
+			gk20a_err(dev_from_gk20a(g), "unable to restart ctxsw!\n");
+			err = tmp_err;
+		}
+	}
+
+	return err;
+}
+
+static void gr_gk20a_cb_size_default(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+
+	gr->attrib_cb_default_size =
+		gr_gpc0_ppc0_cbm_cfg_size_default_v();
+	gr->alpha_cb_default_size =
+		gr_gpc0_ppc0_cbm_cfg2_size_default_v();
+}
+
+static int gr_gk20a_calc_global_ctx_buffer_size(struct gk20a *g)
+{
+	struct gr_gk20a *gr = &g->gr;
+	int size;
+
+	gr->attrib_cb_size = gr->attrib_cb_default_size;
+	gr->alpha_cb_size = gr->alpha_cb_default_size
+		+ (gr->alpha_cb_default_size >> 1);
+
+	size = gr->attrib_cb_size *
+		gr_gpc0_ppc0_cbm_cfg_size_granularity_v() *
+		gr->max_tpc_count;
+
+	size += gr->alpha_cb_size *
+		gr_gpc0_ppc0_cbm_cfg2_size_granularity_v() *
+		gr->max_tpc_count;
+
+	return size;
+}
+
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+					    struct channel_ctx_gk20a *ch_ctx,
+					    u64 addr, u32 size, bool patch)
+{
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_base_r(),
+		gr_scc_pagepool_base_addr_39_8_f(addr), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_scc_pagepool_r(),
+		gr_scc_pagepool_total_pages_f(size) |
+		gr_scc_pagepool_valid_true_f(), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_base_r(),
+		gr_gpcs_gcc_pagepool_base_addr_39_8_f(addr), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_gpcs_gcc_pagepool_r(),
+		gr_gpcs_gcc_pagepool_total_pages_f(size), patch);
+
+	gr_gk20a_ctx_patch_write(g, ch_ctx, gr_pd_pagepool_r(),
+		gr_pd_pagepool_total_pages_f(size) |
+		gr_pd_pagepool_valid_true_f(), patch);
+}
+
+void gk20a_init_gr(struct gpu_ops *gops)
+{
+	gops->gr.access_smpc_reg = gr_gk20a_access_smpc_reg;
+	gops->gr.bundle_cb_defaults = gr_gk20a_bundle_cb_defaults;
+	gops->gr.cb_size_default = gr_gk20a_cb_size_default;
+	gops->gr.calc_global_ctx_buffer_size =
+		gr_gk20a_calc_global_ctx_buffer_size;
+	gops->gr.commit_global_attrib_cb = gr_gk20a_commit_global_attrib_cb;
+	gops->gr.commit_global_bundle_cb = gr_gk20a_commit_global_bundle_cb;
+	gops->gr.commit_global_cb_manager = gr_gk20a_commit_global_cb_manager;
+	gops->gr.commit_global_pagepool = gr_gk20a_commit_global_pagepool;
+	gops->gr.handle_sw_method = gr_gk20a_handle_sw_method;
+	gops->gr.set_alpha_circular_buffer_size =
+		gk20a_gr_set_circular_buffer_size;
+	gops->gr.set_circular_buffer_size =
+		gk20a_gr_set_alpha_circular_buffer_size;
+	gops->gr.enable_hww_exceptions = gr_gk20a_enable_hww_exceptions;
+	gops->gr.is_valid_class = gr_gk20a_is_valid_class;
+	gops->gr.get_sm_dsm_perf_regs = gr_gk20a_get_sm_dsm_perf_regs;
+	gops->gr.get_sm_dsm_perf_ctrl_regs = gr_gk20a_get_sm_dsm_perf_ctrl_regs;
+	gops->gr.init_fs_state = gr_gk20a_ctx_state_floorsweep;
+	gops->gr.set_hww_esr_report_mask = gr_gk20a_set_hww_esr_report_mask;
+	gops->gr.setup_alpha_beta_tables = gr_gk20a_setup_alpha_beta_tables;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
new file mode 100644
index 000000000000..7eb2923ab2c3
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_gk20a.h
@@ -0,0 +1,406 @@
+/*
+ * GK20A Graphics Engine
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __GR_GK20A_H__
+#define __GR_GK20A_H__
+
+#include <linux/slab.h>
+
+#include "gr_ctx_gk20a.h"
+
+#define GR_IDLE_CHECK_DEFAULT		100 /* usec */
+#define GR_IDLE_CHECK_MAX		5000 /* usec */
+
+#define INVALID_SCREEN_TILE_ROW_OFFSET	0xFFFFFFFF
+#define INVALID_MAX_WAYS		0xFFFFFFFF
+
+#define GK20A_FECS_UCODE_IMAGE	"fecs.bin"
+#define GK20A_GPCCS_UCODE_IMAGE	"gpccs.bin"
+
+enum /* global_ctx_buffer */ {
+	CIRCULAR		= 0,
+	PAGEPOOL		= 1,
+	ATTRIBUTE		= 2,
+	CIRCULAR_VPR		= 3,
+	PAGEPOOL_VPR		= 4,
+	ATTRIBUTE_VPR		= 5,
+	GOLDEN_CTX		= 6,
+	PRIV_ACCESS_MAP		= 7,
+	NR_GLOBAL_CTX_BUF	= 8
+};
+
+/* either ATTRIBUTE or ATTRIBUTE_VPR maps to ATTRIBUTE_VA */
+enum  /*global_ctx_buffer_va */ {
+	CIRCULAR_VA		= 0,
+	PAGEPOOL_VA		= 1,
+	ATTRIBUTE_VA		= 2,
+	GOLDEN_CTX_VA		= 3,
+	PRIV_ACCESS_MAP_VA	= 4,
+	NR_GLOBAL_CTX_BUF_VA	= 5
+};
+
+enum {
+	WAIT_UCODE_LOOP,
+	WAIT_UCODE_TIMEOUT,
+	WAIT_UCODE_ERROR,
+	WAIT_UCODE_OK
+};
+
+enum {
+	GR_IS_UCODE_OP_EQUAL,
+	GR_IS_UCODE_OP_NOT_EQUAL,
+	GR_IS_UCODE_OP_AND,
+	GR_IS_UCODE_OP_LESSER,
+	GR_IS_UCODE_OP_LESSER_EQUAL,
+	GR_IS_UCODE_OP_SKIP
+};
+
+enum {
+	eUcodeHandshakeInitComplete = 1,
+	eUcodeHandshakeMethodFinished
+};
+
+enum {
+	ELCG_RUN,	/* clk always run, i.e. disable elcg */
+	ELCG_STOP,	/* clk is stopped */
+	ELCG_AUTO	/* clk will run when non-idle, standard elcg mode */
+};
+
+enum {
+	BLCG_RUN,	/* clk always run, i.e. disable blcg */
+	BLCG_AUTO	/* clk will run when non-idle, standard blcg mode */
+};
+
+#ifndef GR_GO_IDLE_BUNDLE
+#define GR_GO_IDLE_BUNDLE	0x0000e100 /* --V-B */
+#endif
+
+struct gr_channel_map_tlb_entry {
+	u32 curr_ctx;
+	u32 hw_chid;
+};
+
+struct gr_zcull_gk20a {
+	u32 aliquot_width;
+	u32 aliquot_height;
+	u32 aliquot_size;
+	u32 total_aliquots;
+
+	u32 width_align_pixels;
+	u32 height_align_pixels;
+	u32 pixel_squares_by_aliquots;
+};
+
+struct gr_zcull_info {
+	u32 width_align_pixels;
+	u32 height_align_pixels;
+	u32 pixel_squares_by_aliquots;
+	u32 aliquot_total;
+	u32 region_byte_multiplier;
+	u32 region_header_size;
+	u32 subregion_header_size;
+	u32 subregion_width_align_pixels;
+	u32 subregion_height_align_pixels;
+	u32 subregion_count;
+};
+
+#define GK20A_ZBC_COLOR_VALUE_SIZE	4  /* RGBA */
+
+#define GK20A_STARTOF_ZBC_TABLE		1   /* index zero reserved to indicate "not ZBCd" */
+#define GK20A_SIZEOF_ZBC_TABLE		16  /* match ltcs_ltss_dstg_zbc_index_address width (4) */
+#define GK20A_ZBC_TABLE_SIZE		(16 - 1)
+
+#define GK20A_ZBC_TYPE_INVALID		0
+#define GK20A_ZBC_TYPE_COLOR		1
+#define GK20A_ZBC_TYPE_DEPTH		2
+
+struct zbc_color_table {
+	u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 format;
+	u32 ref_cnt;
+};
+
+struct zbc_depth_table {
+	u32 depth;
+	u32 format;
+	u32 ref_cnt;
+};
+
+struct zbc_entry {
+	u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 depth;
+	u32 type;	/* color or depth */
+	u32 format;
+};
+
+struct zbc_query_params {
+	u32 color_ds[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 color_l2[GK20A_ZBC_COLOR_VALUE_SIZE];
+	u32 depth;
+	u32 ref_cnt;
+	u32 format;
+	u32 type;	/* color or depth */
+	u32 index_size;	/* [out] size, [in] index */
+};
+
+struct gr_gk20a {
+	struct gk20a *g;
+	struct {
+		bool dynamic;
+
+		u32 buffer_size;
+		u32 buffer_total_size;
+
+		bool golden_image_initialized;
+		u32 golden_image_size;
+		u32 *local_golden_image;
+
+		u32 zcull_ctxsw_image_size;
+
+		u32 buffer_header_size;
+
+		u32 priv_access_map_size;
+
+		struct gr_ucode_gk20a ucode;
+
+		struct av_list_gk20a  sw_bundle_init;
+		struct av_list_gk20a  sw_method_init;
+		struct aiv_list_gk20a sw_ctx_load;
+		struct av_list_gk20a  sw_non_ctx_load;
+		struct {
+			struct aiv_list_gk20a sys;
+			struct aiv_list_gk20a gpc;
+			struct aiv_list_gk20a tpc;
+			struct aiv_list_gk20a zcull_gpc;
+			struct aiv_list_gk20a ppc;
+			struct aiv_list_gk20a pm_sys;
+			struct aiv_list_gk20a pm_gpc;
+			struct aiv_list_gk20a pm_tpc;
+		} ctxsw_regs;
+		int regs_base_index;
+		bool valid;
+	} ctx_vars;
+
+	struct mutex ctx_mutex; /* protect golden ctx init */
+	struct mutex fecs_mutex; /* protect fecs method */
+
+#define GR_NETLIST_DYNAMIC	-1
+#define GR_NETLIST_STATIC_A	'A'
+	int netlist;
+
+	int initialized;
+	u32 num_fbps;
+
+	u32 max_gpc_count;
+	u32 max_fbps_count;
+	u32 max_tpc_per_gpc_count;
+	u32 max_zcull_per_gpc_count;
+	u32 max_tpc_count;
+
+	u32 sys_count;
+	u32 gpc_count;
+	u32 pe_count_per_gpc;
+	u32 ppc_count;
+	u32 *gpc_ppc_count;
+	u32 tpc_count;
+	u32 *gpc_tpc_count;
+	u32 zcb_count;
+	u32 *gpc_zcb_count;
+	u32 *pes_tpc_count[2];
+	u32 *pes_tpc_mask[2];
+	u32 *gpc_skip_mask;
+
+	u32 bundle_cb_default_size;
+	u32 min_gpm_fifo_depth;
+	u32 bundle_cb_token_limit;
+	u32 attrib_cb_default_size;
+	u32 attrib_cb_size;
+	u32 alpha_cb_default_size;
+	u32 alpha_cb_size;
+	u32 timeslice_mode;
+
+	struct gr_ctx_buffer_desc global_ctx_buffer[NR_GLOBAL_CTX_BUF];
+
+	struct mmu_desc mmu_wr_mem;
+	u32 mmu_wr_mem_size;
+	struct mmu_desc mmu_rd_mem;
+	u32 mmu_rd_mem_size;
+
+	u8 *map_tiles;
+	u32 map_tile_count;
+	u32 map_row_offset;
+
+#define COMP_TAG_LINE_SIZE_SHIFT	(17)	/* one tag covers 128K */
+#define COMP_TAG_LINE_SIZE		(1 << COMP_TAG_LINE_SIZE_SHIFT)
+
+	u32 max_comptag_mem; /* max memory size (MB) for comptag */
+	struct compbit_store_desc compbit_store;
+	struct gk20a_allocator comp_tags;
+
+	struct gr_zcull_gk20a zcull;
+
+	struct zbc_color_table zbc_col_tbl[GK20A_ZBC_TABLE_SIZE];
+	struct zbc_depth_table zbc_dep_tbl[GK20A_ZBC_TABLE_SIZE];
+
+	s32 max_default_color_index;
+	s32 max_default_depth_index;
+
+	s32 max_used_color_index;
+	s32 max_used_depth_index;
+
+	u32 status_disable_mask;
+
+#define GR_CHANNEL_MAP_TLB_SIZE		2 /* must of power of 2 */
+	struct gr_channel_map_tlb_entry chid_tlb[GR_CHANNEL_MAP_TLB_SIZE];
+	u32 channel_tlb_flush_index;
+	spinlock_t ch_tlb_lock;
+
+	void (*remove_support)(struct gr_gk20a *gr);
+	bool sw_ready;
+	bool skip_ucode_init;
+};
+
+void gk20a_fecs_dump_falcon_stats(struct gk20a *g);
+
+struct gk20a_ctxsw_ucode_segment {
+	u32 offset;
+	u32 size;
+};
+
+struct gk20a_ctxsw_ucode_segments {
+	u32 boot_entry;
+	u32 boot_imem_offset;
+	struct gk20a_ctxsw_ucode_segment boot;
+	struct gk20a_ctxsw_ucode_segment code;
+	struct gk20a_ctxsw_ucode_segment data;
+};
+
+struct gk20a_ctxsw_ucode_info {
+	u64 *p_va;
+	struct inst_desc inst_blk_desc;
+	struct surface_mem_desc surface_desc;
+	u64 ucode_gpuva;
+	struct gk20a_ctxsw_ucode_segments fecs;
+	struct gk20a_ctxsw_ucode_segments gpccs;
+};
+
+struct gk20a_ctxsw_bootloader_desc {
+	u32 start_offset;
+	u32 size;
+	u32 imem_offset;
+	u32 entry_point;
+};
+
+struct gpu_ops;
+void gk20a_init_gr(struct gpu_ops *gops);
+int gk20a_init_gr_support(struct gk20a *g);
+void gk20a_gr_reset(struct gk20a *g);
+
+int gk20a_init_gr_channel(struct channel_gk20a *ch_gk20a);
+
+int gr_gk20a_init_ctx_vars(struct gk20a *g, struct gr_gk20a *gr);
+
+struct nvhost_alloc_obj_ctx_args;
+struct nvhost_free_obj_ctx_args;
+
+int gk20a_alloc_obj_ctx(struct channel_gk20a *c,
+			struct nvhost_alloc_obj_ctx_args *args);
+int gk20a_free_obj_ctx(struct channel_gk20a *c,
+			struct nvhost_free_obj_ctx_args *args);
+void gk20a_free_channel_ctx(struct channel_gk20a *c);
+
+int gk20a_gr_isr(struct gk20a *g);
+int gk20a_gr_nonstall_isr(struct gk20a *g);
+
+/* zcull */
+u32 gr_gk20a_get_ctxsw_zcull_size(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_bind_ctxsw_zcull(struct gk20a *g, struct gr_gk20a *gr,
+			struct channel_gk20a *c, u64 zcull_va, u32 mode);
+int gr_gk20a_get_zcull_info(struct gk20a *g, struct gr_gk20a *gr,
+			struct gr_zcull_info *zcull_params);
+/* zbc */
+int gr_gk20a_add_zbc(struct gk20a *g, struct gr_gk20a *gr,
+			struct zbc_entry *zbc_val);
+int gr_gk20a_query_zbc(struct gk20a *g, struct gr_gk20a *gr,
+			struct zbc_query_params *query_params);
+int gk20a_gr_zbc_set_table(struct gk20a *g, struct gr_gk20a *gr,
+			struct zbc_entry *zbc_val);
+int gr_gk20a_clear_zbc_table(struct gk20a *g, struct gr_gk20a *gr);
+int gr_gk20a_load_zbc_default_table(struct gk20a *g, struct gr_gk20a *gr);
+
+/* pmu */
+int gr_gk20a_fecs_get_reglist_img_size(struct gk20a *g, u32 *size);
+int gr_gk20a_fecs_set_reglist_bind_inst(struct gk20a *g, phys_addr_t addr);
+int gr_gk20a_fecs_set_reglist_virual_addr(struct gk20a *g, u64 pmu_va);
+
+void gr_gk20a_init_elcg_mode(struct gk20a *g, u32 mode, u32 engine);
+void gr_gk20a_init_blcg_mode(struct gk20a *g, u32 mode, u32 engine);
+
+/* sm */
+bool gk20a_gr_sm_debugger_attached(struct gk20a *g);
+
+#define gr_gk20a_elpg_protected_call(g, func) \
+	({ \
+		int err; \
+		if (support_gk20a_pmu()) \
+			gk20a_pmu_disable_elpg(g); \
+		err = func; \
+		if (support_gk20a_pmu()) \
+			gk20a_pmu_enable_elpg(g); \
+		err; \
+	})
+
+int gk20a_gr_suspend(struct gk20a *g);
+
+struct nvhost_dbg_gpu_reg_op;
+int gr_gk20a_exec_ctx_ops(struct channel_gk20a *ch,
+			  struct nvhost_dbg_gpu_reg_op *ctx_ops, u32 num_ops,
+			  u32 num_ctx_wr_ops, u32 num_ctx_rd_ops);
+int gr_gk20a_get_ctx_buffer_offsets(struct gk20a *g,
+				    u32 addr,
+				    u32 max_offsets,
+				    u32 *offsets, u32 *offset_addrs,
+				    u32 *num_offsets,
+				    bool is_quad, u32 quad);
+int gr_gk20a_update_smpc_ctxsw_mode(struct gk20a *g,
+				 struct channel_gk20a *c,
+				    bool enable_smpc_ctxsw);
+
+struct channel_ctx_gk20a;
+int gr_gk20a_ctx_patch_write(struct gk20a *g, struct channel_ctx_gk20a *ch_ctx,
+				    u32 addr, u32 data, bool patch);
+int gr_gk20a_ctx_patch_write_begin(struct gk20a *g,
+					  struct channel_ctx_gk20a *ch_ctx);
+int gr_gk20a_ctx_patch_write_end(struct gk20a *g,
+					struct channel_ctx_gk20a *ch_ctx);
+void gr_gk20a_commit_global_pagepool(struct gk20a *g,
+				     struct channel_ctx_gk20a *ch_ctx,
+				     u64 addr, u32 size, bool patch);
+void gk20a_gr_set_shader_exceptions(struct gk20a *g, u32 data);
+void gr_gk20a_enable_hww_exceptions(struct gk20a *g);
+void gr_gk20a_get_sm_dsm_perf_regs(struct gk20a *g,
+				   u32 *num_sm_dsm_perf_regs,
+				   u32 **sm_dsm_perf_regs,
+				   u32 *perf_register_stride);
+void gr_gk20a_get_sm_dsm_perf_ctrl_regs(struct gk20a *g,
+					u32 *num_sm_dsm_perf_regs,
+					u32 **sm_dsm_perf_regs,
+					u32 *perf_register_stride);
+int gr_gk20a_setup_rop_mapping(struct gk20a *g, struct gr_gk20a *gr);
+#endif /*__GR_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h b/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h
new file mode 100644
index 000000000000..a82a1ee7caa8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/gr_pri_gk20a.h
@@ -0,0 +1,179 @@
+/*
+ * GK20A Graphics Context Pri Register Addressing
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _NVHOST_GR_PRI_GK20A_H_
+#define _NVHOST_GR_PRI_GK20A_H_
+
+/*
+ * These convenience macros are generally for use in the management/modificaiton
+ * of the context state store for gr/compute contexts.
+ */
+
+/*
+ * GPC pri addressing
+ */
+static inline u32 pri_gpccs_addr_width(void)
+{
+	return 15; /*from where?*/
+}
+static inline u32 pri_gpccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_gpccs_addr_width()) - 1);
+}
+static inline u32 pri_gpc_addr(u32 addr, u32 gpc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) + addr;
+}
+static inline bool pri_is_gpc_addr_shared(u32 addr)
+{
+	return (addr >= proj_gpc_shared_base_v()) &&
+		(addr < proj_gpc_shared_base_v() + proj_gpc_stride_v());
+}
+static inline bool pri_is_gpc_addr(u32 addr)
+{
+	return	((addr >= proj_gpc_base_v()) &&
+		 (addr < proj_gpc_base_v() +
+		  proj_scal_litter_num_gpcs_v() * proj_gpc_stride_v())) ||
+		pri_is_gpc_addr_shared(addr);
+}
+static inline u32 pri_get_gpc_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_gpcs = proj_scal_litter_num_gpcs_v();
+
+	for (i = 0; i < num_gpcs; i++) {
+		start = proj_gpc_base_v() + (i * proj_gpc_stride_v());
+		if ((addr >= start) && (addr < (start + proj_gpc_stride_v())))
+			return i;
+	}
+	return 0;
+}
+/*
+ * TPC pri addressing
+ */
+static inline u32 pri_tpccs_addr_width(void)
+{
+	return 11; /* from where? */
+}
+static inline u32 pri_tpccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_tpccs_addr_width()) - 1);
+}
+static inline u32 pri_tpc_addr(u32 addr, u32 gpc, u32 tpc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+		proj_tpc_in_gpc_base_v() + (tpc * proj_tpc_in_gpc_stride_v()) +
+		addr;
+}
+static inline bool pri_is_tpc_addr_shared(u32 addr)
+{
+	return (addr >= proj_tpc_in_gpc_shared_base_v()) &&
+		(addr < (proj_tpc_in_gpc_shared_base_v() +
+			 proj_tpc_in_gpc_stride_v()));
+}
+static inline bool pri_is_tpc_addr(u32 addr)
+{
+	return ((addr >= proj_tpc_in_gpc_base_v()) &&
+		(addr < proj_tpc_in_gpc_base_v() + (proj_scal_litter_num_tpc_per_gpc_v() *
+						    proj_tpc_in_gpc_stride_v())))
+		||
+		pri_is_tpc_addr_shared(addr);
+}
+static inline u32 pri_get_tpc_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_tpcs = proj_scal_litter_num_tpc_per_gpc_v();
+
+	for (i = 0; i < num_tpcs; i++) {
+		start = proj_tpc_in_gpc_base_v() + (i * proj_tpc_in_gpc_stride_v());
+		if ((addr >= start) && (addr < (start + proj_tpc_in_gpc_stride_v())))
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * BE pri addressing
+ */
+static inline u32 pri_becs_addr_width(void)
+{
+	return 10;/* from where? */
+}
+static inline u32 pri_becs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_becs_addr_width()) - 1);
+}
+static inline bool pri_is_be_addr_shared(u32 addr)
+{
+	return (addr >= proj_rop_shared_base_v()) &&
+		(addr < proj_rop_shared_base_v() + proj_rop_stride_v());
+}
+static inline u32 pri_be_shared_addr(u32 addr)
+{
+	return proj_rop_shared_base_v() + pri_becs_addr_mask(addr);
+}
+static inline bool pri_is_be_addr(u32 addr)
+{
+	return	((addr >= proj_rop_base_v()) &&
+		 (addr < proj_rop_base_v()+proj_scal_litter_num_fbps_v() * proj_rop_stride_v())) ||
+		pri_is_be_addr_shared(addr);
+}
+
+static inline u32 pri_get_be_num(u32 addr)
+{
+	u32 i, start;
+	u32 num_fbps = proj_scal_litter_num_fbps_v();
+	for (i = 0; i < num_fbps; i++) {
+		start = proj_rop_base_v() + (i * proj_rop_stride_v());
+		if ((addr >= start) && (addr < (start + proj_rop_stride_v())))
+			return i;
+	}
+	return 0;
+}
+
+/*
+ * PPC pri addressing
+ */
+static inline u32 pri_ppccs_addr_width(void)
+{
+	return 9; /* from where? */
+}
+static inline u32 pri_ppccs_addr_mask(u32 addr)
+{
+	return addr & ((1 << pri_ppccs_addr_width()) - 1);
+}
+static inline u32 pri_ppc_addr(u32 addr, u32 gpc, u32 ppc)
+{
+	return proj_gpc_base_v() + (gpc * proj_gpc_stride_v()) +
+		proj_ppc_in_gpc_base_v() + (ppc * proj_ppc_in_gpc_stride_v()) + addr;
+}
+
+enum ctxsw_addr_type {
+	CTXSW_ADDR_TYPE_SYS = 0,
+	CTXSW_ADDR_TYPE_GPC = 1,
+	CTXSW_ADDR_TYPE_TPC = 2,
+	CTXSW_ADDR_TYPE_BE  = 3,
+	CTXSW_ADDR_TYPE_PPC = 4
+};
+
+#define PRI_BROADCAST_FLAGS_NONE  0
+#define PRI_BROADCAST_FLAGS_GPC   BIT(0)
+#define PRI_BROADCAST_FLAGS_TPC   BIT(1)
+#define PRI_BROADCAST_FLAGS_BE    BIT(2)
+#define PRI_BROADCAST_FLAGS_PPC   BIT(3)
+
+#endif /*_NVHOST_GR_PRI_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/hal.c b/drivers/gpu/nvgpu/gk20a/hal.c
new file mode 100644
index 000000000000..dea740c2da1a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal.c
@@ -0,0 +1,33 @@
+/*
+ * NVIDIA GPU HAL interface.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "gk20a.h"
+#include "hal_gk20a.h"
+
+int gpu_init_hal(struct gk20a *g)
+{
+	u32 ver = g->gpu_characteristics.arch + g->gpu_characteristics.impl;
+	switch (ver) {
+	case GK20A_GPUID_GK20A:
+		gk20a_dbg_info("gk20a detected");
+		gk20a_init_hal(&g->ops);
+		break;
+	default:
+		gk20a_err(&g->dev->dev, "no support for %x", ver);
+		return -ENODEV;
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/hal.h b/drivers/gpu/nvgpu/gk20a/hal.h
new file mode 100644
index 000000000000..da02cf5f69d7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal.h
@@ -0,0 +1,25 @@
+/*
+ * NVIDIA GPU Hardware Abstraction Layer functions definitions.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __HAL_GPU__
+#define __HAL_GPU__
+
+#include <linux/kernel.h>
+
+struct gk20a;
+
+int gpu_init_hal(struct gk20a *g);
+
+#endif /* __HAL_GPU__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.c b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
new file mode 100644
index 000000000000..b3e9b0e6ee42
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.c
@@ -0,0 +1,50 @@
+/*
+ * drivers/video/tegra/host/gk20a/hal_gk20a.c
+ *
+ * GK20A Tegra HAL interface.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include "hal_gk20a.h"
+#include "ltc_gk20a.h"
+#include "fb_gk20a.h"
+#include "gk20a.h"
+#include "gk20a_gating_reglist.h"
+#include "channel_gk20a.h"
+
+struct gpu_ops gk20a_ops = {
+	.clock_gating = {
+		.slcg_gr_load_gating_prod =
+			gr_gk20a_slcg_gr_load_gating_prod,
+		.slcg_perf_load_gating_prod =
+			gr_gk20a_slcg_perf_load_gating_prod,
+		.blcg_gr_load_gating_prod =
+			gr_gk20a_blcg_gr_load_gating_prod,
+		.pg_gr_load_gating_prod =
+			gr_gk20a_pg_gr_load_gating_prod,
+		.slcg_therm_load_gating_prod =
+			gr_gk20a_slcg_therm_load_gating_prod,
+	}
+};
+
+int gk20a_init_hal(struct gpu_ops *gops)
+{
+	*gops = gk20a_ops;
+	gk20a_init_ltc(gops);
+	gk20a_init_gr(gops);
+	gk20a_init_fb(gops);
+	gk20a_init_fifo(gops);
+	gops->name = "gk20a";
+
+	return 0;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/hal_gk20a.h b/drivers/gpu/nvgpu/gk20a/hal_gk20a.h
new file mode 100644
index 000000000000..db77a4a75320
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hal_gk20a.h
@@ -0,0 +1,28 @@
+/*
+ * drivers/video/tegra/host/gk20a/hal_gk20a.h
+ *
+ * GK20A Hardware Abstraction Layer functions definitions.
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef __HAL_GK20A__
+#define __HAL_GK20A__
+
+#include <linux/kernel.h>
+
+struct gpu_ops;
+struct gk20a;
+
+int gk20a_init_hal(struct gpu_ops *gops);
+
+#endif /* __HAL_GK20A__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
new file mode 100644
index 000000000000..ebf8a873e2cf
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_bus_gk20a.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_bus_gk20a_h_
+#define _hw_bus_gk20a_h_
+
+static inline u32 bus_bar1_block_r(void)
+{
+	return 0x00001704;
+}
+static inline u32 bus_bar1_block_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 bus_bar1_block_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 bus_bar1_block_mode_virtual_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 bus_bar1_block_ptr_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 bus_intr_0_r(void)
+{
+	return 0x00001100;
+}
+static inline u32 bus_intr_0_pri_squash_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 bus_intr_0_pri_fecserr_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 bus_intr_0_pri_timeout_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 bus_intr_en_0_r(void)
+{
+	return 0x00001140;
+}
+static inline u32 bus_intr_en_0_pri_squash_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 bus_intr_en_0_pri_fecserr_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 bus_intr_en_0_pri_timeout_m(void)
+{
+	return 0x1 << 3;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h
new file mode 100644
index 000000000000..573329f1fc2c
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ccsr_gk20a.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ccsr_gk20a_h_
+#define _hw_ccsr_gk20a_h_
+
+static inline u32 ccsr_channel_inst_r(u32 i)
+{
+	return 0x00800000 + i*8;
+}
+static inline u32 ccsr_channel_inst__size_1_v(void)
+{
+	return 0x00000080;
+}
+static inline u32 ccsr_channel_inst_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 ccsr_channel_inst_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 ccsr_channel_inst_bind_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 ccsr_channel_inst_bind_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 ccsr_channel_r(u32 i)
+{
+	return 0x00800004 + i*8;
+}
+static inline u32 ccsr_channel__size_1_v(void)
+{
+	return 0x00000080;
+}
+static inline u32 ccsr_channel_enable_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 ccsr_channel_enable_set_f(u32 v)
+{
+	return (v & 0x1) << 10;
+}
+static inline u32 ccsr_channel_enable_set_true_f(void)
+{
+	return 0x400;
+}
+static inline u32 ccsr_channel_enable_clr_true_f(void)
+{
+	return 0x800;
+}
+static inline u32 ccsr_channel_runlist_f(u32 v)
+{
+	return (v & 0xf) << 16;
+}
+static inline u32 ccsr_channel_status_v(u32 r)
+{
+	return (r >> 24) & 0xf;
+}
+static inline u32 ccsr_channel_busy_v(u32 r)
+{
+	return (r >> 28) & 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h
new file mode 100644
index 000000000000..66bf01b0e2d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_chiplet_pwr_gk20a.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_chiplet_pwr_gk20a_h_
+#define _hw_chiplet_pwr_gk20a_h_
+
+static inline u32 chiplet_pwr_gpcs_weight_6_r(void)
+{
+	return 0x0010e018;
+}
+static inline u32 chiplet_pwr_gpcs_weight_7_r(void)
+{
+	return 0x0010e01c;
+}
+static inline u32 chiplet_pwr_gpcs_config_1_r(void)
+{
+	return 0x0010e03c;
+}
+static inline u32 chiplet_pwr_gpcs_config_1_ba_enable_yes_f(void)
+{
+	return 0x1;
+}
+static inline u32 chiplet_pwr_fbps_weight_0_r(void)
+{
+	return 0x0010e100;
+}
+static inline u32 chiplet_pwr_fbps_weight_1_r(void)
+{
+	return 0x0010e104;
+}
+static inline u32 chiplet_pwr_fbps_config_1_r(void)
+{
+	return 0x0010e13c;
+}
+static inline u32 chiplet_pwr_fbps_config_1_ba_enable_yes_f(void)
+{
+	return 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
new file mode 100644
index 000000000000..e2a4f2f21651
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ctxsw_prog_gk20a.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ctxsw_prog_gk20a_h_
+#define _hw_ctxsw_prog_gk20a_h_
+
+static inline u32 ctxsw_prog_fecs_header_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 ctxsw_prog_main_image_num_gpcs_o(void)
+{
+	return 0x00000008;
+}
+static inline u32 ctxsw_prog_main_image_patch_count_o(void)
+{
+	return 0x00000010;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_lo_o(void)
+{
+	return 0x00000014;
+}
+static inline u32 ctxsw_prog_main_image_patch_adr_hi_o(void)
+{
+	return 0x00000018;
+}
+static inline u32 ctxsw_prog_main_image_zcull_o(void)
+{
+	return 0x0000001c;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_no_ctxsw_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ctxsw_prog_main_image_zcull_mode_separate_buffer_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 ctxsw_prog_main_image_zcull_ptr_o(void)
+{
+	return 0x00000020;
+}
+static inline u32 ctxsw_prog_main_image_pm_o(void)
+{
+	return 0x00000028;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_m(void)
+{
+	return 0x7 << 0;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_v(u32 r)
+{
+	return (r >> 0) & 0x7;
+}
+static inline u32 ctxsw_prog_main_image_pm_mode_no_ctxsw_f(void)
+{
+	return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_m(void)
+{
+	return 0x7 << 3;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_v(u32 r)
+{
+	return (r >> 3) & 0x7;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_no_ctxsw_f(void)
+{
+	return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_pm_smpc_mode_ctxsw_f(void)
+{
+	return 0x8;
+}
+static inline u32 ctxsw_prog_main_image_pm_ptr_o(void)
+{
+	return 0x0000002c;
+}
+static inline u32 ctxsw_prog_main_image_num_save_ops_o(void)
+{
+	return 0x000000f4;
+}
+static inline u32 ctxsw_prog_main_image_num_restore_ops_o(void)
+{
+	return 0x000000f8;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_o(void)
+{
+	return 0x000000fc;
+}
+static inline u32 ctxsw_prog_main_image_magic_value_v_value_v(void)
+{
+	return 0x600dc0de;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_o(void)
+{
+	return 0x000000a0;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_allow_all_f(void)
+{
+	return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_allow_none_f(void)
+{
+	return 0x1;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_config_mode_use_map_f(void)
+{
+	return 0x2;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_lo_o(void)
+{
+	return 0x000000a4;
+}
+static inline u32 ctxsw_prog_main_image_priv_access_map_addr_hi_o(void)
+{
+	return 0x000000a8;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_o(void)
+{
+	return 0x0000003c;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 ctxsw_prog_main_image_misc_options_verif_features_enabled_f(void)
+{
+	return 0x8;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_o(void)
+{
+	return 0x0000000c;
+}
+static inline u32 ctxsw_prog_local_priv_register_ctl_offset_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_o(void)
+{
+	return 0x000000f4;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_num_ppcs_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_ppc_info_ppc_mask_v(u32 r)
+{
+	return (r >> 16) & 0xffff;
+}
+static inline u32 ctxsw_prog_local_image_num_tpcs_o(void)
+{
+	return 0x000000f8;
+}
+static inline u32 ctxsw_prog_local_magic_value_o(void)
+{
+	return 0x000000fc;
+}
+static inline u32 ctxsw_prog_local_magic_value_v_value_v(void)
+{
+	return 0xad0becab;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_o(void)
+{
+	return 0x000000ec;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_offset_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 ctxsw_prog_main_extended_buffer_ctl_size_v(u32 r)
+{
+	return (r >> 16) & 0xff;
+}
+static inline u32 ctxsw_prog_extended_buffer_segments_size_in_bytes_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 ctxsw_prog_extended_marker_size_in_bytes_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_register_stride_v(void)
+{
+	return 0x00000005;
+}
+static inline u32 ctxsw_prog_extended_sm_dsm_perf_counter_control_register_stride_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 ctxsw_prog_extended_num_smpc_quadrants_v(void)
+{
+	return 0x00000004;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h
new file mode 100644
index 000000000000..b7edc29d8d7e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_fb_gk20a.h
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fb_gk20a_h_
+#define _hw_fb_gk20a_h_
+
+static inline u32 fb_mmu_ctrl_r(void)
+{
+	return 0x00100c80;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 fb_mmu_ctrl_vm_pg_size_128kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_v(u32 r)
+{
+	return (r >> 15) & 0x1;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_empty_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_ctrl_pri_fifo_space_v(u32 r)
+{
+	return (r >> 16) & 0xff;
+}
+static inline u32 fb_mmu_invalidate_pdb_r(void)
+{
+	return 0x00100cb8;
+}
+static inline u32 fb_mmu_invalidate_pdb_aperture_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_invalidate_pdb_addr_f(u32 v)
+{
+	return (v & 0xfffffff) << 4;
+}
+static inline u32 fb_mmu_invalidate_r(void)
+{
+	return 0x00100cbc;
+}
+static inline u32 fb_mmu_invalidate_all_va_true_f(void)
+{
+	return 0x1;
+}
+static inline u32 fb_mmu_invalidate_all_pdb_true_f(void)
+{
+	return 0x2;
+}
+static inline u32 fb_mmu_invalidate_trigger_s(void)
+{
+	return 1;
+}
+static inline u32 fb_mmu_invalidate_trigger_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 fb_mmu_invalidate_trigger_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 fb_mmu_invalidate_trigger_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 fb_mmu_invalidate_trigger_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 fb_mmu_debug_wr_r(void)
+{
+	return 0x00100cc8;
+}
+static inline u32 fb_mmu_debug_wr_aperture_s(void)
+{
+	return 2;
+}
+static inline u32 fb_mmu_debug_wr_aperture_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 fb_mmu_debug_wr_aperture_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 fb_mmu_debug_wr_aperture_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 fb_mmu_debug_wr_aperture_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_debug_wr_vol_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fb_mmu_debug_wr_vol_true_f(void)
+{
+	return 0x4;
+}
+static inline u32 fb_mmu_debug_wr_addr_v(u32 r)
+{
+	return (r >> 4) & 0xfffffff;
+}
+static inline u32 fb_mmu_debug_wr_addr_alignment_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 fb_mmu_debug_rd_r(void)
+{
+	return 0x00100ccc;
+}
+static inline u32 fb_mmu_debug_rd_aperture_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_debug_rd_vol_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 fb_mmu_debug_rd_addr_v(u32 r)
+{
+	return (r >> 4) & 0xfffffff;
+}
+static inline u32 fb_mmu_debug_rd_addr_alignment_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 fb_mmu_debug_ctrl_r(void)
+{
+	return 0x00100cc4;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_v(u32 r)
+{
+	return (r >> 16) & 0x1;
+}
+static inline u32 fb_mmu_debug_ctrl_debug_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fb_mmu_vpr_info_r(void)
+{
+	return 0x00100cd0;
+}
+static inline u32 fb_mmu_vpr_info_fetch_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 fb_mmu_vpr_info_fetch_false_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fb_mmu_vpr_info_fetch_true_v(void)
+{
+	return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
new file mode 100644
index 000000000000..a39d3c51e1ea
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_fifo_gk20a.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_fifo_gk20a_h_
+#define _hw_fifo_gk20a_h_
+
+static inline u32 fifo_bar1_base_r(void)
+{
+	return 0x00002254;
+}
+static inline u32 fifo_bar1_base_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 fifo_bar1_base_ptr_align_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 fifo_bar1_base_valid_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_bar1_base_valid_true_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 fifo_runlist_base_r(void)
+{
+	return 0x00002270;
+}
+static inline u32 fifo_runlist_base_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 fifo_runlist_base_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_runlist_r(void)
+{
+	return 0x00002274;
+}
+static inline u32 fifo_runlist_engine_f(u32 v)
+{
+	return (v & 0xf) << 20;
+}
+static inline u32 fifo_eng_runlist_base_r(u32 i)
+{
+	return 0x00002280 + i*8;
+}
+static inline u32 fifo_eng_runlist_base__size_1_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_eng_runlist_r(u32 i)
+{
+	return 0x00002284 + i*8;
+}
+static inline u32 fifo_eng_runlist__size_1_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_eng_runlist_length_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 fifo_eng_runlist_pending_true_f(void)
+{
+	return 0x100000;
+}
+static inline u32 fifo_eng_timeslice_r(u32 i)
+{
+	return 0x00002310 + i*4;
+}
+static inline u32 fifo_eng_timeslice_timeout_128_f(void)
+{
+	return 0x80;
+}
+static inline u32 fifo_eng_timeslice_timescale_3_f(void)
+{
+	return 0x3000;
+}
+static inline u32 fifo_eng_timeslice_enable_true_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 fifo_pb_timeslice_r(u32 i)
+{
+	return 0x00002350 + i*4;
+}
+static inline u32 fifo_pb_timeslice_timeout_16_f(void)
+{
+	return 0x10;
+}
+static inline u32 fifo_pb_timeslice_timescale_0_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_pb_timeslice_enable_true_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 fifo_pbdma_map_r(u32 i)
+{
+	return 0x00002390 + i*4;
+}
+static inline u32 fifo_intr_0_r(void)
+{
+	return 0x00002100;
+}
+static inline u32 fifo_intr_0_bind_error_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 fifo_intr_0_bind_error_reset_f(void)
+{
+	return 0x1;
+}
+static inline u32 fifo_intr_0_pio_error_pending_f(void)
+{
+	return 0x10;
+}
+static inline u32 fifo_intr_0_pio_error_reset_f(void)
+{
+	return 0x10;
+}
+static inline u32 fifo_intr_0_sched_error_pending_f(void)
+{
+	return 0x100;
+}
+static inline u32 fifo_intr_0_sched_error_reset_f(void)
+{
+	return 0x100;
+}
+static inline u32 fifo_intr_0_chsw_error_pending_f(void)
+{
+	return 0x10000;
+}
+static inline u32 fifo_intr_0_chsw_error_reset_f(void)
+{
+	return 0x10000;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_pending_f(void)
+{
+	return 0x800000;
+}
+static inline u32 fifo_intr_0_fb_flush_timeout_reset_f(void)
+{
+	return 0x800000;
+}
+static inline u32 fifo_intr_0_lb_error_pending_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 fifo_intr_0_lb_error_reset_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_pending_f(void)
+{
+	return 0x8000000;
+}
+static inline u32 fifo_intr_0_dropped_mmu_fault_reset_f(void)
+{
+	return 0x8000000;
+}
+static inline u32 fifo_intr_0_mmu_fault_pending_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 fifo_intr_0_pbdma_intr_pending_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 fifo_intr_0_runlist_event_pending_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 fifo_intr_0_channel_intr_pending_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 fifo_intr_en_0_r(void)
+{
+	return 0x00002140;
+}
+static inline u32 fifo_intr_en_1_r(void)
+{
+	return 0x00002528;
+}
+static inline u32 fifo_intr_bind_error_r(void)
+{
+	return 0x0000252c;
+}
+static inline u32 fifo_intr_sched_error_r(void)
+{
+	return 0x0000254c;
+}
+static inline u32 fifo_intr_sched_error_code_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 fifo_intr_sched_error_code_ctxsw_timeout_v(void)
+{
+	return 0x0000000a;
+}
+static inline u32 fifo_intr_chsw_error_r(void)
+{
+	return 0x0000256c;
+}
+static inline u32 fifo_intr_mmu_fault_id_r(void)
+{
+	return 0x0000259c;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_intr_mmu_fault_eng_id_graphics_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_intr_mmu_fault_inst_r(u32 i)
+{
+	return 0x00002800 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_v(u32 r)
+{
+	return (r >> 0) & 0xfffffff;
+}
+static inline u32 fifo_intr_mmu_fault_inst_ptr_align_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 fifo_intr_mmu_fault_lo_r(u32 i)
+{
+	return 0x00002804 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_hi_r(u32 i)
+{
+	return 0x00002808 + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_info_r(u32 i)
+{
+	return 0x0000280c + i*16;
+}
+static inline u32 fifo_intr_mmu_fault_info_type_v(u32 r)
+{
+	return (r >> 0) & 0xf;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_v(u32 r)
+{
+	return (r >> 6) & 0x1;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_gpc_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_intr_mmu_fault_info_engine_subid_hub_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_intr_mmu_fault_info_client_v(u32 r)
+{
+	return (r >> 8) & 0x1f;
+}
+static inline u32 fifo_intr_pbdma_id_r(void)
+{
+	return 0x000025a0;
+}
+static inline u32 fifo_intr_pbdma_id_status_f(u32 v, u32 i)
+{
+	return (v & 0x1) << (0 + i*1);
+}
+static inline u32 fifo_intr_pbdma_id_status__size_1_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_intr_runlist_r(void)
+{
+	return 0x00002a00;
+}
+static inline u32 fifo_fb_timeout_r(void)
+{
+	return 0x00002a04;
+}
+static inline u32 fifo_fb_timeout_period_m(void)
+{
+	return 0x3fffffff << 0;
+}
+static inline u32 fifo_fb_timeout_period_max_f(void)
+{
+	return 0x3fffffff;
+}
+static inline u32 fifo_pb_timeout_r(void)
+{
+	return 0x00002a08;
+}
+static inline u32 fifo_pb_timeout_detection_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 fifo_eng_timeout_r(void)
+{
+	return 0x00002a0c;
+}
+static inline u32 fifo_eng_timeout_period_m(void)
+{
+	return 0x7fffffff << 0;
+}
+static inline u32 fifo_eng_timeout_period_max_f(void)
+{
+	return 0x7fffffff;
+}
+static inline u32 fifo_eng_timeout_detection_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 fifo_eng_timeout_detection_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 fifo_eng_timeout_detection_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_error_sched_disable_r(void)
+{
+	return 0x0000262c;
+}
+static inline u32 fifo_sched_disable_r(void)
+{
+	return 0x00002630;
+}
+static inline u32 fifo_sched_disable_runlist_f(u32 v, u32 i)
+{
+	return (v & 0x1) << (0 + i*1);
+}
+static inline u32 fifo_sched_disable_runlist_m(u32 i)
+{
+	return 0x1 << (0 + i*1);
+}
+static inline u32 fifo_sched_disable_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_preempt_r(void)
+{
+	return 0x00002634;
+}
+static inline u32 fifo_preempt_pending_true_f(void)
+{
+	return 0x100000;
+}
+static inline u32 fifo_preempt_type_channel_f(void)
+{
+	return 0x0;
+}
+static inline u32 fifo_preempt_chid_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 fifo_trigger_mmu_fault_r(u32 i)
+{
+	return 0x00002a30 + i*4;
+}
+static inline u32 fifo_trigger_mmu_fault_id_f(u32 v)
+{
+	return (v & 0x1f) << 0;
+}
+static inline u32 fifo_trigger_mmu_fault_enable_f(u32 v)
+{
+	return (v & 0x1) << 8;
+}
+static inline u32 fifo_engine_status_r(u32 i)
+{
+	return 0x00002640 + i*8;
+}
+static inline u32 fifo_engine_status__size_1_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 fifo_engine_status_id_v(u32 r)
+{
+	return (r >> 0) & 0xfff;
+}
+static inline u32 fifo_engine_status_id_type_v(u32 r)
+{
+	return (r >> 12) & 0x1;
+}
+static inline u32 fifo_engine_status_id_type_chid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_engine_status_ctx_status_v(u32 r)
+{
+	return (r >> 13) & 0x7;
+}
+static inline u32 fifo_engine_status_ctx_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_load_v(void)
+{
+	return 0x00000005;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_save_v(void)
+{
+	return 0x00000006;
+}
+static inline u32 fifo_engine_status_ctx_status_ctxsw_switch_v(void)
+{
+	return 0x00000007;
+}
+static inline u32 fifo_engine_status_next_id_v(u32 r)
+{
+	return (r >> 16) & 0xfff;
+}
+static inline u32 fifo_engine_status_next_id_type_v(u32 r)
+{
+	return (r >> 28) & 0x1;
+}
+static inline u32 fifo_engine_status_next_id_type_chid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_engine_status_faulted_v(u32 r)
+{
+	return (r >> 30) & 0x1;
+}
+static inline u32 fifo_engine_status_faulted_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_engine_status_engine_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 fifo_engine_status_engine_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_engine_status_engine_busy_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctxsw_v(u32 r)
+{
+	return (r >> 15) & 0x1;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_engine_status_ctxsw_in_progress_f(void)
+{
+	return 0x8000;
+}
+static inline u32 fifo_pbdma_status_r(u32 i)
+{
+	return 0x00003080 + i*4;
+}
+static inline u32 fifo_pbdma_status__size_1_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_pbdma_status_id_v(u32 r)
+{
+	return (r >> 0) & 0xfff;
+}
+static inline u32 fifo_pbdma_status_id_type_v(u32 r)
+{
+	return (r >> 12) & 0x1;
+}
+static inline u32 fifo_pbdma_status_id_type_chid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_pbdma_status_chan_status_v(u32 r)
+{
+	return (r >> 13) & 0x7;
+}
+static inline u32 fifo_pbdma_status_chan_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_load_v(void)
+{
+	return 0x00000005;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_save_v(void)
+{
+	return 0x00000006;
+}
+static inline u32 fifo_pbdma_status_chan_status_chsw_switch_v(void)
+{
+	return 0x00000007;
+}
+static inline u32 fifo_pbdma_status_next_id_v(u32 r)
+{
+	return (r >> 16) & 0xfff;
+}
+static inline u32 fifo_pbdma_status_next_id_type_v(u32 r)
+{
+	return (r >> 28) & 0x1;
+}
+static inline u32 fifo_pbdma_status_next_id_type_chid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 fifo_pbdma_status_chsw_v(u32 r)
+{
+	return (r >> 15) & 0x1;
+}
+static inline u32 fifo_pbdma_status_chsw_in_progress_v(void)
+{
+	return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h
new file mode 100644
index 000000000000..0aeb11f92bf2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_flush_gk20a.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_flush_gk20a_h_
+#define _hw_flush_gk20a_h_
+
+static inline u32 flush_l2_system_invalidate_r(void)
+{
+	return 0x00070004;
+}
+static inline u32 flush_l2_system_invalidate_pending_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 flush_l2_system_invalidate_pending_busy_f(void)
+{
+	return 0x1;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 flush_l2_system_invalidate_outstanding_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 flush_l2_flush_dirty_r(void)
+{
+	return 0x00070010;
+}
+static inline u32 flush_l2_flush_dirty_pending_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 flush_l2_flush_dirty_pending_empty_f(void)
+{
+	return 0x0;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 flush_l2_flush_dirty_pending_busy_f(void)
+{
+	return 0x1;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 flush_l2_flush_dirty_outstanding_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 flush_fb_flush_r(void)
+{
+	return 0x00070000;
+}
+static inline u32 flush_fb_flush_pending_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 flush_fb_flush_pending_busy_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 flush_fb_flush_pending_busy_f(void)
+{
+	return 0x1;
+}
+static inline u32 flush_fb_flush_outstanding_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 flush_fb_flush_outstanding_true_v(void)
+{
+	return 0x00000001;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h
new file mode 100644
index 000000000000..e0118946aec6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_gmmu_gk20a.h
@@ -0,0 +1,1141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gmmu_gk20a_h_
+#define _hw_gmmu_gk20a_h_
+
+static inline u32 gmmu_pde_aperture_big_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pde_aperture_big_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pde_aperture_big_video_memory_f(void)
+{
+	return 0x1;
+}
+static inline u32 gmmu_pde_size_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pde_size_full_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pde_address_big_sys_f(u32 v)
+{
+	return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pde_address_big_sys_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pde_aperture_small_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pde_aperture_small_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pde_aperture_small_video_memory_f(void)
+{
+	return 0x1;
+}
+static inline u32 gmmu_pde_vol_small_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pde_vol_small_true_f(void)
+{
+	return 0x4;
+}
+static inline u32 gmmu_pde_vol_small_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pde_vol_big_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pde_vol_big_true_f(void)
+{
+	return 0x8;
+}
+static inline u32 gmmu_pde_vol_big_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pde_address_small_sys_f(u32 v)
+{
+	return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pde_address_small_sys_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pde_address_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 gmmu_pde__size_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gmmu_pte__size_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gmmu_pte_valid_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pte_valid_true_f(void)
+{
+	return 0x1;
+}
+static inline u32 gmmu_pte_address_sys_f(u32 v)
+{
+	return (v & 0xfffffff) << 4;
+}
+static inline u32 gmmu_pte_address_sys_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pte_vol_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_vol_true_f(void)
+{
+	return 0x1;
+}
+static inline u32 gmmu_pte_vol_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pte_aperture_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_aperture_video_memory_f(void)
+{
+	return 0x0;
+}
+static inline u32 gmmu_pte_read_only_w(void)
+{
+	return 0;
+}
+static inline u32 gmmu_pte_read_only_true_f(void)
+{
+	return 0x4;
+}
+static inline u32 gmmu_pte_write_disable_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_write_disable_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gmmu_pte_read_disable_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_read_disable_true_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gmmu_pte_comptagline_f(u32 v)
+{
+	return (v & 0x1ffff) << 12;
+}
+static inline u32 gmmu_pte_comptagline_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_address_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 gmmu_pte_kind_f(u32 v)
+{
+	return (v & 0xff) << 4;
+}
+static inline u32 gmmu_pte_kind_w(void)
+{
+	return 1;
+}
+static inline u32 gmmu_pte_kind_invalid_v(void)
+{
+	return 0x000000ff;
+}
+static inline u32 gmmu_pte_kind_pitch_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gmmu_pte_kind_z16_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gmmu_pte_kind_z16_2c_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_2c_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_2c_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_2c_v(void)
+{
+	return 0x00000005;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_2c_v(void)
+{
+	return 0x00000006;
+}
+static inline u32 gmmu_pte_kind_z16_2z_v(void)
+{
+	return 0x00000007;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_2z_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_2z_v(void)
+{
+	return 0x00000009;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_2z_v(void)
+{
+	return 0x0000000a;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_2z_v(void)
+{
+	return 0x0000000b;
+}
+static inline u32 gmmu_pte_kind_z16_4cz_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 gmmu_pte_kind_z16_ms2_4cz_v(void)
+{
+	return 0x0000000d;
+}
+static inline u32 gmmu_pte_kind_z16_ms4_4cz_v(void)
+{
+	return 0x0000000e;
+}
+static inline u32 gmmu_pte_kind_z16_ms8_4cz_v(void)
+{
+	return 0x0000000f;
+}
+static inline u32 gmmu_pte_kind_z16_ms16_4cz_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 gmmu_pte_kind_s8z24_v(void)
+{
+	return 0x00000011;
+}
+static inline u32 gmmu_pte_kind_s8z24_1z_v(void)
+{
+	return 0x00000012;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_1z_v(void)
+{
+	return 0x00000013;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_1z_v(void)
+{
+	return 0x00000014;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_1z_v(void)
+{
+	return 0x00000015;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_1z_v(void)
+{
+	return 0x00000016;
+}
+static inline u32 gmmu_pte_kind_s8z24_2cz_v(void)
+{
+	return 0x00000017;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_2cz_v(void)
+{
+	return 0x00000018;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_2cz_v(void)
+{
+	return 0x00000019;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_2cz_v(void)
+{
+	return 0x0000001a;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_2cz_v(void)
+{
+	return 0x0000001b;
+}
+static inline u32 gmmu_pte_kind_s8z24_2cs_v(void)
+{
+	return 0x0000001c;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_2cs_v(void)
+{
+	return 0x0000001d;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_2cs_v(void)
+{
+	return 0x0000001e;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_2cs_v(void)
+{
+	return 0x0000001f;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_2cs_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gmmu_pte_kind_s8z24_4cszv_v(void)
+{
+	return 0x00000021;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms2_4cszv_v(void)
+{
+	return 0x00000022;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms4_4cszv_v(void)
+{
+	return 0x00000023;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms8_4cszv_v(void)
+{
+	return 0x00000024;
+}
+static inline u32 gmmu_pte_kind_s8z24_ms16_4cszv_v(void)
+{
+	return 0x00000025;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_v(void)
+{
+	return 0x00000026;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_v(void)
+{
+	return 0x00000027;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_v(void)
+{
+	return 0x00000028;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_v(void)
+{
+	return 0x00000029;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_1zv_v(void)
+{
+	return 0x0000002e;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_1zv_v(void)
+{
+	return 0x0000002f;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_1zv_v(void)
+{
+	return 0x00000030;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_1zv_v(void)
+{
+	return 0x00000031;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2cs_v(void)
+{
+	return 0x00000032;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2cs_v(void)
+{
+	return 0x00000033;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2cs_v(void)
+{
+	return 0x00000034;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2cs_v(void)
+{
+	return 0x00000035;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2czv_v(void)
+{
+	return 0x0000003a;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2czv_v(void)
+{
+	return 0x0000003b;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2czv_v(void)
+{
+	return 0x0000003c;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2czv_v(void)
+{
+	return 0x0000003d;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_2zv_v(void)
+{
+	return 0x0000003e;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_2zv_v(void)
+{
+	return 0x0000003f;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_2zv_v(void)
+{
+	return 0x00000040;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_2zv_v(void)
+{
+	return 0x00000041;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc12_4cszv_v(void)
+{
+	return 0x00000042;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms4_vc4_4cszv_v(void)
+{
+	return 0x00000043;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc8_4cszv_v(void)
+{
+	return 0x00000044;
+}
+static inline u32 gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v(void)
+{
+	return 0x00000045;
+}
+static inline u32 gmmu_pte_kind_z24s8_v(void)
+{
+	return 0x00000046;
+}
+static inline u32 gmmu_pte_kind_z24s8_1z_v(void)
+{
+	return 0x00000047;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_1z_v(void)
+{
+	return 0x00000048;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_1z_v(void)
+{
+	return 0x00000049;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_1z_v(void)
+{
+	return 0x0000004a;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_1z_v(void)
+{
+	return 0x0000004b;
+}
+static inline u32 gmmu_pte_kind_z24s8_2cs_v(void)
+{
+	return 0x0000004c;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_2cs_v(void)
+{
+	return 0x0000004d;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_2cs_v(void)
+{
+	return 0x0000004e;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_2cs_v(void)
+{
+	return 0x0000004f;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_2cs_v(void)
+{
+	return 0x00000050;
+}
+static inline u32 gmmu_pte_kind_z24s8_2cz_v(void)
+{
+	return 0x00000051;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_2cz_v(void)
+{
+	return 0x00000052;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_2cz_v(void)
+{
+	return 0x00000053;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_2cz_v(void)
+{
+	return 0x00000054;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_2cz_v(void)
+{
+	return 0x00000055;
+}
+static inline u32 gmmu_pte_kind_z24s8_4cszv_v(void)
+{
+	return 0x00000056;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms2_4cszv_v(void)
+{
+	return 0x00000057;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms4_4cszv_v(void)
+{
+	return 0x00000058;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms8_4cszv_v(void)
+{
+	return 0x00000059;
+}
+static inline u32 gmmu_pte_kind_z24s8_ms16_4cszv_v(void)
+{
+	return 0x0000005a;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_v(void)
+{
+	return 0x0000005b;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_v(void)
+{
+	return 0x0000005c;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_v(void)
+{
+	return 0x0000005d;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_v(void)
+{
+	return 0x0000005e;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_1zv_v(void)
+{
+	return 0x00000063;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_1zv_v(void)
+{
+	return 0x00000064;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_1zv_v(void)
+{
+	return 0x00000065;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_1zv_v(void)
+{
+	return 0x00000066;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2cs_v(void)
+{
+	return 0x00000067;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2cs_v(void)
+{
+	return 0x00000068;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2cs_v(void)
+{
+	return 0x00000069;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2cs_v(void)
+{
+	return 0x0000006a;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2czv_v(void)
+{
+	return 0x0000006f;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2czv_v(void)
+{
+	return 0x00000070;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2czv_v(void)
+{
+	return 0x00000071;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2czv_v(void)
+{
+	return 0x00000072;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_2zv_v(void)
+{
+	return 0x00000073;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_2zv_v(void)
+{
+	return 0x00000074;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_2zv_v(void)
+{
+	return 0x00000075;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_2zv_v(void)
+{
+	return 0x00000076;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc12_4cszv_v(void)
+{
+	return 0x00000077;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms4_vc4_4cszv_v(void)
+{
+	return 0x00000078;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc8_4cszv_v(void)
+{
+	return 0x00000079;
+}
+static inline u32 gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v(void)
+{
+	return 0x0000007a;
+}
+static inline u32 gmmu_pte_kind_zf32_v(void)
+{
+	return 0x0000007b;
+}
+static inline u32 gmmu_pte_kind_zf32_1z_v(void)
+{
+	return 0x0000007c;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_1z_v(void)
+{
+	return 0x0000007d;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_1z_v(void)
+{
+	return 0x0000007e;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_1z_v(void)
+{
+	return 0x0000007f;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_1z_v(void)
+{
+	return 0x00000080;
+}
+static inline u32 gmmu_pte_kind_zf32_2cs_v(void)
+{
+	return 0x00000081;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_2cs_v(void)
+{
+	return 0x00000082;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_2cs_v(void)
+{
+	return 0x00000083;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_2cs_v(void)
+{
+	return 0x00000084;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_2cs_v(void)
+{
+	return 0x00000085;
+}
+static inline u32 gmmu_pte_kind_zf32_2cz_v(void)
+{
+	return 0x00000086;
+}
+static inline u32 gmmu_pte_kind_zf32_ms2_2cz_v(void)
+{
+	return 0x00000087;
+}
+static inline u32 gmmu_pte_kind_zf32_ms4_2cz_v(void)
+{
+	return 0x00000088;
+}
+static inline u32 gmmu_pte_kind_zf32_ms8_2cz_v(void)
+{
+	return 0x00000089;
+}
+static inline u32 gmmu_pte_kind_zf32_ms16_2cz_v(void)
+{
+	return 0x0000008a;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v(void)
+{
+	return 0x0000008b;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v(void)
+{
+	return 0x0000008c;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v(void)
+{
+	return 0x0000008d;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v(void)
+{
+	return 0x0000008e;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v(void)
+{
+	return 0x0000008f;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1cs_v(void)
+{
+	return 0x00000090;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1cs_v(void)
+{
+	return 0x00000091;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v(void)
+{
+	return 0x00000092;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v(void)
+{
+	return 0x00000097;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1zv_v(void)
+{
+	return 0x00000098;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1zv_v(void)
+{
+	return 0x00000099;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1zv_v(void)
+{
+	return 0x0000009a;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1czv_v(void)
+{
+	return 0x0000009b;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_1czv_v(void)
+{
+	return 0x0000009c;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_1czv_v(void)
+{
+	return 0x0000009d;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1czv_v(void)
+{
+	return 0x0000009e;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cs_v(void)
+{
+	return 0x0000009f;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cs_v(void)
+{
+	return 0x000000a0;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cs_v(void)
+{
+	return 0x000000a1;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cs_v(void)
+{
+	return 0x000000a2;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v(void)
+{
+	return 0x000000a3;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cszv_v(void)
+{
+	return 0x000000a4;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cszv_v(void)
+{
+	return 0x000000a5;
+}
+static inline u32 gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v(void)
+{
+	return 0x000000a6;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v(void)
+{
+	return 0x000000a7;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v(void)
+{
+	return 0x000000a8;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v(void)
+{
+	return 0x000000a9;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v(void)
+{
+	return 0x000000aa;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v(void)
+{
+	return 0x000000ab;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1cs_v(void)
+{
+	return 0x000000ac;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1cs_v(void)
+{
+	return 0x000000ad;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v(void)
+{
+	return 0x000000ae;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v(void)
+{
+	return 0x000000b3;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1zv_v(void)
+{
+	return 0x000000b4;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1zv_v(void)
+{
+	return 0x000000b5;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1zv_v(void)
+{
+	return 0x000000b6;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1czv_v(void)
+{
+	return 0x000000b7;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_1czv_v(void)
+{
+	return 0x000000b8;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_1czv_v(void)
+{
+	return 0x000000b9;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1czv_v(void)
+{
+	return 0x000000ba;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cs_v(void)
+{
+	return 0x000000bb;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cs_v(void)
+{
+	return 0x000000bc;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cs_v(void)
+{
+	return 0x000000bd;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cs_v(void)
+{
+	return 0x000000be;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v(void)
+{
+	return 0x000000bf;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cszv_v(void)
+{
+	return 0x000000c0;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cszv_v(void)
+{
+	return 0x000000c1;
+}
+static inline u32 gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v(void)
+{
+	return 0x000000c2;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_v(void)
+{
+	return 0x000000c3;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_1cs_v(void)
+{
+	return 0x000000c4;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_1cs_v(void)
+{
+	return 0x000000c5;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_1cs_v(void)
+{
+	return 0x000000c6;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_1cs_v(void)
+{
+	return 0x000000c7;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_1cs_v(void)
+{
+	return 0x000000c8;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_2cszv_v(void)
+{
+	return 0x000000ce;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_2cszv_v(void)
+{
+	return 0x000000cf;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_2cszv_v(void)
+{
+	return 0x000000d0;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v(void)
+{
+	return 0x000000d1;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_2cszv_v(void)
+{
+	return 0x000000d2;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_2cs_v(void)
+{
+	return 0x000000d3;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms2_2cs_v(void)
+{
+	return 0x000000d4;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms4_2cs_v(void)
+{
+	return 0x000000d5;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms8_2cs_v(void)
+{
+	return 0x000000d6;
+}
+static inline u32 gmmu_pte_kind_zf32_x24s8_ms16_2cs_v(void)
+{
+	return 0x000000d7;
+}
+static inline u32 gmmu_pte_kind_generic_16bx2_v(void)
+{
+	return 0x000000fe;
+}
+static inline u32 gmmu_pte_kind_c32_2c_v(void)
+{
+	return 0x000000d8;
+}
+static inline u32 gmmu_pte_kind_c32_2cbr_v(void)
+{
+	return 0x000000d9;
+}
+static inline u32 gmmu_pte_kind_c32_2cba_v(void)
+{
+	return 0x000000da;
+}
+static inline u32 gmmu_pte_kind_c32_2cra_v(void)
+{
+	return 0x000000db;
+}
+static inline u32 gmmu_pte_kind_c32_2bra_v(void)
+{
+	return 0x000000dc;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2c_v(void)
+{
+	return 0x000000dd;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2cbr_v(void)
+{
+	return 0x000000de;
+}
+static inline u32 gmmu_pte_kind_c32_ms2_2cra_v(void)
+{
+	return 0x000000cc;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2c_v(void)
+{
+	return 0x000000df;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cbr_v(void)
+{
+	return 0x000000e0;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cba_v(void)
+{
+	return 0x000000e1;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2cra_v(void)
+{
+	return 0x000000e2;
+}
+static inline u32 gmmu_pte_kind_c32_ms4_2bra_v(void)
+{
+	return 0x000000e3;
+}
+static inline u32 gmmu_pte_kind_c32_ms8_ms16_2c_v(void)
+{
+	return 0x000000e4;
+}
+static inline u32 gmmu_pte_kind_c32_ms8_ms16_2cra_v(void)
+{
+	return 0x000000e5;
+}
+static inline u32 gmmu_pte_kind_c64_2c_v(void)
+{
+	return 0x000000e6;
+}
+static inline u32 gmmu_pte_kind_c64_2cbr_v(void)
+{
+	return 0x000000e7;
+}
+static inline u32 gmmu_pte_kind_c64_2cba_v(void)
+{
+	return 0x000000e8;
+}
+static inline u32 gmmu_pte_kind_c64_2cra_v(void)
+{
+	return 0x000000e9;
+}
+static inline u32 gmmu_pte_kind_c64_2bra_v(void)
+{
+	return 0x000000ea;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2c_v(void)
+{
+	return 0x000000eb;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2cbr_v(void)
+{
+	return 0x000000ec;
+}
+static inline u32 gmmu_pte_kind_c64_ms2_2cra_v(void)
+{
+	return 0x000000cd;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2c_v(void)
+{
+	return 0x000000ed;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cbr_v(void)
+{
+	return 0x000000ee;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cba_v(void)
+{
+	return 0x000000ef;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2cra_v(void)
+{
+	return 0x000000f0;
+}
+static inline u32 gmmu_pte_kind_c64_ms4_2bra_v(void)
+{
+	return 0x000000f1;
+}
+static inline u32 gmmu_pte_kind_c64_ms8_ms16_2c_v(void)
+{
+	return 0x000000f2;
+}
+static inline u32 gmmu_pte_kind_c64_ms8_ms16_2cra_v(void)
+{
+	return 0x000000f3;
+}
+static inline u32 gmmu_pte_kind_c128_2c_v(void)
+{
+	return 0x000000f4;
+}
+static inline u32 gmmu_pte_kind_c128_2cr_v(void)
+{
+	return 0x000000f5;
+}
+static inline u32 gmmu_pte_kind_c128_ms2_2c_v(void)
+{
+	return 0x000000f6;
+}
+static inline u32 gmmu_pte_kind_c128_ms2_2cr_v(void)
+{
+	return 0x000000f7;
+}
+static inline u32 gmmu_pte_kind_c128_ms4_2c_v(void)
+{
+	return 0x000000f8;
+}
+static inline u32 gmmu_pte_kind_c128_ms4_2cr_v(void)
+{
+	return 0x000000f9;
+}
+static inline u32 gmmu_pte_kind_c128_ms8_ms16_2c_v(void)
+{
+	return 0x000000fa;
+}
+static inline u32 gmmu_pte_kind_c128_ms8_ms16_2cr_v(void)
+{
+	return 0x000000fb;
+}
+static inline u32 gmmu_pte_kind_x8c24_v(void)
+{
+	return 0x000000fc;
+}
+static inline u32 gmmu_pte_kind_pitch_no_swizzle_v(void)
+{
+	return 0x000000fd;
+}
+static inline u32 gmmu_pte_kind_smsked_message_v(void)
+{
+	return 0x000000ca;
+}
+static inline u32 gmmu_pte_kind_smhost_message_v(void)
+{
+	return 0x000000cb;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h
new file mode 100644
index 000000000000..ece7602d43d1
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_gr_gk20a.h
@@ -0,0 +1,3173 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_gr_gk20a_h_
+#define _hw_gr_gk20a_h_
+
+static inline u32 gr_intr_r(void)
+{
+	return 0x00400100;
+}
+static inline u32 gr_intr_notify_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_intr_notify_reset_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_intr_semaphore_pending_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_intr_semaphore_reset_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_intr_semaphore_timeout_not_pending_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_intr_semaphore_timeout_pending_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_intr_semaphore_timeout_reset_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_intr_illegal_method_pending_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_intr_illegal_method_reset_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_intr_illegal_notify_pending_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_intr_illegal_notify_reset_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_intr_illegal_class_pending_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_intr_illegal_class_reset_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_intr_class_error_pending_f(void)
+{
+	return 0x100000;
+}
+static inline u32 gr_intr_class_error_reset_f(void)
+{
+	return 0x100000;
+}
+static inline u32 gr_intr_exception_pending_f(void)
+{
+	return 0x200000;
+}
+static inline u32 gr_intr_exception_reset_f(void)
+{
+	return 0x200000;
+}
+static inline u32 gr_intr_firmware_method_pending_f(void)
+{
+	return 0x100;
+}
+static inline u32 gr_intr_firmware_method_reset_f(void)
+{
+	return 0x100;
+}
+static inline u32 gr_intr_nonstall_r(void)
+{
+	return 0x00400120;
+}
+static inline u32 gr_intr_nonstall_trap_pending_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_intr_en_r(void)
+{
+	return 0x0040013c;
+}
+static inline u32 gr_exception_r(void)
+{
+	return 0x00400108;
+}
+static inline u32 gr_exception_fe_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 gr_exception_gpc_m(void)
+{
+	return 0x1 << 24;
+}
+static inline u32 gr_exception1_r(void)
+{
+	return 0x00400118;
+}
+static inline u32 gr_exception1_gpc_0_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_exception2_r(void)
+{
+	return 0x0040011c;
+}
+static inline u32 gr_exception_en_r(void)
+{
+	return 0x00400138;
+}
+static inline u32 gr_exception_en_fe_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 gr_exception1_en_r(void)
+{
+	return 0x00400130;
+}
+static inline u32 gr_exception2_en_r(void)
+{
+	return 0x00400134;
+}
+static inline u32 gr_gpfifo_ctl_r(void)
+{
+	return 0x00400500;
+}
+static inline u32 gr_gpfifo_ctl_access_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 gr_gpfifo_ctl_access_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpfifo_ctl_access_enabled_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_f(u32 v)
+{
+	return (v & 0x1) << 16;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpfifo_ctl_semaphore_access_enabled_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_trapped_addr_r(void)
+{
+	return 0x00400704;
+}
+static inline u32 gr_trapped_addr_mthd_v(u32 r)
+{
+	return (r >> 2) & 0xfff;
+}
+static inline u32 gr_trapped_addr_subch_v(u32 r)
+{
+	return (r >> 16) & 0x7;
+}
+static inline u32 gr_trapped_data_lo_r(void)
+{
+	return 0x00400708;
+}
+static inline u32 gr_trapped_data_hi_r(void)
+{
+	return 0x0040070c;
+}
+static inline u32 gr_status_r(void)
+{
+	return 0x00400700;
+}
+static inline u32 gr_status_fe_method_lower_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 gr_status_fe_method_lower_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_status_mask_r(void)
+{
+	return 0x00400610;
+}
+static inline u32 gr_engine_status_r(void)
+{
+	return 0x0040060c;
+}
+static inline u32 gr_engine_status_value_busy_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_pipe_bundle_address_r(void)
+{
+	return 0x00400200;
+}
+static inline u32 gr_pipe_bundle_address_value_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 gr_pipe_bundle_data_r(void)
+{
+	return 0x00400204;
+}
+static inline u32 gr_pipe_bundle_config_r(void)
+{
+	return 0x00400208;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_pipe_bundle_config_override_pipe_mode_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_fe_hww_esr_r(void)
+{
+	return 0x00404000;
+}
+static inline u32 gr_fe_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_fe_hww_esr_en_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_fe_go_idle_timeout_r(void)
+{
+	return 0x00404154;
+}
+static inline u32 gr_fe_go_idle_timeout_count_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fe_go_idle_timeout_count_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fe_object_table_r(u32 i)
+{
+	return 0x00404200 + i*4;
+}
+static inline u32 gr_fe_object_table_nvclass_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_r(void)
+{
+	return 0x00404488;
+}
+static inline u32 gr_pri_mme_shadow_raw_index_write_trigger_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_pri_mme_shadow_raw_data_r(void)
+{
+	return 0x0040448c;
+}
+static inline u32 gr_mme_hww_esr_r(void)
+{
+	return 0x00404490;
+}
+static inline u32 gr_mme_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_mme_hww_esr_en_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_memfmt_hww_esr_r(void)
+{
+	return 0x00404600;
+}
+static inline u32 gr_memfmt_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_memfmt_hww_esr_en_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_fecs_cpuctl_r(void)
+{
+	return 0x00409100;
+}
+static inline u32 gr_fecs_cpuctl_startcpu_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 gr_fecs_dmactl_r(void)
+{
+	return 0x0040910c;
+}
+static inline u32 gr_fecs_dmactl_require_ctx_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 gr_fecs_dmactl_dmem_scrubbing_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 gr_fecs_dmactl_imem_scrubbing_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 gr_fecs_os_r(void)
+{
+	return 0x00409080;
+}
+static inline u32 gr_fecs_idlestate_r(void)
+{
+	return 0x0040904c;
+}
+static inline u32 gr_fecs_mailbox0_r(void)
+{
+	return 0x00409040;
+}
+static inline u32 gr_fecs_mailbox1_r(void)
+{
+	return 0x00409044;
+}
+static inline u32 gr_fecs_irqstat_r(void)
+{
+	return 0x00409008;
+}
+static inline u32 gr_fecs_irqmode_r(void)
+{
+	return 0x0040900c;
+}
+static inline u32 gr_fecs_irqmask_r(void)
+{
+	return 0x00409018;
+}
+static inline u32 gr_fecs_irqdest_r(void)
+{
+	return 0x0040901c;
+}
+static inline u32 gr_fecs_curctx_r(void)
+{
+	return 0x00409050;
+}
+static inline u32 gr_fecs_nxtctx_r(void)
+{
+	return 0x00409054;
+}
+static inline u32 gr_fecs_engctl_r(void)
+{
+	return 0x004090a4;
+}
+static inline u32 gr_fecs_debug1_r(void)
+{
+	return 0x00409090;
+}
+static inline u32 gr_fecs_debuginfo_r(void)
+{
+	return 0x00409094;
+}
+static inline u32 gr_fecs_icd_cmd_r(void)
+{
+	return 0x00409200;
+}
+static inline u32 gr_fecs_icd_cmd_opc_s(void)
+{
+	return 4;
+}
+static inline u32 gr_fecs_icd_cmd_opc_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_fecs_icd_cmd_opc_m(void)
+{
+	return 0xf << 0;
+}
+static inline u32 gr_fecs_icd_cmd_opc_v(u32 r)
+{
+	return (r >> 0) & 0xf;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rreg_f(void)
+{
+	return 0x8;
+}
+static inline u32 gr_fecs_icd_cmd_opc_rstat_f(void)
+{
+	return 0xe;
+}
+static inline u32 gr_fecs_icd_cmd_idx_f(u32 v)
+{
+	return (v & 0x1f) << 8;
+}
+static inline u32 gr_fecs_icd_rdata_r(void)
+{
+	return 0x0040920c;
+}
+static inline u32 gr_fecs_imemc_r(u32 i)
+{
+	return 0x00409180 + i*16;
+}
+static inline u32 gr_fecs_imemc_offs_f(u32 v)
+{
+	return (v & 0x3f) << 2;
+}
+static inline u32 gr_fecs_imemc_blk_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_fecs_imemc_aincw_f(u32 v)
+{
+	return (v & 0x1) << 24;
+}
+static inline u32 gr_fecs_imemd_r(u32 i)
+{
+	return 0x00409184 + i*16;
+}
+static inline u32 gr_fecs_imemt_r(u32 i)
+{
+	return 0x00409188 + i*16;
+}
+static inline u32 gr_fecs_imemt_tag_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_fecs_dmemc_r(u32 i)
+{
+	return 0x004091c0 + i*8;
+}
+static inline u32 gr_fecs_dmemc_offs_s(void)
+{
+	return 6;
+}
+static inline u32 gr_fecs_dmemc_offs_f(u32 v)
+{
+	return (v & 0x3f) << 2;
+}
+static inline u32 gr_fecs_dmemc_offs_m(void)
+{
+	return 0x3f << 2;
+}
+static inline u32 gr_fecs_dmemc_offs_v(u32 r)
+{
+	return (r >> 2) & 0x3f;
+}
+static inline u32 gr_fecs_dmemc_blk_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_fecs_dmemc_aincw_f(u32 v)
+{
+	return (v & 0x1) << 24;
+}
+static inline u32 gr_fecs_dmemd_r(u32 i)
+{
+	return 0x004091c4 + i*8;
+}
+static inline u32 gr_fecs_dmatrfbase_r(void)
+{
+	return 0x00409110;
+}
+static inline u32 gr_fecs_dmatrfmoffs_r(void)
+{
+	return 0x00409114;
+}
+static inline u32 gr_fecs_dmatrffboffs_r(void)
+{
+	return 0x0040911c;
+}
+static inline u32 gr_fecs_dmatrfcmd_r(void)
+{
+	return 0x00409118;
+}
+static inline u32 gr_fecs_dmatrfcmd_imem_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 gr_fecs_dmatrfcmd_write_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 gr_fecs_dmatrfcmd_size_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 gr_fecs_dmatrfcmd_ctxdma_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 gr_fecs_bootvec_r(void)
+{
+	return 0x00409104;
+}
+static inline u32 gr_fecs_bootvec_vec_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_falcon_hwcfg_r(void)
+{
+	return 0x00409108;
+}
+static inline u32 gr_gpcs_gpccs_falcon_hwcfg_r(void)
+{
+	return 0x0041a108;
+}
+static inline u32 gr_fecs_falcon_rm_r(void)
+{
+	return 0x00409084;
+}
+static inline u32 gr_fecs_current_ctx_r(void)
+{
+	return 0x00409b00;
+}
+static inline u32 gr_fecs_current_ctx_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_current_ctx_ptr_v(u32 r)
+{
+	return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_current_ctx_target_s(void)
+{
+	return 2;
+}
+static inline u32 gr_fecs_current_ctx_target_f(u32 v)
+{
+	return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_current_ctx_target_m(void)
+{
+	return 0x3 << 28;
+}
+static inline u32 gr_fecs_current_ctx_target_v(u32 r)
+{
+	return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_current_ctx_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_current_ctx_valid_s(void)
+{
+	return 1;
+}
+static inline u32 gr_fecs_current_ctx_valid_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 gr_fecs_current_ctx_valid_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 gr_fecs_current_ctx_valid_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 gr_fecs_current_ctx_valid_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_method_data_r(void)
+{
+	return 0x00409500;
+}
+static inline u32 gr_fecs_method_push_r(void)
+{
+	return 0x00409504;
+}
+static inline u32 gr_fecs_method_push_adr_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 gr_fecs_method_push_adr_bind_pointer_f(void)
+{
+	return 0x3;
+}
+static inline u32 gr_fecs_method_push_adr_discover_image_size_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 gr_fecs_method_push_adr_wfi_golden_save_v(void)
+{
+	return 0x00000009;
+}
+static inline u32 gr_fecs_method_push_adr_restore_golden_v(void)
+{
+	return 0x00000015;
+}
+static inline u32 gr_fecs_method_push_adr_discover_zcull_image_size_v(void)
+{
+	return 0x00000016;
+}
+static inline u32 gr_fecs_method_push_adr_discover_pm_image_size_v(void)
+{
+	return 0x00000025;
+}
+static inline u32 gr_fecs_method_push_adr_discover_reglist_image_size_v(void)
+{
+	return 0x00000030;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_bind_instance_v(void)
+{
+	return 0x00000031;
+}
+static inline u32 gr_fecs_method_push_adr_set_reglist_virtual_address_v(void)
+{
+	return 0x00000032;
+}
+static inline u32 gr_fecs_method_push_adr_stop_ctxsw_v(void)
+{
+	return 0x00000038;
+}
+static inline u32 gr_fecs_method_push_adr_start_ctxsw_v(void)
+{
+	return 0x00000039;
+}
+static inline u32 gr_fecs_method_push_adr_set_watchdog_timeout_f(void)
+{
+	return 0x21;
+}
+static inline u32 gr_fecs_host_int_enable_r(void)
+{
+	return 0x00409c24;
+}
+static inline u32 gr_fecs_host_int_enable_fault_during_ctxsw_enable_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_firmware_method_enable_f(void)
+{
+	return 0x20000;
+}
+static inline u32 gr_fecs_host_int_enable_umimp_illegal_method_enable_f(void)
+{
+	return 0x40000;
+}
+static inline u32 gr_fecs_host_int_enable_watchdog_enable_f(void)
+{
+	return 0x80000;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_r(void)
+{
+	return 0x00409614;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_halt_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_halt_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_halt_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_engine_reset_disabled_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_engine_reset_disabled_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_engine_reset_disabled_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_enabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_sys_context_reset_disabled_f(void)
+{
+	return 0x100;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_enabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_gpc_context_reset_disabled_f(void)
+{
+	return 0x200;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_s(void)
+{
+	return 1;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_f(u32 v)
+{
+	return (v & 0x1) << 10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_m(void)
+{
+	return 0x1 << 10;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_v(u32 r)
+{
+	return (r >> 10) & 0x1;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_enabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fecs_ctxsw_reset_ctl_be_context_reset_disabled_f(void)
+{
+	return 0x400;
+}
+static inline u32 gr_fecs_ctx_state_store_major_rev_id_r(void)
+{
+	return 0x0040960c;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_r(u32 i)
+{
+	return 0x00409800 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox__size_1_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_pass_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_value_fail_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_r(u32 i)
+{
+	return 0x00409820 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_set_value_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_r(u32 i)
+{
+	return 0x00409840 + i*4;
+}
+static inline u32 gr_fecs_ctxsw_mailbox_clear_value_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_fecs_fs_r(void)
+{
+	return 0x00409604;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_s(void)
+{
+	return 5;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_f(u32 v)
+{
+	return (v & 0x1f) << 0;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_m(void)
+{
+	return 0x1f << 0;
+}
+static inline u32 gr_fecs_fs_num_available_gpcs_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_s(void)
+{
+	return 5;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_f(u32 v)
+{
+	return (v & 0x1f) << 16;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_m(void)
+{
+	return 0x1f << 16;
+}
+static inline u32 gr_fecs_fs_num_available_fbps_v(u32 r)
+{
+	return (r >> 16) & 0x1f;
+}
+static inline u32 gr_fecs_cfg_r(void)
+{
+	return 0x00409620;
+}
+static inline u32 gr_fecs_cfg_imem_sz_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 gr_fecs_rc_lanes_r(void)
+{
+	return 0x00409880;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_s(void)
+{
+	return 6;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_f(u32 v)
+{
+	return (v & 0x3f) << 0;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_m(void)
+{
+	return 0x3f << 0;
+}
+static inline u32 gr_fecs_rc_lanes_num_chains_v(u32 r)
+{
+	return (r >> 0) & 0x3f;
+}
+static inline u32 gr_fecs_ctxsw_status_1_r(void)
+{
+	return 0x00409400;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_s(void)
+{
+	return 1;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_f(u32 v)
+{
+	return (v & 0x1) << 12;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_m(void)
+{
+	return 0x1 << 12;
+}
+static inline u32 gr_fecs_ctxsw_status_1_arb_busy_v(u32 r)
+{
+	return (r >> 12) & 0x1;
+}
+static inline u32 gr_fecs_arb_ctx_adr_r(void)
+{
+	return 0x00409a24;
+}
+static inline u32 gr_fecs_new_ctx_r(void)
+{
+	return 0x00409b04;
+}
+static inline u32 gr_fecs_new_ctx_ptr_s(void)
+{
+	return 28;
+}
+static inline u32 gr_fecs_new_ctx_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_new_ctx_ptr_m(void)
+{
+	return 0xfffffff << 0;
+}
+static inline u32 gr_fecs_new_ctx_ptr_v(u32 r)
+{
+	return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_new_ctx_target_s(void)
+{
+	return 2;
+}
+static inline u32 gr_fecs_new_ctx_target_f(u32 v)
+{
+	return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_new_ctx_target_m(void)
+{
+	return 0x3 << 28;
+}
+static inline u32 gr_fecs_new_ctx_target_v(u32 r)
+{
+	return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_new_ctx_valid_s(void)
+{
+	return 1;
+}
+static inline u32 gr_fecs_new_ctx_valid_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 gr_fecs_new_ctx_valid_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 gr_fecs_new_ctx_valid_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_r(void)
+{
+	return 0x00409a0c;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_s(void)
+{
+	return 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_m(void)
+{
+	return 0xfffffff << 0;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_ptr_v(u32 r)
+{
+	return (r >> 0) & 0xfffffff;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_s(void)
+{
+	return 2;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_f(u32 v)
+{
+	return (v & 0x3) << 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_m(void)
+{
+	return 0x3 << 28;
+}
+static inline u32 gr_fecs_arb_ctx_ptr_target_v(u32 r)
+{
+	return (r >> 28) & 0x3;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_r(void)
+{
+	return 0x00409a10;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_s(void)
+{
+	return 5;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_f(u32 v)
+{
+	return (v & 0x1f) << 0;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_m(void)
+{
+	return 0x1f << 0;
+}
+static inline u32 gr_fecs_arb_ctx_cmd_cmd_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 gr_rstr2d_gpc_map0_r(void)
+{
+	return 0x0040780c;
+}
+static inline u32 gr_rstr2d_gpc_map1_r(void)
+{
+	return 0x00407810;
+}
+static inline u32 gr_rstr2d_gpc_map2_r(void)
+{
+	return 0x00407814;
+}
+static inline u32 gr_rstr2d_gpc_map3_r(void)
+{
+	return 0x00407818;
+}
+static inline u32 gr_rstr2d_gpc_map4_r(void)
+{
+	return 0x0040781c;
+}
+static inline u32 gr_rstr2d_gpc_map5_r(void)
+{
+	return 0x00407820;
+}
+static inline u32 gr_rstr2d_map_table_cfg_r(void)
+{
+	return 0x004078bc;
+}
+static inline u32 gr_rstr2d_map_table_cfg_row_offset_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_rstr2d_map_table_cfg_num_entries_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_hww_esr_r(void)
+{
+	return 0x00406018;
+}
+static inline u32 gr_pd_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_pd_hww_esr_en_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_r(u32 i)
+{
+	return 0x00406028 + i*4;
+}
+static inline u32 gr_pd_num_tpc_per_gpc__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count0_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count1_f(u32 v)
+{
+	return (v & 0xf) << 4;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count2_f(u32 v)
+{
+	return (v & 0xf) << 8;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count3_f(u32 v)
+{
+	return (v & 0xf) << 12;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count4_f(u32 v)
+{
+	return (v & 0xf) << 16;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count5_f(u32 v)
+{
+	return (v & 0xf) << 20;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count6_f(u32 v)
+{
+	return (v & 0xf) << 24;
+}
+static inline u32 gr_pd_num_tpc_per_gpc_count7_f(u32 v)
+{
+	return (v & 0xf) << 28;
+}
+static inline u32 gr_pd_ab_dist_cfg0_r(void)
+{
+	return 0x004064c0;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_en_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_pd_ab_dist_cfg0_timeslice_enable_dis_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_pd_ab_dist_cfg1_r(void)
+{
+	return 0x004064c4;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_batches_init_f(void)
+{
+	return 0xffff;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_f(u32 v)
+{
+	return (v & 0x7ff) << 16;
+}
+static inline u32 gr_pd_ab_dist_cfg1_max_output_granularity_v(void)
+{
+	return 0x00000080;
+}
+static inline u32 gr_pd_ab_dist_cfg2_r(void)
+{
+	return 0x004064c8;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 gr_pd_ab_dist_cfg2_token_limit_init_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_f(u32 v)
+{
+	return (v & 0xfff) << 16;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_scc_bundle_granularity_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gr_pd_ab_dist_cfg2_state_limit_min_gpm_fifo_depths_v(void)
+{
+	return 0x00000062;
+}
+static inline u32 gr_pd_pagepool_r(void)
+{
+	return 0x004064cc;
+}
+static inline u32 gr_pd_pagepool_total_pages_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_pagepool_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_pd_dist_skip_table_r(u32 i)
+{
+	return 0x004064d0 + i*4;
+}
+static inline u32 gr_pd_dist_skip_table__size_1_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n0_mask_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n1_mask_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n2_mask_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_dist_skip_table_gpc_4n3_mask_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+static inline u32 gr_pd_alpha_ratio_table_r(u32 i)
+{
+	return 0x00406800 + i*4;
+}
+static inline u32 gr_pd_alpha_ratio_table__size_1_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_alpha_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+static inline u32 gr_pd_beta_ratio_table_r(u32 i)
+{
+	return 0x00406c00 + i*4;
+}
+static inline u32 gr_pd_beta_ratio_table__size_1_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n0_mask_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n1_mask_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n2_mask_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+static inline u32 gr_pd_beta_ratio_table_gpc_4n3_mask_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+static inline u32 gr_ds_debug_r(void)
+{
+	return 0x00405800;
+}
+static inline u32 gr_ds_debug_timeslice_mode_disable_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_debug_timeslice_mode_enable_f(void)
+{
+	return 0x8000000;
+}
+static inline u32 gr_ds_zbc_color_r_r(void)
+{
+	return 0x00405804;
+}
+static inline u32 gr_ds_zbc_color_r_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_g_r(void)
+{
+	return 0x00405808;
+}
+static inline u32 gr_ds_zbc_color_g_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_b_r(void)
+{
+	return 0x0040580c;
+}
+static inline u32 gr_ds_zbc_color_b_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_a_r(void)
+{
+	return 0x00405810;
+}
+static inline u32 gr_ds_zbc_color_a_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_color_fmt_r(void)
+{
+	return 0x00405814;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_f(u32 v)
+{
+	return (v & 0x7f) << 0;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_zero_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_unorm_one_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 gr_ds_zbc_color_fmt_val_rf32_gf32_bf32_af32_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 gr_ds_zbc_z_r(void)
+{
+	return 0x00405818;
+}
+static inline u32 gr_ds_zbc_z_val_s(void)
+{
+	return 32;
+}
+static inline u32 gr_ds_zbc_z_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_ds_zbc_z_val_m(void)
+{
+	return 0xffffffff << 0;
+}
+static inline u32 gr_ds_zbc_z_val_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 gr_ds_zbc_z_val__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_ds_zbc_z_val__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_zbc_z_fmt_r(void)
+{
+	return 0x0040581c;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_zbc_z_fmt_val_fp32_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_ds_zbc_tbl_index_r(void)
+{
+	return 0x00405820;
+}
+static inline u32 gr_ds_zbc_tbl_index_val_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_r(void)
+{
+	return 0x00405824;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_c_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_select_z_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_ds_zbc_tbl_ld_action_write_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_ds_zbc_tbl_ld_trigger_active_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_ds_tga_constraintlogic_r(void)
+{
+	return 0x00405830;
+}
+static inline u32 gr_ds_tga_constraintlogic_beta_cbsize_f(u32 v)
+{
+	return (v & 0xfff) << 16;
+}
+static inline u32 gr_ds_tga_constraintlogic_alpha_cbsize_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 gr_ds_hww_esr_r(void)
+{
+	return 0x00405840;
+}
+static inline u32 gr_ds_hww_esr_reset_s(void)
+{
+	return 1;
+}
+static inline u32 gr_ds_hww_esr_reset_f(u32 v)
+{
+	return (v & 0x1) << 30;
+}
+static inline u32 gr_ds_hww_esr_reset_m(void)
+{
+	return 0x1 << 30;
+}
+static inline u32 gr_ds_hww_esr_reset_v(u32 r)
+{
+	return (r >> 30) & 0x1;
+}
+static inline u32 gr_ds_hww_esr_reset_task_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_ds_hww_esr_reset_task_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_ds_hww_esr_en_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_ds_hww_report_mask_r(void)
+{
+	return 0x00405844;
+}
+static inline u32 gr_ds_hww_report_mask_sph0_err_report_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_ds_hww_report_mask_sph1_err_report_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_ds_hww_report_mask_sph2_err_report_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_ds_hww_report_mask_sph3_err_report_f(void)
+{
+	return 0x8;
+}
+static inline u32 gr_ds_hww_report_mask_sph4_err_report_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_ds_hww_report_mask_sph5_err_report_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_ds_hww_report_mask_sph6_err_report_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_ds_hww_report_mask_sph7_err_report_f(void)
+{
+	return 0x80;
+}
+static inline u32 gr_ds_hww_report_mask_sph8_err_report_f(void)
+{
+	return 0x100;
+}
+static inline u32 gr_ds_hww_report_mask_sph9_err_report_f(void)
+{
+	return 0x200;
+}
+static inline u32 gr_ds_hww_report_mask_sph10_err_report_f(void)
+{
+	return 0x400;
+}
+static inline u32 gr_ds_hww_report_mask_sph11_err_report_f(void)
+{
+	return 0x800;
+}
+static inline u32 gr_ds_hww_report_mask_sph12_err_report_f(void)
+{
+	return 0x1000;
+}
+static inline u32 gr_ds_hww_report_mask_sph13_err_report_f(void)
+{
+	return 0x2000;
+}
+static inline u32 gr_ds_hww_report_mask_sph14_err_report_f(void)
+{
+	return 0x4000;
+}
+static inline u32 gr_ds_hww_report_mask_sph15_err_report_f(void)
+{
+	return 0x8000;
+}
+static inline u32 gr_ds_hww_report_mask_sph16_err_report_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_ds_hww_report_mask_sph17_err_report_f(void)
+{
+	return 0x20000;
+}
+static inline u32 gr_ds_hww_report_mask_sph18_err_report_f(void)
+{
+	return 0x40000;
+}
+static inline u32 gr_ds_hww_report_mask_sph19_err_report_f(void)
+{
+	return 0x80000;
+}
+static inline u32 gr_ds_hww_report_mask_sph20_err_report_f(void)
+{
+	return 0x100000;
+}
+static inline u32 gr_ds_hww_report_mask_sph21_err_report_f(void)
+{
+	return 0x200000;
+}
+static inline u32 gr_ds_hww_report_mask_sph22_err_report_f(void)
+{
+	return 0x400000;
+}
+static inline u32 gr_ds_hww_report_mask_sph23_err_report_f(void)
+{
+	return 0x800000;
+}
+static inline u32 gr_ds_num_tpc_per_gpc_r(u32 i)
+{
+	return 0x00405870 + i*4;
+}
+static inline u32 gr_scc_bundle_cb_base_r(void)
+{
+	return 0x00408004;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_scc_bundle_cb_base_addr_39_8_align_bits_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gr_scc_bundle_cb_size_r(void)
+{
+	return 0x00408008;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_f(u32 v)
+{
+	return (v & 0x7ff) << 0;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b__prod_v(void)
+{
+	return 0x00000018;
+}
+static inline u32 gr_scc_bundle_cb_size_div_256b_byte_granularity_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_scc_bundle_cb_size_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_scc_pagepool_base_r(void)
+{
+	return 0x0040800c;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_scc_pagepool_base_addr_39_8_align_bits_v(void)
+{
+	return 0x00000008;
+}
+static inline u32 gr_scc_pagepool_r(void)
+{
+	return 0x00408010;
+}
+static inline u32 gr_scc_pagepool_total_pages_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_scc_pagepool_total_pages_hwmax_value_v(void)
+{
+	return 0x00000080;
+}
+static inline u32 gr_scc_pagepool_total_pages_byte_granularity_v(void)
+{
+	return 0x00000100;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_s(void)
+{
+	return 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_m(void)
+{
+	return 0xff << 8;
+}
+static inline u32 gr_scc_pagepool_max_valid_pages_v(u32 r)
+{
+	return (r >> 8) & 0xff;
+}
+static inline u32 gr_scc_pagepool_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_scc_init_r(void)
+{
+	return 0x0040802c;
+}
+static inline u32 gr_scc_init_ram_trigger_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_scc_hww_esr_r(void)
+{
+	return 0x00408030;
+}
+static inline u32 gr_scc_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_scc_hww_esr_en_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_sked_hww_esr_r(void)
+{
+	return 0x00407020;
+}
+static inline u32 gr_sked_hww_esr_reset_active_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 gr_cwd_fs_r(void)
+{
+	return 0x00405b00;
+}
+static inline u32 gr_cwd_fs_num_gpcs_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_cwd_fs_num_tpcs_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_gpc0_fs_gpc_r(void)
+{
+	return 0x00502608;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_tpcs_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 gr_gpc0_fs_gpc_num_available_zculls_v(u32 r)
+{
+	return (r >> 16) & 0x1f;
+}
+static inline u32 gr_gpc0_cfg_r(void)
+{
+	return 0x00502620;
+}
+static inline u32 gr_gpc0_cfg_imem_sz_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 gr_gpccs_rc_lanes_r(void)
+{
+	return 0x00502880;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_s(void)
+{
+	return 6;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_f(u32 v)
+{
+	return (v & 0x3f) << 0;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_m(void)
+{
+	return 0x3f << 0;
+}
+static inline u32 gr_gpccs_rc_lanes_num_chains_v(u32 r)
+{
+	return (r >> 0) & 0x3f;
+}
+static inline u32 gr_gpccs_rc_lane_size_r(u32 i)
+{
+	return 0x00502910 + i*0;
+}
+static inline u32 gr_gpccs_rc_lane_size__size_1_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_s(void)
+{
+	return 24;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_m(void)
+{
+	return 0xffffff << 0;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_v(u32 r)
+{
+	return (r >> 0) & 0xffffff;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpccs_rc_lane_size_v_0_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpc0_zcull_fs_r(void)
+{
+	return 0x00500910;
+}
+static inline u32 gr_gpc0_zcull_fs_num_sms_f(u32 v)
+{
+	return (v & 0x1ff) << 0;
+}
+static inline u32 gr_gpc0_zcull_fs_num_active_banks_f(u32 v)
+{
+	return (v & 0xf) << 16;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_r(void)
+{
+	return 0x00500914;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_tiles_per_hypertile_row_per_gpc_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_gpc0_zcull_ram_addr_row_offset_f(u32 v)
+{
+	return (v & 0xf) << 8;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_r(void)
+{
+	return 0x00500918;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+static inline u32 gr_gpc0_zcull_sm_num_rcp_conservative__max_v(void)
+{
+	return 0x00800000;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_r(void)
+{
+	return 0x00500920;
+}
+static inline u32 gr_gpc0_zcull_total_ram_size_num_aliquots_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_zcull_zcsize_r(u32 i)
+{
+	return 0x00500a04 + i*32;
+}
+static inline u32 gr_gpc0_zcull_zcsize_height_subregion__multiple_v(void)
+{
+	return 0x00000040;
+}
+static inline u32 gr_gpc0_zcull_zcsize_width_subregion__multiple_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_r(void)
+{
+	return 0x00500c08;
+}
+static inline u32 gr_gpc0_gpm_pd_active_tpcs_num_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_r(u32 i)
+{
+	return 0x00500c10 + i*4;
+}
+static inline u32 gr_gpc0_gpm_pd_sm_id_id_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_r(u32 i)
+{
+	return 0x00500c30 + i*4;
+}
+static inline u32 gr_gpc0_gpm_pd_pes_tpc_id_mask_mask_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_r(void)
+{
+	return 0x00500c8c;
+}
+static inline u32 gr_gpc0_gpm_sd_active_tpcs_num_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_r(void)
+{
+	return 0x00504088;
+}
+static inline u32 gr_gpc0_tpc0_pe_cfg_smid_value_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_r(void)
+{
+	return 0x005044e8;
+}
+static inline u32 gr_gpc0_tpc0_l1c_cfg_smid_value_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_r(void)
+{
+	return 0x00504698;
+}
+static inline u32 gr_gpc0_tpc0_sm_cfg_sm_id_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_r(void)
+{
+	return 0x00503018;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 gr_gpc0_ppc0_pes_vsc_strem_master_pe_true_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_r(void)
+{
+	return 0x005030c0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_m(void)
+{
+	return 0xffff << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_start_offset_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_f(u32 v)
+{
+	return (v & 0xfff) << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_m(void)
+{
+	return 0xfff << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_v(u32 r)
+{
+	return (r >> 16) & 0xfff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_default_v(void)
+{
+	return 0x00000240;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_size_granularity_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg_timeslice_mode_f(u32 v)
+{
+	return (v & 0x1) << 28;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_r(void)
+{
+	return 0x005030e4;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_start_offset_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_f(u32 v)
+{
+	return (v & 0xfff) << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_m(void)
+{
+	return 0xfff << 16;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_v(u32 r)
+{
+	return (r >> 16) & 0xfff;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_default_v(void)
+{
+	return 0x00000648;
+}
+static inline u32 gr_gpc0_ppc0_cbm_cfg2_size_granularity_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gr_gpccs_falcon_addr_r(void)
+{
+	return 0x0041a0ac;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_s(void)
+{
+	return 6;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_f(u32 v)
+{
+	return (v & 0x3f) << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_m(void)
+{
+	return 0x3f << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_v(u32 r)
+{
+	return (r >> 0) & 0x3f;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpccs_falcon_addr_lsb_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_s(void)
+{
+	return 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_f(u32 v)
+{
+	return (v & 0x3f) << 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_m(void)
+{
+	return 0x3f << 6;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_v(u32 r)
+{
+	return (r >> 6) & 0x3f;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpccs_falcon_addr_msb_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_s(void)
+{
+	return 12;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_m(void)
+{
+	return 0xfff << 0;
+}
+static inline u32 gr_gpccs_falcon_addr_ext_v(u32 r)
+{
+	return (r >> 0) & 0xfff;
+}
+static inline u32 gr_gpccs_cpuctl_r(void)
+{
+	return 0x0041a100;
+}
+static inline u32 gr_gpccs_cpuctl_startcpu_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 gr_gpccs_dmactl_r(void)
+{
+	return 0x0041a10c;
+}
+static inline u32 gr_gpccs_dmactl_require_ctx_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 gr_gpccs_dmactl_dmem_scrubbing_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 gr_gpccs_dmactl_imem_scrubbing_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 gr_gpccs_imemc_r(u32 i)
+{
+	return 0x0041a180 + i*16;
+}
+static inline u32 gr_gpccs_imemc_offs_f(u32 v)
+{
+	return (v & 0x3f) << 2;
+}
+static inline u32 gr_gpccs_imemc_blk_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_gpccs_imemc_aincw_f(u32 v)
+{
+	return (v & 0x1) << 24;
+}
+static inline u32 gr_gpccs_imemd_r(u32 i)
+{
+	return 0x0041a184 + i*16;
+}
+static inline u32 gr_gpccs_imemt_r(u32 i)
+{
+	return 0x0041a188 + i*16;
+}
+static inline u32 gr_gpccs_imemt__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 gr_gpccs_imemt_tag_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+static inline u32 gr_gpccs_dmemc_r(u32 i)
+{
+	return 0x0041a1c0 + i*8;
+}
+static inline u32 gr_gpccs_dmemc_offs_f(u32 v)
+{
+	return (v & 0x3f) << 2;
+}
+static inline u32 gr_gpccs_dmemc_blk_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_gpccs_dmemc_aincw_f(u32 v)
+{
+	return (v & 0x1) << 24;
+}
+static inline u32 gr_gpccs_dmemd_r(u32 i)
+{
+	return 0x0041a1c4 + i*8;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_r(u32 i)
+{
+	return 0x0041a800 + i*4;
+}
+static inline u32 gr_gpccs_ctxsw_mailbox_value_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_r(void)
+{
+	return 0x00418808;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_s(void)
+{
+	return 32;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_m(void)
+{
+	return 0xffffffff << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_base_addr_39_8_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_r(void)
+{
+	return 0x0041880c;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_s(void)
+{
+	return 11;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_f(u32 v)
+{
+	return (v & 0x7ff) << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_m(void)
+{
+	return 0x7ff << 0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_v(u32 r)
+{
+	return (r >> 0) & 0x7ff;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_v(void)
+{
+	return 0x00000018;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_div_256b__prod_f(void)
+{
+	return 0x18;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_s(void)
+{
+	return 1;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_false_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpcs_setup_bundle_cb_size_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_r(void)
+{
+	return 0x00418810;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_addr_39_12_align_bits_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 gr_gpcs_setup_attrib_cb_base_valid_true_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_crstr_gpc_map0_r(void)
+{
+	return 0x00418b08;
+}
+static inline u32 gr_crstr_gpc_map0_tile0_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map0_tile1_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map0_tile2_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map0_tile3_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map0_tile4_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map0_tile5_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map1_r(void)
+{
+	return 0x00418b0c;
+}
+static inline u32 gr_crstr_gpc_map1_tile6_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map1_tile7_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map1_tile8_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map1_tile9_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map1_tile10_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map1_tile11_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map2_r(void)
+{
+	return 0x00418b10;
+}
+static inline u32 gr_crstr_gpc_map2_tile12_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map2_tile13_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map2_tile14_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map2_tile15_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map2_tile16_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map2_tile17_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map3_r(void)
+{
+	return 0x00418b14;
+}
+static inline u32 gr_crstr_gpc_map3_tile18_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map3_tile19_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map3_tile20_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map3_tile21_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map3_tile22_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map3_tile23_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map4_r(void)
+{
+	return 0x00418b18;
+}
+static inline u32 gr_crstr_gpc_map4_tile24_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map4_tile25_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map4_tile26_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map4_tile27_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map4_tile28_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map4_tile29_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_gpc_map5_r(void)
+{
+	return 0x00418b1c;
+}
+static inline u32 gr_crstr_gpc_map5_tile30_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_crstr_gpc_map5_tile31_f(u32 v)
+{
+	return (v & 0x7) << 5;
+}
+static inline u32 gr_crstr_gpc_map5_tile32_f(u32 v)
+{
+	return (v & 0x7) << 10;
+}
+static inline u32 gr_crstr_gpc_map5_tile33_f(u32 v)
+{
+	return (v & 0x7) << 15;
+}
+static inline u32 gr_crstr_gpc_map5_tile34_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_crstr_gpc_map5_tile35_f(u32 v)
+{
+	return (v & 0x7) << 25;
+}
+static inline u32 gr_crstr_map_table_cfg_r(void)
+{
+	return 0x00418bb8;
+}
+static inline u32 gr_crstr_map_table_cfg_row_offset_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_crstr_map_table_cfg_num_entries_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_r(void)
+{
+	return 0x00418980;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_0_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_1_f(u32 v)
+{
+	return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_2_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_3_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_4_f(u32 v)
+{
+	return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_5_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_6_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map0_tile_7_f(u32 v)
+{
+	return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_r(void)
+{
+	return 0x00418984;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_8_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_9_f(u32 v)
+{
+	return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_10_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_11_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_12_f(u32 v)
+{
+	return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_13_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_14_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map1_tile_15_f(u32 v)
+{
+	return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_r(void)
+{
+	return 0x00418988;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_16_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_17_f(u32 v)
+{
+	return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_18_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_19_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_20_f(u32 v)
+{
+	return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_21_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_22_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_s(void)
+{
+	return 3;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_f(u32 v)
+{
+	return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_m(void)
+{
+	return 0x7 << 28;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map2_tile_23_v(u32 r)
+{
+	return (r >> 28) & 0x7;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_r(void)
+{
+	return 0x0041898c;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_24_f(u32 v)
+{
+	return (v & 0x7) << 0;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_25_f(u32 v)
+{
+	return (v & 0x7) << 4;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_26_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_27_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_28_f(u32 v)
+{
+	return (v & 0x7) << 16;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_29_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_30_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 gr_gpcs_zcull_sm_in_gpc_number_map3_tile_31_f(u32 v)
+{
+	return (v & 0x7) << 28;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_r(void)
+{
+	return 0x00418c6c;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_disable_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_gpm_pd_cfg_timeslice_mode_enable_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_r(void)
+{
+	return 0x00419004;
+}
+static inline u32 gr_gpcs_gcc_pagepool_base_addr_39_8_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 gr_gpcs_gcc_pagepool_r(void)
+{
+	return 0x00419008;
+}
+static inline u32 gr_gpcs_gcc_pagepool_total_pages_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_r(void)
+{
+	return 0x0041980c;
+}
+static inline u32 gr_gpcs_tpcs_pe_vaf_fast_mode_switch_true_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_r(void)
+{
+	return 0x00419848;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_v_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_f(u32 v)
+{
+	return (v & 0x1) << 28;
+}
+static inline u32 gr_gpcs_tpcs_pe_pin_cb_global_base_addr_valid_true_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_r(void)
+{
+	return 0x00419ca8;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_enable_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 gr_gpcs_tpcs_l1c_pm_enable_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_r(void)
+{
+	return 0x00419cb8;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_r(void)
+{
+	return 0x00419c00;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_tpcs_mpc_vtg_debug_timeslice_mode_enabled_f(void)
+{
+	return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_r(void)
+{
+	return 0x00419e00;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_core_enable_m(void)
+{
+	return 0x1 << 7;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_core_enable_enable_f(void)
+{
+	return 0x80;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_m(void)
+{
+	return 0x1 << 15;
+}
+static inline u32 gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_enable_f(void)
+{
+	return 0x8000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_r(void)
+{
+	return 0x00419e44;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_stack_error_report_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_api_stack_error_report_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_ret_empty_stack_error_report_f(void)
+{
+	return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_wrap_report_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_pc_report_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_pc_overflow_report_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_immc_addr_report_f(void)
+{
+	return 0x80;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_reg_report_f(void)
+{
+	return 0x100;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_encoding_report_f(void)
+{
+	return 0x200;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_sph_instr_combo_report_f(void)
+{
+	return 0x400;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param_report_f(void)
+{
+	return 0x800;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_report_f(void)
+{
+	return 0x1000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_reg_report_f(void)
+{
+	return 0x2000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_oor_addr_report_f(void)
+{
+	return 0x4000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_misaligned_addr_report_f(void)
+{
+	return 0x8000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_addr_space_report_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_illegal_instr_param2_report_f(void)
+{
+	return 0x20000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_invalid_const_addr_ldc_report_f(void)
+{
+	return 0x40000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_geometry_sm_error_report_f(void)
+{
+	return 0x80000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_warp_esr_report_mask_divergent_report_f(void)
+{
+	return 0x100000;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_r(void)
+{
+	return 0x00419e4c;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_sm_to_sm_fault_report_f(void)
+{
+	return 0x1;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_l1_error_report_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_multiple_warp_errors_report_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_physical_stack_overflow_error_report_f(void)
+{
+	return 0x8;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_int_report_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_bpt_pause_report_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_gpcs_tpcs_sm_hww_global_esr_report_mask_single_step_complete_report_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_r(void)
+{
+	return 0x0050450c;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_enabled_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_gpc0_tpc0_tpccs_tpc_exception_en_sm_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_r(void)
+{
+	return 0x00502c94;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_tpc_0_enabled_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_gpc0_gpccs_gpc_exception_en_tpc_0_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_r(void)
+{
+	return 0x0041ac90;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_tpc_v(u32 r)
+{
+	return (r >> 16) & 0xff;
+}
+static inline u32 gr_gpcs_gpccs_gpc_exception_tpc_0_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_r(void)
+{
+	return 0x00419d08;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_sm_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 gr_gpcs_tpcs_tpccs_tpc_exception_sm_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_r(void)
+{
+	return 0x00504610;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_debugger_mode_on_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_control0_stop_trigger_enable_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_r(void)
+{
+	return 0x0050460c;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 gr_gpc0_tpc0_sm_dbgr_status0_locked_down_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_r(void)
+{
+	return 0x00504650;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_int_pending_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_bpt_pause_pending_f(void)
+{
+	return 0x20;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_global_esr_single_step_complete_pending_f(void)
+{
+	return 0x40;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_r(void)
+{
+	return 0x00504648;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpc0_tpc0_sm_hww_warp_esr_error_none_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_gpc0_tpc0_sm_halfctl_ctrl_r(void)
+{
+	return 0x00504770;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_r(void)
+{
+	return 0x00419f70;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_enable_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_read_quad_ctl_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 gr_gpc0_tpc0_sm_debug_sfe_control_r(void)
+{
+	return 0x0050477c;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_r(void)
+{
+	return 0x00419f7c;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_read_half_ctl_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_m(void)
+{
+	return 0x1 << 16;
+}
+static inline u32 gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_enable_f(void)
+{
+	return 0x10000;
+}
+static inline u32 gr_gpcs_tpcs_sm_power_throttle_r(void)
+{
+	return 0x00419ed0;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_r(void)
+{
+	return 0x0041be08;
+}
+static inline u32 gr_gpcs_tpcs_pes_vsc_vpc_fast_mode_switch_true_f(void)
+{
+	return 0x4;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map0_r(void)
+{
+	return 0x0041bf00;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map1_r(void)
+{
+	return 0x0041bf04;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map2_r(void)
+{
+	return 0x0041bf08;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map3_r(void)
+{
+	return 0x0041bf0c;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map4_r(void)
+{
+	return 0x0041bf10;
+}
+static inline u32 gr_ppcs_wwdx_map_gpc_map5_r(void)
+{
+	return 0x0041bf14;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_r(void)
+{
+	return 0x0041bfd0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_row_offset_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_num_entries_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_num_entries_f(u32 v)
+{
+	return (v & 0x1f) << 16;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_normalized_shift_value_f(u32 v)
+{
+	return (v & 0x7) << 21;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg_coeff5_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 24;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_r(void)
+{
+	return 0x0041bfd4;
+}
+static inline u32 gr_gpcs_ppcs_wwdx_sm_num_rcp_conservative_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_r(void)
+{
+	return 0x0041bfe4;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff6_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 0;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff7_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 5;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff8_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 10;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff9_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 15;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff10_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 20;
+}
+static inline u32 gr_ppcs_wwdx_map_table_cfg2_coeff11_mod_value_f(u32 v)
+{
+	return (v & 0x1f) << 25;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_r(void)
+{
+	return 0x0041bec0;
+}
+static inline u32 gr_gpcs_ppcs_cbm_cfg_timeslice_mode_enable_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 gr_bes_zrop_settings_r(void)
+{
+	return 0x00408850;
+}
+static inline u32 gr_bes_zrop_settings_num_active_fbps_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_bes_crop_settings_r(void)
+{
+	return 0x00408958;
+}
+static inline u32 gr_bes_crop_settings_num_active_fbps_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 gr_zcull_bytes_per_aliquot_per_gpu_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gr_zcull_save_restore_header_bytes_per_gpc_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 gr_zcull_save_restore_subregion_header_bytes_per_gpc_v(void)
+{
+	return 0x000000c0;
+}
+static inline u32 gr_zcull_subregion_qty_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel0_r(void)
+{
+	return 0x00504604;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control_sel1_r(void)
+{
+	return 0x00504608;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control0_r(void)
+{
+	return 0x0050465c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control1_r(void)
+{
+	return 0x00504660;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control2_r(void)
+{
+	return 0x00504664;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control3_r(void)
+{
+	return 0x00504668;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control4_r(void)
+{
+	return 0x0050466c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_control5_r(void)
+{
+	return 0x00504658;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status_r(void)
+{
+	return 0x00504670;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter_status1_r(void)
+{
+	return 0x00504694;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_control_r(void)
+{
+	return 0x00504730;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_control_r(void)
+{
+	return 0x00504734;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_control_r(void)
+{
+	return 0x00504738;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_control_r(void)
+{
+	return 0x0050473c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_control_r(void)
+{
+	return 0x00504740;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_control_r(void)
+{
+	return 0x00504744;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_control_r(void)
+{
+	return 0x00504748;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_control_r(void)
+{
+	return 0x0050474c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter0_r(void)
+{
+	return 0x00504674;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter1_r(void)
+{
+	return 0x00504678;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter2_r(void)
+{
+	return 0x0050467c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter3_r(void)
+{
+	return 0x00504680;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter4_r(void)
+{
+	return 0x00504684;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter5_r(void)
+{
+	return 0x00504688;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter6_r(void)
+{
+	return 0x0050468c;
+}
+static inline u32 gr_pri_gpc0_tpc0_sm_dsm_perf_counter7_r(void)
+{
+	return 0x00504690;
+}
+static inline u32 gr_fe_pwr_mode_r(void)
+{
+	return 0x00404170;
+}
+static inline u32 gr_fe_pwr_mode_mode_auto_f(void)
+{
+	return 0x0;
+}
+static inline u32 gr_fe_pwr_mode_mode_force_on_f(void)
+{
+	return 0x2;
+}
+static inline u32 gr_fe_pwr_mode_req_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 gr_fe_pwr_mode_req_send_f(void)
+{
+	return 0x10;
+}
+static inline u32 gr_fe_pwr_mode_req_done_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_r(void)
+{
+	return 0x005044b0;
+}
+static inline u32 gr_gpc0_tpc0_l1c_dbg_cya15_en_f(void)
+{
+	return 0x8000000;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
new file mode 100644
index 000000000000..65221b59909a
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ltc_gk20a.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ltc_gk20a_h_
+#define _hw_ltc_gk20a_h_
+
+static inline u32 ltc_ltcs_lts0_cbc_ctrl1_r(void)
+{
+	return 0x001410c8;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_r(void)
+{
+	return 0x00141104;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_ways_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_v(u32 r)
+{
+	return (r >> 16) & 0x3;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_all_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_half_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ltc_ltc0_lts0_tstg_cfg1_active_sets_quarter_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_r(void)
+{
+	return 0x0017e8c8;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl1_clear_active_f(void)
+{
+	return 0x4;
+}
+static inline u32 ltc_ltc0_lts0_cbc_ctrl1_r(void)
+{
+	return 0x0017e8c8;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_r(void)
+{
+	return 0x0017e8cc;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(u32 v)
+{
+	return (v & 0x1ffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_r(void)
+{
+	return 0x0017e8d0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(u32 v)
+{
+	return (v & 0x1ffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v(void)
+{
+	return 0x0001ffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_r(void)
+{
+	return 0x0017e8d4;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_alignment_shift_v(void)
+{
+	return 0x0000000b;
+}
+static inline u32 ltc_ltcs_ltss_cbc_base_address_v(u32 r)
+{
+	return (r >> 0) & 0x3ffffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_r(void)
+{
+	return 0x0017e8dc;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(u32 r)
+{
+	return (r >> 0) & 0xffff;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_cache_line_size_v(u32 r)
+{
+	return (r >> 24) & 0xf;
+}
+static inline u32 ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(u32 r)
+{
+	return (r >> 28) & 0xf;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_r(void)
+{
+	return 0x0017e91c;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(u32 v)
+{
+	return (v & 0x1f) << 16;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_r(void)
+{
+	return 0x0017ea44;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_index_address_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(u32 i)
+{
+	return 0x0017ea48 + i*4;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(void)
+{
+	return 0x0017ea58;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_s(void)
+{
+	return 32;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_m(void)
+{
+	return 0xffffffff << 0;
+}
+static inline u32 ltc_ltcs_ltss_dstg_zbc_depth_clear_value_field_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_r(void)
+{
+	return 0x0017e924;
+}
+static inline u32 ltc_ltcs_ltss_tstg_set_mgmt_2_l2_bypass_mode_enabled_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 ltc_ltss_g_elpg_r(void)
+{
+	return 0x0017e828;
+}
+static inline u32 ltc_ltss_g_elpg_flush_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 ltc_ltss_g_elpg_flush_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ltc_ltss_g_elpg_flush_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 ltc_ltc0_ltss_intr_r(void)
+{
+	return 0x00140820;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h
new file mode 100644
index 000000000000..1692bb5430cb
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_mc_gk20a.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_mc_gk20a_h_
+#define _hw_mc_gk20a_h_
+
+static inline u32 mc_boot_0_r(void)
+{
+	return 0x00000000;
+}
+static inline u32 mc_boot_0_architecture_v(u32 r)
+{
+	return (r >> 24) & 0x1f;
+}
+static inline u32 mc_boot_0_implementation_v(u32 r)
+{
+	return (r >> 20) & 0xf;
+}
+static inline u32 mc_boot_0_major_revision_v(u32 r)
+{
+	return (r >> 4) & 0xf;
+}
+static inline u32 mc_boot_0_minor_revision_v(u32 r)
+{
+	return (r >> 0) & 0xf;
+}
+static inline u32 mc_intr_0_r(void)
+{
+	return 0x00000100;
+}
+static inline u32 mc_intr_0_pfifo_pending_f(void)
+{
+	return 0x100;
+}
+static inline u32 mc_intr_0_pgraph_pending_f(void)
+{
+	return 0x1000;
+}
+static inline u32 mc_intr_0_pmu_pending_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 mc_intr_0_ltc_pending_f(void)
+{
+	return 0x2000000;
+}
+static inline u32 mc_intr_0_priv_ring_pending_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 mc_intr_0_pbus_pending_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 mc_intr_1_r(void)
+{
+	return 0x00000104;
+}
+static inline u32 mc_intr_mask_0_r(void)
+{
+	return 0x00000640;
+}
+static inline u32 mc_intr_mask_0_pmu_enabled_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 mc_intr_mask_1_r(void)
+{
+	return 0x00000644;
+}
+static inline u32 mc_intr_mask_1_pmu_enabled_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 mc_intr_en_0_r(void)
+{
+	return 0x00000140;
+}
+static inline u32 mc_intr_en_0_inta_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 mc_intr_en_0_inta_hardware_f(void)
+{
+	return 0x1;
+}
+static inline u32 mc_intr_en_1_r(void)
+{
+	return 0x00000144;
+}
+static inline u32 mc_intr_en_1_inta_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 mc_intr_en_1_inta_hardware_f(void)
+{
+	return 0x1;
+}
+static inline u32 mc_enable_r(void)
+{
+	return 0x00000200;
+}
+static inline u32 mc_enable_xbar_enabled_f(void)
+{
+	return 0x4;
+}
+static inline u32 mc_enable_l2_enabled_f(void)
+{
+	return 0x8;
+}
+static inline u32 mc_enable_pmedia_s(void)
+{
+	return 1;
+}
+static inline u32 mc_enable_pmedia_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 mc_enable_pmedia_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 mc_enable_pmedia_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 mc_enable_priv_ring_enabled_f(void)
+{
+	return 0x20;
+}
+static inline u32 mc_enable_ce0_m(void)
+{
+	return 0x1 << 6;
+}
+static inline u32 mc_enable_pfifo_enabled_f(void)
+{
+	return 0x100;
+}
+static inline u32 mc_enable_pgraph_enabled_f(void)
+{
+	return 0x1000;
+}
+static inline u32 mc_enable_pwr_v(u32 r)
+{
+	return (r >> 13) & 0x1;
+}
+static inline u32 mc_enable_pwr_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 mc_enable_pwr_enabled_f(void)
+{
+	return 0x2000;
+}
+static inline u32 mc_enable_pfb_enabled_f(void)
+{
+	return 0x100000;
+}
+static inline u32 mc_enable_ce2_m(void)
+{
+	return 0x1 << 21;
+}
+static inline u32 mc_enable_ce2_enabled_f(void)
+{
+	return 0x200000;
+}
+static inline u32 mc_enable_blg_enabled_f(void)
+{
+	return 0x8000000;
+}
+static inline u32 mc_enable_perfmon_enabled_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 mc_enable_hub_enabled_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 mc_enable_pb_r(void)
+{
+	return 0x00000204;
+}
+static inline u32 mc_enable_pb_0_s(void)
+{
+	return 1;
+}
+static inline u32 mc_enable_pb_0_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 mc_enable_pb_0_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 mc_enable_pb_0_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 mc_enable_pb_0_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 mc_enable_pb_sel_f(u32 v, u32 i)
+{
+	return (v & 0x1) << (0 + i*1);
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h
new file mode 100644
index 000000000000..df1a6d48541f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pbdma_gk20a.h
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pbdma_gk20a_h_
+#define _hw_pbdma_gk20a_h_
+
+static inline u32 pbdma_gp_entry1_r(void)
+{
+	return 0x10000004;
+}
+static inline u32 pbdma_gp_entry1_get_hi_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 pbdma_gp_entry1_length_f(u32 v)
+{
+	return (v & 0x1fffff) << 10;
+}
+static inline u32 pbdma_gp_entry1_length_v(u32 r)
+{
+	return (r >> 10) & 0x1fffff;
+}
+static inline u32 pbdma_gp_base_r(u32 i)
+{
+	return 0x00040048 + i*8192;
+}
+static inline u32 pbdma_gp_base__size_1_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pbdma_gp_base_offset_f(u32 v)
+{
+	return (v & 0x1fffffff) << 3;
+}
+static inline u32 pbdma_gp_base_rsvd_s(void)
+{
+	return 3;
+}
+static inline u32 pbdma_gp_base_hi_r(u32 i)
+{
+	return 0x0004004c + i*8192;
+}
+static inline u32 pbdma_gp_base_hi_offset_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 pbdma_gp_base_hi_limit2_f(u32 v)
+{
+	return (v & 0x1f) << 16;
+}
+static inline u32 pbdma_gp_fetch_r(u32 i)
+{
+	return 0x00040050 + i*8192;
+}
+static inline u32 pbdma_gp_get_r(u32 i)
+{
+	return 0x00040014 + i*8192;
+}
+static inline u32 pbdma_gp_put_r(u32 i)
+{
+	return 0x00040000 + i*8192;
+}
+static inline u32 pbdma_pb_fetch_r(u32 i)
+{
+	return 0x00040054 + i*8192;
+}
+static inline u32 pbdma_pb_fetch_hi_r(u32 i)
+{
+	return 0x00040058 + i*8192;
+}
+static inline u32 pbdma_get_r(u32 i)
+{
+	return 0x00040018 + i*8192;
+}
+static inline u32 pbdma_get_hi_r(u32 i)
+{
+	return 0x0004001c + i*8192;
+}
+static inline u32 pbdma_put_r(u32 i)
+{
+	return 0x0004005c + i*8192;
+}
+static inline u32 pbdma_put_hi_r(u32 i)
+{
+	return 0x00040060 + i*8192;
+}
+static inline u32 pbdma_formats_r(u32 i)
+{
+	return 0x0004009c + i*8192;
+}
+static inline u32 pbdma_formats_gp_fermi0_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_formats_pb_fermi1_f(void)
+{
+	return 0x100;
+}
+static inline u32 pbdma_formats_mp_fermi0_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_syncpointa_r(u32 i)
+{
+	return 0x000400a4 + i*8192;
+}
+static inline u32 pbdma_syncpointa_payload_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 pbdma_syncpointb_r(u32 i)
+{
+	return 0x000400a8 + i*8192;
+}
+static inline u32 pbdma_syncpointb_op_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 pbdma_syncpointb_op_wait_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pbdma_syncpointb_wait_switch_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 pbdma_syncpointb_wait_switch_en_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pbdma_syncpointb_syncpt_index_v(u32 r)
+{
+	return (r >> 8) & 0xff;
+}
+static inline u32 pbdma_pb_header_r(u32 i)
+{
+	return 0x00040084 + i*8192;
+}
+static inline u32 pbdma_pb_header_priv_user_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_pb_header_method_zero_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_pb_header_subchannel_zero_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_pb_header_level_main_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_pb_header_first_true_f(void)
+{
+	return 0x400000;
+}
+static inline u32 pbdma_pb_header_type_inc_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 pbdma_subdevice_r(u32 i)
+{
+	return 0x00040094 + i*8192;
+}
+static inline u32 pbdma_subdevice_id_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 pbdma_subdevice_status_active_f(void)
+{
+	return 0x10000000;
+}
+static inline u32 pbdma_subdevice_channel_dma_enable_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 pbdma_method0_r(u32 i)
+{
+	return 0x000400c0 + i*8192;
+}
+static inline u32 pbdma_data0_r(u32 i)
+{
+	return 0x000400c4 + i*8192;
+}
+static inline u32 pbdma_target_r(u32 i)
+{
+	return 0x000400ac + i*8192;
+}
+static inline u32 pbdma_target_engine_sw_f(void)
+{
+	return 0x1f;
+}
+static inline u32 pbdma_acquire_r(u32 i)
+{
+	return 0x00040030 + i*8192;
+}
+static inline u32 pbdma_acquire_retry_man_2_f(void)
+{
+	return 0x2;
+}
+static inline u32 pbdma_acquire_retry_exp_2_f(void)
+{
+	return 0x100;
+}
+static inline u32 pbdma_acquire_timeout_exp_max_f(void)
+{
+	return 0x7800;
+}
+static inline u32 pbdma_acquire_timeout_man_max_f(void)
+{
+	return 0x7fff8000;
+}
+static inline u32 pbdma_acquire_timeout_en_disable_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_status_r(u32 i)
+{
+	return 0x00040100 + i*8192;
+}
+static inline u32 pbdma_channel_r(u32 i)
+{
+	return 0x00040120 + i*8192;
+}
+static inline u32 pbdma_signature_r(u32 i)
+{
+	return 0x00040010 + i*8192;
+}
+static inline u32 pbdma_signature_hw_valid_f(void)
+{
+	return 0xface;
+}
+static inline u32 pbdma_signature_sw_zero_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_userd_r(u32 i)
+{
+	return 0x00040008 + i*8192;
+}
+static inline u32 pbdma_userd_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 pbdma_userd_addr_f(u32 v)
+{
+	return (v & 0x7fffff) << 9;
+}
+static inline u32 pbdma_userd_hi_r(u32 i)
+{
+	return 0x0004000c + i*8192;
+}
+static inline u32 pbdma_userd_hi_addr_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 pbdma_hce_ctrl_r(u32 i)
+{
+	return 0x000400e4 + i*8192;
+}
+static inline u32 pbdma_hce_ctrl_hce_priv_mode_yes_f(void)
+{
+	return 0x20;
+}
+static inline u32 pbdma_intr_0_r(u32 i)
+{
+	return 0x00040108 + i*8192;
+}
+static inline u32 pbdma_intr_0_memreq_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 pbdma_intr_0_memreq_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 pbdma_intr_0_memack_timeout_pending_f(void)
+{
+	return 0x2;
+}
+static inline u32 pbdma_intr_0_memack_extra_pending_f(void)
+{
+	return 0x4;
+}
+static inline u32 pbdma_intr_0_memdat_timeout_pending_f(void)
+{
+	return 0x8;
+}
+static inline u32 pbdma_intr_0_memdat_extra_pending_f(void)
+{
+	return 0x10;
+}
+static inline u32 pbdma_intr_0_memflush_pending_f(void)
+{
+	return 0x20;
+}
+static inline u32 pbdma_intr_0_memop_pending_f(void)
+{
+	return 0x40;
+}
+static inline u32 pbdma_intr_0_lbconnect_pending_f(void)
+{
+	return 0x80;
+}
+static inline u32 pbdma_intr_0_lbreq_pending_f(void)
+{
+	return 0x100;
+}
+static inline u32 pbdma_intr_0_lback_timeout_pending_f(void)
+{
+	return 0x200;
+}
+static inline u32 pbdma_intr_0_lback_extra_pending_f(void)
+{
+	return 0x400;
+}
+static inline u32 pbdma_intr_0_lbdat_timeout_pending_f(void)
+{
+	return 0x800;
+}
+static inline u32 pbdma_intr_0_lbdat_extra_pending_f(void)
+{
+	return 0x1000;
+}
+static inline u32 pbdma_intr_0_gpfifo_pending_f(void)
+{
+	return 0x2000;
+}
+static inline u32 pbdma_intr_0_gpptr_pending_f(void)
+{
+	return 0x4000;
+}
+static inline u32 pbdma_intr_0_gpentry_pending_f(void)
+{
+	return 0x8000;
+}
+static inline u32 pbdma_intr_0_gpcrc_pending_f(void)
+{
+	return 0x10000;
+}
+static inline u32 pbdma_intr_0_pbptr_pending_f(void)
+{
+	return 0x20000;
+}
+static inline u32 pbdma_intr_0_pbentry_pending_f(void)
+{
+	return 0x40000;
+}
+static inline u32 pbdma_intr_0_pbcrc_pending_f(void)
+{
+	return 0x80000;
+}
+static inline u32 pbdma_intr_0_xbarconnect_pending_f(void)
+{
+	return 0x100000;
+}
+static inline u32 pbdma_intr_0_method_pending_f(void)
+{
+	return 0x200000;
+}
+static inline u32 pbdma_intr_0_methodcrc_pending_f(void)
+{
+	return 0x400000;
+}
+static inline u32 pbdma_intr_0_device_pending_f(void)
+{
+	return 0x800000;
+}
+static inline u32 pbdma_intr_0_semaphore_pending_f(void)
+{
+	return 0x2000000;
+}
+static inline u32 pbdma_intr_0_acquire_pending_f(void)
+{
+	return 0x4000000;
+}
+static inline u32 pbdma_intr_0_pri_pending_f(void)
+{
+	return 0x8000000;
+}
+static inline u32 pbdma_intr_0_no_ctxsw_seg_pending_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 pbdma_intr_0_pbseg_pending_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 pbdma_intr_0_signature_pending_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 pbdma_intr_1_r(u32 i)
+{
+	return 0x00040148 + i*8192;
+}
+static inline u32 pbdma_intr_en_0_r(u32 i)
+{
+	return 0x0004010c + i*8192;
+}
+static inline u32 pbdma_intr_en_0_lbreq_enabled_f(void)
+{
+	return 0x100;
+}
+static inline u32 pbdma_intr_en_1_r(u32 i)
+{
+	return 0x0004014c + i*8192;
+}
+static inline u32 pbdma_intr_stall_r(u32 i)
+{
+	return 0x0004013c + i*8192;
+}
+static inline u32 pbdma_intr_stall_lbreq_enabled_f(void)
+{
+	return 0x100;
+}
+static inline u32 pbdma_udma_nop_r(void)
+{
+	return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h
new file mode 100644
index 000000000000..d40076139aa6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringmaster_gk20a.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringmaster_gk20a_h_
+#define _hw_pri_ringmaster_gk20a_h_
+
+static inline u32 pri_ringmaster_command_r(void)
+{
+	return 0x0012004c;
+}
+static inline u32 pri_ringmaster_command_cmd_m(void)
+{
+	return 0x3f << 0;
+}
+static inline u32 pri_ringmaster_command_cmd_v(u32 r)
+{
+	return (r >> 0) & 0x3f;
+}
+static inline u32 pri_ringmaster_command_cmd_no_cmd_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringmaster_command_cmd_start_ring_f(void)
+{
+	return 0x1;
+}
+static inline u32 pri_ringmaster_command_cmd_ack_interrupt_f(void)
+{
+	return 0x2;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_f(void)
+{
+	return 0x3;
+}
+static inline u32 pri_ringmaster_command_cmd_enumerate_stations_bc_grp_all_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringmaster_command_data_r(void)
+{
+	return 0x00120048;
+}
+static inline u32 pri_ringmaster_start_results_r(void)
+{
+	return 0x00120050;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 pri_ringmaster_start_results_connectivity_pass_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pri_ringmaster_intr_status0_r(void)
+{
+	return 0x00120058;
+}
+static inline u32 pri_ringmaster_intr_status1_r(void)
+{
+	return 0x0012005c;
+}
+static inline u32 pri_ringmaster_global_ctl_r(void)
+{
+	return 0x00120060;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_asserted_f(void)
+{
+	return 0x1;
+}
+static inline u32 pri_ringmaster_global_ctl_ring_reset_deasserted_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringmaster_enum_fbp_r(void)
+{
+	return 0x00120074;
+}
+static inline u32 pri_ringmaster_enum_fbp_count_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 pri_ringmaster_enum_gpc_r(void)
+{
+	return 0x00120078;
+}
+static inline u32 pri_ringmaster_enum_gpc_count_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h
new file mode 100644
index 000000000000..db16a8de991e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_fbp_gk20a.h
@@ -0,0 +1,226 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_pri_ringstation_fbp_gk20a.h
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_pri_ringstation_fbp_gk20a_h__
+#define __hw_pri_ringstation_fbp_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+
+static inline u32 pri_ringstation_fbp_master_config_r(u32 i)
+{
+	return 0x00124300+((i)*4);
+}
+static inline u32 pri_ringstation_fbp_master_config__size_1_v(void)
+{
+	return 64;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_s(void)
+{
+	return 18;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_f(u32 v)
+{
+	return (v & 0x3ffff) << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_m(void)
+{
+	return 0x3ffff << 0;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_v(u32 r)
+{
+	return (r >> 0) & 0x3ffff;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_v(void)
+{
+	return 0x00000064;
+}
+static inline u32 pri_ringstation_fbp_master_config_timeout_i_f(void)
+{
+	return 0x64;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_s(void)
+{
+	return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_f(u32 v)
+{
+	return (v & 0x1) << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_m(void)
+{
+	return 0x1 << 30;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_v(u32 r)
+{
+	return (r >> 30) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_error_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_fs_action_soldier_on_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_s(void)
+{
+	return 1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_error_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pri_ringstation_fbp_master_config_reset_action_soldier_on_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_m(void)
+{
+	return 0x7 << 20;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_v(u32 r)
+{
+	return (r >> 20) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_setup_clocks_i_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_m(void)
+{
+	return 0x7 << 24;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_v(u32 r)
+{
+	return (r >> 24) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_wait_clocks_i_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_f(u32 v)
+{
+	return (v & 0x7) << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_m(void)
+{
+	return 0x7 << 27;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_v(u32 r)
+{
+	return (r >> 27) & 0x7;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_fbp_master_config_hold_clocks_i_f(void)
+{
+	return 0x0;
+}
+
+#endif /* __hw_pri_ringstation_fbp_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h
new file mode 100644
index 000000000000..e8aad933336d
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_gpc_gk20a.h
@@ -0,0 +1,226 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_pri_ringstation_gpc_gk20a.h
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_pri_ringstation_gpc_gk20a_h__
+#define __hw_pri_ringstation_gpc_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+
+static inline u32 pri_ringstation_gpc_master_config_r(u32 i)
+{
+	return 0x00128300+((i)*4);
+}
+static inline u32 pri_ringstation_gpc_master_config__size_1_v(void)
+{
+	return 64;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_s(void)
+{
+	return 18;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_f(u32 v)
+{
+	return (v & 0x3ffff) << 0;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_m(void)
+{
+	return 0x3ffff << 0;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_v(u32 r)
+{
+	return (r >> 0) & 0x3ffff;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_i_v(void)
+{
+	return 0x00000064;
+}
+static inline u32 pri_ringstation_gpc_master_config_timeout_i_f(void)
+{
+	return 0x64;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_s(void)
+{
+	return 1;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_f(u32 v)
+{
+	return (v & 0x1) << 30;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_m(void)
+{
+	return 0x1 << 30;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_v(u32 r)
+{
+	return (r >> 30) & 0x1;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_error_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_error_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_soldier_on_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pri_ringstation_gpc_master_config_fs_action_soldier_on_f(void)
+{
+	return 0x40000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_s(void)
+{
+	return 1;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_error_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_error_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_soldier_on_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pri_ringstation_gpc_master_config_reset_action_soldier_on_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_f(u32 v)
+{
+	return (v & 0x7) << 20;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_m(void)
+{
+	return 0x7 << 20;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_v(u32 r)
+{
+	return (r >> 20) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_setup_clocks_i_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_f(u32 v)
+{
+	return (v & 0x7) << 24;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_m(void)
+{
+	return 0x7 << 24;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_v(u32 r)
+{
+	return (r >> 24) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_wait_clocks_i_f(void)
+{
+	return 0x0;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_s(void)
+{
+	return 3;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_f(u32 v)
+{
+	return (v & 0x7) << 27;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_m(void)
+{
+	return 0x7 << 27;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_v(u32 r)
+{
+	return (r >> 27) & 0x7;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_i_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pri_ringstation_gpc_master_config_hold_clocks_i_f(void)
+{
+	return 0x0;
+}
+
+#endif /* __hw_pri_ringstation_gpc_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h
new file mode 100644
index 000000000000..c281dd54dea9
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pri_ringstation_sys_gk20a.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pri_ringstation_sys_gk20a_h_
+#define _hw_pri_ringstation_sys_gk20a_h_
+
+static inline u32 pri_ringstation_sys_master_config_r(u32 i)
+{
+	return 0x00122300 + i*4;
+}
+static inline u32 pri_ringstation_sys_decode_config_r(void)
+{
+	return 0x00122204;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_m(void)
+{
+	return 0x7 << 0;
+}
+static inline u32 pri_ringstation_sys_decode_config_ring_drop_on_ring_not_started_f(void)
+{
+	return 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
new file mode 100644
index 000000000000..93c55c307c75
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_proj_gk20a.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_proj_gk20a_h_
+#define _hw_proj_gk20a_h_
+
+static inline u32 proj_gpc_base_v(void)
+{
+	return 0x00500000;
+}
+static inline u32 proj_gpc_shared_base_v(void)
+{
+	return 0x00418000;
+}
+static inline u32 proj_gpc_stride_v(void)
+{
+	return 0x00008000;
+}
+static inline u32 proj_ltc_stride_v(void)
+{
+	return 0x00002000;
+}
+static inline u32 proj_lts_stride_v(void)
+{
+	return 0x00000400;
+}
+static inline u32 proj_ppc_in_gpc_base_v(void)
+{
+	return 0x00003000;
+}
+static inline u32 proj_ppc_in_gpc_stride_v(void)
+{
+	return 0x00000200;
+}
+static inline u32 proj_rop_base_v(void)
+{
+	return 0x00410000;
+}
+static inline u32 proj_rop_shared_base_v(void)
+{
+	return 0x00408800;
+}
+static inline u32 proj_rop_stride_v(void)
+{
+	return 0x00000400;
+}
+static inline u32 proj_tpc_in_gpc_base_v(void)
+{
+	return 0x00004000;
+}
+static inline u32 proj_tpc_in_gpc_stride_v(void)
+{
+	return 0x00000800;
+}
+static inline u32 proj_tpc_in_gpc_shared_base_v(void)
+{
+	return 0x00001800;
+}
+static inline u32 proj_host_num_pbdma_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_tpc_per_gpc_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_fbps_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_gpcs_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_pes_per_gpc_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_tpcs_per_pes_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 proj_scal_litter_num_zcull_banks_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 proj_scal_max_gpcs_v(void)
+{
+	return 0x00000020;
+}
+static inline u32 proj_scal_max_tpc_per_gpc_v(void)
+{
+	return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h
new file mode 100644
index 000000000000..d7d26b806cd2
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_pwr_gk20a.h
@@ -0,0 +1,737 @@
+/*
+ * Copyright (c) 2012-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_pwr_gk20a_h_
+#define _hw_pwr_gk20a_h_
+
+static inline u32 pwr_falcon_irqsset_r(void)
+{
+	return 0x0010a000;
+}
+static inline u32 pwr_falcon_irqsset_swgen0_set_f(void)
+{
+	return 0x40;
+}
+static inline u32 pwr_falcon_irqsclr_r(void)
+{
+	return 0x0010a004;
+}
+static inline u32 pwr_falcon_irqstat_r(void)
+{
+	return 0x0010a008;
+}
+static inline u32 pwr_falcon_irqstat_halt_true_f(void)
+{
+	return 0x10;
+}
+static inline u32 pwr_falcon_irqstat_exterr_true_f(void)
+{
+	return 0x20;
+}
+static inline u32 pwr_falcon_irqstat_swgen0_true_f(void)
+{
+	return 0x40;
+}
+static inline u32 pwr_falcon_irqmode_r(void)
+{
+	return 0x0010a00c;
+}
+static inline u32 pwr_falcon_irqmset_r(void)
+{
+	return 0x0010a010;
+}
+static inline u32 pwr_falcon_irqmset_gptmr_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqmset_wdtmr_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqmset_mthd_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqmset_ctxsw_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqmset_halt_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqmset_exterr_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqmset_swgen0_f(u32 v)
+{
+	return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqmset_swgen1_f(u32 v)
+{
+	return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqmclr_r(void)
+{
+	return 0x0010a014;
+}
+static inline u32 pwr_falcon_irqmclr_gptmr_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqmclr_wdtmr_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqmclr_mthd_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqmclr_ctxsw_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqmclr_halt_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqmclr_exterr_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqmclr_swgen0_f(u32 v)
+{
+	return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqmclr_swgen1_f(u32 v)
+{
+	return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqmclr_ext_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_irqmask_r(void)
+{
+	return 0x0010a018;
+}
+static inline u32 pwr_falcon_irqdest_r(void)
+{
+	return 0x0010a01c;
+}
+static inline u32 pwr_falcon_irqdest_host_gptmr_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 pwr_falcon_irqdest_host_wdtmr_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_irqdest_host_mthd_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 pwr_falcon_irqdest_host_ctxsw_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 pwr_falcon_irqdest_host_halt_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_irqdest_host_exterr_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen0_f(u32 v)
+{
+	return (v & 0x1) << 6;
+}
+static inline u32 pwr_falcon_irqdest_host_swgen1_f(u32 v)
+{
+	return (v & 0x1) << 7;
+}
+static inline u32 pwr_falcon_irqdest_host_ext_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_irqdest_target_gptmr_f(u32 v)
+{
+	return (v & 0x1) << 16;
+}
+static inline u32 pwr_falcon_irqdest_target_wdtmr_f(u32 v)
+{
+	return (v & 0x1) << 17;
+}
+static inline u32 pwr_falcon_irqdest_target_mthd_f(u32 v)
+{
+	return (v & 0x1) << 18;
+}
+static inline u32 pwr_falcon_irqdest_target_ctxsw_f(u32 v)
+{
+	return (v & 0x1) << 19;
+}
+static inline u32 pwr_falcon_irqdest_target_halt_f(u32 v)
+{
+	return (v & 0x1) << 20;
+}
+static inline u32 pwr_falcon_irqdest_target_exterr_f(u32 v)
+{
+	return (v & 0x1) << 21;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen0_f(u32 v)
+{
+	return (v & 0x1) << 22;
+}
+static inline u32 pwr_falcon_irqdest_target_swgen1_f(u32 v)
+{
+	return (v & 0x1) << 23;
+}
+static inline u32 pwr_falcon_irqdest_target_ext_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+static inline u32 pwr_falcon_curctx_r(void)
+{
+	return 0x0010a050;
+}
+static inline u32 pwr_falcon_nxtctx_r(void)
+{
+	return 0x0010a054;
+}
+static inline u32 pwr_falcon_mailbox0_r(void)
+{
+	return 0x0010a040;
+}
+static inline u32 pwr_falcon_mailbox1_r(void)
+{
+	return 0x0010a044;
+}
+static inline u32 pwr_falcon_itfen_r(void)
+{
+	return 0x0010a048;
+}
+static inline u32 pwr_falcon_itfen_ctxen_enable_f(void)
+{
+	return 0x1;
+}
+static inline u32 pwr_falcon_idlestate_r(void)
+{
+	return 0x0010a04c;
+}
+static inline u32 pwr_falcon_idlestate_falcon_busy_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 pwr_falcon_idlestate_ext_busy_v(u32 r)
+{
+	return (r >> 1) & 0x7fff;
+}
+static inline u32 pwr_falcon_os_r(void)
+{
+	return 0x0010a080;
+}
+static inline u32 pwr_falcon_engctl_r(void)
+{
+	return 0x0010a0a4;
+}
+static inline u32 pwr_falcon_cpuctl_r(void)
+{
+	return 0x0010a100;
+}
+static inline u32 pwr_falcon_cpuctl_startcpu_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 pwr_falcon_bootvec_r(void)
+{
+	return 0x0010a104;
+}
+static inline u32 pwr_falcon_bootvec_vec_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_falcon_dmactl_r(void)
+{
+	return 0x0010a10c;
+}
+static inline u32 pwr_falcon_dmactl_dmem_scrubbing_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 pwr_falcon_dmactl_imem_scrubbing_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 pwr_falcon_hwcfg_r(void)
+{
+	return 0x0010a108;
+}
+static inline u32 pwr_falcon_hwcfg_imem_size_v(u32 r)
+{
+	return (r >> 0) & 0x1ff;
+}
+static inline u32 pwr_falcon_hwcfg_dmem_size_v(u32 r)
+{
+	return (r >> 9) & 0x1ff;
+}
+static inline u32 pwr_falcon_dmatrfbase_r(void)
+{
+	return 0x0010a110;
+}
+static inline u32 pwr_falcon_dmatrfmoffs_r(void)
+{
+	return 0x0010a114;
+}
+static inline u32 pwr_falcon_dmatrfcmd_r(void)
+{
+	return 0x0010a118;
+}
+static inline u32 pwr_falcon_dmatrfcmd_imem_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 pwr_falcon_dmatrfcmd_write_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 pwr_falcon_dmatrfcmd_size_f(u32 v)
+{
+	return (v & 0x7) << 8;
+}
+static inline u32 pwr_falcon_dmatrfcmd_ctxdma_f(u32 v)
+{
+	return (v & 0x7) << 12;
+}
+static inline u32 pwr_falcon_dmatrffboffs_r(void)
+{
+	return 0x0010a11c;
+}
+static inline u32 pwr_falcon_exterraddr_r(void)
+{
+	return 0x0010a168;
+}
+static inline u32 pwr_falcon_exterrstat_r(void)
+{
+	return 0x0010a16c;
+}
+static inline u32 pwr_falcon_exterrstat_valid_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 pwr_falcon_exterrstat_valid_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 pwr_falcon_exterrstat_valid_true_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_r(void)
+{
+	return 0x0010a200;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_s(void)
+{
+	return 4;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_f(u32 v)
+{
+	return (v & 0xf) << 0;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_m(void)
+{
+	return 0xf << 0;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_v(u32 r)
+{
+	return (r >> 0) & 0xf;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rreg_f(void)
+{
+	return 0x8;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_opc_rstat_f(void)
+{
+	return 0xe;
+}
+static inline u32 pwr_pmu_falcon_icd_cmd_idx_f(u32 v)
+{
+	return (v & 0x1f) << 8;
+}
+static inline u32 pwr_pmu_falcon_icd_rdata_r(void)
+{
+	return 0x0010a20c;
+}
+static inline u32 pwr_falcon_dmemc_r(u32 i)
+{
+	return 0x0010a1c0 + i*8;
+}
+static inline u32 pwr_falcon_dmemc_offs_f(u32 v)
+{
+	return (v & 0x3f) << 2;
+}
+static inline u32 pwr_falcon_dmemc_offs_m(void)
+{
+	return 0x3f << 2;
+}
+static inline u32 pwr_falcon_dmemc_blk_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 pwr_falcon_dmemc_blk_m(void)
+{
+	return 0xff << 8;
+}
+static inline u32 pwr_falcon_dmemc_aincw_f(u32 v)
+{
+	return (v & 0x1) << 24;
+}
+static inline u32 pwr_falcon_dmemc_aincr_f(u32 v)
+{
+	return (v & 0x1) << 25;
+}
+static inline u32 pwr_falcon_dmemd_r(u32 i)
+{
+	return 0x0010a1c4 + i*8;
+}
+static inline u32 pwr_pmu_new_instblk_r(void)
+{
+	return 0x0010a480;
+}
+static inline u32 pwr_pmu_new_instblk_ptr_f(u32 v)
+{
+	return (v & 0xfffffff) << 0;
+}
+static inline u32 pwr_pmu_new_instblk_target_fb_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_pmu_new_instblk_target_sys_coh_f(void)
+{
+	return 0x20000000;
+}
+static inline u32 pwr_pmu_new_instblk_valid_f(u32 v)
+{
+	return (v & 0x1) << 30;
+}
+static inline u32 pwr_pmu_mutex_id_r(void)
+{
+	return 0x0010a488;
+}
+static inline u32 pwr_pmu_mutex_id_value_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 pwr_pmu_mutex_id_value_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pwr_pmu_mutex_id_value_not_avail_v(void)
+{
+	return 0x000000ff;
+}
+static inline u32 pwr_pmu_mutex_id_release_r(void)
+{
+	return 0x0010a48c;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_m(void)
+{
+	return 0xff << 0;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 pwr_pmu_mutex_id_release_value_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_pmu_mutex_r(u32 i)
+{
+	return 0x0010a580 + i*4;
+}
+static inline u32 pwr_pmu_mutex__size_1_v(void)
+{
+	return 0x00000010;
+}
+static inline u32 pwr_pmu_mutex_value_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 pwr_pmu_mutex_value_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 pwr_pmu_mutex_value_initial_lock_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_pmu_queue_head_r(u32 i)
+{
+	return 0x0010a4a0 + i*4;
+}
+static inline u32 pwr_pmu_queue_head__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 pwr_pmu_queue_head_address_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_queue_head_address_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_queue_tail_r(u32 i)
+{
+	return 0x0010a4b0 + i*4;
+}
+static inline u32 pwr_pmu_queue_tail__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 pwr_pmu_queue_tail_address_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_queue_tail_address_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_msgq_head_r(void)
+{
+	return 0x0010a4c8;
+}
+static inline u32 pwr_pmu_msgq_head_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_msgq_head_val_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_msgq_tail_r(void)
+{
+	return 0x0010a4cc;
+}
+static inline u32 pwr_pmu_msgq_tail_val_f(u32 v)
+{
+	return (v & 0xffffffff) << 0;
+}
+static inline u32 pwr_pmu_msgq_tail_val_v(u32 r)
+{
+	return (r >> 0) & 0xffffffff;
+}
+static inline u32 pwr_pmu_idle_mask_r(u32 i)
+{
+	return 0x0010a504 + i*16;
+}
+static inline u32 pwr_pmu_idle_mask_gr_enabled_f(void)
+{
+	return 0x1;
+}
+static inline u32 pwr_pmu_idle_mask_ce_2_enabled_f(void)
+{
+	return 0x200000;
+}
+static inline u32 pwr_pmu_idle_count_r(u32 i)
+{
+	return 0x0010a508 + i*16;
+}
+static inline u32 pwr_pmu_idle_count_value_f(u32 v)
+{
+	return (v & 0x7fffffff) << 0;
+}
+static inline u32 pwr_pmu_idle_count_value_v(u32 r)
+{
+	return (r >> 0) & 0x7fffffff;
+}
+static inline u32 pwr_pmu_idle_count_reset_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 pwr_pmu_idle_ctrl_r(u32 i)
+{
+	return 0x0010a50c + i*16;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_busy_f(void)
+{
+	return 0x2;
+}
+static inline u32 pwr_pmu_idle_ctrl_value_always_f(void)
+{
+	return 0x3;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 pwr_pmu_idle_ctrl_filter_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_pmu_idle_mask_supp_r(u32 i)
+{
+	return 0x0010a9f0 + i*8;
+}
+static inline u32 pwr_pmu_idle_mask_1_supp_r(u32 i)
+{
+	return 0x0010a9f4 + i*8;
+}
+static inline u32 pwr_pmu_idle_ctrl_supp_r(u32 i)
+{
+	return 0x0010aa30 + i*8;
+}
+static inline u32 pwr_pmu_debug_r(u32 i)
+{
+	return 0x0010a5c0 + i*4;
+}
+static inline u32 pwr_pmu_debug__size_1_v(void)
+{
+	return 0x00000004;
+}
+static inline u32 pwr_pmu_mailbox_r(u32 i)
+{
+	return 0x0010a450 + i*4;
+}
+static inline u32 pwr_pmu_mailbox__size_1_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 pwr_pmu_bar0_addr_r(void)
+{
+	return 0x0010a7a0;
+}
+static inline u32 pwr_pmu_bar0_data_r(void)
+{
+	return 0x0010a7a4;
+}
+static inline u32 pwr_pmu_bar0_ctl_r(void)
+{
+	return 0x0010a7ac;
+}
+static inline u32 pwr_pmu_bar0_timeout_r(void)
+{
+	return 0x0010a7a8;
+}
+static inline u32 pwr_pmu_bar0_fecs_error_r(void)
+{
+	return 0x0010a988;
+}
+static inline u32 pwr_pmu_bar0_error_status_r(void)
+{
+	return 0x0010a7b0;
+}
+static inline u32 pwr_pmu_pg_idlefilth_r(u32 i)
+{
+	return 0x0010a6c0 + i*4;
+}
+static inline u32 pwr_pmu_pg_ppuidlefilth_r(u32 i)
+{
+	return 0x0010a6e8 + i*4;
+}
+static inline u32 pwr_pmu_pg_idle_cnt_r(u32 i)
+{
+	return 0x0010a710 + i*4;
+}
+static inline u32 pwr_pmu_pg_intren_r(u32 i)
+{
+	return 0x0010a760 + i*4;
+}
+static inline u32 pwr_fbif_transcfg_r(u32 i)
+{
+	return 0x0010a600 + i*4;
+}
+static inline u32 pwr_fbif_transcfg_target_local_fb_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_fbif_transcfg_target_coherent_sysmem_f(void)
+{
+	return 0x1;
+}
+static inline u32 pwr_fbif_transcfg_target_noncoherent_sysmem_f(void)
+{
+	return 0x2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_s(void)
+{
+	return 1;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_virtual_f(void)
+{
+	return 0x0;
+}
+static inline u32 pwr_fbif_transcfg_mem_type_physical_f(void)
+{
+	return 0x4;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h
new file mode 100644
index 000000000000..7eff3881e864
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_ram_gk20a.h
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_ram_gk20a_h_
+#define _hw_ram_gk20a_h_
+
+static inline u32 ram_in_ramfc_s(void)
+{
+	return 4096;
+}
+static inline u32 ram_in_ramfc_w(void)
+{
+	return 0;
+}
+static inline u32 ram_in_page_dir_base_target_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 ram_in_page_dir_base_target_w(void)
+{
+	return 128;
+}
+static inline u32 ram_in_page_dir_base_target_vid_mem_f(void)
+{
+	return 0x0;
+}
+static inline u32 ram_in_page_dir_base_vol_w(void)
+{
+	return 128;
+}
+static inline u32 ram_in_page_dir_base_vol_true_f(void)
+{
+	return 0x4;
+}
+static inline u32 ram_in_page_dir_base_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_page_dir_base_lo_w(void)
+{
+	return 128;
+}
+static inline u32 ram_in_page_dir_base_hi_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 ram_in_page_dir_base_hi_w(void)
+{
+	return 129;
+}
+static inline u32 ram_in_adr_limit_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_adr_limit_lo_w(void)
+{
+	return 130;
+}
+static inline u32 ram_in_adr_limit_hi_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 ram_in_adr_limit_hi_w(void)
+{
+	return 131;
+}
+static inline u32 ram_in_engine_cs_w(void)
+{
+	return 132;
+}
+static inline u32 ram_in_engine_cs_wfi_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 ram_in_engine_cs_wfi_f(void)
+{
+	return 0x0;
+}
+static inline u32 ram_in_engine_cs_fg_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ram_in_engine_cs_fg_f(void)
+{
+	return 0x8;
+}
+static inline u32 ram_in_gr_cs_w(void)
+{
+	return 132;
+}
+static inline u32 ram_in_gr_cs_wfi_f(void)
+{
+	return 0x0;
+}
+static inline u32 ram_in_gr_wfi_target_w(void)
+{
+	return 132;
+}
+static inline u32 ram_in_gr_wfi_mode_w(void)
+{
+	return 132;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 ram_in_gr_wfi_mode_physical_f(void)
+{
+	return 0x0;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 ram_in_gr_wfi_mode_virtual_f(void)
+{
+	return 0x4;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 ram_in_gr_wfi_ptr_lo_w(void)
+{
+	return 132;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 ram_in_gr_wfi_ptr_hi_w(void)
+{
+	return 133;
+}
+static inline u32 ram_in_base_shift_v(void)
+{
+	return 0x0000000c;
+}
+static inline u32 ram_in_alloc_size_v(void)
+{
+	return 0x00001000;
+}
+static inline u32 ram_fc_size_val_v(void)
+{
+	return 0x00000200;
+}
+static inline u32 ram_fc_gp_put_w(void)
+{
+	return 0;
+}
+static inline u32 ram_fc_userd_w(void)
+{
+	return 2;
+}
+static inline u32 ram_fc_userd_hi_w(void)
+{
+	return 3;
+}
+static inline u32 ram_fc_signature_w(void)
+{
+	return 4;
+}
+static inline u32 ram_fc_gp_get_w(void)
+{
+	return 5;
+}
+static inline u32 ram_fc_pb_get_w(void)
+{
+	return 6;
+}
+static inline u32 ram_fc_pb_get_hi_w(void)
+{
+	return 7;
+}
+static inline u32 ram_fc_pb_top_level_get_w(void)
+{
+	return 8;
+}
+static inline u32 ram_fc_pb_top_level_get_hi_w(void)
+{
+	return 9;
+}
+static inline u32 ram_fc_acquire_w(void)
+{
+	return 12;
+}
+static inline u32 ram_fc_semaphorea_w(void)
+{
+	return 14;
+}
+static inline u32 ram_fc_semaphoreb_w(void)
+{
+	return 15;
+}
+static inline u32 ram_fc_semaphorec_w(void)
+{
+	return 16;
+}
+static inline u32 ram_fc_semaphored_w(void)
+{
+	return 17;
+}
+static inline u32 ram_fc_gp_base_w(void)
+{
+	return 18;
+}
+static inline u32 ram_fc_gp_base_hi_w(void)
+{
+	return 19;
+}
+static inline u32 ram_fc_gp_fetch_w(void)
+{
+	return 20;
+}
+static inline u32 ram_fc_pb_fetch_w(void)
+{
+	return 21;
+}
+static inline u32 ram_fc_pb_fetch_hi_w(void)
+{
+	return 22;
+}
+static inline u32 ram_fc_pb_put_w(void)
+{
+	return 23;
+}
+static inline u32 ram_fc_pb_put_hi_w(void)
+{
+	return 24;
+}
+static inline u32 ram_fc_pb_header_w(void)
+{
+	return 33;
+}
+static inline u32 ram_fc_pb_count_w(void)
+{
+	return 34;
+}
+static inline u32 ram_fc_subdevice_w(void)
+{
+	return 37;
+}
+static inline u32 ram_fc_formats_w(void)
+{
+	return 39;
+}
+static inline u32 ram_fc_syncpointa_w(void)
+{
+	return 41;
+}
+static inline u32 ram_fc_syncpointb_w(void)
+{
+	return 42;
+}
+static inline u32 ram_fc_target_w(void)
+{
+	return 43;
+}
+static inline u32 ram_fc_hce_ctrl_w(void)
+{
+	return 57;
+}
+static inline u32 ram_fc_chid_w(void)
+{
+	return 58;
+}
+static inline u32 ram_fc_chid_id_f(u32 v)
+{
+	return (v & 0xfff) << 0;
+}
+static inline u32 ram_fc_chid_id_w(void)
+{
+	return 0;
+}
+static inline u32 ram_fc_eng_timeslice_w(void)
+{
+	return 62;
+}
+static inline u32 ram_fc_pb_timeslice_w(void)
+{
+	return 63;
+}
+static inline u32 ram_userd_base_shift_v(void)
+{
+	return 0x00000009;
+}
+static inline u32 ram_userd_chan_size_v(void)
+{
+	return 0x00000200;
+}
+static inline u32 ram_userd_put_w(void)
+{
+	return 16;
+}
+static inline u32 ram_userd_get_w(void)
+{
+	return 17;
+}
+static inline u32 ram_userd_ref_w(void)
+{
+	return 18;
+}
+static inline u32 ram_userd_put_hi_w(void)
+{
+	return 19;
+}
+static inline u32 ram_userd_ref_threshold_w(void)
+{
+	return 20;
+}
+static inline u32 ram_userd_top_level_get_w(void)
+{
+	return 22;
+}
+static inline u32 ram_userd_top_level_get_hi_w(void)
+{
+	return 23;
+}
+static inline u32 ram_userd_get_hi_w(void)
+{
+	return 24;
+}
+static inline u32 ram_userd_gp_get_w(void)
+{
+	return 34;
+}
+static inline u32 ram_userd_gp_put_w(void)
+{
+	return 35;
+}
+static inline u32 ram_userd_gp_top_level_get_w(void)
+{
+	return 22;
+}
+static inline u32 ram_userd_gp_top_level_get_hi_w(void)
+{
+	return 23;
+}
+static inline u32 ram_rl_entry_size_v(void)
+{
+	return 0x00000008;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h
new file mode 100644
index 000000000000..b1e6658d2338
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_sim_gk20a.h
@@ -0,0 +1,2150 @@
+/*
+ * drivers/video/tegra/host/gk20a/hw_sim_gk20a.h
+ *
+ * Copyright (c) 2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_sim_gk20a_h__
+#define __hw_sim_gk20a_h__
+/*This file is autogenerated.  Do not edit. */
+
+static inline u32 sim_send_ring_r(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_target_s(void)
+{
+	return 2;
+}
+static inline u32 sim_send_ring_target_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 sim_send_ring_target_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 sim_send_ring_target_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 sim_send_ring_target_phys_init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys_init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_send_ring_target_phys__init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys__init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_send_ring_target_phys__prod_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys__prod_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_send_ring_target_phys_nvm_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_target_phys_nvm_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_send_ring_target_phys_pci_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_send_ring_target_phys_pci_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_send_ring_target_phys_pci_coherent_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_send_ring_target_phys_pci_coherent_f(void)
+{
+	return 0x3;
+}
+static inline u32 sim_send_ring_status_s(void)
+{
+	return 1;
+}
+static inline u32 sim_send_ring_status_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_send_ring_status_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_send_ring_status_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_send_ring_status_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_status_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_status__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_status__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_status__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_status__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_status_invalid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_status_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_status_valid_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_send_ring_size_s(void)
+{
+	return 2;
+}
+static inline u32 sim_send_ring_size_f(u32 v)
+{
+	return (v & 0x3) << 4;
+}
+static inline u32 sim_send_ring_size_m(void)
+{
+	return 0x3 << 4;
+}
+static inline u32 sim_send_ring_size_v(u32 r)
+{
+	return (r >> 4) & 0x3;
+}
+static inline u32 sim_send_ring_size_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_size_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_size__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_size__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_size__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_size__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_size_4kb_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_size_4kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_size_8kb_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_size_8kb_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_send_ring_size_12kb_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_send_ring_size_12kb_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_send_ring_size_16kb_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_send_ring_size_16kb_f(void)
+{
+	return 0x30;
+}
+static inline u32 sim_send_ring_gp_in_ring_s(void)
+{
+	return 1;
+}
+static inline u32 sim_send_ring_gp_in_ring_f(u32 v)
+{
+	return (v & 0x1) << 11;
+}
+static inline u32 sim_send_ring_gp_in_ring_m(void)
+{
+	return 0x1 << 11;
+}
+static inline u32 sim_send_ring_gp_in_ring_v(u32 r)
+{
+	return (r >> 11) & 0x1;
+}
+static inline u32 sim_send_ring_gp_in_ring__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring_no_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_gp_in_ring_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_gp_in_ring_yes_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_send_ring_gp_in_ring_yes_f(void)
+{
+	return 0x800;
+}
+static inline u32 sim_send_ring_addr_lo_s(void)
+{
+	return 20;
+}
+static inline u32 sim_send_ring_addr_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 sim_send_ring_addr_lo_m(void)
+{
+	return 0xfffff << 12;
+}
+static inline u32 sim_send_ring_addr_lo_v(u32 r)
+{
+	return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_send_ring_addr_lo__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_addr_lo__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_addr_lo__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_addr_lo__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_hi_r(void)
+{
+	return 0x00000004;
+}
+static inline u32 sim_send_ring_hi_addr_s(void)
+{
+	return 20;
+}
+static inline u32 sim_send_ring_hi_addr_f(u32 v)
+{
+	return (v & 0xfffff) << 0;
+}
+static inline u32 sim_send_ring_hi_addr_m(void)
+{
+	return 0xfffff << 0;
+}
+static inline u32 sim_send_ring_hi_addr_v(u32 r)
+{
+	return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_send_ring_hi_addr__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_hi_addr__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_ring_hi_addr__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_send_ring_hi_addr__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_send_put_r(void)
+{
+	return 0x00000008;
+}
+static inline u32 sim_send_put_pointer_s(void)
+{
+	return 29;
+}
+static inline u32 sim_send_put_pointer_f(u32 v)
+{
+	return (v & 0x1fffffff) << 3;
+}
+static inline u32 sim_send_put_pointer_m(void)
+{
+	return 0x1fffffff << 3;
+}
+static inline u32 sim_send_put_pointer_v(u32 r)
+{
+	return (r >> 3) & 0x1fffffff;
+}
+static inline u32 sim_send_get_r(void)
+{
+	return 0x0000000c;
+}
+static inline u32 sim_send_get_pointer_s(void)
+{
+	return 29;
+}
+static inline u32 sim_send_get_pointer_f(u32 v)
+{
+	return (v & 0x1fffffff) << 3;
+}
+static inline u32 sim_send_get_pointer_m(void)
+{
+	return 0x1fffffff << 3;
+}
+static inline u32 sim_send_get_pointer_v(u32 r)
+{
+	return (r >> 3) & 0x1fffffff;
+}
+static inline u32 sim_recv_ring_r(void)
+{
+	return 0x00000010;
+}
+static inline u32 sim_recv_ring_target_s(void)
+{
+	return 2;
+}
+static inline u32 sim_recv_ring_target_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 sim_recv_ring_target_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 sim_recv_ring_target_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 sim_recv_ring_target_phys_init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys_init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys__init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys__init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys__prod_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys__prod_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys_nvm_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_target_phys_nvm_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_recv_ring_target_phys_pci_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_recv_ring_target_phys_pci_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_recv_ring_target_phys_pci_coherent_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_recv_ring_target_phys_pci_coherent_f(void)
+{
+	return 0x3;
+}
+static inline u32 sim_recv_ring_status_s(void)
+{
+	return 1;
+}
+static inline u32 sim_recv_ring_status_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_recv_ring_status_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_recv_ring_status_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_recv_ring_status_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_status_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_status__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_status__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_status__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_status__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_status_invalid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_status_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_status_valid_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_recv_ring_size_s(void)
+{
+	return 2;
+}
+static inline u32 sim_recv_ring_size_f(u32 v)
+{
+	return (v & 0x3) << 4;
+}
+static inline u32 sim_recv_ring_size_m(void)
+{
+	return 0x3 << 4;
+}
+static inline u32 sim_recv_ring_size_v(u32 r)
+{
+	return (r >> 4) & 0x3;
+}
+static inline u32 sim_recv_ring_size_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_size_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_size__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_size__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_size__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_size__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_size_4kb_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_size_4kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_size_8kb_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_size_8kb_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_recv_ring_size_12kb_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_recv_ring_size_12kb_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_recv_ring_size_16kb_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_recv_ring_size_16kb_f(void)
+{
+	return 0x30;
+}
+static inline u32 sim_recv_ring_gp_in_ring_s(void)
+{
+	return 1;
+}
+static inline u32 sim_recv_ring_gp_in_ring_f(u32 v)
+{
+	return (v & 0x1) << 11;
+}
+static inline u32 sim_recv_ring_gp_in_ring_m(void)
+{
+	return 0x1 << 11;
+}
+static inline u32 sim_recv_ring_gp_in_ring_v(u32 r)
+{
+	return (r >> 11) & 0x1;
+}
+static inline u32 sim_recv_ring_gp_in_ring__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring_no_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_gp_in_ring_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_gp_in_ring_yes_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_recv_ring_gp_in_ring_yes_f(void)
+{
+	return 0x800;
+}
+static inline u32 sim_recv_ring_addr_lo_s(void)
+{
+	return 20;
+}
+static inline u32 sim_recv_ring_addr_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 sim_recv_ring_addr_lo_m(void)
+{
+	return 0xfffff << 12;
+}
+static inline u32 sim_recv_ring_addr_lo_v(u32 r)
+{
+	return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_recv_ring_addr_lo__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_addr_lo__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_addr_lo__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_addr_lo__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_hi_r(void)
+{
+	return 0x00000014;
+}
+static inline u32 sim_recv_ring_hi_addr_s(void)
+{
+	return 20;
+}
+static inline u32 sim_recv_ring_hi_addr_f(u32 v)
+{
+	return (v & 0xfffff) << 0;
+}
+static inline u32 sim_recv_ring_hi_addr_m(void)
+{
+	return 0xfffff << 0;
+}
+static inline u32 sim_recv_ring_hi_addr_v(u32 r)
+{
+	return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_recv_ring_hi_addr__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_hi_addr__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_ring_hi_addr__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_recv_ring_hi_addr__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_recv_put_r(void)
+{
+	return 0x00000018;
+}
+static inline u32 sim_recv_put_pointer_s(void)
+{
+	return 11;
+}
+static inline u32 sim_recv_put_pointer_f(u32 v)
+{
+	return (v & 0x7ff) << 3;
+}
+static inline u32 sim_recv_put_pointer_m(void)
+{
+	return 0x7ff << 3;
+}
+static inline u32 sim_recv_put_pointer_v(u32 r)
+{
+	return (r >> 3) & 0x7ff;
+}
+static inline u32 sim_recv_get_r(void)
+{
+	return 0x0000001c;
+}
+static inline u32 sim_recv_get_pointer_s(void)
+{
+	return 11;
+}
+static inline u32 sim_recv_get_pointer_f(u32 v)
+{
+	return (v & 0x7ff) << 3;
+}
+static inline u32 sim_recv_get_pointer_m(void)
+{
+	return 0x7ff << 3;
+}
+static inline u32 sim_recv_get_pointer_v(u32 r)
+{
+	return (r >> 3) & 0x7ff;
+}
+static inline u32 sim_config_r(void)
+{
+	return 0x00000020;
+}
+static inline u32 sim_config_mode_s(void)
+{
+	return 1;
+}
+static inline u32 sim_config_mode_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 sim_config_mode_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 sim_config_mode_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 sim_config_mode_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_config_mode_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_config_mode_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_mode_enabled_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_config_channels_s(void)
+{
+	return 7;
+}
+static inline u32 sim_config_channels_f(u32 v)
+{
+	return (v & 0x7f) << 1;
+}
+static inline u32 sim_config_channels_m(void)
+{
+	return 0x7f << 1;
+}
+static inline u32 sim_config_channels_v(u32 r)
+{
+	return (r >> 1) & 0x7f;
+}
+static inline u32 sim_config_channels_none_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_config_channels_none_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_config_cached_only_s(void)
+{
+	return 1;
+}
+static inline u32 sim_config_cached_only_f(u32 v)
+{
+	return (v & 0x1) << 8;
+}
+static inline u32 sim_config_cached_only_m(void)
+{
+	return 0x1 << 8;
+}
+static inline u32 sim_config_cached_only_v(u32 r)
+{
+	return (r >> 8) & 0x1;
+}
+static inline u32 sim_config_cached_only_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_config_cached_only_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_config_cached_only_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_cached_only_enabled_f(void)
+{
+	return 0x100;
+}
+static inline u32 sim_config_validity_s(void)
+{
+	return 2;
+}
+static inline u32 sim_config_validity_f(u32 v)
+{
+	return (v & 0x3) << 9;
+}
+static inline u32 sim_config_validity_m(void)
+{
+	return 0x3 << 9;
+}
+static inline u32 sim_config_validity_v(u32 r)
+{
+	return (r >> 9) & 0x3;
+}
+static inline u32 sim_config_validity__init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_validity__init_f(void)
+{
+	return 0x200;
+}
+static inline u32 sim_config_validity_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_validity_valid_f(void)
+{
+	return 0x200;
+}
+static inline u32 sim_config_simulation_s(void)
+{
+	return 2;
+}
+static inline u32 sim_config_simulation_f(u32 v)
+{
+	return (v & 0x3) << 12;
+}
+static inline u32 sim_config_simulation_m(void)
+{
+	return 0x3 << 12;
+}
+static inline u32 sim_config_simulation_v(u32 r)
+{
+	return (r >> 12) & 0x3;
+}
+static inline u32 sim_config_simulation_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_config_simulation_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_config_simulation_fmodel_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_simulation_fmodel_f(void)
+{
+	return 0x1000;
+}
+static inline u32 sim_config_simulation_rtlsim_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_config_simulation_rtlsim_f(void)
+{
+	return 0x2000;
+}
+static inline u32 sim_config_secondary_display_s(void)
+{
+	return 1;
+}
+static inline u32 sim_config_secondary_display_f(u32 v)
+{
+	return (v & 0x1) << 14;
+}
+static inline u32 sim_config_secondary_display_m(void)
+{
+	return 0x1 << 14;
+}
+static inline u32 sim_config_secondary_display_v(u32 r)
+{
+	return (r >> 14) & 0x1;
+}
+static inline u32 sim_config_secondary_display_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_config_secondary_display_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_config_secondary_display_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_config_secondary_display_enabled_f(void)
+{
+	return 0x4000;
+}
+static inline u32 sim_config_num_heads_s(void)
+{
+	return 8;
+}
+static inline u32 sim_config_num_heads_f(u32 v)
+{
+	return (v & 0xff) << 17;
+}
+static inline u32 sim_config_num_heads_m(void)
+{
+	return 0xff << 17;
+}
+static inline u32 sim_config_num_heads_v(u32 r)
+{
+	return (r >> 17) & 0xff;
+}
+static inline u32 sim_event_ring_r(void)
+{
+	return 0x00000030;
+}
+static inline u32 sim_event_ring_target_s(void)
+{
+	return 2;
+}
+static inline u32 sim_event_ring_target_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 sim_event_ring_target_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 sim_event_ring_target_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 sim_event_ring_target_phys_init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys_init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_event_ring_target_phys__init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys__init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_event_ring_target_phys__prod_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys__prod_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_event_ring_target_phys_nvm_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_target_phys_nvm_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_event_ring_target_phys_pci_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_event_ring_target_phys_pci_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_event_ring_target_phys_pci_coherent_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_event_ring_target_phys_pci_coherent_f(void)
+{
+	return 0x3;
+}
+static inline u32 sim_event_ring_status_s(void)
+{
+	return 1;
+}
+static inline u32 sim_event_ring_status_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_event_ring_status_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_event_ring_status_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_event_ring_status_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_status_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_status__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_status__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_status__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_status__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_status_invalid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_status_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_status_valid_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_event_ring_size_s(void)
+{
+	return 2;
+}
+static inline u32 sim_event_ring_size_f(u32 v)
+{
+	return (v & 0x3) << 4;
+}
+static inline u32 sim_event_ring_size_m(void)
+{
+	return 0x3 << 4;
+}
+static inline u32 sim_event_ring_size_v(u32 r)
+{
+	return (r >> 4) & 0x3;
+}
+static inline u32 sim_event_ring_size_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_size_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_size__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_size__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_size__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_size__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_size_4kb_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_size_4kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_size_8kb_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_size_8kb_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_event_ring_size_12kb_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_event_ring_size_12kb_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_event_ring_size_16kb_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_event_ring_size_16kb_f(void)
+{
+	return 0x30;
+}
+static inline u32 sim_event_ring_gp_in_ring_s(void)
+{
+	return 1;
+}
+static inline u32 sim_event_ring_gp_in_ring_f(u32 v)
+{
+	return (v & 0x1) << 11;
+}
+static inline u32 sim_event_ring_gp_in_ring_m(void)
+{
+	return 0x1 << 11;
+}
+static inline u32 sim_event_ring_gp_in_ring_v(u32 r)
+{
+	return (r >> 11) & 0x1;
+}
+static inline u32 sim_event_ring_gp_in_ring__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring_no_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_gp_in_ring_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_gp_in_ring_yes_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_event_ring_gp_in_ring_yes_f(void)
+{
+	return 0x800;
+}
+static inline u32 sim_event_ring_addr_lo_s(void)
+{
+	return 20;
+}
+static inline u32 sim_event_ring_addr_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 sim_event_ring_addr_lo_m(void)
+{
+	return 0xfffff << 12;
+}
+static inline u32 sim_event_ring_addr_lo_v(u32 r)
+{
+	return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_event_ring_addr_lo__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_addr_lo__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_addr_lo__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_addr_lo__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_hi_v(void)
+{
+	return 0x00000034;
+}
+static inline u32 sim_event_ring_hi_addr_s(void)
+{
+	return 20;
+}
+static inline u32 sim_event_ring_hi_addr_f(u32 v)
+{
+	return (v & 0xfffff) << 0;
+}
+static inline u32 sim_event_ring_hi_addr_m(void)
+{
+	return 0xfffff << 0;
+}
+static inline u32 sim_event_ring_hi_addr_v(u32 r)
+{
+	return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_event_ring_hi_addr__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_hi_addr__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_ring_hi_addr__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_event_ring_hi_addr__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_event_put_r(void)
+{
+	return 0x00000038;
+}
+static inline u32 sim_event_put_pointer_s(void)
+{
+	return 30;
+}
+static inline u32 sim_event_put_pointer_f(u32 v)
+{
+	return (v & 0x3fffffff) << 2;
+}
+static inline u32 sim_event_put_pointer_m(void)
+{
+	return 0x3fffffff << 2;
+}
+static inline u32 sim_event_put_pointer_v(u32 r)
+{
+	return (r >> 2) & 0x3fffffff;
+}
+static inline u32 sim_event_get_r(void)
+{
+	return 0x0000003c;
+}
+static inline u32 sim_event_get_pointer_s(void)
+{
+	return 30;
+}
+static inline u32 sim_event_get_pointer_f(u32 v)
+{
+	return (v & 0x3fffffff) << 2;
+}
+static inline u32 sim_event_get_pointer_m(void)
+{
+	return 0x3fffffff << 2;
+}
+static inline u32 sim_event_get_pointer_v(u32 r)
+{
+	return (r >> 2) & 0x3fffffff;
+}
+static inline u32 sim_status_r(void)
+{
+	return 0x00000028;
+}
+static inline u32 sim_status_send_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_send_put_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 sim_status_send_put_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 sim_status_send_put_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 sim_status_send_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_send_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_send_put_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_send_put_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_send_put_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_send_put_pending_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_status_send_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_send_get_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 sim_status_send_get_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 sim_status_send_get_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 sim_status_send_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_send_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_send_get_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_send_get_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_send_get_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_send_get_pending_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_status_send_get_clear_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_send_get_clear_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_status_recv_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_recv_put_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 sim_status_recv_put_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 sim_status_recv_put_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 sim_status_recv_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_recv_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_recv_put_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_recv_put_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_recv_put_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_recv_put_pending_f(void)
+{
+	return 0x4;
+}
+static inline u32 sim_status_recv_put_clear_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_recv_put_clear_f(void)
+{
+	return 0x4;
+}
+static inline u32 sim_status_recv_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_recv_get_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_status_recv_get_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_status_recv_get_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_status_recv_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_recv_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_recv_get_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_recv_get_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_recv_get_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_recv_get_pending_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_status_event_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_event_put_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 sim_status_event_put_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 sim_status_event_put_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 sim_status_event_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_event_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_event_put_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_event_put_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_event_put_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_event_put_pending_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_status_event_put_clear_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_event_put_clear_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_status_event_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_status_event_get_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 sim_status_event_get_m(void)
+{
+	return 0x1 << 5;
+}
+static inline u32 sim_status_event_get_v(u32 r)
+{
+	return (r >> 5) & 0x1;
+}
+static inline u32 sim_status_event_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_event_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_event_get_idle_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_status_event_get_idle_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_status_event_get_pending_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_status_event_get_pending_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_control_r(void)
+{
+	return 0x0000002c;
+}
+static inline u32 sim_control_send_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_send_put_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+static inline u32 sim_control_send_put_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 sim_control_send_put_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 sim_control_send_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_send_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_send_put_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_send_put_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_send_put_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_send_put_enabled_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_control_send_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_send_get_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+static inline u32 sim_control_send_get_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 sim_control_send_get_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 sim_control_send_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_send_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_send_get_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_send_get_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_send_get_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_send_get_enabled_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_control_recv_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_recv_put_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+static inline u32 sim_control_recv_put_m(void)
+{
+	return 0x1 << 2;
+}
+static inline u32 sim_control_recv_put_v(u32 r)
+{
+	return (r >> 2) & 0x1;
+}
+static inline u32 sim_control_recv_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_recv_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_recv_put_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_recv_put_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_recv_put_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_recv_put_enabled_f(void)
+{
+	return 0x4;
+}
+static inline u32 sim_control_recv_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_recv_get_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_control_recv_get_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_control_recv_get_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_control_recv_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_recv_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_recv_get_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_recv_get_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_recv_get_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_recv_get_enabled_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_control_event_put_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_event_put_f(u32 v)
+{
+	return (v & 0x1) << 4;
+}
+static inline u32 sim_control_event_put_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 sim_control_event_put_v(u32 r)
+{
+	return (r >> 4) & 0x1;
+}
+static inline u32 sim_control_event_put__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_event_put__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_event_put_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_event_put_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_event_put_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_event_put_enabled_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_control_event_get_s(void)
+{
+	return 1;
+}
+static inline u32 sim_control_event_get_f(u32 v)
+{
+	return (v & 0x1) << 5;
+}
+static inline u32 sim_control_event_get_m(void)
+{
+	return 0x1 << 5;
+}
+static inline u32 sim_control_event_get_v(u32 r)
+{
+	return (r >> 5) & 0x1;
+}
+static inline u32 sim_control_event_get__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_event_get__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_event_get_disabled_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_control_event_get_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_control_event_get_enabled_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_control_event_get_enabled_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_dma_r(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_target_s(void)
+{
+	return 2;
+}
+static inline u32 sim_dma_target_f(u32 v)
+{
+	return (v & 0x3) << 0;
+}
+static inline u32 sim_dma_target_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 sim_dma_target_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 sim_dma_target_phys_init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_target_phys_init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_dma_target_phys__init_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_target_phys__init_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_dma_target_phys__prod_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_target_phys__prod_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_dma_target_phys_nvm_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_target_phys_nvm_f(void)
+{
+	return 0x1;
+}
+static inline u32 sim_dma_target_phys_pci_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_dma_target_phys_pci_f(void)
+{
+	return 0x2;
+}
+static inline u32 sim_dma_target_phys_pci_coherent_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_dma_target_phys_pci_coherent_f(void)
+{
+	return 0x3;
+}
+static inline u32 sim_dma_status_s(void)
+{
+	return 1;
+}
+static inline u32 sim_dma_status_f(u32 v)
+{
+	return (v & 0x1) << 3;
+}
+static inline u32 sim_dma_status_m(void)
+{
+	return 0x1 << 3;
+}
+static inline u32 sim_dma_status_v(u32 r)
+{
+	return (r >> 3) & 0x1;
+}
+static inline u32 sim_dma_status_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_status_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_status__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_status__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_status__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_status__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_status_invalid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_status_invalid_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_status_valid_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_status_valid_f(void)
+{
+	return 0x8;
+}
+static inline u32 sim_dma_size_s(void)
+{
+	return 2;
+}
+static inline u32 sim_dma_size_f(u32 v)
+{
+	return (v & 0x3) << 4;
+}
+static inline u32 sim_dma_size_m(void)
+{
+	return 0x3 << 4;
+}
+static inline u32 sim_dma_size_v(u32 r)
+{
+	return (r >> 4) & 0x3;
+}
+static inline u32 sim_dma_size_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_size_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_size__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_size__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_size__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_size__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_size_4kb_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_size_4kb_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_size_8kb_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 sim_dma_size_8kb_f(void)
+{
+	return 0x10;
+}
+static inline u32 sim_dma_size_12kb_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 sim_dma_size_12kb_f(void)
+{
+	return 0x20;
+}
+static inline u32 sim_dma_size_16kb_v(void)
+{
+	return 0x00000003;
+}
+static inline u32 sim_dma_size_16kb_f(void)
+{
+	return 0x30;
+}
+static inline u32 sim_dma_addr_lo_s(void)
+{
+	return 20;
+}
+static inline u32 sim_dma_addr_lo_f(u32 v)
+{
+	return (v & 0xfffff) << 12;
+}
+static inline u32 sim_dma_addr_lo_m(void)
+{
+	return 0xfffff << 12;
+}
+static inline u32 sim_dma_addr_lo_v(u32 r)
+{
+	return (r >> 12) & 0xfffff;
+}
+static inline u32 sim_dma_addr_lo__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_addr_lo__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_addr_lo__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_addr_lo__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_hi_r(void)
+{
+	return 0x00000004;
+}
+static inline u32 sim_dma_hi_addr_s(void)
+{
+	return 20;
+}
+static inline u32 sim_dma_hi_addr_f(u32 v)
+{
+	return (v & 0xfffff) << 0;
+}
+static inline u32 sim_dma_hi_addr_m(void)
+{
+	return 0xfffff << 0;
+}
+static inline u32 sim_dma_hi_addr_v(u32 r)
+{
+	return (r >> 0) & 0xfffff;
+}
+static inline u32 sim_dma_hi_addr__init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_hi_addr__init_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_dma_hi_addr__prod_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_dma_hi_addr__prod_f(void)
+{
+	return 0x0;
+}
+static inline u32 sim_msg_signature_r(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_msg_signature_valid_v(void)
+{
+	return 0x43505256;
+}
+static inline u32 sim_msg_length_r(void)
+{
+	return 0x00000004;
+}
+static inline u32 sim_msg_function_r(void)
+{
+	return 0x00000008;
+}
+static inline u32 sim_msg_function_sim_escape_read_v(void)
+{
+	return 0x00000023;
+}
+static inline u32 sim_msg_function_sim_escape_write_v(void)
+{
+	return 0x00000024;
+}
+static inline u32 sim_msg_result_r(void)
+{
+	return 0x0000000c;
+}
+static inline u32 sim_msg_result_success_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 sim_msg_result_rpc_pending_v(void)
+{
+	return 0xFFFFFFFF;
+}
+static inline u32 sim_msg_sequence_r(void)
+{
+	return 0x00000010;
+}
+static inline u32 sim_msg_spare_r(void)
+{
+	return 0x00000014;
+}
+static inline u32 sim_msg_spare__init_v(void)
+{
+	return 0x00000000;
+}
+
+#endif /* __hw_sim_gk20a_h__ */
diff --git a/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h
new file mode 100644
index 000000000000..5d6397b4d10b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_therm_gk20a.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_therm_gk20a_h_
+#define _hw_therm_gk20a_h_
+
+static inline u32 therm_use_a_r(void)
+{
+	return 0x00020798;
+}
+static inline u32 therm_evt_ext_therm_0_r(void)
+{
+	return 0x00020700;
+}
+static inline u32 therm_evt_ext_therm_1_r(void)
+{
+	return 0x00020704;
+}
+static inline u32 therm_evt_ext_therm_2_r(void)
+{
+	return 0x00020708;
+}
+static inline u32 therm_evt_ba_w0_t1h_r(void)
+{
+	return 0x00020750;
+}
+static inline u32 therm_weight_1_r(void)
+{
+	return 0x00020024;
+}
+static inline u32 therm_peakpower_config1_r(u32 i)
+{
+	return 0x00020154 + i*4;
+}
+static inline u32 therm_peakpower_config1_window_period_2m_v(void)
+{
+	return 0x0000000f;
+}
+static inline u32 therm_peakpower_config1_window_period_2m_f(void)
+{
+	return 0xf;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_s(void)
+{
+	return 6;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_f(u32 v)
+{
+	return (v & 0x3f) << 8;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_m(void)
+{
+	return 0x3f << 8;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_v(u32 r)
+{
+	return (r >> 8) & 0x3f;
+}
+static inline u32 therm_peakpower_config1_ba_sum_shift_20_f(void)
+{
+	return 0x1400;
+}
+static inline u32 therm_peakpower_config1_window_en_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 therm_peakpower_config2_r(u32 i)
+{
+	return 0x00020170 + i*4;
+}
+static inline u32 therm_peakpower_config4_r(u32 i)
+{
+	return 0x000201c0 + i*4;
+}
+static inline u32 therm_peakpower_config6_r(u32 i)
+{
+	return 0x00020270 + i*4;
+}
+static inline u32 therm_peakpower_config8_r(u32 i)
+{
+	return 0x000202e8 + i*4;
+}
+static inline u32 therm_peakpower_config9_r(u32 i)
+{
+	return 0x000202f4 + i*4;
+}
+static inline u32 therm_config1_r(void)
+{
+	return 0x00020050;
+}
+static inline u32 therm_gate_ctrl_r(u32 i)
+{
+	return 0x00020200 + i*4;
+}
+static inline u32 therm_gate_ctrl_eng_clk_m(void)
+{
+	return 0x3 << 0;
+}
+static inline u32 therm_gate_ctrl_eng_clk_run_f(void)
+{
+	return 0x0;
+}
+static inline u32 therm_gate_ctrl_eng_clk_auto_f(void)
+{
+	return 0x1;
+}
+static inline u32 therm_gate_ctrl_eng_clk_stop_f(void)
+{
+	return 0x2;
+}
+static inline u32 therm_gate_ctrl_blk_clk_m(void)
+{
+	return 0x3 << 2;
+}
+static inline u32 therm_gate_ctrl_blk_clk_run_f(void)
+{
+	return 0x0;
+}
+static inline u32 therm_gate_ctrl_blk_clk_auto_f(void)
+{
+	return 0x4;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_m(void)
+{
+	return 0x3 << 4;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_auto_f(void)
+{
+	return 0x10;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_v(void)
+{
+	return 0x00000002;
+}
+static inline u32 therm_gate_ctrl_eng_pwr_off_f(void)
+{
+	return 0x20;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_f(u32 v)
+{
+	return (v & 0x1f) << 8;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_exp_m(void)
+{
+	return 0x1f << 8;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_f(u32 v)
+{
+	return (v & 0x7) << 13;
+}
+static inline u32 therm_gate_ctrl_eng_idle_filt_mant_m(void)
+{
+	return 0x7 << 13;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_f(u32 v)
+{
+	return (v & 0xf) << 20;
+}
+static inline u32 therm_gate_ctrl_eng_delay_after_m(void)
+{
+	return 0xf << 20;
+}
+static inline u32 therm_fecs_idle_filter_r(void)
+{
+	return 0x00020288;
+}
+static inline u32 therm_fecs_idle_filter_value_m(void)
+{
+	return 0xffffffff << 0;
+}
+static inline u32 therm_hubmmu_idle_filter_r(void)
+{
+	return 0x0002028c;
+}
+static inline u32 therm_hubmmu_idle_filter_value_m(void)
+{
+	return 0xffffffff << 0;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
new file mode 100644
index 000000000000..22bc50acfaf4
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_timer_gk20a.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_timer_gk20a_h_
+#define _hw_timer_gk20a_h_
+
+static inline u32 timer_pri_timeout_r(void)
+{
+	return 0x00009080;
+}
+static inline u32 timer_pri_timeout_period_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+static inline u32 timer_pri_timeout_period_m(void)
+{
+	return 0xffffff << 0;
+}
+static inline u32 timer_pri_timeout_period_v(u32 r)
+{
+	return (r >> 0) & 0xffffff;
+}
+static inline u32 timer_pri_timeout_en_f(u32 v)
+{
+	return (v & 0x1) << 31;
+}
+static inline u32 timer_pri_timeout_en_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 timer_pri_timeout_en_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 timer_pri_timeout_en_en_enabled_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 timer_pri_timeout_en_en_disabled_f(void)
+{
+	return 0x0;
+}
+static inline u32 timer_pri_timeout_save_0_r(void)
+{
+	return 0x00009084;
+}
+static inline u32 timer_pri_timeout_save_1_r(void)
+{
+	return 0x00009088;
+}
+static inline u32 timer_pri_timeout_fecs_errcode_r(void)
+{
+	return 0x0000908c;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h
new file mode 100644
index 000000000000..c2922814a7ab
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_top_gk20a.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_top_gk20a_h_
+#define _hw_top_gk20a_h_
+
+static inline u32 top_num_gpcs_r(void)
+{
+	return 0x00022430;
+}
+static inline u32 top_num_gpcs_value_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 top_tpc_per_gpc_r(void)
+{
+	return 0x00022434;
+}
+static inline u32 top_tpc_per_gpc_value_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 top_num_fbps_r(void)
+{
+	return 0x00022438;
+}
+static inline u32 top_num_fbps_value_v(u32 r)
+{
+	return (r >> 0) & 0x1f;
+}
+static inline u32 top_fs_status_r(void)
+{
+	return 0x00022500;
+}
+static inline u32 top_device_info_r(u32 i)
+{
+	return 0x00022700 + i*4;
+}
+static inline u32 top_device_info__size_1_v(void)
+{
+	return 0x00000040;
+}
+static inline u32 top_device_info_chain_v(u32 r)
+{
+	return (r >> 31) & 0x1;
+}
+static inline u32 top_device_info_chain_enable_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 top_device_info_engine_enum_v(u32 r)
+{
+	return (r >> 26) & 0xf;
+}
+static inline u32 top_device_info_runlist_enum_v(u32 r)
+{
+	return (r >> 21) & 0xf;
+}
+static inline u32 top_device_info_type_enum_v(u32 r)
+{
+	return (r >> 2) & 0x1fffffff;
+}
+static inline u32 top_device_info_type_enum_graphics_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 top_device_info_type_enum_graphics_f(void)
+{
+	return 0x0;
+}
+static inline u32 top_device_info_type_enum_copy0_v(void)
+{
+	return 0x00000001;
+}
+static inline u32 top_device_info_type_enum_copy0_f(void)
+{
+	return 0x4;
+}
+static inline u32 top_device_info_entry_v(u32 r)
+{
+	return (r >> 0) & 0x3;
+}
+static inline u32 top_device_info_entry_not_valid_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 top_device_info_entry_enum_v(void)
+{
+	return 0x00000002;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h b/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h
new file mode 100644
index 000000000000..826e9bd11fc7
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/hw_trim_gk20a.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+/*
+ * Function naming determines intended use:
+ *
+ *     <x>_r(void) : Returns the offset for register <x>.
+ *
+ *     <x>_o(void) : Returns the offset for element <x>.
+ *
+ *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+ *
+ *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+ *
+ *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+ *         and masked to place it at field <y> of register <x>.  This value
+ *         can be |'d with others to produce a full register value for
+ *         register <x>.
+ *
+ *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+ *         value can be ~'d and then &'d to clear the value of field <y> for
+ *         register <x>.
+ *
+ *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+ *         to place it at field <y> of register <x>.  This value can be |'d
+ *         with others to produce a full register value for <x>.
+ *
+ *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+ *         <x> value 'r' after being shifted to place its LSB at bit 0.
+ *         This value is suitable for direct comparison with other unshifted
+ *         values appropriate for use in field <y> of register <x>.
+ *
+ *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+ *         field <y> of register <x>.  This value is suitable for direct
+ *         comparison with unshifted values appropriate for use in field <y>
+ *         of register <x>.
+ */
+#ifndef _hw_trim_gk20a_h_
+#define _hw_trim_gk20a_h_
+
+static inline u32 trim_sys_gpcpll_cfg_r(void)
+{
+	return 0x00137000;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_v(u32 r)
+{
+	return (r >> 0) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enable_yes_f(void)
+{
+	return 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_m(void)
+{
+	return 0x1 << 1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_v(u32 r)
+{
+	return (r >> 1) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_iddq_power_on_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_m(void)
+{
+	return 0x1 << 4;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_on_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_gpcpll_cfg_enb_lckdet_power_off_f(void)
+{
+	return 0x10;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_v(u32 r)
+{
+	return (r >> 17) & 0x1;
+}
+static inline u32 trim_sys_gpcpll_cfg_pll_lock_true_f(void)
+{
+	return 0x20000;
+}
+static inline u32 trim_sys_gpcpll_coeff_r(void)
+{
+	return 0x00137004;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+static inline u32 trim_sys_gpcpll_coeff_mdiv_v(u32 r)
+{
+	return (r >> 0) & 0xff;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_m(void)
+{
+	return 0xff << 8;
+}
+static inline u32 trim_sys_gpcpll_coeff_ndiv_v(u32 r)
+{
+	return (r >> 8) & 0xff;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_f(u32 v)
+{
+	return (v & 0x3f) << 16;
+}
+static inline u32 trim_sys_gpcpll_coeff_pldiv_v(u32 r)
+{
+	return (r >> 16) & 0x3f;
+}
+static inline u32 trim_sys_sel_vco_r(void)
+{
+	return 0x00137100;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_m(void)
+{
+	return 0x1 << 0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_v(void)
+{
+	return 0x00000000;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_init_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_bypass_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_sel_vco_gpc2clk_out_vco_f(void)
+{
+	return 0x1;
+}
+static inline u32 trim_sys_gpc2clk_out_r(void)
+{
+	return 0x00137250;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_s(void)
+{
+	return 6;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_f(u32 v)
+{
+	return (v & 0x3f) << 0;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_m(void)
+{
+	return 0x3f << 0;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_v(u32 r)
+{
+	return (r >> 0) & 0x3f;
+}
+static inline u32 trim_sys_gpc2clk_out_bypdiv_by31_f(void)
+{
+	return 0x3c;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_s(void)
+{
+	return 6;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_f(u32 v)
+{
+	return (v & 0x3f) << 8;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_m(void)
+{
+	return 0x3f << 8;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_v(u32 r)
+{
+	return (r >> 8) & 0x3f;
+}
+static inline u32 trim_sys_gpc2clk_out_vcodiv_by1_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 trim_sys_gpc2clk_out_sdiv14_indiv4_mode_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_r(u32 i)
+{
+	return 0x00134124 + i*512;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_noofipclks_f(u32 v)
+{
+	return (v & 0x3fff) << 0;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_write_en_asserted_f(void)
+{
+	return 0x10000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_enable_asserted_f(void)
+{
+	return 0x100000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cfg_reset_asserted_f(void)
+{
+	return 0x1000000;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_r(u32 i)
+{
+	return 0x00134128 + i*512;
+}
+static inline u32 trim_gpc_clk_cntr_ncgpcclk_cnt_value_v(u32 r)
+{
+	return (r >> 0) & 0xfffff;
+}
+static inline u32 trim_sys_gpcpll_cfg2_r(void)
+{
+	return 0x0013700c;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+static inline u32 trim_sys_gpcpll_cfg2_pll_stepa_m(void)
+{
+	return 0xff << 24;
+}
+static inline u32 trim_sys_gpcpll_cfg3_r(void)
+{
+	return 0x00137018;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+static inline u32 trim_sys_gpcpll_cfg3_pll_stepb_m(void)
+{
+	return 0xff << 16;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_r(void)
+{
+	return 0x0013701c;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_m(void)
+{
+	return 0x1 << 22;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_yes_f(void)
+{
+	return 0x400000;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_slowdown_using_pll_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_m(void)
+{
+	return 0x1 << 31;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_yes_f(void)
+{
+	return 0x80000000;
+}
+static inline u32 trim_sys_gpcpll_ndiv_slowdown_en_dynramp_no_f(void)
+{
+	return 0x0;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_r(void)
+{
+	return 0x001328a0;
+}
+static inline u32 trim_gpc_bcast_gpcpll_ndiv_slowdown_debug_pll_dynramp_done_synced_v(u32 r)
+{
+	return (r >> 24) & 0x1;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/kind_gk20a.c b/drivers/gpu/nvgpu/gk20a/kind_gk20a.c
new file mode 100644
index 000000000000..b0a740563691
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/kind_gk20a.c
@@ -0,0 +1,424 @@
+/*
+ * drivers/video/tegra/host/gk20a/kind_gk20a.c
+ *
+ * GK20A memory kind management
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <linux/bitops.h>
+
+#include "hw_gmmu_gk20a.h"
+#include "kind_gk20a.h"
+
+/* TBD: generate these from kind_macros.h */
+
+/* TBD: not sure on the work creation for gk20a, doubtful */
+static inline bool gk20a_kind_work_creation_sked(u8 k)
+{
+	return false;
+}
+static inline bool gk20a_kind_work_creation_host(u8 k)
+{
+	return false;
+}
+
+static inline bool gk20a_kind_work_creation(u8 k)
+{
+	return gk20a_kind_work_creation_sked(k) ||
+		gk20a_kind_work_creation_host(k);
+}
+
+/* note: taken from the !2cs_compression case */
+static inline bool gk20a_kind_supported(u8 k)
+{
+	return gk20a_kind_work_creation(k) ||
+		(k == gmmu_pte_kind_invalid_v()) ||
+		(k == gmmu_pte_kind_pitch_v()) ||
+		(k >= gmmu_pte_kind_z16_v() &&
+		 k <= gmmu_pte_kind_z16_ms8_2c_v()) ||
+		(k >= gmmu_pte_kind_z16_2z_v() &&
+		 k <= gmmu_pte_kind_z16_ms8_2z_v()) ||
+		(k == gmmu_pte_kind_s8z24_v()) ||
+		(k >= gmmu_pte_kind_s8z24_2cz_v() &&
+		 k <= gmmu_pte_kind_s8z24_ms8_2cz_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_2zv_v()) ||
+		(k == gmmu_pte_kind_z24s8_v()) ||
+		(k >= gmmu_pte_kind_z24s8_2cz_v() &&
+		 k <= gmmu_pte_kind_z24s8_ms8_2cz_v()) ||
+		(k == gmmu_pte_kind_zf32_v()) ||
+		(k >= gmmu_pte_kind_zf32_2cz_v() &&
+		 k <= gmmu_pte_kind_zf32_ms8_2cz_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v() &&
+		 k <= gmmu_pte_kind_zf32_x24s8_v()) ||
+		(k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+		 k <= gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v()) ||
+		(k == gmmu_pte_kind_generic_16bx2_v()) ||
+		(k == gmmu_pte_kind_c32_2c_v()) ||
+		(k == gmmu_pte_kind_c32_2cra_v()) ||
+		(k == gmmu_pte_kind_c32_ms2_2c_v()) ||
+		(k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c32_ms4_2cbr_v()) ||
+		(k >= gmmu_pte_kind_c32_ms4_2cra_v() &&
+		 k <= gmmu_pte_kind_c64_2c_v()) ||
+		(k == gmmu_pte_kind_c64_2cra_v()) ||
+		(k == gmmu_pte_kind_c64_ms2_2c_v()) ||
+		(k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c64_ms4_2cbr_v()) ||
+		(k >= gmmu_pte_kind_c64_ms4_2cra_v() &&
+		 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v()) ||
+		(k == gmmu_pte_kind_pitch_no_swizzle_v());
+		}
+
+static inline bool gk20a_kind_z(u8 k)
+{
+	return (k >= gmmu_pte_kind_z16_v() &&
+		k <= gmmu_pte_kind_v8z24_ms8_vc24_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v())
+		/* ||
+		(k >= gmmu_pte_kind_zv32_x24s8_2cszv_v() &&
+		k <= gmmu_pte_kind_xf32_x24s8_ms16_2cs_v())*/;
+}
+
+static inline bool gk20a_kind_c(u8 k)
+{
+	return gk20a_kind_work_creation(k) ||
+		(k == gmmu_pte_kind_pitch_v()) ||
+		(k == gmmu_pte_kind_generic_16bx2_v()) ||
+		(k >= gmmu_pte_kind_c32_2c_v() &&
+		 k <= gmmu_pte_kind_c32_ms2_2cbr_v()) ||
+		(k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+		(k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_pitch_no_swizzle_v());
+}
+
+static inline bool gk20a_kind_compressible(u8 k)
+{
+	return (k >= gmmu_pte_kind_z16_2c_v() &&
+		k <= gmmu_pte_kind_z16_ms16_4cz_v()) ||
+		(k >= gmmu_pte_kind_s8z24_1z_v() &&
+		 k <= gmmu_pte_kind_s8z24_ms16_4cszv_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v()) ||
+		(k >= gmmu_pte_kind_z24s8_1z_v() &&
+		 k <= gmmu_pte_kind_z24s8_ms16_4cszv_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_1z_v() &&
+		 k <= gmmu_pte_kind_zf32_ms16_2cz_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1zv_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_x24s8_1cs_v() &&
+		 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v()) ||
+		(k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+		 k <= gmmu_pte_kind_c32_ms2_2cbr_v()) ||
+		(k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+		(k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v());
+}
+
+static inline bool gk20a_kind_zbc(u8 k)
+{
+	return (k >= gmmu_pte_kind_z16_2c_v() &&
+		k <= gmmu_pte_kind_z16_ms16_2c_v()) ||
+		(k >= gmmu_pte_kind_z16_4cz_v() &&
+		 k <= gmmu_pte_kind_z16_ms16_4cz_v()) ||
+		(k >= gmmu_pte_kind_s8z24_2cz_v() &&
+		 k <= gmmu_pte_kind_s8z24_ms16_4cszv_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_2cs_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_2czv_v()) ||
+		(k >= gmmu_pte_kind_v8z24_ms4_vc12_4cszv_v() &&
+		 k <= gmmu_pte_kind_v8z24_ms8_vc24_4cszv_v()) ||
+		(k >= gmmu_pte_kind_z24s8_2cs_v() &&
+		 k <= gmmu_pte_kind_z24s8_ms16_4cszv_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_2cs_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_2czv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_2czv_v()) ||
+		(k >= gmmu_pte_kind_z24v8_ms4_vc12_4cszv_v() &&
+		 k <= gmmu_pte_kind_z24v8_ms8_vc24_4cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_2cs_v() &&
+		 k <= gmmu_pte_kind_zf32_ms16_2cz_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1cs_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_1czv_v() &&
+		 k <= gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1cs_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_1cs_v()) ||
+		(k >= gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_1czv_v() &&
+		 k <= gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()) ||
+		(k >= gmmu_pte_kind_zf32_x24s8_1cs_v() &&
+		 k <= gmmu_pte_kind_zf32_x24s8_ms16_1cs_v()) ||
+		(k >= gmmu_pte_kind_zf32_x24s8_2cszv_v() &&
+		 k <= gmmu_pte_kind_c32_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms2_2c_v() &&
+		 k <= gmmu_pte_kind_c32_ms2_2cbr_v())  ||
+		(k == gmmu_pte_kind_c32_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c32_ms4_2cra_v()) ||
+		(k >= gmmu_pte_kind_c32_ms8_ms16_2c_v() &&
+		 k <= gmmu_pte_kind_c64_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms2_2c_v() &&
+		 k <= gmmu_pte_kind_c64_ms2_2cbr_v()) ||
+		(k == gmmu_pte_kind_c64_ms2_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms4_2c_v() &&
+		 k <= gmmu_pte_kind_c64_ms4_2cra_v()) ||
+		(k >= gmmu_pte_kind_c64_ms8_ms16_2c_v() &&
+		 k <= gmmu_pte_kind_c128_ms8_ms16_2cr_v());
+}
+
+u8 gk20a_uc_kind_map[256];
+void gk20a_init_uncompressed_kind_map(void)
+{
+	int i;
+	for (i = 0; i < 256; i++)
+		gk20a_uc_kind_map[i] = gmmu_pte_kind_invalid_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms2_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms4_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms8_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_2z_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms2_2z_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms4_2z_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z16_ms8_2z_v()] =
+		gmmu_pte_kind_z16_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms2_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms4_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms8_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms2_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_s8z24_ms8_2cs_v()] =
+		gmmu_pte_kind_s8z24_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc4_2zv_v()] =
+		gmmu_pte_kind_v8z24_ms4_vc4_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc8_2zv_v()] =
+		gmmu_pte_kind_v8z24_ms8_vc8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms4_vc12_2zv_v()] =
+		gmmu_pte_kind_v8z24_ms4_vc12_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_v8z24_ms8_vc24_2zv_v()] =
+		gmmu_pte_kind_v8z24_ms8_vc24_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms2_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms2_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms4_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24s8_ms8_2cz_v()] =
+		gmmu_pte_kind_z24s8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms2_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms2_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms4_2cz_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_ms8_2cz_v()] =
+		gmmu_pte_kind_zf32_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_2cszv_v()] =
+		gmmu_pte_kind_x8z24_x16v8s8_ms4_vc12_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_2cszv_v()] =
+		gmmu_pte_kind_x8z24_x16v8s8_ms4_vc4_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_2cszv_v()] =
+		gmmu_pte_kind_x8z24_x16v8s8_ms8_vc8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_2cszv_v()] =
+		gmmu_pte_kind_x8z24_x16v8s8_ms8_vc24_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_2cszv_v()] =
+		gmmu_pte_kind_zf32_x16v8s8_ms4_vc12_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_2cszv_v()] =
+		gmmu_pte_kind_zf32_x16v8s8_ms4_vc4_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_2cszv_v()] =
+		gmmu_pte_kind_zf32_x16v8s8_ms8_vc8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_2cszv_v()] =
+		gmmu_pte_kind_zf32_x16v8s8_ms8_vc24_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_2cszv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms2_2cszv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms4_2cszv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms8_2cszv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms2_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_zf32_x24s8_ms8_2cs_v()] =
+		gmmu_pte_kind_zf32_x24s8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_2cba_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_2bra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms2_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms2_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cbr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cba_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms4_2bra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms8_ms16_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c32_ms8_ms16_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_2cbr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_2cba_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_2bra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms2_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms2_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cbr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cba_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms4_2bra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms8_ms16_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c64_ms8_ms16_2cra_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_2cr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms2_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms2_2cr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms4_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms4_2cr_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms8_ms16_2c_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_c128_ms8_ms16_2cr_v()] =
+		gmmu_pte_kind_generic_16bx2_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc4_2zv_v()] =
+		gmmu_pte_kind_z24v8_ms4_vc4_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms4_vc12_2zv_v()] =
+		gmmu_pte_kind_z24v8_ms4_vc12_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc8_2zv_v()] =
+		gmmu_pte_kind_z24v8_ms8_vc8_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2cs_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2czv_v()] =
+	gk20a_uc_kind_map[gmmu_pte_kind_z24v8_ms8_vc24_2zv_v()] =
+		gmmu_pte_kind_z24v8_ms8_vc24_v();
+
+	gk20a_uc_kind_map[gmmu_pte_kind_x8c24_v()] =
+		gmmu_pte_kind_x8c24_v();
+}
+
+u16 gk20a_kind_attr[256];
+void gk20a_init_kind_attr(void)
+{
+	u16 k;
+	for (k = 0; k < 256; k++) {
+		gk20a_kind_attr[k] = 0;
+		if (gk20a_kind_supported((u8)k))
+			gk20a_kind_attr[k] |= GK20A_KIND_ATTR_SUPPORTED;
+		if (gk20a_kind_compressible((u8)k))
+			gk20a_kind_attr[k] |= GK20A_KIND_ATTR_COMPRESSIBLE;
+		if (gk20a_kind_z((u8)k))
+			gk20a_kind_attr[k] |= GK20A_KIND_ATTR_Z;
+		if (gk20a_kind_c((u8)k))
+			gk20a_kind_attr[k] |= GK20A_KIND_ATTR_C;
+		if (gk20a_kind_zbc((u8)k))
+			gk20a_kind_attr[k] |= GK20A_KIND_ATTR_ZBC;
+	}
+}
diff --git a/drivers/gpu/nvgpu/gk20a/kind_gk20a.h b/drivers/gpu/nvgpu/gk20a/kind_gk20a.h
new file mode 100644
index 000000000000..93f011d4a84b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/kind_gk20a.h
@@ -0,0 +1,67 @@
+/*
+ * drivers/video/tegra/host/gk20a/kind_gk20a.h
+ *
+ * GK20A memory kind management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __KIND_GK20A_H__
+#define __KIND_GK20A_H__
+
+
+void gk20a_init_uncompressed_kind_map(void);
+void gk20a_init_kind_attr(void);
+
+extern u16 gk20a_kind_attr[];
+#define NV_KIND_DEFAULT		     -1
+
+#define GK20A_KIND_ATTR_SUPPORTED    BIT(0)
+#define GK20A_KIND_ATTR_COMPRESSIBLE BIT(1)
+#define GK20A_KIND_ATTR_Z            BIT(2)
+#define GK20A_KIND_ATTR_C            BIT(3)
+#define GK20A_KIND_ATTR_ZBC          BIT(4)
+
+static inline bool gk20a_kind_is_supported(u8 k)
+{
+	return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_SUPPORTED);
+}
+static inline bool gk20a_kind_is_compressible(u8 k)
+{
+	return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_COMPRESSIBLE);
+}
+
+static inline bool gk20a_kind_is_z(u8 k)
+{
+	return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_Z);
+}
+
+static inline bool gk20a_kind_is_c(u8 k)
+{
+	return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_C);
+}
+static inline bool gk20a_kind_is_zbc(u8 k)
+{
+	return !!(gk20a_kind_attr[k] & GK20A_KIND_ATTR_ZBC);
+}
+
+/* maps kind to its uncompressed version */
+extern u8 gk20a_uc_kind_map[];
+static inline u8 gk20a_get_uncompressed_kind(u8 k)
+{
+	return gk20a_uc_kind_map[k];
+}
+
+#endif /* __KIND_GK20A_H__ */
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_common.c b/drivers/gpu/nvgpu/gk20a/ltc_common.c
new file mode 100644
index 000000000000..cbb27cc77a1e
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_common.c
@@ -0,0 +1,243 @@
+/*
+ * drivers/video/tegra/host/gk20a/ltc_common.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/dma-mapping.h>
+#include <linux/delay.h>
+
+#include "gk20a.h"
+#include "gr_gk20a.h"
+
+static int gk20a_determine_L2_size_bytes(struct gk20a *g)
+{
+	const u32 gpuid = GK20A_GPUID(g->gpu_characteristics.arch,
+				      g->gpu_characteristics.impl);
+	u32 lts_per_ltc;
+	u32 ways;
+	u32 sets;
+	u32 bytes_per_line;
+	u32 active_ltcs;
+	u32 cache_size;
+
+	u32 tmp;
+	u32 active_sets_value;
+
+	tmp = gk20a_readl(g, ltc_ltc0_lts0_tstg_cfg1_r());
+	ways = hweight32(ltc_ltc0_lts0_tstg_cfg1_active_ways_v(tmp));
+
+	active_sets_value = ltc_ltc0_lts0_tstg_cfg1_active_sets_v(tmp);
+	if (active_sets_value == ltc_ltc0_lts0_tstg_cfg1_active_sets_all_v()) {
+		sets = 64;
+	} else if (active_sets_value ==
+		 ltc_ltc0_lts0_tstg_cfg1_active_sets_half_v()) {
+		sets = 32;
+	} else if (active_sets_value ==
+		 ltc_ltc0_lts0_tstg_cfg1_active_sets_quarter_v()) {
+		sets = 16;
+	} else {
+		dev_err(dev_from_gk20a(g),
+			"Unknown constant %u for active sets",
+		       (unsigned)active_sets_value);
+		sets = 0;
+	}
+
+	active_ltcs = g->gr.num_fbps;
+
+	/* chip-specific values */
+	switch (gpuid) {
+	case GK20A_GPUID_GK20A:
+		lts_per_ltc = 1;
+		bytes_per_line = 128;
+		break;
+
+	default:
+		dev_err(dev_from_gk20a(g), "Unknown GPU id 0x%02x\n",
+			(unsigned)gpuid);
+		lts_per_ltc = 0;
+		bytes_per_line = 0;
+	}
+
+	cache_size = active_ltcs * lts_per_ltc * ways * sets * bytes_per_line;
+
+	return cache_size;
+}
+
+/*
+ * Set the maximum number of ways that can have the "EVIST_LAST" class.
+ */
+static void gk20a_ltc_set_max_ways_evict_last(struct gk20a *g, u32 max_ways)
+{
+	u32 mgmt_reg;
+
+	mgmt_reg = gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_r()) &
+		~ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(~0);
+	mgmt_reg |= ltc_ltcs_ltss_tstg_set_mgmt_max_ways_evict_last_f(max_ways);
+
+	gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_r(), mgmt_reg);
+}
+
+/*
+ * Sets the ZBC color for the passed index.
+ */
+static void gk20a_ltc_set_zbc_color_entry(struct gk20a *g,
+					  struct zbc_entry *color_val,
+					  u32 index)
+{
+	u32 i;
+	u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+		     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+
+	for (i = 0;
+	     i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
+		gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i),
+			     color_val->color_l2[i]);
+}
+
+/*
+ * Sets the ZBC depth for the passed index.
+ */
+static void gk20a_ltc_set_zbc_depth_entry(struct gk20a *g,
+					  struct zbc_entry *depth_val,
+					  u32 index)
+{
+	u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+		     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(),
+		     depth_val->depth);
+}
+
+/*
+ * Clear the L2 ZBC color table for the passed index.
+ */
+static void gk20a_ltc_clear_zbc_color_entry(struct gk20a *g, u32 index)
+{
+	u32 i;
+	u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+		     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+
+	for (i = 0;
+	     i < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); i++)
+		gk20a_writel(g,
+			     ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(i), 0);
+}
+
+/*
+ * Clear the L2 ZBC depth entry for the passed index.
+ */
+static void gk20a_ltc_clear_zbc_depth_entry(struct gk20a *g, u32 index)
+{
+	u32 real_index = index + GK20A_STARTOF_ZBC_TABLE;
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+		     ltc_ltcs_ltss_dstg_zbc_index_address_f(real_index));
+
+	gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
+}
+
+static int gk20a_ltc_init_zbc(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 i, j;
+
+	/* reset zbc clear */
+	for (i = 0; i < GK20A_SIZEOF_ZBC_TABLE -
+	    GK20A_STARTOF_ZBC_TABLE; i++) {
+		gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_index_r(),
+			(gk20a_readl(g, ltc_ltcs_ltss_dstg_zbc_index_r()) &
+			 ~ltc_ltcs_ltss_dstg_zbc_index_address_f(~0)) |
+				ltc_ltcs_ltss_dstg_zbc_index_address_f(
+					i + GK20A_STARTOF_ZBC_TABLE));
+		for (j = 0; j < ltc_ltcs_ltss_dstg_zbc_color_clear_value__size_1_v(); j++)
+			gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_color_clear_value_r(j), 0);
+		gk20a_writel(g, ltc_ltcs_ltss_dstg_zbc_depth_clear_value_r(), 0);
+	}
+
+	gr_gk20a_clear_zbc_table(g, gr);
+	gr_gk20a_load_zbc_default_table(g, gr);
+
+	return 0;
+}
+
+static void gk20a_ltc_init_cbc(struct gk20a *g, struct gr_gk20a *gr)
+{
+	u32 compbit_base_post_divide;
+	u64 compbit_base_post_multiply64;
+	u64 compbit_store_base_iova =
+		NV_MC_SMMU_VADDR_TRANSLATE(gr->compbit_store.base_iova);
+	u64 compbit_base_post_divide64 = (compbit_store_base_iova >>
+		ltc_ltcs_ltss_cbc_base_alignment_shift_v());
+
+	do_div(compbit_base_post_divide64, gr->num_fbps);
+	compbit_base_post_divide = u64_lo32(compbit_base_post_divide64);
+
+	compbit_base_post_multiply64 = ((u64)compbit_base_post_divide *
+		gr->num_fbps) << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
+
+	if (compbit_base_post_multiply64 < compbit_store_base_iova)
+		compbit_base_post_divide++;
+
+	gk20a_writel(g, ltc_ltcs_ltss_cbc_base_r(),
+		compbit_base_post_divide);
+
+	gk20a_dbg(gpu_dbg_info | gpu_dbg_map | gpu_dbg_pte,
+		   "compbit base.pa: 0x%x,%08x cbc_base:0x%08x\n",
+		   (u32)(compbit_store_base_iova >> 32),
+		   (u32)(compbit_store_base_iova & 0xffffffff),
+		   compbit_base_post_divide);
+}
+
+/* Flushes the compression bit cache as well as "data".
+ * Note: the name here is a bit of a misnomer.  ELPG uses this
+ * internally... but ELPG doesn't have to be on to do it manually.
+ */
+static void gk20a_mm_g_elpg_flush_locked(struct gk20a *g)
+{
+	u32 data;
+	s32 retry = 100;
+
+	gk20a_dbg_fn("");
+
+	/* Make sure all previous writes are committed to the L2. There's no
+	   guarantee that writes are to DRAM. This will be a sysmembar internal
+	   to the L2. */
+	gk20a_writel(g, ltc_ltss_g_elpg_r(),
+		     ltc_ltss_g_elpg_flush_pending_f());
+	do {
+		data = gk20a_readl(g, ltc_ltss_g_elpg_r());
+
+		if (ltc_ltss_g_elpg_flush_v(data) ==
+		    ltc_ltss_g_elpg_flush_pending_v()) {
+			gk20a_dbg_info("g_elpg_flush 0x%x", data);
+			retry--;
+			usleep_range(20, 40);
+		} else
+			break;
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			    "g_elpg_flush too many retries");
+
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
new file mode 100644
index 000000000000..08aedecd5db0
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.c
@@ -0,0 +1,203 @@
+/*
+ * drivers/video/tegra/host/gk20a/ltc_gk20a.c
+ *
+ * GK20A Graphics
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+
+#include "hw_ltc_gk20a.h"
+#include "hw_proj_gk20a.h"
+
+#include "ltc_common.c"
+
+static int gk20a_ltc_init_comptags(struct gk20a *g, struct gr_gk20a *gr)
+{
+	struct device *d = dev_from_gk20a(g);
+	DEFINE_DMA_ATTRS(attrs);
+	dma_addr_t iova;
+
+	/* max memory size (MB) to cover */
+	u32 max_size = gr->max_comptag_mem;
+	/* one tag line covers 128KB */
+	u32 max_comptag_lines = max_size << 3;
+
+	u32 hw_max_comptag_lines =
+		ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_init_v();
+
+	u32 cbc_param =
+		gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r());
+	u32 comptags_per_cacheline =
+		ltc_ltcs_ltss_cbc_param_comptags_per_cache_line_v(cbc_param);
+	u32 slices_per_fbp =
+		ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(cbc_param);
+	u32 cacheline_size =
+		512 << ltc_ltcs_ltss_cbc_param_cache_line_size_v(cbc_param);
+
+	u32 compbit_backing_size;
+
+	gk20a_dbg_fn("");
+
+	if (max_comptag_lines == 0) {
+		gr->compbit_store.size = 0;
+		return 0;
+	}
+
+	if (max_comptag_lines > hw_max_comptag_lines)
+		max_comptag_lines = hw_max_comptag_lines;
+
+	/* no hybird fb */
+	compbit_backing_size =
+		DIV_ROUND_UP(max_comptag_lines, comptags_per_cacheline) *
+		cacheline_size * slices_per_fbp * gr->num_fbps;
+
+	/* aligned to 2KB * num_fbps */
+	compbit_backing_size +=
+		gr->num_fbps << ltc_ltcs_ltss_cbc_base_alignment_shift_v();
+
+	/* must be a multiple of 64KB */
+	compbit_backing_size = roundup(compbit_backing_size, 64*1024);
+
+	max_comptag_lines =
+		(compbit_backing_size * comptags_per_cacheline) /
+		cacheline_size * slices_per_fbp * gr->num_fbps;
+
+	if (max_comptag_lines > hw_max_comptag_lines)
+		max_comptag_lines = hw_max_comptag_lines;
+
+	gk20a_dbg_info("compbit backing store size : %d",
+		compbit_backing_size);
+	gk20a_dbg_info("max comptag lines : %d",
+		max_comptag_lines);
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	gr->compbit_store.size = compbit_backing_size;
+	gr->compbit_store.pages = dma_alloc_attrs(d, gr->compbit_store.size,
+					&iova, GFP_KERNEL, &attrs);
+	if (!gr->compbit_store.pages) {
+		gk20a_err(dev_from_gk20a(g), "failed to allocate"
+			   "backing store for compbit : size %d",
+			   compbit_backing_size);
+		return -ENOMEM;
+	}
+	gr->compbit_store.base_iova = iova;
+
+	gk20a_allocator_init(&gr->comp_tags, "comptag",
+			      1, /* start */
+			      max_comptag_lines - 1, /* length*/
+			      1); /* align */
+
+	return 0;
+}
+
+static int gk20a_ltc_clear_comptags(struct gk20a *g, u32 min, u32 max)
+{
+	struct gr_gk20a *gr = &g->gr;
+	u32 fbp, slice, ctrl1, val;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(gk20a_get_gr_idle_timeout(g));
+	u32 delay = GR_IDLE_CHECK_DEFAULT;
+	u32 slices_per_fbp =
+		ltc_ltcs_ltss_cbc_param_slices_per_fbp_v(
+			gk20a_readl(g, ltc_ltcs_ltss_cbc_param_r()));
+
+	gk20a_dbg_fn("");
+
+	if (gr->compbit_store.size == 0)
+		return 0;
+
+	gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl2_r(),
+		     ltc_ltcs_ltss_cbc_ctrl2_clear_lower_bound_f(min));
+	gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl3_r(),
+		     ltc_ltcs_ltss_cbc_ctrl3_clear_upper_bound_f(max));
+	gk20a_writel(g, ltc_ltcs_ltss_cbc_ctrl1_r(),
+		     gk20a_readl(g, ltc_ltcs_ltss_cbc_ctrl1_r()) |
+		     ltc_ltcs_ltss_cbc_ctrl1_clear_active_f());
+
+	for (fbp = 0; fbp < gr->num_fbps; fbp++) {
+		for (slice = 0; slice < slices_per_fbp; slice++) {
+
+			delay = GR_IDLE_CHECK_DEFAULT;
+
+			ctrl1 = ltc_ltc0_lts0_cbc_ctrl1_r() +
+				fbp * proj_ltc_stride_v() +
+				slice * proj_lts_stride_v();
+
+			do {
+				val = gk20a_readl(g, ctrl1);
+				if (ltc_ltcs_ltss_cbc_ctrl1_clear_v(val) !=
+				    ltc_ltcs_ltss_cbc_ctrl1_clear_active_v())
+					break;
+
+				usleep_range(delay, delay * 2);
+				delay = min_t(u32, delay << 1,
+					GR_IDLE_CHECK_MAX);
+
+			} while (time_before(jiffies, end_jiffies) ||
+					!tegra_platform_is_silicon());
+
+			if (!time_before(jiffies, end_jiffies)) {
+				gk20a_err(dev_from_gk20a(g),
+					   "comp tag clear timeout\n");
+				return -EBUSY;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+#ifdef CONFIG_DEBUG_FS
+static void gk20a_ltc_sync_debugfs(struct gk20a *g)
+{
+	u32 reg_f = ltc_ltcs_ltss_tstg_set_mgmt_2_l2_bypass_mode_enabled_f();
+
+	spin_lock(&g->debugfs_lock);
+	if (g->mm.ltc_enabled != g->mm.ltc_enabled_debug) {
+		u32 reg = gk20a_readl(g, ltc_ltcs_ltss_tstg_set_mgmt_2_r());
+		if (g->mm.ltc_enabled_debug)
+			/* bypass disabled (normal caching ops)*/
+			reg &= ~reg_f;
+		else
+			/* bypass enabled (no caching) */
+			reg |= reg_f;
+
+		gk20a_writel(g, ltc_ltcs_ltss_tstg_set_mgmt_2_r(), reg);
+		g->mm.ltc_enabled = g->mm.ltc_enabled_debug;
+	}
+	spin_unlock(&g->debugfs_lock);
+}
+#endif
+
+void gk20a_init_ltc(struct gpu_ops *gops)
+{
+	gops->ltc.determine_L2_size_bytes = gk20a_determine_L2_size_bytes;
+	gops->ltc.set_max_ways_evict_last = gk20a_ltc_set_max_ways_evict_last;
+	gops->ltc.init_comptags = gk20a_ltc_init_comptags;
+	gops->ltc.clear_comptags = gk20a_ltc_clear_comptags;
+	gops->ltc.set_zbc_color_entry = gk20a_ltc_set_zbc_color_entry;
+	gops->ltc.set_zbc_depth_entry = gk20a_ltc_set_zbc_depth_entry;
+	gops->ltc.clear_zbc_color_entry = gk20a_ltc_clear_zbc_color_entry;
+	gops->ltc.clear_zbc_depth_entry = gk20a_ltc_clear_zbc_depth_entry;
+	gops->ltc.init_zbc = gk20a_ltc_init_zbc;
+	gops->ltc.init_cbc = gk20a_ltc_init_cbc;
+#ifdef CONFIG_DEBUG_FS
+	gops->ltc.sync_debugfs = gk20a_ltc_sync_debugfs;
+#endif
+	gops->ltc.elpg_flush = gk20a_mm_g_elpg_flush_locked;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h
new file mode 100644
index 000000000000..208811b256cc
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/ltc_gk20a.h
@@ -0,0 +1,21 @@
+/*
+ * GK20A L2
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _NVHOST_GK20A_LTC
+#define _NVHOST_GK20A_LTC
+struct gk20a;
+
+void gk20a_init_ltc(struct gpu_ops *gops);
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.c b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
new file mode 100644
index 000000000000..b22df5e87de6
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.c
@@ -0,0 +1,2984 @@
+/*
+ * drivers/video/tegra/host/gk20a/mm_gk20a.c
+ *
+ * GK20A memory management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/delay.h>
+#include <linux/highmem.h>
+#include <linux/log2.h>
+#include <linux/nvhost.h>
+#include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
+#include <linux/nvmap.h>
+#include <linux/tegra-soc.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-buf.h>
+#include <asm/cacheflush.h>
+
+#include "gk20a.h"
+#include "mm_gk20a.h"
+#include "hw_gmmu_gk20a.h"
+#include "hw_fb_gk20a.h"
+#include "hw_bus_gk20a.h"
+#include "hw_ram_gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_flush_gk20a.h"
+#include "hw_ltc_gk20a.h"
+
+#include "kind_gk20a.h"
+
+#ifdef CONFIG_ARM64
+#define outer_flush_range(a, b)
+#define __cpuc_flush_dcache_area __flush_dcache_area
+#endif
+
+/*
+ * GPU mapping life cycle
+ * ======================
+ *
+ * Kernel mappings
+ * ---------------
+ *
+ * Kernel mappings are created through vm.map(..., false):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path does not support deferred unmapping (i.e. kernel must wait for
+ *    all hw operations on the buffer to complete before unmapping).
+ *  - References to dmabuf are owned and managed by the (kernel) clients of
+ *    the gk20a_vm layer.
+ *
+ *
+ * User space mappings
+ * -------------------
+ *
+ * User space mappings are created through as.map_buffer -> vm.map(..., true):
+ *
+ *  - Mappings to the same allocations are reused and refcounted.
+ *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
+ *    until all hw operations have completed).
+ *  - References to dmabuf are owned and managed by the vm_gk20a
+ *    layer itself. vm.map acquires these refs, and sets
+ *    mapped_buffer->own_mem_ref to record that we must release the refs when we
+ *    actually unmap.
+ *
+ */
+
+static inline int vm_aspace_id(struct vm_gk20a *vm)
+{
+	/* -1 is bar1 or pmu, etc. */
+	return vm->as_share ? vm->as_share->id : -1;
+}
+static inline u32 hi32(u64 f)
+{
+	return (u32)(f >> 32);
+}
+static inline u32 lo32(u64 f)
+{
+	return (u32)(f & 0xffffffff);
+}
+
+#define FLUSH_CPU_DCACHE(va, pa, size)	\
+	do {	\
+		__cpuc_flush_dcache_area((void *)(va), (size_t)(size));	\
+		outer_flush_range(pa, pa + (size_t)(size));		\
+	} while (0)
+
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer);
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+					struct rb_root *root, u64 addr);
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+				struct rb_root *root, struct dma_buf *dmabuf,
+				u32 kind);
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+				   enum gmmu_pgsz_gk20a pgsz_idx,
+				   struct sg_table *sgt,
+				   u64 first_vaddr, u64 last_vaddr,
+				   u8 kind_v, u32 ctag_offset, bool cacheable,
+				   int rw_flag);
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
+static void gk20a_vm_remove_support(struct vm_gk20a *vm);
+
+
+/* note: keep the page sizes sorted lowest to highest here */
+static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
+static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
+static const u64 gmmu_page_offset_masks[gmmu_nr_page_sizes] = { 0xfffLL,
+								0x1ffffLL };
+static const u64 gmmu_page_masks[gmmu_nr_page_sizes] = { ~0xfffLL, ~0x1ffffLL };
+
+struct gk20a_comptags {
+	u32 offset;
+	u32 lines;
+};
+
+struct gk20a_dmabuf_priv {
+	struct mutex lock;
+
+	struct gk20a_allocator *comptag_allocator;
+	struct gk20a_comptags comptags;
+
+	struct dma_buf_attachment *attach;
+	struct sg_table *sgt;
+
+	int pin_count;
+};
+
+static void gk20a_mm_delete_priv(void *_priv)
+{
+	struct gk20a_dmabuf_priv *priv = _priv;
+	if (!priv)
+		return;
+
+	if (priv->comptags.lines) {
+		BUG_ON(!priv->comptag_allocator);
+		priv->comptag_allocator->free(priv->comptag_allocator,
+					      priv->comptags.offset,
+					      priv->comptags.lines);
+	}
+
+	kfree(priv);
+}
+
+struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
+{
+	struct gk20a_dmabuf_priv *priv;
+
+	priv = dma_buf_get_drvdata(dmabuf, dev);
+	if (WARN_ON(!priv))
+		return ERR_PTR(-EINVAL);
+
+	mutex_lock(&priv->lock);
+
+	if (priv->pin_count == 0) {
+		priv->attach = dma_buf_attach(dmabuf, dev);
+		if (IS_ERR(priv->attach)) {
+			mutex_unlock(&priv->lock);
+			return (struct sg_table *)priv->attach;
+		}
+
+		priv->sgt = dma_buf_map_attachment(priv->attach,
+						   DMA_BIDIRECTIONAL);
+		if (IS_ERR(priv->sgt)) {
+			dma_buf_detach(dmabuf, priv->attach);
+			mutex_unlock(&priv->lock);
+			return priv->sgt;
+		}
+	}
+
+	priv->pin_count++;
+	mutex_unlock(&priv->lock);
+	return priv->sgt;
+}
+
+void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
+		    struct sg_table *sgt)
+{
+	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+	dma_addr_t dma_addr;
+
+	if (IS_ERR(priv) || !priv)
+		return;
+
+	mutex_lock(&priv->lock);
+	WARN_ON(priv->sgt != sgt);
+	priv->pin_count--;
+	WARN_ON(priv->pin_count < 0);
+	dma_addr = sg_dma_address(priv->sgt->sgl);
+	if (priv->pin_count == 0) {
+		dma_buf_unmap_attachment(priv->attach, priv->sgt,
+					 DMA_BIDIRECTIONAL);
+		dma_buf_detach(dmabuf, priv->attach);
+	}
+	mutex_unlock(&priv->lock);
+}
+
+
+static void gk20a_get_comptags(struct device *dev,
+			       struct dma_buf *dmabuf,
+			       struct gk20a_comptags *comptags)
+{
+	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+
+	if (!comptags)
+		return;
+
+	if (!priv) {
+		comptags->lines = 0;
+		comptags->offset = 0;
+		return;
+	}
+
+	*comptags = priv->comptags;
+}
+
+static int gk20a_alloc_comptags(struct device *dev,
+				struct dma_buf *dmabuf,
+				struct gk20a_allocator *allocator,
+				int lines)
+{
+	struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
+	u32 offset = 0;
+	int err;
+
+	if (!priv)
+		return -ENOSYS;
+
+	if (!lines)
+		return -EINVAL;
+
+	/* store the allocator so we can use it when we free the ctags */
+	priv->comptag_allocator = allocator;
+	err = allocator->alloc(allocator, &offset, lines);
+	if (!err) {
+		priv->comptags.lines = lines;
+		priv->comptags.offset = offset;
+	}
+	return err;
+}
+
+
+
+
+static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+	if (g->ops.fb.reset)
+		g->ops.fb.reset(g);
+
+	if (g->ops.fb.init_fs_state)
+		g->ops.fb.init_fs_state(g);
+
+	return 0;
+}
+
+void gk20a_remove_mm_support(struct mm_gk20a *mm)
+{
+	struct gk20a *g = mm->g;
+	struct device *d = dev_from_gk20a(g);
+	struct vm_gk20a *vm = &mm->bar1.vm;
+	struct inst_desc *inst_block = &mm->bar1.inst_block;
+
+	gk20a_dbg_fn("");
+
+	if (inst_block->cpuva)
+		dma_free_coherent(d, inst_block->size,
+			inst_block->cpuva, inst_block->iova);
+	inst_block->cpuva = NULL;
+	inst_block->iova = 0;
+
+	gk20a_vm_remove_support(vm);
+}
+
+int gk20a_init_mm_setup_sw(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	int i;
+
+	gk20a_dbg_fn("");
+
+	if (mm->sw_ready) {
+		gk20a_dbg_fn("skip init");
+		return 0;
+	}
+
+	mm->g = g;
+	mutex_init(&mm->tlb_lock);
+	mutex_init(&mm->l2_op_lock);
+	mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
+	mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
+	mm->pde_stride    = mm->big_page_size << 10;
+	mm->pde_stride_shift = ilog2(mm->pde_stride);
+	BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
+
+	for (i = 0; i < ARRAY_SIZE(gmmu_page_sizes); i++) {
+
+		u32 num_ptes, pte_space, num_pages;
+
+		/* assuming "full" page tables */
+		num_ptes = mm->pde_stride / gmmu_page_sizes[i];
+
+		pte_space = num_ptes * gmmu_pte__size_v();
+		/* allocate whole pages */
+		pte_space = roundup(pte_space, PAGE_SIZE);
+
+		num_pages = pte_space / PAGE_SIZE;
+		/* make sure "order" is viable */
+		BUG_ON(!is_power_of_2(num_pages));
+
+		mm->page_table_sizing[i].num_ptes = num_ptes;
+		mm->page_table_sizing[i].order = ilog2(num_pages);
+	}
+
+	/*TBD: make channel vm size configurable */
+	mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
+
+	gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
+
+	gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
+			gmmu_page_sizes[gmmu_page_size_small] >> 10,
+			(mm->page_table_sizing[gmmu_page_size_small].num_ptes *
+			 gmmu_pte__size_v()) >> 10);
+
+	gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
+			gmmu_page_sizes[gmmu_page_size_big] >> 10,
+			(mm->page_table_sizing[gmmu_page_size_big].num_ptes *
+			 gmmu_pte__size_v()) >> 10);
+
+
+	gk20a_init_bar1_vm(mm);
+
+	mm->remove_support = gk20a_remove_mm_support;
+	mm->sw_ready = true;
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+/* make sure gk20a_init_mm_support is called before */
+static int gk20a_init_mm_setup_hw(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	struct inst_desc *inst_block = &mm->bar1.inst_block;
+	phys_addr_t inst_pa = inst_block->cpu_pa;
+
+	gk20a_dbg_fn("");
+
+	/* set large page size in fb
+	 * note this is very early on, can we defer it ? */
+	{
+		u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
+
+		if (gmmu_page_sizes[gmmu_page_size_big] == SZ_128K)
+			fb_mmu_ctrl = (fb_mmu_ctrl &
+				       ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
+				fb_mmu_ctrl_vm_pg_size_128kb_f();
+		else
+			BUG_ON(1); /* no support/testing for larger ones yet */
+
+		gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
+	}
+
+	inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
+	gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
+
+	/* this is very early in init... can we defer this? */
+	{
+		gk20a_writel(g, bus_bar1_block_r(),
+			     bus_bar1_block_target_vid_mem_f() |
+			     bus_bar1_block_mode_virtual_f() |
+			     bus_bar1_block_ptr_f(inst_pa));
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+int gk20a_init_mm_support(struct gk20a *g)
+{
+	u32 err;
+
+	err = gk20a_init_mm_reset_enable_hw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_mm_setup_sw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_mm_setup_hw(g);
+	if (err)
+		return err;
+
+	return err;
+}
+
+#ifdef CONFIG_GK20A_PHYS_PAGE_TABLES
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+			    void **handle,
+			    struct sg_table **sgt,
+			    size_t *size)
+{
+	u32 num_pages = 1 << order;
+	u32 len = num_pages * PAGE_SIZE;
+	int err;
+	struct page *pages;
+
+	gk20a_dbg_fn("");
+
+	pages = alloc_pages(GFP_KERNEL, order);
+	if (!pages) {
+		gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
+		goto err_out;
+	}
+	*sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt) {
+		gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
+		goto err_alloced;
+	}
+	err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+	if (err) {
+		gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
+		goto err_sg_table;
+	}
+	sg_set_page((*sgt)->sgl, pages, len, 0);
+	*handle = page_address(pages);
+	memset(*handle, 0, len);
+	*size = len;
+	FLUSH_CPU_DCACHE(*handle, sg_phys((*sgt)->sgl), len);
+
+	return 0;
+
+err_sg_table:
+	kfree(*sgt);
+err_alloced:
+	__free_pages(pages, order);
+err_out:
+	return -ENOMEM;
+}
+
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+			    struct sg_table *sgt, u32 order,
+			    size_t size)
+{
+	gk20a_dbg_fn("");
+	BUG_ON(sgt == NULL);
+	free_pages((unsigned long)handle, order);
+	sg_free_table(sgt);
+	kfree(sgt);
+}
+
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+			  void **va, size_t size)
+{
+	FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+	*va = handle;
+	return 0;
+}
+
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+	FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
+}
+#else
+static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
+			    void **handle,
+			    struct sg_table **sgt,
+			    size_t *size)
+{
+	struct device *d = dev_from_vm(vm);
+	u32 num_pages = 1 << order;
+	u32 len = num_pages * PAGE_SIZE;
+	dma_addr_t iova;
+	DEFINE_DMA_ATTRS(attrs);
+	struct page **pages;
+	int err = 0;
+
+	gk20a_dbg_fn("");
+
+	*size = len;
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
+	if (!pages) {
+		gk20a_err(d, "memory allocation failed\n");
+		goto err_out;
+	}
+
+	err = gk20a_get_sgtable_from_pages(d, sgt, pages,
+				iova, len);
+	if (err) {
+		gk20a_err(d, "sgt allocation failed\n");
+		goto err_free;
+	}
+
+	*handle = (void *)pages;
+
+	return 0;
+
+err_free:
+	dma_free_attrs(d, len, pages, iova, &attrs);
+	pages = NULL;
+	iova = 0;
+err_out:
+	return -ENOMEM;
+}
+
+static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
+			    struct sg_table *sgt, u32 order,
+			    size_t size)
+{
+	struct device *d = dev_from_vm(vm);
+	u64 iova;
+	DEFINE_DMA_ATTRS(attrs);
+	struct page **pages = (struct page **)handle;
+
+	gk20a_dbg_fn("");
+	BUG_ON(sgt == NULL);
+
+	iova = sg_dma_address(sgt->sgl);
+
+	gk20a_free_sgtable(&sgt);
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+	dma_free_attrs(d, size, pages, iova, &attrs);
+	pages = NULL;
+	iova = 0;
+}
+
+static int map_gmmu_pages(void *handle, struct sg_table *sgt,
+			  void **kva, size_t size)
+{
+	int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct page **pages = (struct page **)handle;
+	gk20a_dbg_fn("");
+
+	*kva = vmap(pages, count, 0, pgprot_dmacoherent(PAGE_KERNEL));
+	if (!(*kva))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
+{
+	gk20a_dbg_fn("");
+	vunmap(va);
+}
+#endif
+
+/* allocate a phys contig region big enough for a full
+ * sized gmmu page table for the given gmmu_page_size.
+ * the whole range is zeroed so it's "invalid"/will fault
+ */
+
+static int zalloc_gmmu_page_table_gk20a(struct vm_gk20a *vm,
+					enum gmmu_pgsz_gk20a gmmu_pgsz_idx,
+					struct page_table_gk20a *pte)
+{
+	int err;
+	u32 pte_order;
+	void *handle = NULL;
+	struct sg_table *sgt;
+	size_t size;
+
+	gk20a_dbg_fn("");
+
+	/* allocate enough pages for the table */
+	pte_order = vm->mm->page_table_sizing[gmmu_pgsz_idx].order;
+
+	err = alloc_gmmu_pages(vm, pte_order, &handle, &sgt, &size);
+	if (err)
+		return err;
+
+	gk20a_dbg(gpu_dbg_pte, "pte = 0x%p, addr=%08llx, size %d",
+			pte, gk20a_mm_iova_addr(sgt->sgl), pte_order);
+
+	pte->ref = handle;
+	pte->sgt = sgt;
+	pte->size = size;
+
+	return 0;
+}
+
+/* given address range (inclusive) determine the pdes crossed */
+static inline void pde_range_from_vaddr_range(struct vm_gk20a *vm,
+					      u64 addr_lo, u64 addr_hi,
+					      u32 *pde_lo, u32 *pde_hi)
+{
+	*pde_lo = (u32)(addr_lo >> vm->mm->pde_stride_shift);
+	*pde_hi = (u32)(addr_hi >> vm->mm->pde_stride_shift);
+	gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
+		   addr_lo, addr_hi, vm->mm->pde_stride_shift);
+	gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
+		   *pde_lo, *pde_hi);
+}
+
+static inline u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
+{
+	return (u32 *) (((u8 *)vm->pdes.kv) + i*gmmu_pde__size_v());
+}
+
+static inline u32 pte_index_from_vaddr(struct vm_gk20a *vm,
+				       u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
+{
+	u32 ret;
+	/* mask off pde part */
+	addr = addr & ((((u64)1) << vm->mm->pde_stride_shift) - ((u64)1));
+	/* shift over to get pte index. note assumption that pte index
+	 * doesn't leak over into the high 32b */
+	ret = (u32)(addr >> gmmu_page_shifts[pgsz_idx]);
+
+	gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
+	return ret;
+}
+
+static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
+						    u32 *pte_offset)
+{
+	/* ptes are 8B regardless of pagesize */
+	/* pte space pages are 4KB. so 512 ptes per 4KB page*/
+	*pte_page = i >> 9;
+
+	/* this offset is a pte offset, not a byte offset */
+	*pte_offset = i & ((1<<9)-1);
+
+	gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
+		   i, *pte_page, *pte_offset);
+}
+
+
+/*
+ * given a pde index/page table number make sure it has
+ * backing store and if not go ahead allocate it and
+ * record it in the appropriate pde
+ */
+static int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
+				u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+{
+	int err;
+	struct page_table_gk20a *pte =
+		vm->pdes.ptes[gmmu_pgsz_idx] + i;
+
+	gk20a_dbg_fn("");
+
+	/* if it's already in place it's valid */
+	if (pte->ref)
+		return 0;
+
+	gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
+		   gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
+
+	err = zalloc_gmmu_page_table_gk20a(vm, gmmu_pgsz_idx, pte);
+	if (err)
+		return err;
+
+	/* rewrite pde */
+	update_gmmu_pde_locked(vm, i);
+
+	return 0;
+}
+
+static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
+						       u64 addr)
+{
+	struct vm_reserved_va_node *va_node;
+	list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
+		if (addr >= va_node->vaddr_start &&
+		    addr < (u64)va_node->vaddr_start + (u64)va_node->size)
+			return va_node;
+
+	return NULL;
+}
+
+int gk20a_vm_get_buffers(struct vm_gk20a *vm,
+			 struct mapped_buffer_node ***mapped_buffers,
+			 int *num_buffers)
+{
+	struct mapped_buffer_node *mapped_buffer;
+	struct mapped_buffer_node **buffer_list;
+	struct rb_node *node;
+	int i = 0;
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	buffer_list = kzalloc(sizeof(*buffer_list) *
+			      vm->num_user_mapped_buffers, GFP_KERNEL);
+	if (!buffer_list) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		return -ENOMEM;
+	}
+
+	node = rb_first(&vm->mapped_buffers);
+	while (node) {
+		mapped_buffer =
+			container_of(node, struct mapped_buffer_node, node);
+		if (mapped_buffer->user_mapped) {
+			buffer_list[i] = mapped_buffer;
+			kref_get(&mapped_buffer->ref);
+			i++;
+		}
+		node = rb_next(&mapped_buffer->node);
+	}
+
+	BUG_ON(i != vm->num_user_mapped_buffers);
+
+	*num_buffers = vm->num_user_mapped_buffers;
+	*mapped_buffers = buffer_list;
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	return 0;
+}
+
+static void gk20a_vm_unmap_locked_kref(struct kref *ref)
+{
+	struct mapped_buffer_node *mapped_buffer =
+		container_of(ref, struct mapped_buffer_node, ref);
+	gk20a_vm_unmap_locked(mapped_buffer);
+}
+
+void gk20a_vm_put_buffers(struct vm_gk20a *vm,
+				 struct mapped_buffer_node **mapped_buffers,
+				 int num_buffers)
+{
+	int i;
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	for (i = 0; i < num_buffers; ++i)
+		kref_put(&mapped_buffers[i]->ref,
+			 gk20a_vm_unmap_locked_kref);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	kfree(mapped_buffers);
+}
+
+static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
+{
+	struct device *d = dev_from_vm(vm);
+	int retries;
+	struct mapped_buffer_node *mapped_buffer;
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+	if (!mapped_buffer) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+		return;
+	}
+
+	if (mapped_buffer->flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+		mutex_unlock(&vm->update_gmmu_lock);
+
+		retries = 1000;
+		while (retries) {
+			if (atomic_read(&mapped_buffer->ref.refcount) == 1)
+				break;
+			retries--;
+			udelay(50);
+		}
+		if (!retries)
+			gk20a_err(d, "sync-unmap failed on 0x%llx",
+								offset);
+		mutex_lock(&vm->update_gmmu_lock);
+	}
+
+	mapped_buffer->user_mapped--;
+	if (mapped_buffer->user_mapped == 0)
+		vm->num_user_mapped_buffers--;
+	kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+}
+
+static u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
+			     u64 size,
+			     enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
+
+{
+	struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
+	int err;
+	u64 offset;
+	u32 start_page_nr = 0, num_pages;
+	u64 gmmu_page_size = gmmu_page_sizes[gmmu_pgsz_idx];
+
+	if (gmmu_pgsz_idx >= ARRAY_SIZE(gmmu_page_sizes)) {
+		dev_warn(dev_from_vm(vm),
+			 "invalid page size requested in gk20a vm alloc");
+		return -EINVAL;
+	}
+
+	if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
+		dev_warn(dev_from_vm(vm),
+			 "unsupportd page size requested");
+		return -EINVAL;
+
+	}
+
+	/* be certain we round up to gmmu_page_size if needed */
+	/* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
+	size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
+
+	gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
+			gmmu_page_sizes[gmmu_pgsz_idx]>>10);
+
+	/* The vma allocator represents page accounting. */
+	num_pages = size >> gmmu_page_shifts[gmmu_pgsz_idx];
+
+	err = vma->alloc(vma, &start_page_nr, num_pages);
+
+	if (err) {
+		gk20a_err(dev_from_vm(vm),
+			   "%s oom: sz=0x%llx", vma->name, size);
+		return 0;
+	}
+
+	offset = (u64)start_page_nr << gmmu_page_shifts[gmmu_pgsz_idx];
+	gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
+
+	return offset;
+}
+
+static int gk20a_vm_free_va(struct vm_gk20a *vm,
+			     u64 offset, u64 size,
+			     enum gmmu_pgsz_gk20a pgsz_idx)
+{
+	struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
+	u32 page_size = gmmu_page_sizes[pgsz_idx];
+	u32 page_shift = gmmu_page_shifts[pgsz_idx];
+	u32 start_page_nr, num_pages;
+	int err;
+
+	gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
+			vma->name, offset, size);
+
+	start_page_nr = (u32)(offset >> page_shift);
+	num_pages = (u32)((size + page_size - 1) >> page_shift);
+
+	err = vma->free(vma, start_page_nr, num_pages);
+	if (err) {
+		gk20a_err(dev_from_vm(vm),
+			   "not found: offset=0x%llx, sz=0x%llx",
+			   offset, size);
+	}
+
+	return err;
+}
+
+static int insert_mapped_buffer(struct rb_root *root,
+				struct mapped_buffer_node *mapped_buffer)
+{
+	struct rb_node **new_node = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new_node) {
+		struct mapped_buffer_node *cmp_with =
+			container_of(*new_node, struct mapped_buffer_node,
+				     node);
+
+		parent = *new_node;
+
+		if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
+			new_node = &((*new_node)->rb_left);
+		else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
+			new_node = &((*new_node)->rb_right);
+		else
+			return -EINVAL; /* no fair dup'ing */
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&mapped_buffer->node, parent, new_node);
+	rb_insert_color(&mapped_buffer->node, root);
+
+	return 0;
+}
+
+static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
+				struct rb_root *root, struct dma_buf *dmabuf,
+				u32 kind)
+{
+	struct rb_node *node = rb_first(root);
+	while (node) {
+		struct mapped_buffer_node *mapped_buffer =
+			container_of(node, struct mapped_buffer_node, node);
+		if (mapped_buffer->dmabuf == dmabuf &&
+		    kind == mapped_buffer->kind)
+			return mapped_buffer;
+		node = rb_next(&mapped_buffer->node);
+	}
+	return 0;
+}
+
+static struct mapped_buffer_node *find_mapped_buffer_locked(
+					struct rb_root *root, u64 addr)
+{
+
+	struct rb_node *node = root->rb_node;
+	while (node) {
+		struct mapped_buffer_node *mapped_buffer =
+			container_of(node, struct mapped_buffer_node, node);
+		if (mapped_buffer->addr > addr) /* u64 cmp */
+			node = node->rb_left;
+		else if (mapped_buffer->addr != addr) /* u64 cmp */
+			node = node->rb_right;
+		else
+			return mapped_buffer;
+	}
+	return 0;
+}
+
+static struct mapped_buffer_node *find_mapped_buffer_range_locked(
+					struct rb_root *root, u64 addr)
+{
+	struct rb_node *node = root->rb_node;
+	while (node) {
+		struct mapped_buffer_node *m =
+			container_of(node, struct mapped_buffer_node, node);
+		if (m->addr <= addr && m->addr + m->size > addr)
+			return m;
+		else if (m->addr > addr) /* u64 cmp */
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return 0;
+}
+
+#define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
+
+struct buffer_attrs {
+	struct sg_table *sgt;
+	u64 size;
+	u64 align;
+	u32 ctag_offset;
+	u32 ctag_lines;
+	int pgsz_idx;
+	u8 kind_v;
+	u8 uc_kind_v;
+};
+
+static void gmmu_select_page_size(struct buffer_attrs *bfr)
+{
+	int i;
+	/*  choose the biggest first (top->bottom) */
+	for (i = (gmmu_nr_page_sizes-1); i >= 0; i--)
+		if (!(gmmu_page_offset_masks[i] & bfr->align)) {
+			/* would like to add this too but nvmap returns the
+			 * original requested size not the allocated size.
+			 * (!(gmmu_page_offset_masks[i] & bfr->size)) */
+			bfr->pgsz_idx = i;
+			break;
+		}
+}
+
+static int setup_buffer_kind_and_compression(struct device *d,
+					     u32 flags,
+					     struct buffer_attrs *bfr,
+					     enum gmmu_pgsz_gk20a pgsz_idx)
+{
+	bool kind_compressible;
+
+	if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
+		bfr->kind_v = gmmu_pte_kind_pitch_v();
+
+	if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
+		gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
+		return -EINVAL;
+	}
+
+	bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
+	/* find a suitable uncompressed kind if it becomes necessary later */
+	kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
+	if (kind_compressible) {
+		bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
+		if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
+			/* shouldn't happen, but it is worth cross-checking */
+			gk20a_err(d, "comptag kind 0x%x can't be"
+				   " downgraded to uncompressed kind",
+				   bfr->kind_v);
+			return -EINVAL;
+		}
+	}
+	/* comptags only supported for suitable kinds, 128KB pagesize */
+	if (unlikely(kind_compressible &&
+		     (gmmu_page_sizes[pgsz_idx] != 128*1024))) {
+		/*
+		gk20a_warn(d, "comptags specified"
+		" but pagesize being used doesn't support it");*/
+		/* it is safe to fall back to uncompressed as
+		   functionality is not harmed */
+		bfr->kind_v = bfr->uc_kind_v;
+		kind_compressible = false;
+	}
+	if (kind_compressible)
+		bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
+			COMP_TAG_LINE_SIZE_SHIFT;
+	else
+		bfr->ctag_lines = 0;
+
+	return 0;
+}
+
+static int validate_fixed_buffer(struct vm_gk20a *vm,
+				 struct buffer_attrs *bfr,
+				 u64 map_offset)
+{
+	struct device *dev = dev_from_vm(vm);
+	struct vm_reserved_va_node *va_node;
+	struct mapped_buffer_node *buffer;
+
+	if (map_offset & gmmu_page_offset_masks[bfr->pgsz_idx]) {
+		gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
+			   map_offset);
+		return -EINVAL;
+	}
+
+	/* find the space reservation */
+	va_node = addr_to_reservation(vm, map_offset);
+	if (!va_node) {
+		gk20a_warn(dev, "fixed offset mapping without space allocation");
+		return -EINVAL;
+	}
+
+	/* check that this mappings does not collide with existing
+	 * mappings by checking the overlapping area between the current
+	 * buffer and all other mapped buffers */
+
+	list_for_each_entry(buffer,
+		&va_node->va_buffers_list, va_buffers_list) {
+		s64 begin = max(buffer->addr, map_offset);
+		s64 end = min(buffer->addr +
+			buffer->size, map_offset + bfr->size);
+		if (end - begin > 0) {
+			gk20a_warn(dev, "overlapping buffer map requested");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static u64 __locked_gmmu_map(struct vm_gk20a *vm,
+				u64 map_offset,
+				struct sg_table *sgt,
+				u64 size,
+				int pgsz_idx,
+				u8 kind_v,
+				u32 ctag_offset,
+				u32 flags,
+				int rw_flag)
+{
+	int err = 0, i = 0;
+	u32 pde_lo, pde_hi;
+	struct device *d = dev_from_vm(vm);
+
+	/* Allocate (or validate when map_offset != 0) the virtual address. */
+	if (!map_offset) {
+		map_offset = gk20a_vm_alloc_va(vm, size,
+					  pgsz_idx);
+		if (!map_offset) {
+			gk20a_err(d, "failed to allocate va space");
+			err = -ENOMEM;
+			goto fail;
+		}
+	}
+
+	pde_range_from_vaddr_range(vm,
+				   map_offset,
+				   map_offset + size - 1,
+				   &pde_lo, &pde_hi);
+
+	/* mark the addr range valid (but with 0 phys addr, which will fault) */
+	for (i = pde_lo; i <= pde_hi; i++) {
+		err = validate_gmmu_page_table_gk20a_locked(vm, i,
+							    pgsz_idx);
+		if (err) {
+			gk20a_err(d, "failed to validate page table %d: %d",
+							   i, err);
+			goto fail;
+		}
+	}
+
+	err = update_gmmu_ptes_locked(vm, pgsz_idx,
+				      sgt,
+				      map_offset, map_offset + size - 1,
+				      kind_v,
+				      ctag_offset,
+				      flags &
+				      NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
+				      rw_flag);
+	if (err) {
+		gk20a_err(d, "failed to update ptes on map");
+		goto fail;
+	}
+
+	return map_offset;
+ fail:
+	gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
+	return 0;
+}
+
+static void __locked_gmmu_unmap(struct vm_gk20a *vm,
+				u64 vaddr,
+				u64 size,
+				int pgsz_idx,
+				bool va_allocated,
+				int rw_flag)
+{
+	int err = 0;
+	struct gk20a *g = gk20a_from_vm(vm);
+
+	if (va_allocated) {
+		err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
+		if (err) {
+			dev_err(dev_from_vm(vm),
+				"failed to free va");
+			return;
+		}
+	}
+
+	/* unmap here needs to know the page size we assigned at mapping */
+	err = update_gmmu_ptes_locked(vm,
+				pgsz_idx,
+				0, /* n/a for unmap */
+				vaddr,
+				vaddr + size - 1,
+				0, 0, false /* n/a for unmap */,
+				rw_flag);
+	if (err)
+		dev_err(dev_from_vm(vm),
+			"failed to update gmmu ptes on unmap");
+
+	/* detect which if any pdes/ptes can now be released */
+
+	/* flush l2 so any dirty lines are written out *now*.
+	 *  also as we could potentially be switching this buffer
+	 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
+	 * some point in the future we need to invalidate l2.  e.g. switching
+	 * from a render buffer unmap (here) to later using the same memory
+	 * for gmmu ptes.  note the positioning of this relative to any smmu
+	 * unmapping (below). */
+
+	gk20a_mm_l2_flush(g, true);
+}
+
+static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
+					 struct dma_buf *dmabuf,
+					 u64 offset_align,
+					 u32 flags,
+					 int kind,
+					 struct sg_table **sgt,
+					 bool user_mapped,
+					 int rw_flag)
+{
+	struct mapped_buffer_node *mapped_buffer = 0;
+
+	mapped_buffer =
+		find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
+						  dmabuf, kind);
+	if (!mapped_buffer)
+		return 0;
+
+	if (mapped_buffer->flags != flags)
+		return 0;
+
+	if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
+	    mapped_buffer->addr != offset_align)
+		return 0;
+
+	BUG_ON(mapped_buffer->vm != vm);
+
+	/* mark the buffer as used */
+	if (user_mapped) {
+		if (mapped_buffer->user_mapped == 0)
+			vm->num_user_mapped_buffers++;
+		mapped_buffer->user_mapped++;
+
+		/* If the mapping comes from user space, we own
+		 * the handle ref. Since we reuse an
+		 * existing mapping here, we need to give back those
+		 * refs once in order not to leak.
+		 */
+		if (mapped_buffer->own_mem_ref)
+			dma_buf_put(mapped_buffer->dmabuf);
+		else
+			mapped_buffer->own_mem_ref = true;
+	}
+	kref_get(&mapped_buffer->ref);
+
+	gk20a_dbg(gpu_dbg_map,
+		   "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
+		   "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
+		   "own_mem_ref=%d user_mapped=%d",
+		   vm_aspace_id(vm), mapped_buffer->pgsz_idx,
+		   mapped_buffer->flags,
+		   mapped_buffer->ctag_lines,
+		   mapped_buffer->ctag_offset,
+		   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+		   hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+		   lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
+		   hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+		   lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
+		   mapped_buffer->own_mem_ref, user_mapped);
+
+	if (sgt)
+		*sgt = mapped_buffer->sgt;
+	return mapped_buffer->addr;
+}
+
+u64 gk20a_vm_map(struct vm_gk20a *vm,
+			struct dma_buf *dmabuf,
+			u64 offset_align,
+			u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
+			int kind,
+			struct sg_table **sgt,
+			bool user_mapped,
+			int rw_flag)
+{
+	struct gk20a *g = gk20a_from_vm(vm);
+	struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
+	struct device *d = dev_from_vm(vm);
+	struct mapped_buffer_node *mapped_buffer = 0;
+	bool inserted = false, va_allocated = false;
+	u32 gmmu_page_size = 0;
+	u64 map_offset = 0;
+	int err = 0;
+	struct buffer_attrs bfr = {0};
+	struct gk20a_comptags comptags;
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	/* check if this buffer is already mapped */
+	map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
+						   flags, kind, sgt,
+						   user_mapped, rw_flag);
+	if (map_offset) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		return map_offset;
+	}
+
+	/* pin buffer to get phys/iovmm addr */
+	bfr.sgt = gk20a_mm_pin(d, dmabuf);
+	if (IS_ERR(bfr.sgt)) {
+		/* Falling back to physical is actually possible
+		 * here in many cases if we use 4K phys pages in the
+		 * gmmu.  However we have some regions which require
+		 * contig regions to work properly (either phys-contig
+		 * or contig through smmu io_vaspace).  Until we can
+		 * track the difference between those two cases we have
+		 * to fail the mapping when we run out of SMMU space.
+		 */
+		gk20a_warn(d, "oom allocating tracking buffer");
+		goto clean_up;
+	}
+
+	if (sgt)
+		*sgt = bfr.sgt;
+
+	bfr.kind_v = kind;
+	bfr.size = dmabuf->size;
+	bfr.align = 1 << __ffs((u64)sg_dma_address(bfr.sgt->sgl));
+	bfr.pgsz_idx = -1;
+
+	/* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
+	 * page size according to memory alignment */
+	if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
+		bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
+				gmmu_page_size_big : gmmu_page_size_small;
+	} else {
+		gmmu_select_page_size(&bfr);
+	}
+
+	/* validate/adjust bfr attributes */
+	if (unlikely(bfr.pgsz_idx == -1)) {
+		gk20a_err(d, "unsupported page size detected");
+		goto clean_up;
+	}
+
+	if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
+		     bfr.pgsz_idx > gmmu_page_size_big)) {
+		BUG_ON(1);
+		err = -EINVAL;
+		goto clean_up;
+	}
+	gmmu_page_size = gmmu_page_sizes[bfr.pgsz_idx];
+
+	/* Check if we should use a fixed offset for mapping this buffer */
+	if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)  {
+		err = validate_fixed_buffer(vm, &bfr, offset_align);
+		if (err)
+			goto clean_up;
+
+		map_offset = offset_align;
+		va_allocated = false;
+	} else
+		va_allocated = true;
+
+	if (sgt)
+		*sgt = bfr.sgt;
+
+	err = setup_buffer_kind_and_compression(d, flags, &bfr, bfr.pgsz_idx);
+	if (unlikely(err)) {
+		gk20a_err(d, "failure setting up kind and compression");
+		goto clean_up;
+	}
+
+	/* bar1 and pmu vm don't need ctag */
+	if (!vm->enable_ctag)
+		bfr.ctag_lines = 0;
+
+	gk20a_get_comptags(d, dmabuf, &comptags);
+
+	if (bfr.ctag_lines && !comptags.lines) {
+		/* allocate compression resources if needed */
+		err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
+					   bfr.ctag_lines);
+		if (err) {
+			/* ok to fall back here if we ran out */
+			/* TBD: we can partially alloc ctags as well... */
+			bfr.ctag_lines = bfr.ctag_offset = 0;
+			bfr.kind_v = bfr.uc_kind_v;
+		} else {
+			gk20a_get_comptags(d, dmabuf, &comptags);
+
+			/* init/clear the ctag buffer */
+			g->ops.ltc.clear_comptags(g,
+					  comptags.offset,
+					  comptags.offset + comptags.lines - 1);
+		}
+	}
+
+	/* store the comptag info */
+	bfr.ctag_offset = comptags.offset;
+
+	/* update gmmu ptes */
+	map_offset = __locked_gmmu_map(vm, map_offset,
+					bfr.sgt,
+					bfr.size,
+					bfr.pgsz_idx,
+					bfr.kind_v,
+					bfr.ctag_offset,
+					flags, rw_flag);
+	if (!map_offset)
+		goto clean_up;
+
+	gk20a_dbg(gpu_dbg_map,
+	   "as=%d pgsz=%d "
+	   "kind=0x%x kind_uc=0x%x flags=0x%x "
+	   "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
+	   vm_aspace_id(vm), gmmu_page_size,
+	   bfr.kind_v, bfr.uc_kind_v, flags,
+	   bfr.ctag_lines, bfr.ctag_offset,
+	   hi32(map_offset), lo32(map_offset),
+	   hi32((u64)sg_dma_address(bfr.sgt->sgl)),
+	   lo32((u64)sg_dma_address(bfr.sgt->sgl)),
+	   hi32((u64)sg_phys(bfr.sgt->sgl)),
+	   lo32((u64)sg_phys(bfr.sgt->sgl)));
+
+#if defined(NVHOST_DEBUG)
+	{
+		int i;
+		struct scatterlist *sg = NULL;
+		gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
+		for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
+			u64 da = sg_dma_address(sg);
+			u64 pa = sg_phys(sg);
+			u64 len = sg->length;
+			gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
+				   i, hi32(pa), lo32(pa), hi32(da), lo32(da),
+				   hi32(len), lo32(len));
+		}
+	}
+#endif
+
+	/* keep track of the buffer for unmapping */
+	/* TBD: check for multiple mapping of same buffer */
+	mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
+	if (!mapped_buffer) {
+		gk20a_warn(d, "oom allocating tracking buffer");
+		goto clean_up;
+	}
+	mapped_buffer->dmabuf      = dmabuf;
+	mapped_buffer->sgt         = bfr.sgt;
+	mapped_buffer->addr        = map_offset;
+	mapped_buffer->size        = bfr.size;
+	mapped_buffer->pgsz_idx    = bfr.pgsz_idx;
+	mapped_buffer->ctag_offset = bfr.ctag_offset;
+	mapped_buffer->ctag_lines  = bfr.ctag_lines;
+	mapped_buffer->vm          = vm;
+	mapped_buffer->flags       = flags;
+	mapped_buffer->kind        = kind;
+	mapped_buffer->va_allocated = va_allocated;
+	mapped_buffer->user_mapped = user_mapped ? 1 : 0;
+	mapped_buffer->own_mem_ref = user_mapped;
+	INIT_LIST_HEAD(&mapped_buffer->unmap_list);
+	INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
+	kref_init(&mapped_buffer->ref);
+
+	err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
+	if (err) {
+		gk20a_err(d, "failed to insert into mapped buffer tree");
+		goto clean_up;
+	}
+	inserted = true;
+	if (user_mapped)
+		vm->num_user_mapped_buffers++;
+
+	gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
+
+	if (!va_allocated) {
+		struct vm_reserved_va_node *va_node;
+
+		/* find the space reservation */
+		va_node = addr_to_reservation(vm, map_offset);
+		list_add_tail(&mapped_buffer->va_buffers_list,
+			      &va_node->va_buffers_list);
+		mapped_buffer->va_node = va_node;
+	}
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	/* Invalidate kernel mappings immediately */
+	if (vm_aspace_id(vm) == -1)
+		gk20a_mm_tlb_invalidate(vm);
+
+	return map_offset;
+
+clean_up:
+	if (inserted) {
+		rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+		if (user_mapped)
+			vm->num_user_mapped_buffers--;
+	}
+	kfree(mapped_buffer);
+	if (va_allocated)
+		gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
+	if (!IS_ERR(bfr.sgt))
+		gk20a_mm_unpin(d, dmabuf, bfr.sgt);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+	gk20a_dbg_info("err=%d\n", err);
+	return 0;
+}
+
+u64 gk20a_gmmu_map(struct vm_gk20a *vm,
+		struct sg_table **sgt,
+		u64 size,
+		u32 flags,
+		int rw_flag)
+{
+	u64 vaddr;
+
+	mutex_lock(&vm->update_gmmu_lock);
+	vaddr = __locked_gmmu_map(vm, 0, /* already mapped? - No */
+				*sgt, /* sg table */
+				size,
+				0, /* page size index = 0 i.e. SZ_4K */
+				0, /* kind */
+				0, /* ctag_offset */
+				flags, rw_flag);
+	mutex_unlock(&vm->update_gmmu_lock);
+	if (!vaddr) {
+		gk20a_err(dev_from_vm(vm), "failed to allocate va space");
+		return 0;
+	}
+
+	/* Invalidate kernel mappings immediately */
+	gk20a_mm_tlb_invalidate(vm);
+
+	return vaddr;
+}
+
+void gk20a_gmmu_unmap(struct vm_gk20a *vm,
+		u64 vaddr,
+		u64 size,
+		int rw_flag)
+{
+	mutex_lock(&vm->update_gmmu_lock);
+	__locked_gmmu_unmap(vm,
+			vaddr,
+			size,
+			0, /* page size 4K */
+			true, /*va_allocated */
+			rw_flag);
+	mutex_unlock(&vm->update_gmmu_lock);
+}
+
+phys_addr_t gk20a_get_phys_from_iova(struct device *d,
+				u64 dma_addr)
+{
+	phys_addr_t phys;
+	u64 iova;
+
+	struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
+	if (!mapping)
+		return dma_addr;
+
+	iova = dma_addr & PAGE_MASK;
+	phys = iommu_iova_to_phys(mapping->domain, iova);
+	return phys;
+}
+
+/* get sg_table from already allocated buffer */
+int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
+			void *cpuva, u64 iova,
+			size_t size)
+{
+	int err = 0;
+	*sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+	if (!(*sgt)) {
+		dev_err(d, "failed to allocate memory\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+	err = dma_get_sgtable(d, *sgt,
+			cpuva, iova,
+			size);
+	if (err) {
+		dev_err(d, "failed to create sg table\n");
+		goto fail;
+	}
+	sg_dma_address((*sgt)->sgl) = iova;
+
+	return 0;
+ fail:
+	if (*sgt) {
+		kfree(*sgt);
+		*sgt = NULL;
+	}
+	return err;
+}
+
+int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
+			struct page **pages, u64 iova,
+			size_t size)
+{
+	int err = 0;
+	*sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
+	if (!(*sgt)) {
+		dev_err(d, "failed to allocate memory\n");
+		err = -ENOMEM;
+		goto fail;
+	}
+	err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
+	if (err) {
+		dev_err(d, "failed to allocate sg_table\n");
+		goto fail;
+	}
+	sg_set_page((*sgt)->sgl, *pages, size, 0);
+	sg_dma_address((*sgt)->sgl) = iova;
+
+	return 0;
+ fail:
+	if (*sgt) {
+		kfree(*sgt);
+		*sgt = NULL;
+	}
+	return err;
+}
+
+void gk20a_free_sgtable(struct sg_table **sgt)
+{
+	sg_free_table(*sgt);
+	kfree(*sgt);
+	*sgt = NULL;
+}
+
+u64 gk20a_mm_iova_addr(struct scatterlist *sgl)
+{
+	u64 result = sg_phys(sgl);
+#ifdef CONFIG_TEGRA_IOMMU_SMMU
+	if (sg_dma_address(sgl) == DMA_ERROR_CODE)
+		result = 0;
+	else if (sg_dma_address(sgl)) {
+		result = sg_dma_address(sgl) |
+			1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT;
+	}
+#endif
+	return result;
+}
+
+static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
+				   enum gmmu_pgsz_gk20a pgsz_idx,
+				   struct sg_table *sgt,
+				   u64 first_vaddr, u64 last_vaddr,
+				   u8 kind_v, u32 ctag_offset,
+				   bool cacheable,
+				   int rw_flag)
+{
+	int err;
+	u32 pde_lo, pde_hi, pde_i;
+	struct scatterlist *cur_chunk;
+	unsigned int cur_offset;
+	u32 pte_w[2] = {0, 0}; /* invalid pte */
+	u32 ctag = ctag_offset;
+	u32 ctag_incr;
+	u32 page_size  = gmmu_page_sizes[pgsz_idx];
+	u64 addr = 0;
+
+	pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
+				   &pde_lo, &pde_hi);
+
+	gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
+		   pgsz_idx, pde_lo, pde_hi);
+
+	/* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
+	 * below (per-pte). Note: this doesn't work unless page size (when
+	 * comptags are active) is 128KB. We have checks elsewhere for that. */
+	ctag_incr = !!ctag_offset;
+
+	if (sgt)
+		cur_chunk = sgt->sgl;
+	else
+		cur_chunk = NULL;
+
+	cur_offset = 0;
+
+	for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
+		u32 pte_lo, pte_hi;
+		u32 pte_cur;
+		void *pte_kv_cur;
+
+		struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
+
+		if (pde_i == pde_lo)
+			pte_lo = pte_index_from_vaddr(vm, first_vaddr,
+						      pgsz_idx);
+		else
+			pte_lo = 0;
+
+		if ((pde_i != pde_hi) && (pde_hi != pde_lo))
+			pte_hi = vm->mm->page_table_sizing[pgsz_idx].num_ptes-1;
+		else
+			pte_hi = pte_index_from_vaddr(vm, last_vaddr,
+						      pgsz_idx);
+
+		/* get cpu access to the ptes */
+		err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur,
+				     pte->size);
+		if (err) {
+			gk20a_err(dev_from_vm(vm),
+				   "couldn't map ptes for update as=%d pte_ref_cnt=%d",
+				   vm_aspace_id(vm), pte->ref_cnt);
+			goto clean_up;
+		}
+
+		gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
+		for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
+
+			if (likely(sgt)) {
+				u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
+				if (new_addr) {
+					addr = new_addr;
+					addr += cur_offset;
+				}
+
+				pte_w[0] = gmmu_pte_valid_true_f() |
+					gmmu_pte_address_sys_f(addr
+						>> gmmu_pte_address_shift_v());
+				pte_w[1] = gmmu_pte_aperture_video_memory_f() |
+					gmmu_pte_kind_f(kind_v) |
+					gmmu_pte_comptagline_f(ctag);
+
+				if (rw_flag == gk20a_mem_flag_read_only) {
+					pte_w[0] |= gmmu_pte_read_only_true_f();
+					pte_w[1] |=
+						gmmu_pte_write_disable_true_f();
+				} else if (rw_flag ==
+					   gk20a_mem_flag_write_only) {
+					pte_w[1] |=
+						gmmu_pte_read_disable_true_f();
+				}
+
+				if (!cacheable)
+					pte_w[1] |= gmmu_pte_vol_true_f();
+
+				pte->ref_cnt++;
+
+				gk20a_dbg(gpu_dbg_pte,
+					   "pte_cur=%d addr=0x%x,%08x kind=%d"
+					   " ctag=%d vol=%d refs=%d"
+					   " [0x%08x,0x%08x]",
+					   pte_cur, hi32(addr), lo32(addr),
+					   kind_v, ctag, !cacheable,
+					   pte->ref_cnt, pte_w[1], pte_w[0]);
+
+				ctag += ctag_incr;
+				cur_offset += page_size;
+				addr += page_size;
+				while (cur_chunk &&
+					cur_offset >= cur_chunk->length) {
+					cur_offset -= cur_chunk->length;
+					cur_chunk = sg_next(cur_chunk);
+				}
+
+			} else {
+				pte->ref_cnt--;
+				gk20a_dbg(gpu_dbg_pte,
+					   "pte_cur=%d ref=%d [0x0,0x0]",
+					   pte_cur, pte->ref_cnt);
+			}
+
+			gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
+			gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
+		}
+
+		unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
+
+		if (pte->ref_cnt == 0) {
+			/* It can make sense to keep around one page table for
+			 * each flavor (empty)... in case a new map is coming
+			 * right back to alloc (and fill it in) again.
+			 * But: deferring unmapping should help with pathologic
+			 * unmap/map/unmap/map cases where we'd trigger pte
+			 * free/alloc/free/alloc.
+			 */
+			free_gmmu_pages(vm, pte->ref, pte->sgt,
+				vm->mm->page_table_sizing[pgsz_idx].order,
+				pte->size);
+			pte->ref = NULL;
+
+			/* rewrite pde */
+			update_gmmu_pde_locked(vm, pde_i);
+		}
+
+	}
+
+	smp_mb();
+	vm->tlb_dirty = true;
+	gk20a_dbg_fn("set tlb dirty");
+
+	return 0;
+
+clean_up:
+	/*TBD: potentially rewrite above to pre-map everything it needs to
+	 * as that's the only way it can fail */
+	return err;
+
+}
+
+
+/* for gk20a the "video memory" apertures here are misnomers. */
+static inline u32 big_valid_pde0_bits(u64 pte_addr)
+{
+	u32 pde0_bits =
+		gmmu_pde_aperture_big_video_memory_f() |
+		gmmu_pde_address_big_sys_f(
+			   (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+	return  pde0_bits;
+}
+static inline u32 small_valid_pde1_bits(u64 pte_addr)
+{
+	u32 pde1_bits =
+		gmmu_pde_aperture_small_video_memory_f() |
+		gmmu_pde_vol_small_true_f() | /* tbd: why? */
+		gmmu_pde_address_small_sys_f(
+			   (u32)(pte_addr >> gmmu_pde_address_shift_v()));
+	return pde1_bits;
+}
+
+/* Given the current state of the ptes associated with a pde,
+   determine value and write it out.  There's no checking
+   here to determine whether or not a change was actually
+   made.  So, superfluous updates will cause unnecessary
+   pde invalidations.
+*/
+static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
+{
+	bool small_valid, big_valid;
+	u64 pte_addr[2] = {0, 0};
+	struct page_table_gk20a *small_pte =
+		vm->pdes.ptes[gmmu_page_size_small] + i;
+	struct page_table_gk20a *big_pte =
+		vm->pdes.ptes[gmmu_page_size_big] + i;
+	u32 pde_v[2] = {0, 0};
+	u32 *pde;
+
+	small_valid = small_pte && small_pte->ref;
+	big_valid   = big_pte && big_pte->ref;
+
+	if (small_valid)
+		pte_addr[gmmu_page_size_small] =
+			gk20a_mm_iova_addr(small_pte->sgt->sgl);
+	if (big_valid)
+		pte_addr[gmmu_page_size_big] =
+			gk20a_mm_iova_addr(big_pte->sgt->sgl);
+
+	pde_v[0] = gmmu_pde_size_full_f();
+	pde_v[0] |= big_valid ?
+		big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
+		:
+		(gmmu_pde_aperture_big_invalid_f());
+
+	pde_v[1] |= (small_valid ?
+		     small_valid_pde1_bits(pte_addr[gmmu_page_size_small])
+		     :
+		     (gmmu_pde_aperture_small_invalid_f() |
+		      gmmu_pde_vol_small_false_f())
+		     )
+		|
+		(big_valid ? (gmmu_pde_vol_big_true_f()) :
+		 gmmu_pde_vol_big_false_f());
+
+	pde = pde_from_index(vm, i);
+
+	gk20a_mem_wr32(pde, 0, pde_v[0]);
+	gk20a_mem_wr32(pde, 1, pde_v[1]);
+
+	smp_mb();
+
+	FLUSH_CPU_DCACHE(pde,
+			 sg_phys(vm->pdes.sgt->sgl) + (i*gmmu_pde__size_v()),
+			 sizeof(u32)*2);
+
+	gk20a_mm_l2_invalidate(vm->mm->g);
+
+	gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
+
+	vm->tlb_dirty  = true;
+}
+
+
+static int gk20a_vm_put_empty(struct vm_gk20a *vm, u64 vaddr,
+			       u32 num_pages, u32 pgsz_idx)
+{
+	struct mm_gk20a *mm = vm->mm;
+	struct gk20a *g = mm->g;
+	u32 pgsz = gmmu_page_sizes[pgsz_idx];
+	u32 i;
+	dma_addr_t iova;
+
+	/* allocate the zero page if the va does not already have one */
+	if (!vm->zero_page_cpuva) {
+		int err = 0;
+		vm->zero_page_cpuva = dma_alloc_coherent(&g->dev->dev,
+							 mm->big_page_size,
+							 &iova,
+							 GFP_KERNEL);
+		if (!vm->zero_page_cpuva) {
+			dev_err(&g->dev->dev, "failed to allocate zero page\n");
+			return -ENOMEM;
+		}
+
+		vm->zero_page_iova = iova;
+		err = gk20a_get_sgtable(&g->dev->dev, &vm->zero_page_sgt,
+					vm->zero_page_cpuva, vm->zero_page_iova,
+					mm->big_page_size);
+		if (err) {
+			dma_free_coherent(&g->dev->dev, mm->big_page_size,
+					  vm->zero_page_cpuva,
+					  vm->zero_page_iova);
+			vm->zero_page_iova = 0;
+			vm->zero_page_cpuva = NULL;
+
+			dev_err(&g->dev->dev, "failed to create sg table for zero page\n");
+			return -ENOMEM;
+		}
+	}
+
+	for (i = 0; i < num_pages; i++) {
+		u64 page_vaddr = __locked_gmmu_map(vm, vaddr,
+			vm->zero_page_sgt, pgsz, pgsz_idx, 0, 0,
+			NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET,
+			gk20a_mem_flag_none);
+
+		if (!page_vaddr) {
+			gk20a_err(dev_from_vm(vm), "failed to remap clean buffers!");
+			goto err_unmap;
+		}
+		vaddr += pgsz;
+	}
+
+	gk20a_mm_l2_flush(mm->g, true);
+
+	return 0;
+
+err_unmap:
+
+	WARN_ON(1);
+	/* something went wrong. unmap pages */
+	while (i--) {
+		vaddr -= pgsz;
+		__locked_gmmu_unmap(vm, vaddr, pgsz, pgsz_idx, 0,
+				    gk20a_mem_flag_none);
+	}
+
+	return -EINVAL;
+}
+
+/* NOTE! mapped_buffers lock must be held */
+static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
+{
+	struct vm_gk20a *vm = mapped_buffer->vm;
+
+	if (mapped_buffer->va_node &&
+	    mapped_buffer->va_node->sparse) {
+		u64 vaddr = mapped_buffer->addr;
+		u32 pgsz_idx = mapped_buffer->pgsz_idx;
+		u32 num_pages = mapped_buffer->size >>
+			gmmu_page_shifts[pgsz_idx];
+
+		/* there is little we can do if this fails... */
+		gk20a_vm_put_empty(vm, vaddr, num_pages, pgsz_idx);
+
+	} else
+		__locked_gmmu_unmap(vm,
+				mapped_buffer->addr,
+				mapped_buffer->size,
+				mapped_buffer->pgsz_idx,
+				mapped_buffer->va_allocated,
+				gk20a_mem_flag_none);
+
+	gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
+		   vm_aspace_id(vm), gmmu_page_sizes[mapped_buffer->pgsz_idx],
+		   hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
+		   mapped_buffer->own_mem_ref);
+
+	gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
+		       mapped_buffer->sgt);
+
+	/* remove from mapped buffer tree and remove list, free */
+	rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
+	if (!list_empty(&mapped_buffer->va_buffers_list))
+		list_del(&mapped_buffer->va_buffers_list);
+
+	/* keep track of mapped buffers */
+	if (mapped_buffer->user_mapped)
+		vm->num_user_mapped_buffers--;
+
+	if (mapped_buffer->own_mem_ref)
+		dma_buf_put(mapped_buffer->dmabuf);
+
+	kfree(mapped_buffer);
+
+	return;
+}
+
+void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
+{
+	struct device *d = dev_from_vm(vm);
+	struct mapped_buffer_node *mapped_buffer;
+
+	mutex_lock(&vm->update_gmmu_lock);
+	mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
+	if (!mapped_buffer) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
+		return;
+	}
+	kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
+	mutex_unlock(&vm->update_gmmu_lock);
+}
+
+static void gk20a_vm_remove_support(struct vm_gk20a *vm)
+{
+	struct gk20a *g = vm->mm->g;
+	struct mapped_buffer_node *mapped_buffer;
+	struct vm_reserved_va_node *va_node, *va_node_tmp;
+	struct rb_node *node;
+
+	gk20a_dbg_fn("");
+	mutex_lock(&vm->update_gmmu_lock);
+
+	/* TBD: add a flag here for the unmap code to recognize teardown
+	 * and short-circuit any otherwise expensive operations. */
+
+	node = rb_first(&vm->mapped_buffers);
+	while (node) {
+		mapped_buffer =
+			container_of(node, struct mapped_buffer_node, node);
+		gk20a_vm_unmap_locked(mapped_buffer);
+		node = rb_first(&vm->mapped_buffers);
+	}
+
+	/* destroy remaining reserved memory areas */
+	list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
+		reserved_va_list) {
+		list_del(&va_node->reserved_va_list);
+		kfree(va_node);
+	}
+
+	/* TBD: unmapping all buffers above may not actually free
+	 * all vm ptes.  jettison them here for certain... */
+
+	unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
+	free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0, vm->pdes.size);
+
+	kfree(vm->pdes.ptes[gmmu_page_size_small]);
+	kfree(vm->pdes.ptes[gmmu_page_size_big]);
+	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
+	gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	/* release zero page if used */
+	if (vm->zero_page_cpuva)
+		dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
+				  vm->zero_page_cpuva, vm->zero_page_iova);
+
+	/* vm is not used anymore. release it. */
+	kfree(vm);
+}
+
+static void gk20a_vm_remove_support_kref(struct kref *ref)
+{
+	struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
+	gk20a_vm_remove_support(vm);
+}
+
+void gk20a_vm_get(struct vm_gk20a *vm)
+{
+	kref_get(&vm->ref);
+}
+
+void gk20a_vm_put(struct vm_gk20a *vm)
+{
+	kref_put(&vm->ref, gk20a_vm_remove_support_kref);
+}
+
+/* address space interfaces for the gk20a module */
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
+{
+	struct gk20a_as *as = as_share->as;
+	struct gk20a *g = gk20a_from_as(as);
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm;
+	u64 vma_size;
+	u32 num_pages, low_hole_pages;
+	char name[32];
+	int err;
+
+	gk20a_dbg_fn("");
+
+	vm = kzalloc(sizeof(*vm), GFP_KERNEL);
+	if (!vm)
+		return -ENOMEM;
+
+	as_share->vm = vm;
+
+	vm->mm = mm;
+	vm->as_share = as_share;
+
+	vm->big_pages = true;
+
+	vm->va_start  = mm->pde_stride;   /* create a one pde hole */
+	vm->va_limit  = mm->channel.size; /* note this means channel.size is
+					     really just the max */
+	{
+		u32 pde_lo, pde_hi;
+		pde_range_from_vaddr_range(vm,
+					   0, vm->va_limit-1,
+					   &pde_lo, &pde_hi);
+		vm->pdes.num_pdes = pde_hi + 1;
+	}
+
+	vm->pdes.ptes[gmmu_page_size_small] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	vm->pdes.ptes[gmmu_page_size_big] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+	      vm->pdes.ptes[gmmu_page_size_big]))
+		return -ENOMEM;
+
+	gk20a_dbg_info("init space for va_limit=0x%llx num_pdes=%d",
+		   vm->va_limit, vm->pdes.num_pdes);
+
+	/* allocate the page table directory */
+	err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+			       &vm->pdes.sgt, &vm->pdes.size);
+	if (err)
+		return -ENOMEM;
+
+	err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+			     vm->pdes.size);
+	if (err) {
+		free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+					vm->pdes.size);
+		return -ENOMEM;
+	}
+	gk20a_dbg(gpu_dbg_pte, "pdes.kv = 0x%p, pdes.phys = 0x%llx",
+			vm->pdes.kv,
+			gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+	/* we could release vm->pdes.kv but it's only one page... */
+
+
+	/* low-half: alloc small pages */
+	/* high-half: alloc big pages */
+	vma_size = mm->channel.size >> 1;
+
+	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+		 gmmu_page_sizes[gmmu_page_size_small]>>10);
+	num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
+
+	/* num_pages above is without regard to the low-side hole. */
+	low_hole_pages = (vm->va_start >>
+			  gmmu_page_shifts[gmmu_page_size_small]);
+
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
+	      low_hole_pages,             /* start */
+	      num_pages - low_hole_pages, /* length */
+	      1);                         /* align */
+
+	snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
+		 gmmu_page_sizes[gmmu_page_size_big]>>10);
+
+	num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
+			      num_pages, /* start */
+			      num_pages, /* length */
+			      1); /* align */
+
+	vm->mapped_buffers = RB_ROOT;
+
+	mutex_init(&vm->update_gmmu_lock);
+	kref_init(&vm->ref);
+	INIT_LIST_HEAD(&vm->reserved_va_list);
+
+	vm->enable_ctag = true;
+
+	return 0;
+}
+
+
+int gk20a_vm_release_share(struct gk20a_as_share *as_share)
+{
+	struct vm_gk20a *vm = as_share->vm;
+
+	gk20a_dbg_fn("");
+
+	vm->as_share = NULL;
+
+	/* put as reference to vm */
+	gk20a_vm_put(vm);
+
+	as_share->vm = NULL;
+
+	return 0;
+}
+
+
+int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
+			 struct nvhost_as_alloc_space_args *args)
+
+{	int err = -ENOMEM;
+	int pgsz_idx;
+	u32 start_page_nr;
+	struct gk20a_allocator *vma;
+	struct vm_gk20a *vm = as_share->vm;
+	struct vm_reserved_va_node *va_node;
+	u64 vaddr_start = 0;
+
+	gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
+			args->flags, args->page_size, args->pages,
+			args->o_a.offset);
+
+	/* determine pagesz idx */
+	for (pgsz_idx = gmmu_page_size_small;
+	     pgsz_idx < gmmu_nr_page_sizes;
+	     pgsz_idx++) {
+		if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+			break;
+	}
+
+	if (pgsz_idx >= gmmu_nr_page_sizes) {
+		err = -EINVAL;
+		goto clean_up;
+	}
+
+	va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
+	if (!va_node) {
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE &&
+	    pgsz_idx != gmmu_page_size_big) {
+		err = -ENOSYS;
+		kfree(va_node);
+		goto clean_up;
+	}
+
+	start_page_nr = 0;
+	if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
+		start_page_nr = (u32)(args->o_a.offset >>
+				      gmmu_page_shifts[pgsz_idx]);
+
+	vma = &vm->vma[pgsz_idx];
+	err = vma->alloc(vma, &start_page_nr, args->pages);
+	if (err) {
+		kfree(va_node);
+		goto clean_up;
+	}
+
+	vaddr_start = (u64)start_page_nr << gmmu_page_shifts[pgsz_idx];
+
+	va_node->vaddr_start = vaddr_start;
+	va_node->size = (u64)args->page_size * (u64)args->pages;
+	va_node->pgsz_idx = args->page_size;
+	INIT_LIST_HEAD(&va_node->va_buffers_list);
+	INIT_LIST_HEAD(&va_node->reserved_va_list);
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	/* mark that we need to use sparse mappings here */
+	if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE) {
+		err = gk20a_vm_put_empty(vm, vaddr_start, args->pages,
+					 pgsz_idx);
+		if (err) {
+			mutex_unlock(&vm->update_gmmu_lock);
+			vma->free(vma, start_page_nr, args->pages);
+			kfree(va_node);
+			goto clean_up;
+		}
+
+		va_node->sparse = true;
+	}
+
+	list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	args->o_a.offset = vaddr_start;
+
+clean_up:
+	return err;
+}
+
+int gk20a_vm_free_space(struct gk20a_as_share *as_share,
+			struct nvhost_as_free_space_args *args)
+{
+	int err = -ENOMEM;
+	int pgsz_idx;
+	u32 start_page_nr;
+	struct gk20a_allocator *vma;
+	struct vm_gk20a *vm = as_share->vm;
+	struct vm_reserved_va_node *va_node;
+
+	gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
+			args->pages, args->offset);
+
+	/* determine pagesz idx */
+	for (pgsz_idx = gmmu_page_size_small;
+	     pgsz_idx < gmmu_nr_page_sizes;
+	     pgsz_idx++) {
+		if (gmmu_page_sizes[pgsz_idx] == args->page_size)
+			break;
+	}
+
+	if (pgsz_idx >= gmmu_nr_page_sizes) {
+		err = -EINVAL;
+		goto clean_up;
+	}
+
+	start_page_nr = (u32)(args->offset >>
+			      gmmu_page_shifts[pgsz_idx]);
+
+	vma = &vm->vma[pgsz_idx];
+	err = vma->free(vma, start_page_nr, args->pages);
+
+	if (err)
+		goto clean_up;
+
+	mutex_lock(&vm->update_gmmu_lock);
+	va_node = addr_to_reservation(vm, args->offset);
+	if (va_node) {
+		struct mapped_buffer_node *buffer;
+
+		/* there is no need to unallocate the buffers in va. Just
+		 * convert them into normal buffers */
+
+		list_for_each_entry(buffer,
+			&va_node->va_buffers_list, va_buffers_list)
+			list_del_init(&buffer->va_buffers_list);
+
+		list_del(&va_node->reserved_va_list);
+
+		/* if this was a sparse mapping, free the va */
+		if (va_node->sparse)
+			__locked_gmmu_unmap(vm,
+				va_node->vaddr_start,
+				va_node->size,
+				va_node->pgsz_idx,
+				false,
+				gk20a_mem_flag_none);
+		kfree(va_node);
+	}
+	mutex_unlock(&vm->update_gmmu_lock);
+
+clean_up:
+	return err;
+}
+
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+			  struct channel_gk20a *ch)
+{
+	int err = 0;
+	struct vm_gk20a *vm = as_share->vm;
+
+	gk20a_dbg_fn("");
+
+	ch->vm = vm;
+	err = channel_gk20a_commit_va(ch);
+	if (err)
+		ch->vm = 0;
+
+	return err;
+}
+
+int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
+{
+	struct gk20a_dmabuf_priv *priv;
+	static DEFINE_MUTEX(priv_lock);
+
+	priv = dma_buf_get_drvdata(dmabuf, dev);
+	if (likely(priv))
+		return 0;
+
+	mutex_lock(&priv_lock);
+	priv = dma_buf_get_drvdata(dmabuf, dev);
+	if (priv)
+		goto priv_exist_or_err;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		priv = ERR_PTR(-ENOMEM);
+		goto priv_exist_or_err;
+	}
+	mutex_init(&priv->lock);
+	dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
+priv_exist_or_err:
+	mutex_unlock(&priv_lock);
+	if (IS_ERR(priv))
+		return -ENOMEM;
+
+	return 0;
+}
+
+
+static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
+{
+	int kind = 0;
+#ifdef CONFIG_TEGRA_NVMAP
+	int err;
+	u64 nvmap_param;
+
+	err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
+				     &nvmap_param);
+	kind = err ? kind : nvmap_param;
+#endif
+	return kind;
+}
+
+int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
+			int dmabuf_fd,
+			u64 *offset_align,
+			u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
+			int kind)
+{
+	int err = 0;
+	struct vm_gk20a *vm = as_share->vm;
+	struct dma_buf *dmabuf;
+	u64 ret_va;
+
+	gk20a_dbg_fn("");
+
+	/* get ref to the mem handle (released on unmap_locked) */
+	dmabuf = dma_buf_get(dmabuf_fd);
+	if (!dmabuf)
+		return 0;
+
+	err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
+	if (err) {
+		dma_buf_put(dmabuf);
+		return err;
+	}
+
+	if (kind == -1)
+		kind = gk20a_dmabuf_get_kind(dmabuf);
+
+	ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
+			flags, kind, NULL, true,
+			gk20a_mem_flag_none);
+	*offset_align = ret_va;
+	if (!ret_va) {
+		dma_buf_put(dmabuf);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
+{
+	struct vm_gk20a *vm = as_share->vm;
+
+	gk20a_dbg_fn("");
+
+	gk20a_vm_unmap_user(vm, offset);
+	return 0;
+}
+
+int gk20a_init_bar1_vm(struct mm_gk20a *mm)
+{
+	int err;
+	phys_addr_t inst_pa;
+	void *inst_ptr;
+	struct vm_gk20a *vm = &mm->bar1.vm;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct device *d = dev_from_gk20a(g);
+	struct inst_desc *inst_block = &mm->bar1.inst_block;
+	u64 pde_addr;
+	u32 pde_addr_lo;
+	u32 pde_addr_hi;
+	dma_addr_t iova;
+
+	vm->mm = mm;
+
+	mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
+
+	gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
+
+	vm->va_start = mm->pde_stride * 1;
+	vm->va_limit = mm->bar1.aperture_size;
+
+	{
+		u32 pde_lo, pde_hi;
+		pde_range_from_vaddr_range(vm,
+					   0, vm->va_limit-1,
+					   &pde_lo, &pde_hi);
+		vm->pdes.num_pdes = pde_hi + 1;
+	}
+
+	/* bar1 is likely only to ever use/need small page sizes. */
+	/* But just in case, for now... arrange for both.*/
+	vm->pdes.ptes[gmmu_page_size_small] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	vm->pdes.ptes[gmmu_page_size_big] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+	      vm->pdes.ptes[gmmu_page_size_big]))
+		return -ENOMEM;
+
+	gk20a_dbg_info("init space for bar1 va_limit=0x%llx num_pdes=%d",
+		   vm->va_limit, vm->pdes.num_pdes);
+
+
+	/* allocate the page table directory */
+	err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+			       &vm->pdes.sgt, &vm->pdes.size);
+	if (err)
+		goto clean_up;
+
+	err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+			     vm->pdes.size);
+	if (err) {
+		free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+					vm->pdes.size);
+		goto clean_up;
+	}
+	gk20a_dbg(gpu_dbg_pte, "bar 1 pdes.kv = 0x%p, pdes.phys = 0x%llx",
+			vm->pdes.kv, gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+	/* we could release vm->pdes.kv but it's only one page... */
+
+	pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+	pde_addr_lo = u64_lo32(pde_addr >> 12);
+	pde_addr_hi = u64_hi32(pde_addr);
+
+	gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+		(u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl),
+		pde_addr_lo, pde_addr_hi);
+
+	/* allocate instance mem for bar1 */
+	inst_block->size = ram_in_alloc_size_v();
+	inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+				&iova, GFP_KERNEL);
+	if (!inst_block->cpuva) {
+		gk20a_err(d, "%s: memory allocation failed\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	inst_block->iova = iova;
+	inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+	if (!inst_block->cpu_pa) {
+		gk20a_err(d, "%s: failed to get phys address\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	inst_pa = inst_block->cpu_pa;
+	inst_ptr = inst_block->cpuva;
+
+	gk20a_dbg_info("bar1 inst block physical phys = 0x%llx, kv = 0x%p",
+		(u64)inst_pa, inst_ptr);
+
+	memset(inst_ptr, 0, ram_fc_size_val_v());
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+		ram_in_page_dir_base_target_vid_mem_f() |
+		ram_in_page_dir_base_vol_true_f() |
+		ram_in_page_dir_base_lo_f(pde_addr_lo));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+		ram_in_page_dir_base_hi_f(pde_addr_hi));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+		 u64_lo32(vm->va_limit) | 0xFFF);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+
+	gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_bar1",
+			      1,/*start*/
+			      (vm->va_limit >> 12) - 1 /* length*/,
+			      1); /* align */
+	/* initialize just in case we try to use it anyway */
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_bar1-unused",
+			      0x0badc0de, /* start */
+			      1, /* length */
+			      1); /* align */
+
+	vm->mapped_buffers = RB_ROOT;
+
+	mutex_init(&vm->update_gmmu_lock);
+	kref_init(&vm->ref);
+	INIT_LIST_HEAD(&vm->reserved_va_list);
+
+	return 0;
+
+clean_up:
+	/* free, etc */
+	if (inst_block->cpuva)
+		dma_free_coherent(d, inst_block->size,
+			inst_block->cpuva, inst_block->iova);
+	inst_block->cpuva = NULL;
+	inst_block->iova = 0;
+	return err;
+}
+
+/* pmu vm, share channel_vm interfaces */
+int gk20a_init_pmu_vm(struct mm_gk20a *mm)
+{
+	int err;
+	phys_addr_t inst_pa;
+	void *inst_ptr;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct gk20a *g = gk20a_from_mm(mm);
+	struct device *d = dev_from_gk20a(g);
+	struct inst_desc *inst_block = &mm->pmu.inst_block;
+	u64 pde_addr;
+	u32 pde_addr_lo;
+	u32 pde_addr_hi;
+	dma_addr_t iova;
+
+	vm->mm = mm;
+
+	mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
+
+	gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
+
+	vm->va_start  = GK20A_PMU_VA_START;
+	vm->va_limit  = vm->va_start + mm->pmu.aperture_size;
+
+	{
+		u32 pde_lo, pde_hi;
+		pde_range_from_vaddr_range(vm,
+					   0, vm->va_limit-1,
+					   &pde_lo, &pde_hi);
+		vm->pdes.num_pdes = pde_hi + 1;
+	}
+
+	/* The pmu is likely only to ever use/need small page sizes. */
+	/* But just in case, for now... arrange for both.*/
+	vm->pdes.ptes[gmmu_page_size_small] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	vm->pdes.ptes[gmmu_page_size_big] =
+		kzalloc(sizeof(struct page_table_gk20a) *
+			vm->pdes.num_pdes, GFP_KERNEL);
+
+	if (!(vm->pdes.ptes[gmmu_page_size_small] &&
+	      vm->pdes.ptes[gmmu_page_size_big]))
+		return -ENOMEM;
+
+	gk20a_dbg_info("init space for pmu va_limit=0x%llx num_pdes=%d",
+		   vm->va_limit, vm->pdes.num_pdes);
+
+	/* allocate the page table directory */
+	err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
+			       &vm->pdes.sgt, &vm->pdes.size);
+	if (err)
+		goto clean_up;
+
+	err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
+			     vm->pdes.size);
+	if (err) {
+		free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
+					vm->pdes.size);
+		goto clean_up;
+	}
+	gk20a_dbg_info("pmu pdes phys @ 0x%llx",
+			(u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
+	/* we could release vm->pdes.kv but it's only one page... */
+
+	pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
+	pde_addr_lo = u64_lo32(pde_addr >> 12);
+	pde_addr_hi = u64_hi32(pde_addr);
+
+	gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
+			(u64)pde_addr, pde_addr_lo, pde_addr_hi);
+
+	/* allocate instance mem for pmu */
+	inst_block->size = GK20A_PMU_INST_SIZE;
+	inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
+				&iova, GFP_KERNEL);
+	if (!inst_block->cpuva) {
+		gk20a_err(d, "%s: memory allocation failed\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	inst_block->iova = iova;
+	inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
+	if (!inst_block->cpu_pa) {
+		gk20a_err(d, "%s: failed to get phys address\n", __func__);
+		err = -ENOMEM;
+		goto clean_up;
+	}
+
+	inst_pa = inst_block->cpu_pa;
+	inst_ptr = inst_block->cpuva;
+
+	gk20a_dbg_info("pmu inst block physical addr: 0x%llx", (u64)inst_pa);
+
+	memset(inst_ptr, 0, GK20A_PMU_INST_SIZE);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
+		ram_in_page_dir_base_target_vid_mem_f() |
+		ram_in_page_dir_base_vol_true_f() |
+		ram_in_page_dir_base_lo_f(pde_addr_lo));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
+		ram_in_page_dir_base_hi_f(pde_addr_hi));
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
+		 u64_lo32(vm->va_limit) | 0xFFF);
+
+	gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
+		ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
+
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_pmu",
+			      (vm->va_start >> 12), /* start */
+			      (vm->va_limit - vm->va_start) >> 12, /*length*/
+			      1); /* align */
+	/* initialize just in case we try to use it anyway */
+	gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_pmu-unused",
+			      0x0badc0de, /* start */
+			      1, /* length */
+			      1); /* align */
+
+
+	vm->mapped_buffers = RB_ROOT;
+
+	mutex_init(&vm->update_gmmu_lock);
+	kref_init(&vm->ref);
+	INIT_LIST_HEAD(&vm->reserved_va_list);
+
+	return 0;
+
+clean_up:
+	/* free, etc */
+	if (inst_block->cpuva)
+		dma_free_coherent(d, inst_block->size,
+			inst_block->cpuva, inst_block->iova);
+	inst_block->cpuva = NULL;
+	inst_block->iova = 0;
+	return err;
+}
+
+void gk20a_mm_fb_flush(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	u32 data;
+	s32 retry = 100;
+
+	gk20a_dbg_fn("");
+
+	mutex_lock(&mm->l2_op_lock);
+
+	g->ops.ltc.elpg_flush(g);
+
+	/* Make sure all previous writes are committed to the L2. There's no
+	   guarantee that writes are to DRAM. This will be a sysmembar internal
+	   to the L2. */
+	gk20a_writel(g, flush_fb_flush_r(),
+		flush_fb_flush_pending_busy_f());
+
+	do {
+		data = gk20a_readl(g, flush_fb_flush_r());
+
+		if (flush_fb_flush_outstanding_v(data) ==
+			flush_fb_flush_outstanding_true_v() ||
+		    flush_fb_flush_pending_v(data) ==
+			flush_fb_flush_pending_busy_v()) {
+				gk20a_dbg_info("fb_flush 0x%x", data);
+				retry--;
+				usleep_range(20, 40);
+		} else
+			break;
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"fb_flush too many retries");
+
+	mutex_unlock(&mm->l2_op_lock);
+}
+
+static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
+{
+	u32 data;
+	s32 retry = 200;
+
+	/* Invalidate any clean lines from the L2 so subsequent reads go to
+	   DRAM. Dirty lines are not affected by this operation. */
+	gk20a_writel(g, flush_l2_system_invalidate_r(),
+		flush_l2_system_invalidate_pending_busy_f());
+
+	do {
+		data = gk20a_readl(g, flush_l2_system_invalidate_r());
+
+		if (flush_l2_system_invalidate_outstanding_v(data) ==
+			flush_l2_system_invalidate_outstanding_true_v() ||
+		    flush_l2_system_invalidate_pending_v(data) ==
+			flush_l2_system_invalidate_pending_busy_v()) {
+				gk20a_dbg_info("l2_system_invalidate 0x%x",
+						data);
+				retry--;
+				usleep_range(20, 40);
+		} else
+			break;
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"l2_system_invalidate too many retries");
+}
+
+void gk20a_mm_l2_invalidate(struct gk20a *g)
+{
+	struct mm_gk20a *mm = &g->mm;
+	mutex_lock(&mm->l2_op_lock);
+	gk20a_mm_l2_invalidate_locked(g);
+	mutex_unlock(&mm->l2_op_lock);
+}
+
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
+{
+	struct mm_gk20a *mm = &g->mm;
+	u32 data;
+	s32 retry = 200;
+
+	gk20a_dbg_fn("");
+
+	mutex_lock(&mm->l2_op_lock);
+
+	/* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
+	   as clean, so subsequent reads might hit in the L2. */
+	gk20a_writel(g, flush_l2_flush_dirty_r(),
+		flush_l2_flush_dirty_pending_busy_f());
+
+	do {
+		data = gk20a_readl(g, flush_l2_flush_dirty_r());
+
+		if (flush_l2_flush_dirty_outstanding_v(data) ==
+			flush_l2_flush_dirty_outstanding_true_v() ||
+		    flush_l2_flush_dirty_pending_v(data) ==
+			flush_l2_flush_dirty_pending_busy_v()) {
+				gk20a_dbg_info("l2_flush_dirty 0x%x", data);
+				retry--;
+				usleep_range(20, 40);
+		} else
+			break;
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"l2_flush_dirty too many retries");
+
+	if (invalidate)
+		gk20a_mm_l2_invalidate_locked(g);
+
+	mutex_unlock(&mm->l2_op_lock);
+}
+
+
+int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
+			 struct dma_buf **dmabuf,
+			 u64 *offset)
+{
+	struct mapped_buffer_node *mapped_buffer;
+
+	gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
+
+	mutex_lock(&vm->update_gmmu_lock);
+
+	mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
+							gpu_va);
+	if (!mapped_buffer) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		return -EINVAL;
+	}
+
+	*dmabuf = mapped_buffer->dmabuf;
+	*offset = gpu_va - mapped_buffer->addr;
+
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	return 0;
+}
+
+void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
+{
+	struct mm_gk20a *mm = vm->mm;
+	struct gk20a *g = gk20a_from_vm(vm);
+	u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->pdes.sgt->sgl) >> 12);
+	u32 data;
+	s32 retry = 200;
+
+	gk20a_dbg_fn("");
+
+	/* pagetables are considered sw states which are preserved after
+	   prepare_poweroff. When gk20a deinit releases those pagetables,
+	   common code in vm unmap path calls tlb invalidate that touches
+	   hw. Use the power_on flag to skip tlb invalidation when gpu
+	   power is turned off */
+
+	if (!g->power_on)
+		return;
+
+	/* No need to invalidate if tlb is clean */
+	mutex_lock(&vm->update_gmmu_lock);
+	if (!vm->tlb_dirty) {
+		mutex_unlock(&vm->update_gmmu_lock);
+		return;
+	}
+	vm->tlb_dirty = false;
+	mutex_unlock(&vm->update_gmmu_lock);
+
+	mutex_lock(&mm->tlb_lock);
+	do {
+		data = gk20a_readl(g, fb_mmu_ctrl_r());
+		if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
+			break;
+		usleep_range(20, 40);
+		retry--;
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"wait mmu fifo space too many retries");
+
+	gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
+		fb_mmu_invalidate_pdb_addr_f(addr_lo) |
+		fb_mmu_invalidate_pdb_aperture_vid_mem_f());
+
+	/* this is a sledgehammer, it would seem */
+	gk20a_writel(g, fb_mmu_invalidate_r(),
+		fb_mmu_invalidate_all_pdb_true_f() |
+		fb_mmu_invalidate_all_va_true_f() |
+		fb_mmu_invalidate_trigger_true_f());
+
+	do {
+		data = gk20a_readl(g, fb_mmu_ctrl_r());
+		if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
+			fb_mmu_ctrl_pri_fifo_empty_false_f())
+			break;
+		retry--;
+		usleep_range(20, 40);
+	} while (retry >= 0 || !tegra_platform_is_silicon());
+
+	if (retry < 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"mmu invalidate too many retries");
+
+	mutex_unlock(&mm->tlb_lock);
+}
+
+int gk20a_mm_suspend(struct gk20a *g)
+{
+	gk20a_dbg_fn("");
+
+	gk20a_mm_fb_flush(g);
+	gk20a_mm_l2_flush(g, true);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+void gk20a_mm_ltc_isr(struct gk20a *g)
+{
+	u32 intr;
+
+	intr = gk20a_readl(g, ltc_ltc0_ltss_intr_r());
+	gk20a_err(dev_from_gk20a(g), "ltc: %08x\n", intr);
+	gk20a_writel(g, ltc_ltc0_ltss_intr_r(), intr);
+}
+
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
+{
+	u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
+	return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
+		fb_mmu_debug_ctrl_debug_enabled_v();
+}
+
+static int gk20a_mm_mmu_vpr_info_fetch_wait(struct gk20a *g,
+					    const unsigned int msec)
+{
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msec);
+	while (1) {
+		u32 val;
+
+		val = gk20a_readl(g, fb_mmu_vpr_info_r());
+		if (fb_mmu_vpr_info_fetch_v(val) ==
+		    fb_mmu_vpr_info_fetch_false_v())
+			break;
+
+		if (tegra_platform_is_silicon() &&
+				WARN_ON(time_after(jiffies, timeout)))
+			return -ETIME;
+	}
+
+	return 0;
+}
+
+int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g)
+{
+	int ret = 0;
+
+	gk20a_busy_noresume(g->dev);
+	if (!pm_runtime_active(&g->dev->dev))
+		goto fail;
+
+	if (gk20a_mm_mmu_vpr_info_fetch_wait(g, 5)) {
+		ret = -ETIME;
+		goto fail;
+	}
+
+	gk20a_writel(g, fb_mmu_vpr_info_r(),
+		     fb_mmu_vpr_info_fetch_true_v());
+
+	ret = gk20a_mm_mmu_vpr_info_fetch_wait(g, 5);
+
+ fail:
+	gk20a_idle(g->dev);
+	return ret;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/mm_gk20a.h b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
new file mode 100644
index 000000000000..23d15c232763
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/mm_gk20a.h
@@ -0,0 +1,464 @@
+/*
+ * drivers/video/tegra/host/gk20a/mm_gk20a.h
+ *
+ * GK20A memory management
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __MM_GK20A_H__
+#define __MM_GK20A_H__
+
+#include <linux/scatterlist.h>
+#include <linux/dma-attrs.h>
+#include <linux/iommu.h>
+#include <asm/dma-iommu.h>
+#include "gk20a_allocator.h"
+
+/* This "address bit" in the gmmu ptes (and other gk20a accesses)
+ * signals the address as presented should be translated by the SMMU.
+ * Without this bit present gk20a accesses are *not* translated.
+ */
+/* Hack, get this from manuals somehow... */
+#define NV_MC_SMMU_VADDR_TRANSLATION_BIT     34
+#define NV_MC_SMMU_VADDR_TRANSLATE(x) (x | \
+				(1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT))
+
+/* For now keep the size relatively small-ish compared to the full
+ * 40b va.  32GB for now. It consists of two 16GB spaces. */
+#define NV_GMMU_VA_RANGE	35ULL
+#define NV_GMMU_VA_IS_UPPER(x)	((x) >= ((u64)0x1 << (NV_GMMU_VA_RANGE-1)))
+
+struct mem_desc {
+	struct dma_buf *ref;
+	struct sg_table *sgt;
+	u32 size;
+};
+
+struct mem_desc_sub {
+	u32 offset;
+	u32 size;
+};
+
+struct gpfifo_desc {
+	size_t size;
+	u32 entry_num;
+
+	u32 get;
+	u32 put;
+
+	bool wrap;
+
+	u64 iova;
+	struct gpfifo *cpu_va;
+	u64 gpu_va;
+};
+
+struct mmu_desc {
+	void *cpuva;
+	u64 iova;
+	size_t size;
+};
+
+struct inst_desc {
+	u64 iova;
+	void *cpuva;
+	phys_addr_t cpu_pa;
+	size_t size;
+};
+
+struct surface_mem_desc {
+	u64 iova;
+	void *cpuva;
+	struct sg_table *sgt;
+	size_t size;
+};
+
+struct userd_desc {
+	struct sg_table *sgt;
+	u64 iova;
+	void *cpuva;
+	size_t size;
+	u64 gpu_va;
+};
+
+struct runlist_mem_desc {
+	u64 iova;
+	void *cpuva;
+	size_t size;
+};
+
+struct patch_desc {
+	struct page **pages;
+	u64 iova;
+	size_t size;
+	void *cpu_va;
+	u64 gpu_va;
+	u32 data_count;
+};
+
+struct pmu_mem_desc {
+	void *cpuva;
+	u64 iova;
+	u64 pmu_va;
+	size_t size;
+};
+
+struct priv_cmd_queue_mem_desc {
+	u64 base_iova;
+	u32 *base_cpuva;
+	size_t size;
+};
+
+struct zcull_ctx_desc {
+	struct mem_desc mem;
+	u64 gpu_va;
+	u32 ctx_attr;
+	u32 ctx_sw_mode;
+};
+
+struct pm_ctx_desc {
+	struct mem_desc mem;
+	u64 gpu_va;
+	u32 ctx_attr;
+	u32 ctx_sw_mode;
+};
+
+struct gr_ctx_buffer_desc;
+struct platform_device;
+struct gr_ctx_buffer_desc {
+	void (*destroy)(struct platform_device *, struct gr_ctx_buffer_desc *);
+	struct sg_table *sgt;
+	struct page **pages;
+	size_t size;
+	u64 iova;
+	struct dma_attrs attrs;
+	void *priv;
+};
+
+struct gr_ctx_desc {
+	struct page **pages;
+	u64 iova;
+	size_t size;
+	u64 gpu_va;
+};
+
+struct compbit_store_desc {
+	struct pages **pages;
+	size_t size;
+	u64 base_iova;
+};
+
+struct page_table_gk20a {
+	/* backing for */
+	/* Either a *page or a *mem_handle */
+	void *ref;
+	/* track mapping cnt on this page table */
+	u32 ref_cnt;
+	struct sg_table *sgt;
+	size_t size;
+};
+
+#ifndef _NVHOST_MEM_MGR_H
+enum gk20a_mem_rw_flag {
+	gk20a_mem_flag_none = 0,
+	gk20a_mem_flag_read_only = 1,
+	gk20a_mem_flag_write_only = 2,
+};
+#endif
+
+enum gmmu_pgsz_gk20a {
+	gmmu_page_size_small = 0,
+	gmmu_page_size_big   = 1,
+	gmmu_nr_page_sizes   = 2
+};
+
+
+struct page_directory_gk20a {
+	/* backing for */
+	u32 num_pdes;
+	void *kv;
+	/* Either a *page or a *mem_handle */
+	void *ref;
+	struct sg_table *sgt;
+	size_t size;
+	struct page_table_gk20a *ptes[gmmu_nr_page_sizes];
+};
+
+struct priv_cmd_queue {
+	struct priv_cmd_queue_mem_desc mem;
+	u64 base_gpuva;	/* gpu_va base */
+	u16 size;	/* num of entries in words */
+	u16 put;	/* put for priv cmd queue */
+	u16 get;	/* get for priv cmd queue */
+	struct list_head free;	/* list of pre-allocated free entries */
+	struct list_head head;	/* list of used entries */
+};
+
+struct priv_cmd_entry {
+	u32 *ptr;
+	u64 gva;
+	u16 get;	/* start of entry in queue */
+	u16 size;	/* in words */
+	u32 gp_get;	/* gp_get when submitting last priv cmd */
+	u32 gp_put;	/* gp_put when submitting last priv cmd */
+	u32 gp_wrap;	/* wrap when submitting last priv cmd */
+	bool pre_alloc;	/* prealloc entry, free to free list */
+	struct list_head list;	/* node for lists */
+};
+
+struct mapped_buffer_node {
+	struct vm_gk20a *vm;
+	struct rb_node node;
+	struct list_head unmap_list;
+	struct list_head va_buffers_list;
+	struct vm_reserved_va_node *va_node;
+	u64 addr;
+	u64 size;
+	struct dma_buf *dmabuf;
+	struct sg_table *sgt;
+	struct kref ref;
+	u32 user_mapped;
+	bool own_mem_ref;
+	u32 pgsz_idx;
+	u32 ctag_offset;
+	u32 ctag_lines;
+	u32 flags;
+	u32 kind;
+	bool va_allocated;
+};
+
+struct vm_reserved_va_node {
+	struct list_head reserved_va_list;
+	struct list_head va_buffers_list;
+	u32 pgsz_idx;
+	u64 vaddr_start;
+	u64 size;
+	bool sparse;
+};
+
+struct vm_gk20a {
+	struct mm_gk20a *mm;
+	struct gk20a_as_share *as_share; /* as_share this represents */
+
+	u64 va_start;
+	u64 va_limit;
+
+	int num_user_mapped_buffers;
+
+	bool big_pages;   /* enable large page support */
+	bool enable_ctag;
+	bool tlb_dirty;
+	bool mapped;
+
+	struct kref ref;
+
+	struct mutex update_gmmu_lock;
+
+	struct page_directory_gk20a pdes;
+
+	struct gk20a_allocator vma[gmmu_nr_page_sizes];
+	struct rb_root mapped_buffers;
+
+	struct list_head reserved_va_list;
+
+	dma_addr_t zero_page_iova;
+	void *zero_page_cpuva;
+	struct sg_table *zero_page_sgt;
+};
+
+struct gk20a;
+struct channel_gk20a;
+
+int gk20a_init_mm_support(struct gk20a *g);
+int gk20a_init_mm_setup_sw(struct gk20a *g);
+int gk20a_init_bar1_vm(struct mm_gk20a *mm);
+int gk20a_init_pmu_vm(struct mm_gk20a *mm);
+
+void gk20a_mm_fb_flush(struct gk20a *g);
+void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate);
+void gk20a_mm_l2_invalidate(struct gk20a *g);
+
+struct mm_gk20a {
+	struct gk20a *g;
+
+	u32 compression_page_size;
+	u32 big_page_size;
+	u32 pde_stride;
+	u32 pde_stride_shift;
+
+	struct {
+		u32 order;
+		u32 num_ptes;
+	} page_table_sizing[gmmu_nr_page_sizes];
+
+
+	struct {
+		u64 size;
+	} channel;
+
+	struct {
+		u32 aperture_size;
+		struct vm_gk20a vm;
+		struct inst_desc inst_block;
+	} bar1;
+
+	struct {
+		u32 aperture_size;
+		struct vm_gk20a vm;
+		struct inst_desc inst_block;
+	} pmu;
+
+	struct mutex tlb_lock;
+	struct mutex l2_op_lock;
+
+	void (*remove_support)(struct mm_gk20a *mm);
+	bool sw_ready;
+#ifdef CONFIG_DEBUG_FS
+	u32 ltc_enabled;
+	u32 ltc_enabled_debug;
+#endif
+};
+
+int gk20a_mm_init(struct mm_gk20a *mm);
+
+#define gk20a_from_mm(mm) ((mm)->g)
+#define gk20a_from_vm(vm) ((vm)->mm->g)
+
+#define dev_from_vm(vm) dev_from_gk20a(vm->mm->g)
+
+#define DEFAULT_ALLOC_ALIGNMENT (4*1024)
+
+static inline int bar1_aperture_size_mb_gk20a(void)
+{
+	return 128; /*TBD read this from fuses?*/
+}
+/* max address bits */
+static inline int max_physaddr_bits_gk20a(void)
+{
+	return 40;/*"old" sys physaddr, meaningful? */
+}
+static inline int max_vid_physaddr_bits_gk20a(void)
+{
+	/* "vid phys" is asid/smmu phys?,
+	 * i.e. is this the real sys physaddr? */
+	return 37;
+}
+static inline int max_vaddr_bits_gk20a(void)
+{
+	return 40; /* chopped for area? */
+}
+
+#if 0 /*related to addr bits above, concern below TBD on which is accurate */
+#define bar1_instance_block_shift_gk20a() (max_physaddr_bits_gk20a() -\
+					   bus_bar1_block_ptr_s())
+#else
+#define bar1_instance_block_shift_gk20a() bus_bar1_block_ptr_shift_v()
+#endif
+
+void gk20a_mm_dump_vm(struct vm_gk20a *vm,
+		u64 va_begin, u64 va_end, char *label);
+
+int gk20a_mm_suspend(struct gk20a *g);
+
+phys_addr_t gk20a_get_phys_from_iova(struct device *d,
+				u64 dma_addr);
+
+int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
+			void *cpuva, u64 iova,
+			size_t size);
+
+int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
+			struct page **pages, u64 iova,
+			size_t size);
+
+void gk20a_free_sgtable(struct sg_table **sgt);
+
+u64 gk20a_mm_iova_addr(struct scatterlist *sgl);
+
+void gk20a_mm_ltc_isr(struct gk20a *g);
+
+bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g);
+
+int gk20a_mm_mmu_vpr_info_fetch(struct gk20a *g);
+
+u64 gk20a_gmmu_map(struct vm_gk20a *vm,
+		struct sg_table **sgt,
+		u64 size,
+		u32 flags,
+		int rw_flag);
+
+void gk20a_gmmu_unmap(struct vm_gk20a *vm,
+		u64 vaddr,
+		u64 size,
+		int rw_flag);
+
+struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf);
+void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
+		    struct sg_table *sgt);
+
+u64 gk20a_vm_map(struct vm_gk20a *vm,
+		struct dma_buf *dmabuf,
+		u64 offset_align,
+		u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
+		int kind,
+		struct sg_table **sgt,
+		bool user_mapped,
+		int rw_flag);
+
+/* unmap handle from kernel */
+void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset);
+
+/* get reference to all currently mapped buffers */
+int gk20a_vm_get_buffers(struct vm_gk20a *vm,
+			 struct mapped_buffer_node ***mapped_buffers,
+			 int *num_buffers);
+
+/* put references on the given buffers */
+void gk20a_vm_put_buffers(struct vm_gk20a *vm,
+			  struct mapped_buffer_node **mapped_buffers,
+			  int num_buffers);
+
+/* invalidate tlbs for the vm area */
+void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm);
+
+/* find buffer corresponding to va */
+int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
+			 struct dma_buf **dmabuf,
+			 u64 *offset);
+
+void gk20a_vm_get(struct vm_gk20a *vm);
+void gk20a_vm_put(struct vm_gk20a *vm);
+
+/* vm-as interface */
+struct nvhost_as_alloc_space_args;
+struct nvhost_as_free_space_args;
+int gk20a_vm_alloc_share(struct gk20a_as_share *as_share);
+int gk20a_vm_release_share(struct gk20a_as_share *as_share);
+int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
+			 struct nvhost_as_alloc_space_args *args);
+int gk20a_vm_free_space(struct gk20a_as_share *as_share,
+			struct nvhost_as_free_space_args *args);
+int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
+			  struct channel_gk20a *ch);
+int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
+			int dmabuf_fd,
+			u64 *offset_align,
+			u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
+			int kind);
+int gk20a_vm_unmap_buffer(struct gk20a_as_share *, u64 offset);
+
+int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev);
+#endif /*_MM_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a.h b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
new file mode 100644
index 000000000000..09f348cb9f53
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a.h
@@ -0,0 +1,160 @@
+/*
+ * drivers/video/tegra/host/gk20a/soc/platform_gk20a.h
+ *
+ * GK20A Platform (SoC) Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _GK20A_PLATFORM_H_
+#define _GK20A_PLATFORM_H_
+
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+
+struct gk20a;
+struct channel_gk20a;
+struct gr_ctx_buffer_desc;
+struct gk20a_scale_profile;
+
+struct gk20a_platform {
+#ifdef CONFIG_TEGRA_GK20A
+	u32 syncpt_base;
+#endif
+	/* Populated by the gk20a driver before probing the platform. */
+	struct gk20a *g;
+
+	/* Should be populated at probe. */
+	bool can_railgate;
+
+	/* Should be populated at probe. */
+	bool has_syncpoints;
+
+	/* Should be populated by probe. */
+	struct dentry *debugfs;
+
+	/* Clock configuration is stored here. Platform probe is responsible
+	 * for filling this data. */
+	struct clk *clk[3];
+	int num_clks;
+
+	/* Delay before rail gated */
+	int railgate_delay;
+
+	/* Delay before clock gated */
+	int clockgate_delay;
+
+	/* Initialize the platform interface of the gk20a driver.
+	 *
+	 * The platform implementation of this function must
+	 *   - set the power and clocks of the gk20a device to a known
+	 *     state, and
+	 *   - populate the gk20a_platform structure (a pointer to the
+	 *     structure can be obtained by calling gk20a_get_platform).
+	 *
+	 * After this function is finished, the driver will initialise
+	 * pm runtime and genpd based on the platform configuration.
+	 */
+	int (*probe)(struct platform_device *dev);
+
+	/* Second stage initialisation - called once all power management
+	 * initialisations are done.
+	 */
+	int (*late_probe)(struct platform_device *dev);
+
+	/* Called before submitting work to the gpu. The platform may use this
+	 * hook to ensure that any other hw modules that the gpu depends on are
+	 * powered. The platform implementation must count refs to this call. */
+	int (*channel_busy)(struct platform_device *dev);
+
+	/* Called after the work on the gpu is completed. The platform may use
+	 * this hook to release power refs to any other hw modules that the gpu
+	 * depends on. The platform implementation must count refs to this
+	 * call. */
+	void (*channel_idle)(struct platform_device *dev);
+
+	/* This function is called to allocate secure memory (memory that the
+	 * CPU cannot see). The function should fill the context buffer
+	 * descriptor (especially fields destroy, sgt, size).
+	 */
+	int (*secure_alloc)(struct platform_device *dev,
+			    struct gr_ctx_buffer_desc *desc,
+			    size_t size);
+
+	/* Device is going to be suspended */
+	int (*suspend)(struct device *);
+
+	/* Called to turn off the device */
+	int (*railgate)(struct platform_device *dev);
+
+	/* Called to turn on the device */
+	int (*unrailgate)(struct platform_device *dev);
+
+	/* Postscale callback is called after frequency change */
+	void (*postscale)(struct platform_device *pdev,
+			  unsigned long freq);
+
+	/* Pre callback is called before frequency change */
+	void (*prescale)(struct platform_device *pdev);
+
+	/* Devfreq governor name. If scaling is enabled, we request
+	 * this governor to be used in scaling */
+	const char *devfreq_governor;
+
+	/* Quality of service id. If this is set, the scaling routines
+	 * will register a callback to id. Each time we receive a new value,
+	 * the postscale callback gets called.  */
+	int qos_id;
+
+	/* Called as part of debug dump. If the gpu gets hung, this function
+	 * is responsible for delivering all necessary debug data of other
+	 * hw units which may interact with the gpu without direct supervision
+	 * of the CPU.
+	 */
+	void (*dump_platform_dependencies)(struct platform_device *dev);
+};
+
+static inline struct gk20a_platform *gk20a_get_platform(
+		struct platform_device *dev)
+{
+	return (struct gk20a_platform *)platform_get_drvdata(dev);
+}
+
+extern struct gk20a_platform gk20a_generic_platform;
+#ifdef CONFIG_TEGRA_GK20A
+extern struct gk20a_platform gk20a_tegra_platform;
+#endif
+
+static inline int gk20a_platform_channel_busy(struct platform_device *dev)
+{
+	struct gk20a_platform *p = gk20a_get_platform(dev);
+	int ret = 0;
+	if (p->channel_busy)
+		ret = p->channel_busy(dev);
+
+	return ret;
+}
+
+static inline void gk20a_platform_channel_idle(struct platform_device *dev)
+{
+	struct gk20a_platform *p = gk20a_get_platform(dev);
+	if (p->channel_idle)
+		p->channel_idle(dev);
+}
+
+static inline bool gk20a_platform_has_syncpoints(struct platform_device *dev)
+{
+	struct gk20a_platform *p = gk20a_get_platform(dev);
+	return p->has_syncpoints;
+}
+
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c
new file mode 100644
index 000000000000..7b750df61751
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_generic.c
@@ -0,0 +1,35 @@
+/*
+ * drivers/video/tegra/host/gk20a/platform_gk20a_generic.c
+ *
+ * GK20A Generic Platform Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "platform_gk20a.h"
+
+static int gk20a_generic_probe(struct platform_device *dev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
+
+	/* TODO: Initialize clocks and power */
+	(void)platform;
+
+	return 0;
+}
+
+struct gk20a_platform gk20a_generic_platform = {
+	.probe = gk20a_generic_probe,
+};
diff --git a/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
new file mode 100644
index 000000000000..35658f31c9d8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/platform_gk20a_tegra.c
@@ -0,0 +1,561 @@
+/*
+ * drivers/video/tegra/host/gk20a/platform_gk20a_tegra.c
+ *
+ * GK20A Tegra Platform Interface
+ *
+ * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/tegra-powergate.h>
+#include <linux/platform_data/tegra_edp.h>
+#include <linux/nvhost_ioctl.h>
+#include <linux/dma-buf.h>
+#include <linux/nvmap.h>
+#include <mach/irqs.h>
+#include <mach/pm_domains.h>
+
+#include "../../../arch/arm/mach-tegra/iomap.h"
+
+#include "gk20a.h"
+#include "hal_gk20a.h"
+#include "platform_gk20a.h"
+#include "gk20a_scale.h"
+
+#define TEGRA_GK20A_INTR		INT_GPU
+#define TEGRA_GK20A_INTR_NONSTALL	INT_GPU_NONSTALL
+
+#define TEGRA_GK20A_SIM_BASE 0x538F0000 /*tbd: get from iomap.h */
+#define TEGRA_GK20A_SIM_SIZE 0x1000     /*tbd: this is a high-side guess */
+
+extern struct device tegra_vpr_dev;
+struct gk20a_platform t132_gk20a_tegra_platform;
+
+struct gk20a_emc_params {
+	long				emc_slope;
+	long				emc_offset;
+	long				emc_dip_slope;
+	long				emc_dip_offset;
+	long				emc_xmid;
+	bool				linear;
+};
+
+/*
+ * 20.12 fixed point arithmetic
+ */
+
+static const int FXFRAC = 12;
+static const int FX_HALF = (1 << 12) / 2;
+
+#define INT_TO_FX(x) ((x) << FXFRAC)
+#define FX_TO_INT(x) ((x) >> FXFRAC)
+
+#define MHZ_TO_HZ(x) ((x) * 1000000)
+#define HZ_TO_MHZ(x) ((x) / 1000000)
+
+int FXMUL(int x, int y)
+{
+	return ((long long) x * (long long) y) >> FXFRAC;
+}
+
+int FXDIV(int x, int y)
+{
+	/* long long div operation not supported, must shift manually. This
+	 * would have been
+	 *
+	 *    return (((long long) x) << FXFRAC) / (long long) y;
+	 */
+	int pos, t;
+	if (x == 0)
+		return 0;
+
+	/* find largest allowable right shift to numerator, limit to FXFRAC */
+	t = x < 0 ? -x : x;
+	pos = 31 - fls(t); /* fls can't be 32 if x != 0 */
+	if (pos > FXFRAC)
+		pos = FXFRAC;
+
+	y >>= FXFRAC - pos;
+	if (y == 0)
+		return 0x7FFFFFFF; /* overflow, return MAX_FIXED */
+
+	return (x << pos) / y;
+}
+
+static int gk20a_tegra_channel_busy(struct platform_device *dev)
+{
+	int ret = 0;
+
+	/* Explicitly turn on the host1x clocks
+	 * - This is needed as host1x driver sets ignore_children = true
+	 * to cater the use case of display clock ON but host1x clock OFF
+	 * in OS-Idle-Display-ON case
+	 * - This was easily done in ACM as it only checked the ref count
+	 * of host1x (or any device for that matter) to be zero before
+	 * turning off its clock
+	 * - However, runtime PM checks to see if *ANY* child of device is
+	 * in ACTIVE state and if yes, it doesn't suspend the parent. As a
+	 * result of this, display && host1x clocks remains ON during
+	 * OS-Idle-Display-ON case
+	 * - The code below fixes this use-case
+	 */
+	if (to_platform_device(dev->dev.parent))
+		ret = nvhost_module_busy_ext(
+			to_platform_device(dev->dev.parent));
+
+	return ret;
+}
+
+static void gk20a_tegra_channel_idle(struct platform_device *dev)
+{
+	/* Explicitly turn off the host1x clocks */
+	if (to_platform_device(dev->dev.parent))
+		nvhost_module_idle_ext(to_platform_device(dev->dev.parent));
+}
+
+static void gk20a_tegra_secure_destroy(struct platform_device *pdev,
+				       struct gr_ctx_buffer_desc *desc)
+{
+	gk20a_free_sgtable(&desc->sgt);
+	dma_free_attrs(&tegra_vpr_dev, desc->size,
+			(void *)(uintptr_t)&desc->iova,
+			desc->iova, &desc->attrs);
+}
+
+static int gk20a_tegra_secure_alloc(struct platform_device *pdev,
+				    struct gr_ctx_buffer_desc *desc,
+				    size_t size)
+{
+	struct device *dev = &pdev->dev;
+	DEFINE_DMA_ATTRS(attrs);
+	dma_addr_t iova;
+	struct sg_table *sgt;
+	struct page *page;
+	int err = 0;
+
+	dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
+
+	(void)dma_alloc_attrs(&tegra_vpr_dev, size, &iova,
+				      GFP_KERNEL, &attrs);
+	if (dma_mapping_error(&tegra_vpr_dev, iova))
+		return -ENOMEM;
+
+	desc->iova = iova;
+	desc->size = size;
+	desc->attrs = attrs;
+	desc->destroy = gk20a_tegra_secure_destroy;
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt) {
+		gk20a_err(dev, "failed to allocate memory\n");
+		goto fail;
+	}
+	err = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (err) {
+		gk20a_err(dev, "failed to allocate sg_table\n");
+		goto fail_sgt;
+	}
+	page = phys_to_page(iova);
+	sg_set_page(sgt->sgl, page, size, 0);
+	sg_dma_address(sgt->sgl) = iova;
+
+	desc->sgt = sgt;
+
+	return err;
+
+fail_sgt:
+	kfree(sgt);
+fail:
+	dma_free_attrs(&tegra_vpr_dev, desc->size,
+			(void *)(uintptr_t)&desc->iova,
+			desc->iova, &desc->attrs);
+	return err;
+}
+
+/*
+ * gk20a_tegra_get_emc_rate()
+ *
+ * This function returns the minimum emc clock based on gpu frequency
+ */
+
+long gk20a_tegra_get_emc_rate(struct gk20a_emc_params *emc_params, long freq)
+{
+	long hz;
+
+	freq = INT_TO_FX(HZ_TO_MHZ(freq));
+	hz = FXMUL(freq, emc_params->emc_slope) + emc_params->emc_offset;
+
+	hz -= FXMUL(emc_params->emc_dip_slope,
+		FXMUL(freq - emc_params->emc_xmid,
+			freq - emc_params->emc_xmid)) +
+		emc_params->emc_dip_offset;
+
+	hz = MHZ_TO_HZ(FX_TO_INT(hz + FX_HALF)); /* round to nearest */
+	hz = (hz < 0) ? 0 : hz;
+
+	return hz;
+}
+
+/*
+ * gk20a_tegra_postscale(profile, freq)
+ *
+ * This function sets emc frequency based on current gpu frequency
+ */
+
+static void gk20a_tegra_postscale(struct platform_device *pdev,
+				  unsigned long freq)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	struct gk20a_scale_profile *profile = platform->g->scale_profile;
+	struct gk20a_emc_params *emc_params = profile->private_data;
+	struct gk20a *g = get_gk20a(pdev);
+
+	long after = gk20a_clk_get_rate(g);
+	long emc_target = gk20a_tegra_get_emc_rate(emc_params, after);
+
+	clk_set_rate(platform->clk[2], emc_target);
+}
+
+/*
+ * gk20a_tegra_prescale(profile, freq)
+ *
+ * This function informs EDP about changed constraints.
+ */
+
+static void gk20a_tegra_prescale(struct platform_device *pdev)
+{
+	struct gk20a *g = get_gk20a(pdev);
+	u32 avg = 0;
+
+	gk20a_pmu_load_norm(g, &avg);
+	tegra_edp_notify_gpu_load(avg);
+}
+
+/*
+ * gk20a_tegra_calibrate_emc()
+ *
+ * Compute emc scaling parameters
+ *
+ * Remc = S * R3d + O - (Sd * (R3d - Rm)^2 + Od)
+ *
+ * Remc - 3d.emc rate
+ * R3d  - 3d.cbus rate
+ * Rm   - 3d.cbus 'middle' rate = (max + min)/2
+ * S    - emc_slope
+ * O    - emc_offset
+ * Sd   - emc_dip_slope
+ * Od   - emc_dip_offset
+ *
+ * this superposes a quadratic dip centered around the middle 3d
+ * frequency over a linear correlation of 3d.emc to 3d clock
+ * rates.
+ *
+ * S, O are chosen so that the maximum 3d rate produces the
+ * maximum 3d.emc rate exactly, and the minimum 3d rate produces
+ * at least the minimum 3d.emc rate.
+ *
+ * Sd and Od are chosen to produce the largest dip that will
+ * keep 3d.emc frequencies monotonously decreasing with 3d
+ * frequencies. To achieve this, the first derivative of Remc
+ * with respect to R3d should be zero for the minimal 3d rate:
+ *
+ *   R'emc = S - 2 * Sd * (R3d - Rm)
+ *   R'emc(R3d-min) = 0
+ *   S = 2 * Sd * (R3d-min - Rm)
+ *     = 2 * Sd * (R3d-min - R3d-max) / 2
+ *
+ *   +------------------------------+
+ *   | Sd = S / (R3d-min - R3d-max) |
+ *   +------------------------------+
+ *
+ *   dip = Sd * (R3d - Rm)^2 + Od
+ *
+ * requiring dip(R3d-min) = 0 and dip(R3d-max) = 0 gives
+ *
+ *   Sd * (R3d-min - Rm)^2 + Od = 0
+ *   Od = -Sd * ((R3d-min - R3d-max) / 2)^2
+ *      = -Sd * ((R3d-min - R3d-max)^2) / 4
+ *
+ *   +------------------------------+
+ *   | Od = (emc-max - emc-min) / 4 |
+ *   +------------------------------+
+ *
+ */
+
+void gk20a_tegra_calibrate_emc(struct gk20a_emc_params *emc_params,
+			       struct clk *clk_3d, struct clk *clk_3d_emc)
+{
+	long correction;
+	unsigned long max_emc;
+	unsigned long min_emc;
+	unsigned long min_rate_3d;
+	unsigned long max_rate_3d;
+
+	max_emc = clk_round_rate(clk_3d_emc, UINT_MAX);
+	max_emc = INT_TO_FX(HZ_TO_MHZ(max_emc));
+
+	min_emc = clk_round_rate(clk_3d_emc, 0);
+	min_emc = INT_TO_FX(HZ_TO_MHZ(min_emc));
+
+	max_rate_3d = clk_round_rate(clk_3d, UINT_MAX);
+	max_rate_3d = INT_TO_FX(HZ_TO_MHZ(max_rate_3d));
+
+	min_rate_3d = clk_round_rate(clk_3d, 0);
+	min_rate_3d = INT_TO_FX(HZ_TO_MHZ(min_rate_3d));
+
+	emc_params->emc_slope =
+		FXDIV((max_emc - min_emc), (max_rate_3d - min_rate_3d));
+	emc_params->emc_offset = max_emc -
+		FXMUL(emc_params->emc_slope, max_rate_3d);
+	/* Guarantee max 3d rate maps to max emc rate */
+	emc_params->emc_offset += max_emc -
+		(FXMUL(emc_params->emc_slope, max_rate_3d) +
+		emc_params->emc_offset);
+
+	emc_params->emc_dip_offset = (max_emc - min_emc) / 4;
+	emc_params->emc_dip_slope =
+		-FXDIV(emc_params->emc_slope, max_rate_3d - min_rate_3d);
+	emc_params->emc_xmid = (max_rate_3d + min_rate_3d) / 2;
+	correction =
+		emc_params->emc_dip_offset +
+			FXMUL(emc_params->emc_dip_slope,
+			FXMUL(max_rate_3d - emc_params->emc_xmid,
+				max_rate_3d - emc_params->emc_xmid));
+	emc_params->emc_dip_offset -= correction;
+}
+
+/*
+ * gk20a_tegra_railgate()
+ *
+ * Gate (disable) gk20a power rail
+ */
+
+static int gk20a_tegra_railgate(struct platform_device *pdev)
+{
+	if (tegra_powergate_is_powered(TEGRA_POWERGATE_GPU))
+		tegra_powergate_partition(TEGRA_POWERGATE_GPU);
+	return 0;
+}
+
+/*
+ * gk20a_tegra_unrailgate()
+ *
+ * Ungate (enable) gk20a power rail
+ */
+
+static int gk20a_tegra_unrailgate(struct platform_device *pdev)
+{
+	tegra_unpowergate_partition(TEGRA_POWERGATE_GPU);
+	return 0;
+}
+
+struct {
+	char *name;
+	unsigned long default_rate;
+} tegra_gk20a_clocks[] = {
+	{"PLLG_ref", UINT_MAX},
+	{"pwr", 204000000},
+	{"emc", UINT_MAX} };
+
+/*
+ * gk20a_tegra_get_clocks()
+ *
+ * This function finds clocks in tegra platform and populates
+ * the clock information to gk20a platform data.
+ */
+
+static int gk20a_tegra_get_clocks(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = platform_get_drvdata(pdev);
+	char devname[16];
+	int i;
+	int ret = 0;
+
+	snprintf(devname, sizeof(devname),
+		 (pdev->id <= 0) ? "tegra_%s" : "tegra_%s.%d\n",
+		 pdev->name, pdev->id);
+
+	platform->num_clks = 0;
+	for (i = 0; i < ARRAY_SIZE(tegra_gk20a_clocks); i++) {
+		long rate = tegra_gk20a_clocks[i].default_rate;
+		struct clk *c;
+
+		c = clk_get_sys(devname, tegra_gk20a_clocks[i].name);
+		if (IS_ERR(c)) {
+			ret = PTR_ERR(c);
+			goto err_get_clock;
+		}
+		rate = clk_round_rate(c, rate);
+		clk_set_rate(c, rate);
+		platform->clk[i] = c;
+	}
+	platform->num_clks = i;
+
+	return 0;
+
+err_get_clock:
+
+	while (i--)
+		clk_put(platform->clk[i]);
+	return ret;
+}
+
+static void gk20a_tegra_scale_init(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(pdev);
+	struct gk20a_scale_profile *profile = platform->g->scale_profile;
+		struct gk20a_emc_params *emc_params;
+
+	if (!profile)
+		return;
+
+	emc_params = kzalloc(sizeof(*emc_params), GFP_KERNEL);
+	if (!emc_params)
+		return;
+
+	gk20a_tegra_calibrate_emc(emc_params, gk20a_clk_get(platform->g),
+				  platform->clk[2]);
+
+	profile->private_data = emc_params;
+}
+
+static void gk20a_tegra_debug_dump(struct platform_device *pdev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(pdev);
+	struct gk20a *g = platform->g;
+	nvhost_debug_dump_device(g->dev);
+}
+
+static int gk20a_tegra_probe(struct platform_device *dev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
+
+	if (tegra_get_chipid() == TEGRA_CHIPID_TEGRA13) {
+		t132_gk20a_tegra_platform.g = platform->g;
+		*platform = t132_gk20a_tegra_platform;
+	}
+
+	gk20a_tegra_get_clocks(dev);
+
+	return 0;
+}
+
+static int gk20a_tegra_late_probe(struct platform_device *dev)
+{
+	struct gk20a_platform *platform = gk20a_get_platform(dev);
+
+	/* Make gk20a power domain a subdomain of mc */
+	tegra_pd_add_sd(&platform->g->pd);
+
+	/* Initialise tegra specific scaling quirks */
+	gk20a_tegra_scale_init(dev);
+
+	return 0;
+}
+
+static int gk20a_tegra_suspend(struct device *dev)
+{
+	tegra_edp_notify_gpu_load(0);
+	return 0;
+}
+
+static struct resource gk20a_tegra_resources[] = {
+	{
+	.start = TEGRA_GK20A_BAR0_BASE,
+	.end   = TEGRA_GK20A_BAR0_BASE + TEGRA_GK20A_BAR0_SIZE - 1,
+	.flags = IORESOURCE_MEM,
+	},
+	{
+	.start = TEGRA_GK20A_BAR1_BASE,
+	.end   = TEGRA_GK20A_BAR1_BASE + TEGRA_GK20A_BAR1_SIZE - 1,
+	.flags = IORESOURCE_MEM,
+	},
+	{ /* Used on ASIM only */
+	.start = TEGRA_GK20A_SIM_BASE,
+	.end   = TEGRA_GK20A_SIM_BASE + TEGRA_GK20A_SIM_SIZE - 1,
+	.flags = IORESOURCE_MEM,
+	},
+	{
+	.start = TEGRA_GK20A_INTR,
+	.end   = TEGRA_GK20A_INTR,
+	.flags = IORESOURCE_IRQ,
+	},
+	{
+	.start = TEGRA_GK20A_INTR_NONSTALL,
+	.end   = TEGRA_GK20A_INTR_NONSTALL,
+	.flags = IORESOURCE_IRQ,
+	},
+};
+
+struct gk20a_platform t132_gk20a_tegra_platform = {
+	.has_syncpoints = true,
+
+	/* power management configuration */
+	.railgate_delay		= 500,
+	.clockgate_delay	= 50,
+
+	.probe = gk20a_tegra_probe,
+	.late_probe = gk20a_tegra_late_probe,
+
+	/* power management callbacks */
+	.suspend = gk20a_tegra_suspend,
+	.railgate = gk20a_tegra_railgate,
+	.unrailgate = gk20a_tegra_unrailgate,
+
+	/* frequency scaling configuration */
+	.prescale = gk20a_tegra_prescale,
+	.postscale = gk20a_tegra_postscale,
+	.devfreq_governor = "nvhost_podgov",
+	.qos_id = PM_QOS_GPU_FREQ_MIN,
+
+	.channel_busy = gk20a_tegra_channel_busy,
+	.channel_idle = gk20a_tegra_channel_idle,
+	.secure_alloc = gk20a_tegra_secure_alloc,
+	.dump_platform_dependencies = gk20a_tegra_debug_dump,
+};
+
+struct gk20a_platform gk20a_tegra_platform = {
+	.has_syncpoints = true,
+
+	/* power management configuration */
+	.railgate_delay		= 500,
+	.clockgate_delay	= 50,
+	.can_railgate		= true,
+
+	.probe = gk20a_tegra_probe,
+	.late_probe = gk20a_tegra_late_probe,
+
+	/* power management callbacks */
+	.suspend = gk20a_tegra_suspend,
+	.railgate = gk20a_tegra_railgate,
+	.unrailgate = gk20a_tegra_unrailgate,
+
+	/* frequency scaling configuration */
+	.prescale = gk20a_tegra_prescale,
+	.postscale = gk20a_tegra_postscale,
+	.devfreq_governor = "nvhost_podgov",
+	.qos_id = PM_QOS_GPU_FREQ_MIN,
+
+	.channel_busy = gk20a_tegra_channel_busy,
+	.channel_idle = gk20a_tegra_channel_idle,
+	.secure_alloc = gk20a_tegra_secure_alloc,
+	.dump_platform_dependencies = gk20a_tegra_debug_dump,
+};
+
+struct platform_device tegra_gk20a_device = {
+	.name		= "gk20a",
+	.resource	= gk20a_tegra_resources,
+	.num_resources	= ARRAY_SIZE(gk20a_tegra_resources),
+	.dev		= {
+		.platform_data = &gk20a_tegra_platform,
+	},
+};
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
new file mode 100644
index 000000000000..a00499a98ab8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.c
@@ -0,0 +1,3796 @@
+/*
+ * drivers/video/tegra/host/gk20a/pmu_gk20a.c
+ *
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/delay.h>	/* for mdelay */
+#include <linux/firmware.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/dma-mapping.h>
+
+#include "gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_pwr_gk20a.h"
+#include "hw_top_gk20a.h"
+
+#define GK20A_PMU_UCODE_IMAGE	"gpmu_ucode.bin"
+
+#define gk20a_dbg_pmu(fmt, arg...) \
+	gk20a_dbg(gpu_dbg_pmu, fmt, ##arg)
+
+static void pmu_dump_falcon_stats(struct pmu_gk20a *pmu);
+static int gk20a_pmu_get_elpg_residency_gating(struct gk20a *g,
+		u32 *ingating_time, u32 *ungating_time, u32 *gating_cnt);
+static void gk20a_init_pmu_setup_hw2_workqueue(struct work_struct *work);
+static void pmu_save_zbc(struct gk20a *g, u32 entries);
+static void ap_callback_init_and_enable_ctrl(
+		struct gk20a *g, struct pmu_msg *msg,
+		void *param, u32 seq_desc, u32 status);
+static int gk20a_pmu_ap_send_command(struct gk20a *g,
+			union pmu_ap_cmd *p_ap_cmd, bool b_block);
+
+static u32 pmu_cmdline_size_v0(struct pmu_gk20a *pmu)
+{
+	return sizeof(struct pmu_cmdline_args_v0);
+}
+
+static u32 pmu_cmdline_size_v1(struct pmu_gk20a *pmu)
+{
+	return sizeof(struct pmu_cmdline_args_v1);
+}
+
+static void set_pmu_cmdline_args_cpufreq_v1(struct pmu_gk20a *pmu, u32 freq)
+{
+	pmu->args_v1.cpu_freq_hz = freq;
+}
+
+static void set_pmu_cmdline_args_cpufreq_v0(struct pmu_gk20a *pmu, u32 freq)
+{
+	pmu->args_v0.cpu_freq_hz = freq;
+}
+
+static void *get_pmu_cmdline_args_ptr_v1(struct pmu_gk20a *pmu)
+{
+	return (void *)(&pmu->args_v1);
+}
+
+static void *get_pmu_cmdline_args_ptr_v0(struct pmu_gk20a *pmu)
+{
+	return (void *)(&pmu->args_v0);
+}
+
+static u32 get_pmu_allocation_size_v1(struct pmu_gk20a *pmu)
+{
+	return sizeof(struct pmu_allocation_v1);
+}
+
+static u32 get_pmu_allocation_size_v0(struct pmu_gk20a *pmu)
+{
+	return sizeof(struct pmu_allocation_v0);
+}
+
+static void set_pmu_allocation_ptr_v1(struct pmu_gk20a *pmu,
+	void **pmu_alloc_ptr, void *assign_ptr)
+{
+	struct pmu_allocation_v1 **pmu_a_ptr =
+		(struct pmu_allocation_v1 **)pmu_alloc_ptr;
+	*pmu_a_ptr = (struct pmu_allocation_v1 *)assign_ptr;
+}
+
+static void set_pmu_allocation_ptr_v0(struct pmu_gk20a *pmu,
+	void **pmu_alloc_ptr, void *assign_ptr)
+{
+	struct pmu_allocation_v0 **pmu_a_ptr =
+		(struct pmu_allocation_v0 **)pmu_alloc_ptr;
+	*pmu_a_ptr = (struct pmu_allocation_v0 *)assign_ptr;
+}
+
+static void pmu_allocation_set_dmem_size_v1(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr, u16 size)
+{
+	struct pmu_allocation_v1 *pmu_a_ptr =
+		(struct pmu_allocation_v1 *)pmu_alloc_ptr;
+	pmu_a_ptr->alloc.dmem.size = size;
+}
+
+static void pmu_allocation_set_dmem_size_v0(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr, u16 size)
+{
+	struct pmu_allocation_v0 *pmu_a_ptr =
+		(struct pmu_allocation_v0 *)pmu_alloc_ptr;
+	pmu_a_ptr->alloc.dmem.size = size;
+}
+
+static u16 pmu_allocation_get_dmem_size_v1(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v1 *pmu_a_ptr =
+		(struct pmu_allocation_v1 *)pmu_alloc_ptr;
+	return pmu_a_ptr->alloc.dmem.size;
+}
+
+static u16 pmu_allocation_get_dmem_size_v0(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v0 *pmu_a_ptr =
+		(struct pmu_allocation_v0 *)pmu_alloc_ptr;
+	return pmu_a_ptr->alloc.dmem.size;
+}
+
+static u32 pmu_allocation_get_dmem_offset_v1(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v1 *pmu_a_ptr =
+		(struct pmu_allocation_v1 *)pmu_alloc_ptr;
+	return pmu_a_ptr->alloc.dmem.offset;
+}
+
+static u32 pmu_allocation_get_dmem_offset_v0(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v0 *pmu_a_ptr =
+		(struct pmu_allocation_v0 *)pmu_alloc_ptr;
+	return pmu_a_ptr->alloc.dmem.offset;
+}
+
+static u32 *pmu_allocation_get_dmem_offset_addr_v1(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v1 *pmu_a_ptr =
+		(struct pmu_allocation_v1 *)pmu_alloc_ptr;
+	return &pmu_a_ptr->alloc.dmem.offset;
+}
+
+static u32 *pmu_allocation_get_dmem_offset_addr_v0(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr)
+{
+	struct pmu_allocation_v0 *pmu_a_ptr =
+		(struct pmu_allocation_v0 *)pmu_alloc_ptr;
+	return &pmu_a_ptr->alloc.dmem.offset;
+}
+
+static void pmu_allocation_set_dmem_offset_v1(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr, u32 offset)
+{
+	struct pmu_allocation_v1 *pmu_a_ptr =
+		(struct pmu_allocation_v1 *)pmu_alloc_ptr;
+	pmu_a_ptr->alloc.dmem.offset = offset;
+}
+
+static void pmu_allocation_set_dmem_offset_v0(struct pmu_gk20a *pmu,
+	void *pmu_alloc_ptr, u32 offset)
+{
+	struct pmu_allocation_v0 *pmu_a_ptr =
+		(struct pmu_allocation_v0 *)pmu_alloc_ptr;
+	pmu_a_ptr->alloc.dmem.offset = offset;
+}
+
+static void *get_pmu_msg_pmu_init_msg_ptr_v1(struct pmu_init_msg *init)
+{
+	return (void *)(&(init->pmu_init_v1));
+}
+
+static u16 get_pmu_init_msg_pmu_sw_mg_off_v1(union pmu_init_msg_pmu *init_msg)
+{
+	struct pmu_init_msg_pmu_v1 *init =
+		(struct pmu_init_msg_pmu_v1 *)(&init_msg->v1);
+	return init->sw_managed_area_offset;
+}
+
+static u16 get_pmu_init_msg_pmu_sw_mg_size_v1(union pmu_init_msg_pmu *init_msg)
+{
+	struct pmu_init_msg_pmu_v1 *init =
+		(struct pmu_init_msg_pmu_v1 *)(&init_msg->v1);
+	return init->sw_managed_area_size;
+}
+
+static void *get_pmu_msg_pmu_init_msg_ptr_v0(struct pmu_init_msg *init)
+{
+	return (void *)(&(init->pmu_init_v0));
+}
+
+static u16 get_pmu_init_msg_pmu_sw_mg_off_v0(union pmu_init_msg_pmu *init_msg)
+{
+	struct pmu_init_msg_pmu_v0 *init =
+		(struct pmu_init_msg_pmu_v0 *)(&init_msg->v0);
+	return init->sw_managed_area_offset;
+}
+
+static u16 get_pmu_init_msg_pmu_sw_mg_size_v0(union pmu_init_msg_pmu *init_msg)
+{
+	struct pmu_init_msg_pmu_v0 *init =
+		(struct pmu_init_msg_pmu_v0 *)(&init_msg->v0);
+	return init->sw_managed_area_size;
+}
+
+static u32 get_pmu_perfmon_cmd_start_size_v1(void)
+{
+	return sizeof(struct pmu_perfmon_cmd_start_v1);
+}
+
+static u32 get_pmu_perfmon_cmd_start_size_v0(void)
+{
+	return sizeof(struct pmu_perfmon_cmd_start_v0);
+}
+
+static int get_perfmon_cmd_start_offsetofvar_v1(
+	enum pmu_perfmon_cmd_start_fields field)
+{
+	switch (field) {
+	case COUNTER_ALLOC:
+		return offsetof(struct pmu_perfmon_cmd_start_v1,
+		counter_alloc);
+	default:
+		return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
+static int get_perfmon_cmd_start_offsetofvar_v0(
+	enum pmu_perfmon_cmd_start_fields field)
+{
+	switch (field) {
+	case COUNTER_ALLOC:
+		return offsetof(struct pmu_perfmon_cmd_start_v0,
+		counter_alloc);
+	default:
+		return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
+static u32 get_pmu_perfmon_cmd_init_size_v1(void)
+{
+	return sizeof(struct pmu_perfmon_cmd_init_v1);
+}
+
+static u32 get_pmu_perfmon_cmd_init_size_v0(void)
+{
+	return sizeof(struct pmu_perfmon_cmd_init_v0);
+}
+
+static int get_perfmon_cmd_init_offsetofvar_v1(
+	enum pmu_perfmon_cmd_start_fields field)
+{
+	switch (field) {
+	case COUNTER_ALLOC:
+		return offsetof(struct pmu_perfmon_cmd_init_v1,
+		counter_alloc);
+	default:
+		return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
+static int get_perfmon_cmd_init_offsetofvar_v0(
+	enum pmu_perfmon_cmd_start_fields field)
+{
+	switch (field) {
+	case COUNTER_ALLOC:
+		return offsetof(struct pmu_perfmon_cmd_init_v0,
+		counter_alloc);
+	default:
+		return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
+static void perfmon_start_set_cmd_type_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+	start->cmd_type = value;
+}
+
+static void perfmon_start_set_cmd_type_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+	start->cmd_type = value;
+}
+
+static void perfmon_start_set_group_id_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+	start->group_id = value;
+}
+
+static void perfmon_start_set_group_id_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+	start->group_id = value;
+}
+
+static void perfmon_start_set_state_id_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+	start->state_id = value;
+}
+
+static void perfmon_start_set_state_id_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+	start->state_id = value;
+}
+
+static void perfmon_start_set_flags_v1(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+	start->flags = value;
+}
+
+static void perfmon_start_set_flags_v0(struct pmu_perfmon_cmd *pc, u8 value)
+{
+	struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+	start->flags = value;
+}
+
+static u8 perfmon_start_get_flags_v1(struct pmu_perfmon_cmd *pc)
+{
+	struct pmu_perfmon_cmd_start_v1 *start = &pc->start_v1;
+	return start->flags;
+}
+
+static u8 perfmon_start_get_flags_v0(struct pmu_perfmon_cmd *pc)
+{
+	struct pmu_perfmon_cmd_start_v0 *start = &pc->start_v0;
+	return start->flags;
+}
+
+static void perfmon_cmd_init_set_sample_buffer_v1(struct pmu_perfmon_cmd *pc,
+	u16 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->sample_buffer = value;
+}
+
+static void perfmon_cmd_init_set_sample_buffer_v0(struct pmu_perfmon_cmd *pc,
+	u16 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->sample_buffer = value;
+}
+
+static void perfmon_cmd_init_set_dec_cnt_v1(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->to_decrease_count = value;
+}
+
+static void perfmon_cmd_init_set_dec_cnt_v0(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->to_decrease_count = value;
+}
+
+static void perfmon_cmd_init_set_base_cnt_id_v1(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->base_counter_id = value;
+}
+
+static void perfmon_cmd_init_set_base_cnt_id_v0(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->base_counter_id = value;
+}
+
+static void perfmon_cmd_init_set_samp_period_us_v1(struct pmu_perfmon_cmd *pc,
+	u32 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->sample_period_us = value;
+}
+
+static void perfmon_cmd_init_set_samp_period_us_v0(struct pmu_perfmon_cmd *pc,
+	u32 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->sample_period_us = value;
+}
+
+static void perfmon_cmd_init_set_num_cnt_v1(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->num_counters = value;
+}
+
+static void perfmon_cmd_init_set_num_cnt_v0(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->num_counters = value;
+}
+
+static void perfmon_cmd_init_set_mov_avg_v1(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v1 *init = &pc->init_v1;
+	init->samples_in_moving_avg = value;
+}
+
+static void perfmon_cmd_init_set_mov_avg_v0(struct pmu_perfmon_cmd *pc,
+	u8 value)
+{
+	struct pmu_perfmon_cmd_init_v0 *init = &pc->init_v0;
+	init->samples_in_moving_avg = value;
+}
+
+static void get_pmu_init_msg_pmu_queue_params_v0(struct pmu_queue *queue,
+	u32 id, void *pmu_init_msg)
+{
+	struct pmu_init_msg_pmu_v0 *init =
+		(struct pmu_init_msg_pmu_v0 *)pmu_init_msg;
+	queue->index    = init->queue_info[id].index;
+	queue->offset   = init->queue_info[id].offset;
+	queue->size = init->queue_info[id].size;
+}
+
+static void get_pmu_init_msg_pmu_queue_params_v1(struct pmu_queue *queue,
+	u32 id, void *pmu_init_msg)
+{
+	struct pmu_init_msg_pmu_v1 *init =
+		(struct pmu_init_msg_pmu_v1 *)pmu_init_msg;
+	queue->index    = init->queue_info[id].index;
+	queue->offset   = init->queue_info[id].offset;
+	queue->size = init->queue_info[id].size;
+}
+
+static void *get_pmu_sequence_in_alloc_ptr_v1(struct pmu_sequence *seq)
+{
+	return (void *)(&seq->in_v1);
+}
+
+static void *get_pmu_sequence_in_alloc_ptr_v0(struct pmu_sequence *seq)
+{
+	return (void *)(&seq->in_v0);
+}
+
+static void *get_pmu_sequence_out_alloc_ptr_v1(struct pmu_sequence *seq)
+{
+	return (void *)(&seq->out_v1);
+}
+
+static void *get_pmu_sequence_out_alloc_ptr_v0(struct pmu_sequence *seq)
+{
+	return (void *)(&seq->out_v0);
+}
+
+static int gk20a_init_pmu(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	switch (pmu->desc->app_version) {
+	case APP_VERSION_1:
+		g->ops.pmu_ver.cmd_id_zbc_table_update = 16;
+		g->ops.pmu_ver.get_pmu_cmdline_args_size =
+			pmu_cmdline_size_v1;
+		g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq =
+			set_pmu_cmdline_args_cpufreq_v1;
+		g->ops.pmu_ver.get_pmu_cmdline_args_ptr =
+			get_pmu_cmdline_args_ptr_v1;
+		g->ops.pmu_ver.get_pmu_allocation_struct_size =
+			get_pmu_allocation_size_v1;
+		g->ops.pmu_ver.set_pmu_allocation_ptr =
+			set_pmu_allocation_ptr_v1;
+		g->ops.pmu_ver.pmu_allocation_set_dmem_size =
+			pmu_allocation_set_dmem_size_v1;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_size =
+			pmu_allocation_get_dmem_size_v1;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_offset =
+			pmu_allocation_get_dmem_offset_v1;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_offset_addr =
+			pmu_allocation_get_dmem_offset_addr_v1;
+		g->ops.pmu_ver.pmu_allocation_set_dmem_offset =
+			pmu_allocation_set_dmem_offset_v1;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params =
+			get_pmu_init_msg_pmu_queue_params_v1;
+		g->ops.pmu_ver.get_pmu_msg_pmu_init_msg_ptr =
+			get_pmu_msg_pmu_init_msg_ptr_v1;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_off =
+			get_pmu_init_msg_pmu_sw_mg_off_v1;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_size =
+			get_pmu_init_msg_pmu_sw_mg_size_v1;
+		g->ops.pmu_ver.get_pmu_perfmon_cmd_start_size =
+			get_pmu_perfmon_cmd_start_size_v1;
+		g->ops.pmu_ver.get_perfmon_cmd_start_offsetofvar =
+			get_perfmon_cmd_start_offsetofvar_v1;
+		g->ops.pmu_ver.perfmon_start_set_cmd_type =
+			perfmon_start_set_cmd_type_v1;
+		g->ops.pmu_ver.perfmon_start_set_group_id =
+			perfmon_start_set_group_id_v1;
+		g->ops.pmu_ver.perfmon_start_set_state_id =
+			perfmon_start_set_state_id_v1;
+		g->ops.pmu_ver.perfmon_start_set_flags =
+			perfmon_start_set_flags_v1;
+		g->ops.pmu_ver.perfmon_start_get_flags =
+			perfmon_start_get_flags_v1;
+		g->ops.pmu_ver.get_pmu_perfmon_cmd_init_size =
+			get_pmu_perfmon_cmd_init_size_v1;
+		g->ops.pmu_ver.get_perfmon_cmd_init_offsetofvar =
+			get_perfmon_cmd_init_offsetofvar_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_sample_buffer =
+			perfmon_cmd_init_set_sample_buffer_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_dec_cnt =
+			perfmon_cmd_init_set_dec_cnt_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_base_cnt_id =
+			perfmon_cmd_init_set_base_cnt_id_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_samp_period_us =
+			perfmon_cmd_init_set_samp_period_us_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_num_cnt =
+			perfmon_cmd_init_set_num_cnt_v1;
+		g->ops.pmu_ver.perfmon_cmd_init_set_mov_avg =
+			perfmon_cmd_init_set_mov_avg_v1;
+		g->ops.pmu_ver.get_pmu_seq_in_a_ptr =
+			get_pmu_sequence_in_alloc_ptr_v1;
+		g->ops.pmu_ver.get_pmu_seq_out_a_ptr =
+			get_pmu_sequence_out_alloc_ptr_v1;
+		break;
+	case APP_VERSION_0:
+		g->ops.pmu_ver.cmd_id_zbc_table_update = 14;
+		g->ops.pmu_ver.get_pmu_cmdline_args_size =
+			pmu_cmdline_size_v0;
+		g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq =
+			set_pmu_cmdline_args_cpufreq_v0;
+		g->ops.pmu_ver.get_pmu_cmdline_args_ptr =
+			get_pmu_cmdline_args_ptr_v0;
+		g->ops.pmu_ver.get_pmu_allocation_struct_size =
+			get_pmu_allocation_size_v0;
+		g->ops.pmu_ver.set_pmu_allocation_ptr =
+			set_pmu_allocation_ptr_v0;
+		g->ops.pmu_ver.pmu_allocation_set_dmem_size =
+			pmu_allocation_set_dmem_size_v0;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_size =
+			pmu_allocation_get_dmem_size_v0;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_offset =
+			pmu_allocation_get_dmem_offset_v0;
+		g->ops.pmu_ver.pmu_allocation_get_dmem_offset_addr =
+			pmu_allocation_get_dmem_offset_addr_v0;
+		g->ops.pmu_ver.pmu_allocation_set_dmem_offset =
+			pmu_allocation_set_dmem_offset_v0;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params =
+			get_pmu_init_msg_pmu_queue_params_v0;
+		g->ops.pmu_ver.get_pmu_msg_pmu_init_msg_ptr =
+			get_pmu_msg_pmu_init_msg_ptr_v0;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_off =
+			get_pmu_init_msg_pmu_sw_mg_off_v0;
+		g->ops.pmu_ver.get_pmu_init_msg_pmu_sw_mg_size =
+			get_pmu_init_msg_pmu_sw_mg_size_v0;
+		g->ops.pmu_ver.get_pmu_perfmon_cmd_start_size =
+			get_pmu_perfmon_cmd_start_size_v0;
+		g->ops.pmu_ver.get_perfmon_cmd_start_offsetofvar =
+			get_perfmon_cmd_start_offsetofvar_v0;
+		g->ops.pmu_ver.perfmon_start_set_cmd_type =
+			perfmon_start_set_cmd_type_v0;
+		g->ops.pmu_ver.perfmon_start_set_group_id =
+			perfmon_start_set_group_id_v0;
+		g->ops.pmu_ver.perfmon_start_set_state_id =
+			perfmon_start_set_state_id_v0;
+		g->ops.pmu_ver.perfmon_start_set_flags =
+			perfmon_start_set_flags_v0;
+		g->ops.pmu_ver.perfmon_start_get_flags =
+			perfmon_start_get_flags_v0;
+		g->ops.pmu_ver.get_pmu_perfmon_cmd_init_size =
+			get_pmu_perfmon_cmd_init_size_v0;
+		g->ops.pmu_ver.get_perfmon_cmd_init_offsetofvar =
+			get_perfmon_cmd_init_offsetofvar_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_sample_buffer =
+			perfmon_cmd_init_set_sample_buffer_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_dec_cnt =
+			perfmon_cmd_init_set_dec_cnt_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_base_cnt_id =
+			perfmon_cmd_init_set_base_cnt_id_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_samp_period_us =
+			perfmon_cmd_init_set_samp_period_us_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_num_cnt =
+			perfmon_cmd_init_set_num_cnt_v0;
+		g->ops.pmu_ver.perfmon_cmd_init_set_mov_avg =
+			perfmon_cmd_init_set_mov_avg_v0;
+		g->ops.pmu_ver.get_pmu_seq_in_a_ptr =
+			get_pmu_sequence_in_alloc_ptr_v0;
+		g->ops.pmu_ver.get_pmu_seq_out_a_ptr =
+			get_pmu_sequence_out_alloc_ptr_v0;
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(pmu->g),
+		"PMU code version not supported\n");
+		return -EINVAL;
+		break;
+	}
+	return 0;
+}
+
+static void pmu_copy_from_dmem(struct pmu_gk20a *pmu,
+		u32 src, u8 *dst, u32 size, u8 port)
+{
+	struct gk20a *g = pmu->g;
+	u32 i, words, bytes;
+	u32 data, addr_mask;
+	u32 *dst_u32 = (u32*)dst;
+
+	if (size == 0) {
+		gk20a_err(dev_from_gk20a(g),
+			"size is zero");
+		return;
+	}
+
+	if (src & 0x3) {
+		gk20a_err(dev_from_gk20a(g),
+			"src (0x%08x) not 4-byte aligned", src);
+		return;
+	}
+
+	mutex_lock(&pmu->pmu_copy_lock);
+
+	words = size >> 2;
+	bytes = size & 0x3;
+
+	addr_mask = pwr_falcon_dmemc_offs_m() |
+		    pwr_falcon_dmemc_blk_m();
+
+	src &= addr_mask;
+
+	gk20a_writel(g, pwr_falcon_dmemc_r(port),
+		src | pwr_falcon_dmemc_aincr_f(1));
+
+	for (i = 0; i < words; i++)
+		dst_u32[i] = gk20a_readl(g, pwr_falcon_dmemd_r(port));
+
+	if (bytes > 0) {
+		data = gk20a_readl(g, pwr_falcon_dmemd_r(port));
+		for (i = 0; i < bytes; i++) {
+			dst[(words << 2) + i] = ((u8 *)&data)[i];
+			gk20a_dbg_pmu("read: dst_u8[%d]=0x%08x",
+					i, dst[(words << 2) + i]);
+		}
+	}
+	mutex_unlock(&pmu->pmu_copy_lock);
+	return;
+}
+
+static void pmu_copy_to_dmem(struct pmu_gk20a *pmu,
+		u32 dst, u8 *src, u32 size, u8 port)
+{
+	struct gk20a *g = pmu->g;
+	u32 i, words, bytes;
+	u32 data, addr_mask;
+	u32 *src_u32 = (u32*)src;
+
+	if (size == 0) {
+		gk20a_err(dev_from_gk20a(g),
+			"size is zero");
+		return;
+	}
+
+	if (dst & 0x3) {
+		gk20a_err(dev_from_gk20a(g),
+			"dst (0x%08x) not 4-byte aligned", dst);
+		return;
+	}
+
+	mutex_lock(&pmu->pmu_copy_lock);
+
+	words = size >> 2;
+	bytes = size & 0x3;
+
+	addr_mask = pwr_falcon_dmemc_offs_m() |
+		    pwr_falcon_dmemc_blk_m();
+
+	dst &= addr_mask;
+
+	gk20a_writel(g, pwr_falcon_dmemc_r(port),
+		dst | pwr_falcon_dmemc_aincw_f(1));
+
+	for (i = 0; i < words; i++)
+		gk20a_writel(g, pwr_falcon_dmemd_r(port), src_u32[i]);
+
+	if (bytes > 0) {
+		data = 0;
+		for (i = 0; i < bytes; i++)
+			((u8 *)&data)[i] = src[(words << 2) + i];
+		gk20a_writel(g, pwr_falcon_dmemd_r(port), data);
+	}
+
+	data = gk20a_readl(g, pwr_falcon_dmemc_r(port)) & addr_mask;
+	size = ALIGN(size, 4);
+	if (data != dst + size) {
+		gk20a_err(dev_from_gk20a(g),
+			"copy failed. bytes written %d, expected %d",
+			data - dst, size);
+	}
+	mutex_unlock(&pmu->pmu_copy_lock);
+	return;
+}
+
+static int pmu_idle(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(2000);
+	u32 idle_stat;
+
+	/* wait for pmu idle */
+	do {
+		idle_stat = gk20a_readl(g, pwr_falcon_idlestate_r());
+
+		if (pwr_falcon_idlestate_falcon_busy_v(idle_stat) == 0 &&
+		    pwr_falcon_idlestate_ext_busy_v(idle_stat) == 0) {
+			break;
+		}
+
+		if (time_after_eq(jiffies, end_jiffies)) {
+			gk20a_err(dev_from_gk20a(g),
+				"timeout waiting pmu idle : 0x%08x",
+				idle_stat);
+			return -EBUSY;
+		}
+		usleep_range(100, 200);
+	} while (1);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static void pmu_enable_irq(struct pmu_gk20a *pmu, bool enable)
+{
+	struct gk20a *g = pmu->g;
+
+	gk20a_dbg_fn("");
+
+	gk20a_writel(g, mc_intr_mask_0_r(),
+		gk20a_readl(g, mc_intr_mask_0_r()) &
+		~mc_intr_mask_0_pmu_enabled_f());
+	gk20a_writel(g, mc_intr_mask_1_r(),
+		gk20a_readl(g, mc_intr_mask_1_r()) &
+		~mc_intr_mask_1_pmu_enabled_f());
+
+	gk20a_writel(g, pwr_falcon_irqmclr_r(),
+		pwr_falcon_irqmclr_gptmr_f(1)  |
+		pwr_falcon_irqmclr_wdtmr_f(1)  |
+		pwr_falcon_irqmclr_mthd_f(1)   |
+		pwr_falcon_irqmclr_ctxsw_f(1)  |
+		pwr_falcon_irqmclr_halt_f(1)   |
+		pwr_falcon_irqmclr_exterr_f(1) |
+		pwr_falcon_irqmclr_swgen0_f(1) |
+		pwr_falcon_irqmclr_swgen1_f(1) |
+		pwr_falcon_irqmclr_ext_f(0xff));
+
+	if (enable) {
+		/* dest 0=falcon, 1=host; level 0=irq0, 1=irq1 */
+		gk20a_writel(g, pwr_falcon_irqdest_r(),
+			pwr_falcon_irqdest_host_gptmr_f(0)    |
+			pwr_falcon_irqdest_host_wdtmr_f(1)    |
+			pwr_falcon_irqdest_host_mthd_f(0)     |
+			pwr_falcon_irqdest_host_ctxsw_f(0)    |
+			pwr_falcon_irqdest_host_halt_f(1)     |
+			pwr_falcon_irqdest_host_exterr_f(0)   |
+			pwr_falcon_irqdest_host_swgen0_f(1)   |
+			pwr_falcon_irqdest_host_swgen1_f(0)   |
+			pwr_falcon_irqdest_host_ext_f(0xff)   |
+			pwr_falcon_irqdest_target_gptmr_f(1)  |
+			pwr_falcon_irqdest_target_wdtmr_f(0)  |
+			pwr_falcon_irqdest_target_mthd_f(0)   |
+			pwr_falcon_irqdest_target_ctxsw_f(0)  |
+			pwr_falcon_irqdest_target_halt_f(0)   |
+			pwr_falcon_irqdest_target_exterr_f(0) |
+			pwr_falcon_irqdest_target_swgen0_f(0) |
+			pwr_falcon_irqdest_target_swgen1_f(0) |
+			pwr_falcon_irqdest_target_ext_f(0xff));
+
+		/* 0=disable, 1=enable */
+		gk20a_writel(g, pwr_falcon_irqmset_r(),
+			pwr_falcon_irqmset_gptmr_f(1)  |
+			pwr_falcon_irqmset_wdtmr_f(1)  |
+			pwr_falcon_irqmset_mthd_f(0)   |
+			pwr_falcon_irqmset_ctxsw_f(0)  |
+			pwr_falcon_irqmset_halt_f(1)   |
+			pwr_falcon_irqmset_exterr_f(1) |
+			pwr_falcon_irqmset_swgen0_f(1) |
+			pwr_falcon_irqmset_swgen1_f(1));
+
+		gk20a_writel(g, mc_intr_mask_0_r(),
+			gk20a_readl(g, mc_intr_mask_0_r()) |
+			mc_intr_mask_0_pmu_enabled_f());
+	}
+
+	gk20a_dbg_fn("done");
+}
+
+static int pmu_enable_hw(struct pmu_gk20a *pmu, bool enable)
+{
+	struct gk20a *g = pmu->g;
+
+	gk20a_dbg_fn("");
+
+	if (enable) {
+		int retries = GR_IDLE_CHECK_MAX / GR_IDLE_CHECK_DEFAULT;
+		gk20a_enable(g, mc_enable_pwr_enabled_f());
+
+		do {
+			u32 w = gk20a_readl(g, pwr_falcon_dmactl_r()) &
+				(pwr_falcon_dmactl_dmem_scrubbing_m() |
+				 pwr_falcon_dmactl_imem_scrubbing_m());
+
+			if (!w) {
+				gk20a_dbg_fn("done");
+				return 0;
+			}
+			udelay(GR_IDLE_CHECK_DEFAULT);
+		} while (--retries || !tegra_platform_is_silicon());
+
+		gk20a_disable(g, mc_enable_pwr_enabled_f());
+		gk20a_err(dev_from_gk20a(g), "Falcon mem scrubbing timeout");
+
+		return -ETIMEDOUT;
+	} else {
+		gk20a_disable(g, mc_enable_pwr_enabled_f());
+		return 0;
+	}
+}
+
+static int pmu_enable(struct pmu_gk20a *pmu, bool enable)
+{
+	struct gk20a *g = pmu->g;
+	u32 pmc_enable;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (!enable) {
+		pmc_enable = gk20a_readl(g, mc_enable_r());
+		if (mc_enable_pwr_v(pmc_enable) !=
+		    mc_enable_pwr_disabled_v()) {
+
+			pmu_enable_irq(pmu, false);
+			pmu_enable_hw(pmu, false);
+		}
+	} else {
+		err = pmu_enable_hw(pmu, true);
+		if (err)
+			return err;
+
+		/* TBD: post reset */
+
+		err = pmu_idle(pmu);
+		if (err)
+			return err;
+
+		pmu_enable_irq(pmu, true);
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+static int pmu_reset(struct pmu_gk20a *pmu)
+{
+	int err;
+
+	err = pmu_idle(pmu);
+	if (err)
+		return err;
+
+	/* TBD: release pmu hw mutex */
+
+	err = pmu_enable(pmu, false);
+	if (err)
+		return err;
+
+	/* TBD: cancel all sequences */
+	/* TBD: init all sequences and state tables */
+	/* TBD: restore pre-init message handler */
+
+	err = pmu_enable(pmu, true);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static int pmu_bootstrap(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct gk20a_platform *platform = platform_get_drvdata(g->dev);
+	struct mm_gk20a *mm = &g->mm;
+	struct pmu_ucode_desc *desc = pmu->desc;
+	u64 addr_code, addr_data, addr_load;
+	u32 i, blocks, addr_args;
+
+	gk20a_dbg_fn("");
+
+	gk20a_writel(g, pwr_falcon_itfen_r(),
+		gk20a_readl(g, pwr_falcon_itfen_r()) |
+		pwr_falcon_itfen_ctxen_enable_f());
+	gk20a_writel(g, pwr_pmu_new_instblk_r(),
+		pwr_pmu_new_instblk_ptr_f(
+			mm->pmu.inst_block.cpu_pa >> 12) |
+		pwr_pmu_new_instblk_valid_f(1) |
+		pwr_pmu_new_instblk_target_sys_coh_f());
+
+	/* TBD: load all other surfaces */
+
+	g->ops.pmu_ver.set_pmu_cmdline_args_cpu_freq(pmu,
+		clk_get_rate(platform->clk[1]));
+
+	addr_args = (pwr_falcon_hwcfg_dmem_size_v(
+		gk20a_readl(g, pwr_falcon_hwcfg_r()))
+			<< GK20A_PMU_DMEM_BLKSIZE2) -
+		g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu);
+
+	pmu_copy_to_dmem(pmu, addr_args,
+			(u8 *)(g->ops.pmu_ver.get_pmu_cmdline_args_ptr(pmu)),
+			g->ops.pmu_ver.get_pmu_cmdline_args_size(pmu), 0);
+
+	gk20a_writel(g, pwr_falcon_dmemc_r(0),
+		pwr_falcon_dmemc_offs_f(0) |
+		pwr_falcon_dmemc_blk_f(0)  |
+		pwr_falcon_dmemc_aincw_f(1));
+
+	addr_code = u64_lo32((pmu->ucode.pmu_va +
+			desc->app_start_offset +
+			desc->app_resident_code_offset) >> 8) ;
+	addr_data = u64_lo32((pmu->ucode.pmu_va +
+			desc->app_start_offset +
+			desc->app_resident_data_offset) >> 8);
+	addr_load = u64_lo32((pmu->ucode.pmu_va +
+			desc->bootloader_start_offset) >> 8);
+
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), GK20A_PMU_DMAIDX_UCODE);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_size);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_code_size);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_imem_entry);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_data);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), desc->app_resident_data_size);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_code);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), 0x1);
+	gk20a_writel(g, pwr_falcon_dmemd_r(0), addr_args);
+
+	gk20a_writel(g, pwr_falcon_dmatrfbase_r(),
+		addr_load - (desc->bootloader_imem_offset >> 8));
+
+	blocks = ((desc->bootloader_size + 0xFF) & ~0xFF) >> 8;
+
+	for (i = 0; i < blocks; i++) {
+		gk20a_writel(g, pwr_falcon_dmatrfmoffs_r(),
+			desc->bootloader_imem_offset + (i << 8));
+		gk20a_writel(g, pwr_falcon_dmatrffboffs_r(),
+			desc->bootloader_imem_offset + (i << 8));
+		gk20a_writel(g, pwr_falcon_dmatrfcmd_r(),
+			pwr_falcon_dmatrfcmd_imem_f(1)  |
+			pwr_falcon_dmatrfcmd_write_f(0) |
+			pwr_falcon_dmatrfcmd_size_f(6)  |
+			pwr_falcon_dmatrfcmd_ctxdma_f(GK20A_PMU_DMAIDX_UCODE));
+	}
+
+	gk20a_writel(g, pwr_falcon_bootvec_r(),
+		pwr_falcon_bootvec_vec_f(desc->bootloader_entry_point));
+
+	gk20a_writel(g, pwr_falcon_cpuctl_r(),
+		pwr_falcon_cpuctl_startcpu_f(1));
+
+	gk20a_writel(g, pwr_falcon_os_r(), desc->app_version);
+
+	return 0;
+}
+
+static void pmu_seq_init(struct pmu_gk20a *pmu)
+{
+	u32 i;
+
+	memset(pmu->seq, 0,
+		sizeof(struct pmu_sequence) * PMU_MAX_NUM_SEQUENCES);
+	memset(pmu->pmu_seq_tbl, 0,
+		sizeof(pmu->pmu_seq_tbl));
+
+	for (i = 0; i < PMU_MAX_NUM_SEQUENCES; i++)
+		pmu->seq[i].id = i;
+}
+
+static int pmu_seq_acquire(struct pmu_gk20a *pmu,
+			struct pmu_sequence **pseq)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_sequence *seq;
+	u32 index;
+
+	mutex_lock(&pmu->pmu_seq_lock);
+	index = find_first_zero_bit(pmu->pmu_seq_tbl,
+				sizeof(pmu->pmu_seq_tbl));
+	if (index >= sizeof(pmu->pmu_seq_tbl)) {
+		gk20a_err(dev_from_gk20a(g),
+			"no free sequence available");
+		mutex_unlock(&pmu->pmu_seq_lock);
+		return -EAGAIN;
+	}
+	set_bit(index, pmu->pmu_seq_tbl);
+	mutex_unlock(&pmu->pmu_seq_lock);
+
+	seq = &pmu->seq[index];
+	seq->state = PMU_SEQ_STATE_PENDING;
+
+	*pseq = seq;
+	return 0;
+}
+
+static void pmu_seq_release(struct pmu_gk20a *pmu,
+			struct pmu_sequence *seq)
+{
+	struct gk20a *g = pmu->g;
+	seq->state	= PMU_SEQ_STATE_FREE;
+	seq->desc	= PMU_INVALID_SEQ_DESC;
+	seq->callback	= NULL;
+	seq->cb_params	= NULL;
+	seq->msg	= NULL;
+	seq->out_payload = NULL;
+	g->ops.pmu_ver.pmu_allocation_set_dmem_size(pmu,
+		g->ops.pmu_ver.get_pmu_seq_in_a_ptr(seq), 0);
+	g->ops.pmu_ver.pmu_allocation_set_dmem_size(pmu,
+		g->ops.pmu_ver.get_pmu_seq_out_a_ptr(seq), 0);
+
+	clear_bit(seq->id, pmu->pmu_seq_tbl);
+}
+
+static int pmu_queue_init(struct pmu_gk20a *pmu,
+		u32 id, union pmu_init_msg_pmu *init)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_queue *queue = &pmu->queue[id];
+	queue->id	= id;
+	g->ops.pmu_ver.get_pmu_init_msg_pmu_queue_params(queue, id, init);
+
+	queue->mutex_id = id;
+	mutex_init(&queue->mutex);
+
+	gk20a_dbg_pmu("queue %d: index %d, offset 0x%08x, size 0x%08x",
+		id, queue->index, queue->offset, queue->size);
+
+	return 0;
+}
+
+static int pmu_queue_head(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+			u32 *head, bool set)
+{
+	struct gk20a *g = pmu->g;
+
+	BUG_ON(!head);
+
+	if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+
+		if (queue->index >= pwr_pmu_queue_head__size_1_v())
+			return -EINVAL;
+
+		if (!set)
+			*head = pwr_pmu_queue_head_address_v(
+				gk20a_readl(g,
+					pwr_pmu_queue_head_r(queue->index)));
+		else
+			gk20a_writel(g,
+				pwr_pmu_queue_head_r(queue->index),
+				pwr_pmu_queue_head_address_f(*head));
+	} else {
+		if (!set)
+			*head = pwr_pmu_msgq_head_val_v(
+				gk20a_readl(g, pwr_pmu_msgq_head_r()));
+		else
+			gk20a_writel(g,
+				pwr_pmu_msgq_head_r(),
+				pwr_pmu_msgq_head_val_f(*head));
+	}
+
+	return 0;
+}
+
+static int pmu_queue_tail(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+			u32 *tail, bool set)
+{
+	struct gk20a *g = pmu->g;
+
+	BUG_ON(!tail);
+
+	if (PMU_IS_COMMAND_QUEUE(queue->id)) {
+
+		if (queue->index >= pwr_pmu_queue_tail__size_1_v())
+			return -EINVAL;
+
+		if (!set)
+			*tail = pwr_pmu_queue_tail_address_v(
+				gk20a_readl(g,
+					pwr_pmu_queue_tail_r(queue->index)));
+		else
+			gk20a_writel(g,
+				pwr_pmu_queue_tail_r(queue->index),
+				pwr_pmu_queue_tail_address_f(*tail));
+	} else {
+		if (!set)
+			*tail = pwr_pmu_msgq_tail_val_v(
+				gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+		else
+			gk20a_writel(g,
+				pwr_pmu_msgq_tail_r(),
+				pwr_pmu_msgq_tail_val_f(*tail));
+	}
+
+	return 0;
+}
+
+static inline void pmu_queue_read(struct pmu_gk20a *pmu,
+			u32 offset, u8 *dst, u32 size)
+{
+	pmu_copy_from_dmem(pmu, offset, dst, size, 0);
+}
+
+static inline void pmu_queue_write(struct pmu_gk20a *pmu,
+			u32 offset, u8 *src, u32 size)
+{
+	pmu_copy_to_dmem(pmu, offset, src, size, 0);
+}
+
+int pmu_mutex_acquire(struct pmu_gk20a *pmu, u32 id, u32 *token)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_mutex *mutex;
+	u32 data, owner, max_retry;
+
+	if (!pmu->initialized)
+		return 0;
+
+	BUG_ON(!token);
+	BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+	BUG_ON(id > pmu->mutex_cnt);
+
+	mutex = &pmu->mutex[id];
+
+	owner = pwr_pmu_mutex_value_v(
+		gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+
+	if (*token != PMU_INVALID_MUTEX_OWNER_ID && *token == owner) {
+		BUG_ON(mutex->ref_cnt == 0);
+		gk20a_dbg_pmu("already acquired by owner : 0x%08x", *token);
+		mutex->ref_cnt++;
+		return 0;
+	}
+
+	max_retry = 40;
+	do {
+		data = pwr_pmu_mutex_id_value_v(
+			gk20a_readl(g, pwr_pmu_mutex_id_r()));
+		if (data == pwr_pmu_mutex_id_value_init_v() ||
+		    data == pwr_pmu_mutex_id_value_not_avail_v()) {
+			gk20a_warn(dev_from_gk20a(g),
+				"fail to generate mutex token: val 0x%08x",
+				owner);
+			usleep_range(20, 40);
+			continue;
+		}
+
+		owner = data;
+		gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+			pwr_pmu_mutex_value_f(owner));
+
+		data = pwr_pmu_mutex_value_v(
+			gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+
+		if (owner == data) {
+			mutex->ref_cnt = 1;
+			gk20a_dbg_pmu("mutex acquired: id=%d, token=0x%x",
+				mutex->index, *token);
+			*token = owner;
+			return 0;
+		} else {
+			gk20a_dbg_info("fail to acquire mutex idx=0x%08x",
+				mutex->index);
+
+			data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+			data = set_field(data,
+				pwr_pmu_mutex_id_release_value_m(),
+				pwr_pmu_mutex_id_release_value_f(owner));
+			gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+
+			usleep_range(20, 40);
+			continue;
+		}
+	} while (max_retry-- > 0);
+
+	return -EBUSY;
+}
+
+int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_mutex *mutex;
+	u32 owner, data;
+
+	if (!pmu->initialized)
+		return 0;
+
+	BUG_ON(!token);
+	BUG_ON(!PMU_MUTEX_ID_IS_VALID(id));
+	BUG_ON(id > pmu->mutex_cnt);
+
+	mutex = &pmu->mutex[id];
+
+	owner = pwr_pmu_mutex_value_v(
+		gk20a_readl(g, pwr_pmu_mutex_r(mutex->index)));
+
+	if (*token != owner) {
+		gk20a_err(dev_from_gk20a(g),
+			"requester 0x%08x NOT match owner 0x%08x",
+			*token, owner);
+		return -EINVAL;
+	}
+
+	if (--mutex->ref_cnt == 0) {
+		gk20a_writel(g, pwr_pmu_mutex_r(mutex->index),
+			pwr_pmu_mutex_value_initial_lock_f());
+
+		data = gk20a_readl(g, pwr_pmu_mutex_id_release_r());
+		data = set_field(data, pwr_pmu_mutex_id_release_value_m(),
+			pwr_pmu_mutex_id_release_value_f(owner));
+		gk20a_writel(g, pwr_pmu_mutex_id_release_r(), data);
+
+		gk20a_dbg_pmu("mutex released: id=%d, token=0x%x",
+			mutex->index, *token);
+	}
+
+	return 0;
+}
+
+static int pmu_queue_lock(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue)
+{
+	int err;
+
+	if (PMU_IS_MESSAGE_QUEUE(queue->id))
+		return 0;
+
+	if (PMU_IS_SW_COMMAND_QUEUE(queue->id)) {
+		mutex_lock(&queue->mutex);
+		queue->locked = true;
+		return 0;
+	}
+
+	err = pmu_mutex_acquire(pmu, queue->mutex_id,
+			&queue->mutex_lock);
+	if (err == 0)
+		queue->locked = true;
+
+	return err;
+}
+
+static int pmu_queue_unlock(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue)
+{
+	int err;
+
+	if (PMU_IS_MESSAGE_QUEUE(queue->id))
+		return 0;
+
+	if (PMU_IS_SW_COMMAND_QUEUE(queue->id)) {
+		mutex_unlock(&queue->mutex);
+		queue->locked = false;
+		return 0;
+	}
+
+	if (queue->locked) {
+		err = pmu_mutex_release(pmu, queue->mutex_id,
+				&queue->mutex_lock);
+		if (err == 0)
+			queue->locked = false;
+	}
+
+	return 0;
+}
+
+/* called by pmu_read_message, no lock */
+static bool pmu_queue_is_empty(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue)
+{
+	u32 head, tail;
+
+	pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+	if (queue->opened && queue->oflag == OFLAG_READ)
+		tail = queue->position;
+	else
+		pmu_queue_tail(pmu, queue, &tail, QUEUE_GET);
+
+	return head == tail;
+}
+
+static bool pmu_queue_has_room(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue, u32 size, bool *need_rewind)
+{
+	u32 head, tail, free;
+	bool rewind = false;
+
+	BUG_ON(!queue->locked);
+
+	size = ALIGN(size, QUEUE_ALIGNMENT);
+
+	pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+	pmu_queue_tail(pmu, queue, &tail, QUEUE_GET);
+
+	if (head >= tail) {
+		free = queue->offset + queue->size - head;
+		free -= PMU_CMD_HDR_SIZE;
+
+		if (size > free) {
+			rewind = true;
+			head = queue->offset;
+		}
+	}
+
+	if (head < tail)
+		free = tail - head - 1;
+
+	if (need_rewind)
+		*need_rewind = rewind;
+
+	return size <= free;
+}
+
+static int pmu_queue_push(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue, void *data, u32 size)
+{
+	gk20a_dbg_fn("");
+
+	if (!queue->opened && queue->oflag == OFLAG_WRITE){
+		gk20a_err(dev_from_gk20a(pmu->g),
+			"queue not opened for write");
+		return -EINVAL;
+	}
+
+	pmu_queue_write(pmu, queue->position, data, size);
+	queue->position += ALIGN(size, QUEUE_ALIGNMENT);
+	return 0;
+}
+
+static int pmu_queue_pop(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue, void *data, u32 size,
+			u32 *bytes_read)
+{
+	u32 head, tail, used;
+
+	*bytes_read = 0;
+
+	if (!queue->opened && queue->oflag == OFLAG_READ){
+		gk20a_err(dev_from_gk20a(pmu->g),
+			"queue not opened for read");
+		return -EINVAL;
+	}
+
+	pmu_queue_head(pmu, queue, &head, QUEUE_GET);
+	tail = queue->position;
+
+	if (head == tail)
+		return 0;
+
+	if (head > tail)
+		used = head - tail;
+	else
+		used = queue->offset + queue->size - tail;
+
+	if (size > used) {
+		gk20a_warn(dev_from_gk20a(pmu->g),
+			"queue size smaller than request read");
+		size = used;
+	}
+
+	pmu_queue_read(pmu, tail, data, size);
+	queue->position += ALIGN(size, QUEUE_ALIGNMENT);
+	*bytes_read = size;
+	return 0;
+}
+
+static void pmu_queue_rewind(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue)
+{
+	struct pmu_cmd cmd;
+
+	gk20a_dbg_fn("");
+
+	if (!queue->opened) {
+		gk20a_err(dev_from_gk20a(pmu->g),
+			"queue not opened");
+		return;
+	}
+
+	if (queue->oflag == OFLAG_WRITE) {
+		cmd.hdr.unit_id = PMU_UNIT_REWIND;
+		cmd.hdr.size = PMU_CMD_HDR_SIZE;
+		pmu_queue_push(pmu, queue, &cmd, cmd.hdr.size);
+		gk20a_dbg_pmu("queue %d rewinded", queue->id);
+	}
+
+	queue->position = queue->offset;
+	return;
+}
+
+/* open for read and lock the queue */
+static int pmu_queue_open_read(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue)
+{
+	int err;
+
+	err = pmu_queue_lock(pmu, queue);
+	if (err)
+		return err;
+
+	if (queue->opened)
+		BUG();
+
+	pmu_queue_tail(pmu, queue, &queue->position, QUEUE_GET);
+	queue->oflag = OFLAG_READ;
+	queue->opened = true;
+
+	return 0;
+}
+
+/* open for write and lock the queue
+   make sure there's enough free space for the write */
+static int pmu_queue_open_write(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue, u32 size)
+{
+	bool rewind = false;
+	int err;
+
+	err = pmu_queue_lock(pmu, queue);
+	if (err)
+		return err;
+
+	if (queue->opened)
+		BUG();
+
+	if (!pmu_queue_has_room(pmu, queue, size, &rewind)) {
+		gk20a_err(dev_from_gk20a(pmu->g), "queue full");
+		return -EAGAIN;
+	}
+
+	pmu_queue_head(pmu, queue, &queue->position, QUEUE_GET);
+	queue->oflag = OFLAG_WRITE;
+	queue->opened = true;
+
+	if (rewind)
+		pmu_queue_rewind(pmu, queue);
+
+	return 0;
+}
+
+/* close and unlock the queue */
+static int pmu_queue_close(struct pmu_gk20a *pmu,
+			struct pmu_queue *queue, bool commit)
+{
+	if (!queue->opened)
+		return 0;
+
+	if (commit) {
+		if (queue->oflag == OFLAG_READ) {
+			pmu_queue_tail(pmu, queue,
+				&queue->position, QUEUE_SET);
+		}
+		else {
+			pmu_queue_head(pmu, queue,
+				&queue->position, QUEUE_SET);
+		}
+	}
+
+	queue->opened = false;
+
+	pmu_queue_unlock(pmu, queue);
+
+	return 0;
+}
+
+static void gk20a_save_pmu_sw_state(struct pmu_gk20a *pmu,
+			struct gk20a_pmu_save_state *save)
+{
+	save->seq = pmu->seq;
+	save->next_seq_desc = pmu->next_seq_desc;
+	save->mutex = pmu->mutex;
+	save->mutex_cnt = pmu->mutex_cnt;
+	save->desc = pmu->desc;
+	save->ucode = pmu->ucode;
+	save->elpg_enable = pmu->elpg_enable;
+	save->pg_wq = pmu->pg_wq;
+	save->seq_buf = pmu->seq_buf;
+	save->pg_buf = pmu->pg_buf;
+	save->sw_ready = pmu->sw_ready;
+	save->pg_init = pmu->pg_init;
+}
+
+static void gk20a_restore_pmu_sw_state(struct pmu_gk20a *pmu,
+			struct gk20a_pmu_save_state *save)
+{
+	pmu->seq = save->seq;
+	pmu->next_seq_desc = save->next_seq_desc;
+	pmu->mutex = save->mutex;
+	pmu->mutex_cnt = save->mutex_cnt;
+	pmu->desc = save->desc;
+	pmu->ucode = save->ucode;
+	pmu->elpg_enable = save->elpg_enable;
+	pmu->pg_wq = save->pg_wq;
+	pmu->seq_buf = save->seq_buf;
+	pmu->pg_buf = save->pg_buf;
+	pmu->sw_ready = save->sw_ready;
+	pmu->pg_init = save->pg_init;
+}
+
+void gk20a_remove_pmu_support(struct pmu_gk20a *pmu)
+{
+	struct gk20a_pmu_save_state save;
+
+	gk20a_dbg_fn("");
+
+	gk20a_allocator_destroy(&pmu->dmem);
+
+	/* Save the stuff you don't want to lose */
+	gk20a_save_pmu_sw_state(pmu, &save);
+
+	/* this function is also called by pmu_destory outside gk20a deinit that
+	   releases gk20a struct so fill up with zeros here. */
+	memset(pmu, 0, sizeof(struct pmu_gk20a));
+
+	/* Restore stuff you want to keep */
+	gk20a_restore_pmu_sw_state(pmu, &save);
+}
+
+int gk20a_init_pmu_reset_enable_hw(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+
+	gk20a_dbg_fn("");
+
+	pmu_enable_hw(pmu, true);
+
+	return 0;
+}
+
+static void pmu_elpg_enable_allow(struct work_struct *work);
+
+int gk20a_init_pmu_setup_sw(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct device *d = dev_from_gk20a(g);
+	int i, err = 0;
+	u8 *ptr;
+	void *ucode_ptr;
+	struct sg_table *sgt_pmu_ucode;
+	struct sg_table *sgt_seq_buf;
+	DEFINE_DMA_ATTRS(attrs);
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	if (pmu->sw_ready) {
+		for (i = 0; i < pmu->mutex_cnt; i++) {
+			pmu->mutex[i].id    = i;
+			pmu->mutex[i].index = i;
+		}
+		pmu_seq_init(pmu);
+
+		gk20a_dbg_fn("skip init");
+		goto skip_init;
+	}
+
+	/* no infoRom script from vbios? */
+
+	/* TBD: sysmon subtask */
+
+	pmu->mutex_cnt = pwr_pmu_mutex__size_1_v();
+	pmu->mutex = kzalloc(pmu->mutex_cnt *
+		sizeof(struct pmu_mutex), GFP_KERNEL);
+	if (!pmu->mutex) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < pmu->mutex_cnt; i++) {
+		pmu->mutex[i].id    = i;
+		pmu->mutex[i].index = i;
+	}
+
+	pmu->seq = kzalloc(PMU_MAX_NUM_SEQUENCES *
+		sizeof(struct pmu_sequence), GFP_KERNEL);
+	if (!pmu->seq) {
+		err = -ENOMEM;
+		goto err_free_mutex;
+	}
+
+	pmu_seq_init(pmu);
+
+	if (!g->pmu_fw) {
+		g->pmu_fw = gk20a_request_firmware(g, GK20A_PMU_UCODE_IMAGE);
+		if (!g->pmu_fw) {
+			gk20a_err(d, "failed to load pmu ucode!!");
+			err = -ENOENT;
+			goto err_free_seq;
+		}
+	}
+
+	gk20a_dbg_fn("firmware loaded");
+
+	pmu->desc = (struct pmu_ucode_desc *)g->pmu_fw->data;
+	pmu->ucode_image = (u32 *)((u8 *)pmu->desc +
+			pmu->desc->descriptor_size);
+
+
+	INIT_DELAYED_WORK(&pmu->elpg_enable, pmu_elpg_enable_allow);
+	INIT_WORK(&pmu->pg_init, gk20a_init_pmu_setup_hw2_workqueue);
+
+	gk20a_init_pmu_vm(mm);
+
+	dma_set_attr(DMA_ATTR_READ_ONLY, &attrs);
+	pmu->ucode.cpuva = dma_alloc_attrs(d, GK20A_PMU_UCODE_SIZE_MAX,
+					&iova,
+					GFP_KERNEL,
+					&attrs);
+	if (!pmu->ucode.cpuva) {
+		gk20a_err(d, "failed to allocate memory\n");
+		err = -ENOMEM;
+		goto err_release_fw;
+	}
+
+	pmu->ucode.iova = iova;
+	pmu->seq_buf.cpuva = dma_alloc_coherent(d, GK20A_PMU_SEQ_BUF_SIZE,
+					&iova,
+					GFP_KERNEL);
+	if (!pmu->seq_buf.cpuva) {
+		gk20a_err(d, "failed to allocate memory\n");
+		err = -ENOMEM;
+		goto err_free_pmu_ucode;
+	}
+
+	pmu->seq_buf.iova = iova;
+	init_waitqueue_head(&pmu->pg_wq);
+
+	err = gk20a_get_sgtable(d, &sgt_pmu_ucode,
+				pmu->ucode.cpuva,
+				pmu->ucode.iova,
+				GK20A_PMU_UCODE_SIZE_MAX);
+	if (err) {
+		gk20a_err(d, "failed to allocate sg table\n");
+		goto err_free_seq_buf;
+	}
+
+	pmu->ucode.pmu_va = gk20a_gmmu_map(vm, &sgt_pmu_ucode,
+					GK20A_PMU_UCODE_SIZE_MAX,
+					0, /* flags */
+					gk20a_mem_flag_read_only);
+	if (!pmu->ucode.pmu_va) {
+		gk20a_err(d, "failed to map pmu ucode memory!!");
+		goto err_free_ucode_sgt;
+	}
+
+	err = gk20a_get_sgtable(d, &sgt_seq_buf,
+				pmu->seq_buf.cpuva,
+				pmu->seq_buf.iova,
+				GK20A_PMU_SEQ_BUF_SIZE);
+	if (err) {
+		gk20a_err(d, "failed to allocate sg table\n");
+		goto err_unmap_ucode;
+	}
+
+	pmu->seq_buf.pmu_va = gk20a_gmmu_map(vm, &sgt_seq_buf,
+					GK20A_PMU_SEQ_BUF_SIZE,
+					0, /* flags */
+					gk20a_mem_flag_none);
+	if (!pmu->seq_buf.pmu_va) {
+		gk20a_err(d, "failed to map pmu ucode memory!!");
+		goto err_free_seq_buf_sgt;
+	}
+
+	ptr = (u8 *)pmu->seq_buf.cpuva;
+	if (!ptr) {
+		gk20a_err(d, "failed to map cpu ptr for zbc buffer");
+		goto err_unmap_seq_buf;
+	}
+
+	/* TBD: remove this if ZBC save/restore is handled by PMU
+	 * end an empty ZBC sequence for now */
+	ptr[0] = 0x16; /* opcode EXIT */
+	ptr[1] = 0; ptr[2] = 1; ptr[3] = 0;
+	ptr[4] = 0; ptr[5] = 0; ptr[6] = 0; ptr[7] = 0;
+
+	pmu->seq_buf.size = GK20A_PMU_SEQ_BUF_SIZE;
+
+	ucode_ptr = pmu->ucode.cpuva;
+
+	for (i = 0; i < (pmu->desc->app_start_offset +
+			pmu->desc->app_size) >> 2; i++)
+		gk20a_mem_wr32(ucode_ptr, i, pmu->ucode_image[i]);
+
+	gk20a_free_sgtable(&sgt_pmu_ucode);
+	gk20a_free_sgtable(&sgt_seq_buf);
+
+skip_init:
+	mutex_init(&pmu->elpg_mutex);
+	mutex_init(&pmu->isr_mutex);
+	mutex_init(&pmu->pmu_copy_lock);
+	mutex_init(&pmu->pmu_seq_lock);
+
+	pmu->perfmon_counter.index = 3; /* GR & CE2 */
+	pmu->perfmon_counter.group_id = PMU_DOMAIN_GROUP_PSTATE;
+
+	pmu->remove_support = gk20a_remove_pmu_support;
+	err = gk20a_init_pmu(pmu);
+	if (err) {
+		gk20a_err(d, "failed to set function pointers\n");
+		return err;
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+
+ err_unmap_seq_buf:
+	gk20a_gmmu_unmap(vm, pmu->seq_buf.pmu_va,
+		GK20A_PMU_SEQ_BUF_SIZE, gk20a_mem_flag_none);
+ err_free_seq_buf_sgt:
+	gk20a_free_sgtable(&sgt_seq_buf);
+ err_unmap_ucode:
+	gk20a_gmmu_unmap(vm, pmu->ucode.pmu_va,
+		GK20A_PMU_UCODE_SIZE_MAX, gk20a_mem_flag_none);
+ err_free_ucode_sgt:
+	gk20a_free_sgtable(&sgt_pmu_ucode);
+ err_free_seq_buf:
+	dma_free_coherent(d, GK20A_PMU_SEQ_BUF_SIZE,
+		pmu->seq_buf.cpuva, pmu->seq_buf.iova);
+	pmu->seq_buf.cpuva = NULL;
+	pmu->seq_buf.iova = 0;
+ err_free_pmu_ucode:
+	dma_free_attrs(d, GK20A_PMU_UCODE_SIZE_MAX,
+		pmu->ucode.cpuva, pmu->ucode.iova, &attrs);
+	pmu->ucode.cpuva = NULL;
+	pmu->ucode.iova = 0;
+ err_release_fw:
+	release_firmware(g->pmu_fw);
+ err_free_seq:
+	kfree(pmu->seq);
+ err_free_mutex:
+	kfree(pmu->mutex);
+ err:
+	gk20a_dbg_fn("fail");
+	return err;
+}
+
+static void pmu_handle_pg_elpg_msg(struct gk20a *g, struct pmu_msg *msg,
+			void *param, u32 handle, u32 status);
+
+static void pmu_handle_pg_buf_config_msg(struct gk20a *g, struct pmu_msg *msg,
+			void *param, u32 handle, u32 status)
+{
+	struct pmu_gk20a *pmu = param;
+	struct pmu_pg_msg_eng_buf_stat *eng_buf_stat = &msg->msg.pg.eng_buf_stat;
+
+	gk20a_dbg_fn("");
+
+	if (status != 0) {
+		gk20a_err(dev_from_gk20a(g), "PGENG cmd aborted");
+		/* TBD: disable ELPG */
+		return;
+	}
+
+	if (eng_buf_stat->status == PMU_PG_MSG_ENG_BUF_FAILED) {
+		gk20a_err(dev_from_gk20a(g), "failed to load PGENG buffer");
+	}
+
+	pmu->buf_loaded = (eng_buf_stat->status == PMU_PG_MSG_ENG_BUF_LOADED);
+	wake_up(&pmu->pg_wq);
+}
+
+int gk20a_init_pmu_setup_hw1(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	pmu_reset(pmu);
+
+	/* setup apertures - virtual */
+	gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_UCODE),
+		pwr_fbif_transcfg_mem_type_virtual_f());
+	gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_VIRT),
+		pwr_fbif_transcfg_mem_type_virtual_f());
+	/* setup apertures - physical */
+	gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_VID),
+		pwr_fbif_transcfg_mem_type_physical_f() |
+		pwr_fbif_transcfg_target_local_fb_f());
+	gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_SYS_COH),
+		pwr_fbif_transcfg_mem_type_physical_f() |
+		pwr_fbif_transcfg_target_coherent_sysmem_f());
+	gk20a_writel(g, pwr_fbif_transcfg_r(GK20A_PMU_DMAIDX_PHYS_SYS_NCOH),
+		pwr_fbif_transcfg_mem_type_physical_f() |
+		pwr_fbif_transcfg_target_noncoherent_sysmem_f());
+
+	/* TBD: load pmu ucode */
+	err = pmu_bootstrap(pmu);
+	if (err)
+		return err;
+
+	return 0;
+
+}
+
+static int gk20a_aelpg_init(struct gk20a *g);
+static int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id);
+
+
+static void gk20a_init_pmu_setup_hw2_workqueue(struct work_struct *work)
+{
+	struct pmu_gk20a *pmu = container_of(work, struct pmu_gk20a, pg_init);
+	struct gk20a *g = pmu->g;
+	gk20a_init_pmu_setup_hw2(g);
+}
+
+int gk20a_init_pmu_setup_hw2(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct mm_gk20a *mm = &g->mm;
+	struct vm_gk20a *vm = &mm->pmu.vm;
+	struct device *d = dev_from_gk20a(g);
+	struct pmu_cmd cmd;
+	u32 desc;
+	long remain;
+	int err;
+	bool status;
+	u32 size;
+	struct sg_table *sgt_pg_buf;
+	dma_addr_t iova;
+
+	gk20a_dbg_fn("");
+
+	if (!support_gk20a_pmu())
+		return 0;
+
+	size = 0;
+	err = gr_gk20a_fecs_get_reglist_img_size(g, &size);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to query fecs pg buffer size");
+		return err;
+	}
+
+	if (!pmu->sw_ready) {
+		pmu->pg_buf.cpuva = dma_alloc_coherent(d, size,
+						&iova,
+						GFP_KERNEL);
+		if (!pmu->pg_buf.cpuva) {
+			gk20a_err(d, "failed to allocate memory\n");
+			err = -ENOMEM;
+			goto err;
+		}
+
+		pmu->pg_buf.iova = iova;
+		pmu->pg_buf.size = size;
+
+		err = gk20a_get_sgtable(d, &sgt_pg_buf,
+					pmu->pg_buf.cpuva,
+					pmu->pg_buf.iova,
+					size);
+		if (err) {
+			gk20a_err(d, "failed to create sg table\n");
+			goto err_free_pg_buf;
+		}
+
+		pmu->pg_buf.pmu_va = gk20a_gmmu_map(vm,
+					&sgt_pg_buf,
+					size,
+					0, /* flags */
+					gk20a_mem_flag_none);
+		if (!pmu->pg_buf.pmu_va) {
+			gk20a_err(d, "failed to map fecs pg buffer");
+			err = -ENOMEM;
+			goto err_free_sgtable;
+		}
+
+		gk20a_free_sgtable(&sgt_pg_buf);
+	}
+
+	/*
+	 * This is the actual point at which sw setup is complete, so set the
+	 * sw_ready flag here.
+	 */
+	pmu->sw_ready = true;
+
+	/* TBD: acquire pmu hw mutex */
+
+	/* TBD: post reset again? */
+
+	/* PMU_INIT message handler will send PG_INIT */
+	remain = wait_event_timeout(
+			pmu->pg_wq,
+			(status = (pmu->elpg_ready &&
+				pmu->stat_dmem_offset != 0 &&
+				pmu->elpg_stat == PMU_ELPG_STAT_OFF)),
+			msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+	if (status == 0) {
+		gk20a_err(dev_from_gk20a(g),
+			"PG_INIT_ACK failed, remaining timeout : 0x%lx", remain);
+		pmu_dump_falcon_stats(pmu);
+		return -EBUSY;
+	}
+
+	err = gr_gk20a_fecs_set_reglist_bind_inst(g, mm->pmu.inst_block.cpu_pa);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to bind pmu inst to gr");
+		return err;
+	}
+
+	err = gr_gk20a_fecs_set_reglist_virual_addr(g, pmu->pg_buf.pmu_va);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to set pg buffer pmu va");
+		return err;
+	}
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_eng_buf_load);
+	cmd.cmd.pg.eng_buf_load.cmd_type = PMU_PG_CMD_ID_ENG_BUF_LOAD;
+	cmd.cmd.pg.eng_buf_load.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.eng_buf_load.buf_idx = PMU_PGENG_GR_BUFFER_IDX_FECS;
+	cmd.cmd.pg.eng_buf_load.buf_size = pmu->pg_buf.size;
+	cmd.cmd.pg.eng_buf_load.dma_base = u64_lo32(pmu->pg_buf.pmu_va >> 8);
+	cmd.cmd.pg.eng_buf_load.dma_offset = (u8)(pmu->pg_buf.pmu_va & 0xFF);
+	cmd.cmd.pg.eng_buf_load.dma_idx = PMU_DMAIDX_VIRT;
+
+	pmu->buf_loaded = false;
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+			pmu_handle_pg_buf_config_msg, pmu, &desc, ~0);
+
+	remain = wait_event_timeout(
+			pmu->pg_wq,
+			pmu->buf_loaded,
+			msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+	if (!pmu->buf_loaded) {
+		gk20a_err(dev_from_gk20a(g),
+			"PGENG FECS buffer load failed, remaining timeout : 0x%lx",
+			remain);
+		return -EBUSY;
+	}
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_eng_buf_load);
+	cmd.cmd.pg.eng_buf_load.cmd_type = PMU_PG_CMD_ID_ENG_BUF_LOAD;
+	cmd.cmd.pg.eng_buf_load.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.eng_buf_load.buf_idx = PMU_PGENG_GR_BUFFER_IDX_ZBC;
+	cmd.cmd.pg.eng_buf_load.buf_size = pmu->seq_buf.size;
+	cmd.cmd.pg.eng_buf_load.dma_base = u64_lo32(pmu->seq_buf.pmu_va >> 8);
+	cmd.cmd.pg.eng_buf_load.dma_offset = (u8)(pmu->seq_buf.pmu_va & 0xFF);
+	cmd.cmd.pg.eng_buf_load.dma_idx = PMU_DMAIDX_VIRT;
+
+	pmu->buf_loaded = false;
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+			pmu_handle_pg_buf_config_msg, pmu, &desc, ~0);
+
+	remain = wait_event_timeout(
+			pmu->pg_wq,
+			pmu->buf_loaded,
+			msecs_to_jiffies(gk20a_get_gr_idle_timeout(g)));
+	if (!pmu->buf_loaded) {
+		gk20a_err(dev_from_gk20a(g),
+			"PGENG ZBC buffer load failed, remaining timeout 0x%lx",
+			remain);
+		return -EBUSY;
+	}
+
+	/*
+	 * FIXME: To enable ELPG, we increase the PMU ext2priv timeout unit to
+	 * 7. This prevents PMU stalling on Host register accesses. Once the
+	 * cause for this hang is discovered and fixed, this WAR should be
+	 * removed.
+	 */
+	gk20a_writel(g, 0x10a164, 0x109ff);
+
+	pmu->initialized = true;
+	pmu->zbc_ready = true;
+
+	/* Save zbc table after PMU is initialized. */
+	pmu_save_zbc(g, 0xf);
+
+	/*
+	 * We can't guarantee that gr code to enable ELPG will be
+	 * invoked, so we explicitly call disable-enable here
+	 * to enable elpg.
+	 */
+	gk20a_pmu_disable_elpg(g);
+
+	if (g->elpg_enabled)
+		gk20a_pmu_enable_elpg(g);
+
+	udelay(50);
+
+	/* Enable AELPG */
+	if (g->aelpg_enabled) {
+		gk20a_aelpg_init(g);
+		gk20a_aelpg_init_and_enable(g, PMU_AP_CTRL_ID_GRAPHICS);
+	}
+
+	return 0;
+
+ err_free_sgtable:
+	gk20a_free_sgtable(&sgt_pg_buf);
+ err_free_pg_buf:
+	dma_free_coherent(d, size,
+		pmu->pg_buf.cpuva, pmu->pg_buf.iova);
+	pmu->pg_buf.cpuva = NULL;
+	pmu->pg_buf.iova = 0;
+ err:
+	return err;
+}
+
+int gk20a_init_pmu_support(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	u32 err;
+
+	gk20a_dbg_fn("");
+
+	if (pmu->initialized)
+		return 0;
+
+	pmu->g = g;
+
+	err = gk20a_init_pmu_reset_enable_hw(g);
+	if (err)
+		return err;
+
+	if (support_gk20a_pmu()) {
+		err = gk20a_init_pmu_setup_sw(g);
+		if (err)
+			return err;
+
+		err = gk20a_init_pmu_setup_hw1(g);
+		if (err)
+			return err;
+	}
+
+	return err;
+}
+
+static void pmu_handle_pg_elpg_msg(struct gk20a *g, struct pmu_msg *msg,
+			void *param, u32 handle, u32 status)
+{
+	struct pmu_gk20a *pmu = param;
+	struct pmu_pg_msg_elpg_msg *elpg_msg = &msg->msg.pg.elpg_msg;
+
+	gk20a_dbg_fn("");
+
+	if (status != 0) {
+		gk20a_err(dev_from_gk20a(g), "ELPG cmd aborted");
+		/* TBD: disable ELPG */
+		return;
+	}
+
+	switch (elpg_msg->msg) {
+	case PMU_PG_ELPG_MSG_INIT_ACK:
+		gk20a_dbg_pmu("INIT_PG is acknowledged from PMU");
+		pmu->elpg_ready = true;
+		wake_up(&pmu->pg_wq);
+		break;
+	case PMU_PG_ELPG_MSG_ALLOW_ACK:
+		gk20a_dbg_pmu("ALLOW is acknowledged from PMU");
+		pmu->elpg_stat = PMU_ELPG_STAT_ON;
+		wake_up(&pmu->pg_wq);
+		break;
+	case PMU_PG_ELPG_MSG_DISALLOW_ACK:
+		gk20a_dbg_pmu("DISALLOW is acknowledged from PMU");
+		pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+		wake_up(&pmu->pg_wq);
+		break;
+	default:
+		gk20a_err(dev_from_gk20a(g),
+			"unsupported ELPG message : 0x%04x", elpg_msg->msg);
+	}
+
+	return;
+}
+
+static void pmu_handle_pg_stat_msg(struct gk20a *g, struct pmu_msg *msg,
+			void *param, u32 handle, u32 status)
+{
+	struct pmu_gk20a *pmu = param;
+
+	gk20a_dbg_fn("");
+
+	if (status != 0) {
+		gk20a_err(dev_from_gk20a(g), "ELPG cmd aborted");
+		/* TBD: disable ELPG */
+		return;
+	}
+
+	switch (msg->msg.pg.stat.sub_msg_id) {
+	case PMU_PG_STAT_MSG_RESP_DMEM_OFFSET:
+		gk20a_dbg_pmu("ALLOC_DMEM_OFFSET is acknowledged from PMU");
+		pmu->stat_dmem_offset = msg->msg.pg.stat.data;
+		wake_up(&pmu->pg_wq);
+		break;
+	default:
+		break;
+	}
+}
+
+static int pmu_init_powergating(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_cmd cmd;
+	u32 seq;
+
+	gk20a_dbg_fn("");
+
+	if (tegra_cpu_is_asim()) {
+		/* TBD: calculate threshold for silicon */
+		gk20a_writel(g, pwr_pmu_pg_idlefilth_r(ENGINE_GR_GK20A),
+				PMU_PG_IDLE_THRESHOLD_SIM);
+		gk20a_writel(g, pwr_pmu_pg_ppuidlefilth_r(ENGINE_GR_GK20A),
+				PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM);
+	} else {
+		/* TBD: calculate threshold for silicon */
+		gk20a_writel(g, pwr_pmu_pg_idlefilth_r(ENGINE_GR_GK20A),
+				PMU_PG_IDLE_THRESHOLD);
+		gk20a_writel(g, pwr_pmu_pg_ppuidlefilth_r(ENGINE_GR_GK20A),
+				PMU_PG_POST_POWERUP_IDLE_THRESHOLD);
+	}
+
+	/* init ELPG */
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+	cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+	cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_INIT;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+
+	/* alloc dmem for powergating state log */
+	pmu->stat_dmem_offset = 0;
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_stat);
+	cmd.cmd.pg.stat.cmd_type = PMU_PG_CMD_ID_PG_STAT;
+	cmd.cmd.pg.stat.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.stat.sub_cmd_id = PMU_PG_STAT_CMD_ALLOC_DMEM;
+	cmd.cmd.pg.stat.data = 0;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+			pmu_handle_pg_stat_msg, pmu, &seq, ~0);
+
+	/* disallow ELPG initially
+	   PMU ucode requires a disallow cmd before allow cmd */
+	pmu->elpg_stat = PMU_ELPG_STAT_ON; /* set for wait_event PMU_ELPG_STAT_OFF */
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+	cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+	cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_DISALLOW;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+
+	/* start with elpg disabled until first enable call */
+	pmu->elpg_refcnt = 1;
+
+	return 0;
+}
+
+static int pmu_init_perfmon(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_v *pv = &g->ops.pmu_ver;
+	struct pmu_cmd cmd;
+	struct pmu_payload payload;
+	u32 seq;
+	u32 data;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	pmu->perfmon_ready = 0;
+
+	/* use counter #3 for GR && CE2 busy cycles */
+	gk20a_writel(g, pwr_pmu_idle_mask_r(3),
+		pwr_pmu_idle_mask_gr_enabled_f() |
+		pwr_pmu_idle_mask_ce_2_enabled_f());
+
+	/* disable idle filtering for counters 3 and 6 */
+	data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(3));
+	data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+			pwr_pmu_idle_ctrl_filter_m(),
+			pwr_pmu_idle_ctrl_value_busy_f() |
+			pwr_pmu_idle_ctrl_filter_disabled_f());
+	gk20a_writel(g, pwr_pmu_idle_ctrl_r(3), data);
+
+	/* use counter #6 for total cycles */
+	data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(6));
+	data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+			pwr_pmu_idle_ctrl_filter_m(),
+			pwr_pmu_idle_ctrl_value_always_f() |
+			pwr_pmu_idle_ctrl_filter_disabled_f());
+	gk20a_writel(g, pwr_pmu_idle_ctrl_r(6), data);
+
+	/*
+	 * We don't want to disturb counters #3 and #6, which are used by
+	 * perfmon, so we add wiring also to counters #1 and #2 for
+	 * exposing raw counter readings.
+	 */
+	gk20a_writel(g, pwr_pmu_idle_mask_r(1),
+		pwr_pmu_idle_mask_gr_enabled_f() |
+		pwr_pmu_idle_mask_ce_2_enabled_f());
+
+	data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(1));
+	data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+			pwr_pmu_idle_ctrl_filter_m(),
+			pwr_pmu_idle_ctrl_value_busy_f() |
+			pwr_pmu_idle_ctrl_filter_disabled_f());
+	gk20a_writel(g, pwr_pmu_idle_ctrl_r(1), data);
+
+	data = gk20a_readl(g, pwr_pmu_idle_ctrl_r(2));
+	data = set_field(data, pwr_pmu_idle_ctrl_value_m() |
+			pwr_pmu_idle_ctrl_filter_m(),
+			pwr_pmu_idle_ctrl_value_always_f() |
+			pwr_pmu_idle_ctrl_filter_disabled_f());
+	gk20a_writel(g, pwr_pmu_idle_ctrl_r(2), data);
+
+	pmu->sample_buffer = 0;
+	err = pmu->dmem.alloc(&pmu->dmem, &pmu->sample_buffer, 2 * sizeof(u16));
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"failed to allocate perfmon sample buffer");
+		return -ENOMEM;
+	}
+
+	/* init PERFMON */
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + pv->get_pmu_perfmon_cmd_init_size();
+	cmd.cmd.perfmon.cmd_type = PMU_PERFMON_CMD_ID_INIT;
+	/* buffer to save counter values for pmu perfmon */
+	pv->perfmon_cmd_init_set_sample_buffer(&cmd.cmd.perfmon,
+	(u16)pmu->sample_buffer);
+	/* number of sample periods below lower threshold
+	   before pmu triggers perfmon decrease event
+	   TBD: = 15 */
+	pv->perfmon_cmd_init_set_dec_cnt(&cmd.cmd.perfmon, 15);
+	/* index of base counter, aka. always ticking counter */
+	pv->perfmon_cmd_init_set_base_cnt_id(&cmd.cmd.perfmon, 6);
+	/* microseconds interval between pmu polls perf counters */
+	pv->perfmon_cmd_init_set_samp_period_us(&cmd.cmd.perfmon, 16700);
+	/* number of perfmon counters
+	   counter #3 (GR and CE2) for gk20a */
+	pv->perfmon_cmd_init_set_num_cnt(&cmd.cmd.perfmon, 1);
+	/* moving average window for sample periods
+	   TBD: = 3000000 / sample_period_us = 17 */
+	pv->perfmon_cmd_init_set_mov_avg(&cmd.cmd.perfmon, 17);
+
+	memset(&payload, 0, sizeof(struct pmu_payload));
+	payload.in.buf = &pmu->perfmon_counter;
+	payload.in.size = sizeof(struct pmu_perfmon_counter);
+	payload.in.offset = pv->get_perfmon_cmd_init_offsetofvar(COUNTER_ALLOC);
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, &payload, PMU_COMMAND_QUEUE_LPQ,
+			NULL, NULL, &seq, ~0);
+
+	return 0;
+}
+
+static int pmu_process_init_msg(struct pmu_gk20a *pmu,
+			struct pmu_msg *msg)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_v *pv = &g->ops.pmu_ver;
+	union pmu_init_msg_pmu *init;
+	struct pmu_sha1_gid_data gid_data;
+	u32 i, tail = 0;
+
+	tail = pwr_pmu_msgq_tail_val_v(
+		gk20a_readl(g, pwr_pmu_msgq_tail_r()));
+
+	pmu_copy_from_dmem(pmu, tail,
+		(u8 *)&msg->hdr, PMU_MSG_HDR_SIZE, 0);
+
+	if (msg->hdr.unit_id != PMU_UNIT_INIT) {
+		gk20a_err(dev_from_gk20a(g),
+			"expecting init msg");
+		return -EINVAL;
+	}
+
+	pmu_copy_from_dmem(pmu, tail + PMU_MSG_HDR_SIZE,
+		(u8 *)&msg->msg, msg->hdr.size - PMU_MSG_HDR_SIZE, 0);
+
+	if (msg->msg.init.msg_type != PMU_INIT_MSG_TYPE_PMU_INIT) {
+		gk20a_err(dev_from_gk20a(g),
+			"expecting init msg");
+		return -EINVAL;
+	}
+
+	tail += ALIGN(msg->hdr.size, PMU_DMEM_ALIGNMENT);
+	gk20a_writel(g, pwr_pmu_msgq_tail_r(),
+		pwr_pmu_msgq_tail_val_f(tail));
+
+	init = pv->get_pmu_msg_pmu_init_msg_ptr(&(msg->msg.init));
+	if (!pmu->gid_info.valid) {
+
+		pmu_copy_from_dmem(pmu,
+			pv->get_pmu_init_msg_pmu_sw_mg_off(init),
+			(u8 *)&gid_data,
+			sizeof(struct pmu_sha1_gid_data), 0);
+
+		pmu->gid_info.valid =
+			(*(u32 *)gid_data.signature == PMU_SHA1_GID_SIGNATURE);
+
+		if (pmu->gid_info.valid) {
+
+			BUG_ON(sizeof(pmu->gid_info.gid) !=
+				sizeof(gid_data.gid));
+
+			memcpy(pmu->gid_info.gid, gid_data.gid,
+				sizeof(pmu->gid_info.gid));
+		}
+	}
+
+	for (i = 0; i < PMU_QUEUE_COUNT; i++)
+		pmu_queue_init(pmu, i, init);
+
+	gk20a_allocator_init(&pmu->dmem, "gk20a_pmu_dmem",
+			pv->get_pmu_init_msg_pmu_sw_mg_off(init),
+			pv->get_pmu_init_msg_pmu_sw_mg_size(init),
+			PMU_DMEM_ALLOC_ALIGNMENT);
+
+	pmu->pmu_ready = true;
+
+	return 0;
+}
+
+static bool pmu_read_message(struct pmu_gk20a *pmu, struct pmu_queue *queue,
+			struct pmu_msg *msg, int *status)
+{
+	struct gk20a *g = pmu->g;
+	u32 read_size, bytes_read;
+	int err;
+
+	*status = 0;
+
+	if (pmu_queue_is_empty(pmu, queue))
+		return false;
+
+	err = pmu_queue_open_read(pmu, queue);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to open queue %d for read", queue->id);
+		*status = err;
+		return false;
+	}
+
+	err = pmu_queue_pop(pmu, queue, &msg->hdr,
+			PMU_MSG_HDR_SIZE, &bytes_read);
+	if (err || bytes_read != PMU_MSG_HDR_SIZE) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to read msg from queue %d", queue->id);
+		*status = err | -EINVAL;
+		goto clean_up;
+	}
+
+	if (msg->hdr.unit_id == PMU_UNIT_REWIND) {
+		pmu_queue_rewind(pmu, queue);
+		/* read again after rewind */
+		err = pmu_queue_pop(pmu, queue, &msg->hdr,
+				PMU_MSG_HDR_SIZE, &bytes_read);
+		if (err || bytes_read != PMU_MSG_HDR_SIZE) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to read msg from queue %d", queue->id);
+			*status = err | -EINVAL;
+			goto clean_up;
+		}
+	}
+
+	if (!PMU_UNIT_ID_IS_VALID(msg->hdr.unit_id)) {
+		gk20a_err(dev_from_gk20a(g),
+			"read invalid unit_id %d from queue %d",
+			msg->hdr.unit_id, queue->id);
+			*status = -EINVAL;
+			goto clean_up;
+	}
+
+	if (msg->hdr.size > PMU_MSG_HDR_SIZE) {
+		read_size = msg->hdr.size - PMU_MSG_HDR_SIZE;
+		err = pmu_queue_pop(pmu, queue, &msg->msg,
+			read_size, &bytes_read);
+		if (err || bytes_read != read_size) {
+			gk20a_err(dev_from_gk20a(g),
+				"fail to read msg from queue %d", queue->id);
+			*status = err;
+			goto clean_up;
+		}
+	}
+
+	err = pmu_queue_close(pmu, queue, true);
+	if (err) {
+		gk20a_err(dev_from_gk20a(g),
+			"fail to close queue %d", queue->id);
+		*status = err;
+		return false;
+	}
+
+	return true;
+
+clean_up:
+	err = pmu_queue_close(pmu, queue, false);
+	if (err)
+		gk20a_err(dev_from_gk20a(g),
+			"fail to close queue %d", queue->id);
+	return false;
+}
+
+static int pmu_response_handle(struct pmu_gk20a *pmu,
+			struct pmu_msg *msg)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_sequence *seq;
+	struct pmu_v *pv = &g->ops.pmu_ver;
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	seq = &pmu->seq[msg->hdr.seq_id];
+	if (seq->state != PMU_SEQ_STATE_USED &&
+	    seq->state != PMU_SEQ_STATE_CANCELLED) {
+		gk20a_err(dev_from_gk20a(g),
+			"msg for an unknown sequence %d", seq->id);
+		return -EINVAL;
+	}
+
+	if (msg->hdr.unit_id == PMU_UNIT_RC &&
+	    msg->msg.rc.msg_type == PMU_RC_MSG_TYPE_UNHANDLED_CMD) {
+		gk20a_err(dev_from_gk20a(g),
+			"unhandled cmd: seq %d", seq->id);
+	}
+	else if (seq->state != PMU_SEQ_STATE_CANCELLED) {
+		if (seq->msg) {
+			if (seq->msg->hdr.size >= msg->hdr.size) {
+				memcpy(seq->msg, msg, msg->hdr.size);
+				if (pv->pmu_allocation_get_dmem_size(pmu,
+				pv->get_pmu_seq_out_a_ptr(seq)) != 0) {
+					pmu_copy_from_dmem(pmu,
+					pv->pmu_allocation_get_dmem_offset(pmu,
+					pv->get_pmu_seq_out_a_ptr(seq)),
+					seq->out_payload,
+					pv->pmu_allocation_get_dmem_size(pmu,
+					pv->get_pmu_seq_out_a_ptr(seq)), 0);
+				}
+			} else {
+				gk20a_err(dev_from_gk20a(g),
+					"sequence %d msg buffer too small",
+					seq->id);
+			}
+		}
+	} else
+		seq->callback = NULL;
+	if (pv->pmu_allocation_get_dmem_size(pmu,
+			pv->get_pmu_seq_in_a_ptr(seq)) != 0)
+		pmu->dmem.free(&pmu->dmem,
+		pv->pmu_allocation_get_dmem_offset(pmu,
+		pv->get_pmu_seq_in_a_ptr(seq)),
+		pv->pmu_allocation_get_dmem_size(pmu,
+		pv->get_pmu_seq_in_a_ptr(seq)));
+	if (pv->pmu_allocation_get_dmem_size(pmu,
+			pv->get_pmu_seq_out_a_ptr(seq)) != 0)
+		pmu->dmem.free(&pmu->dmem,
+		pv->pmu_allocation_get_dmem_offset(pmu,
+		pv->get_pmu_seq_out_a_ptr(seq)),
+		pv->pmu_allocation_get_dmem_size(pmu,
+		pv->get_pmu_seq_out_a_ptr(seq)));
+
+	if (seq->callback)
+		seq->callback(g, msg, seq->cb_params, seq->desc, ret);
+
+	pmu_seq_release(pmu, seq);
+
+	/* TBD: notify client waiting for available dmem */
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+}
+
+static int pmu_wait_message_cond(struct pmu_gk20a *pmu, u32 timeout,
+				 u32 *var, u32 val);
+
+static void pmu_handle_zbc_msg(struct gk20a *g, struct pmu_msg *msg,
+			void *param, u32 handle, u32 status)
+{
+	struct pmu_gk20a *pmu = param;
+	pmu->zbc_save_done = 1;
+}
+
+static void pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_cmd cmd;
+	u32 seq;
+
+	if (!pmu->pmu_ready || !entries || !pmu->zbc_ready)
+		return;
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_zbc_cmd);
+	cmd.cmd.zbc.cmd_type = g->ops.pmu_ver.cmd_id_zbc_table_update;
+	cmd.cmd.zbc.entry_mask = ZBC_MASK(entries);
+
+	pmu->zbc_save_done = 0;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			   pmu_handle_zbc_msg, pmu, &seq, ~0);
+	pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+			      &pmu->zbc_save_done, 1);
+	if (!pmu->zbc_save_done)
+		gk20a_err(dev_from_gk20a(g), "ZBC save timeout");
+}
+
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries)
+{
+	if (g->pmu.zbc_ready)
+		pmu_save_zbc(g, entries);
+}
+
+static int pmu_perfmon_start_sampling(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_v *pv = &g->ops.pmu_ver;
+	struct pmu_cmd cmd;
+	struct pmu_payload payload;
+	u32 current_rate = 0;
+	u32 seq;
+
+	/* PERFMON Start */
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + pv->get_pmu_perfmon_cmd_start_size();
+	pv->perfmon_start_set_cmd_type(&cmd.cmd.perfmon,
+		PMU_PERFMON_CMD_ID_START);
+	pv->perfmon_start_set_group_id(&cmd.cmd.perfmon,
+		PMU_DOMAIN_GROUP_PSTATE);
+	pv->perfmon_start_set_state_id(&cmd.cmd.perfmon,
+		pmu->perfmon_state_id[PMU_DOMAIN_GROUP_PSTATE]);
+
+	current_rate = rate_gpu_to_gpc2clk(gk20a_clk_get_rate(g));
+	if (current_rate >= gpc_pll_params.max_freq)
+		pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+		PMU_PERFMON_FLAG_ENABLE_DECREASE);
+	else if (current_rate <= gpc_pll_params.min_freq)
+		pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+		PMU_PERFMON_FLAG_ENABLE_INCREASE);
+	else
+		pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+		PMU_PERFMON_FLAG_ENABLE_INCREASE |
+		PMU_PERFMON_FLAG_ENABLE_DECREASE);
+
+	pv->perfmon_start_set_flags(&cmd.cmd.perfmon,
+		pv->perfmon_start_get_flags(&cmd.cmd.perfmon) |
+		PMU_PERFMON_FLAG_CLEAR_PREV);
+
+	memset(&payload, 0, sizeof(struct pmu_payload));
+
+	/* TBD: PMU_PERFMON_PCT_TO_INC * 100 */
+	pmu->perfmon_counter.upper_threshold = 3000; /* 30% */
+	/* TBD: PMU_PERFMON_PCT_TO_DEC * 100 */
+	pmu->perfmon_counter.lower_threshold = 1000; /* 10% */
+	pmu->perfmon_counter.valid = true;
+
+	payload.in.buf = &pmu->perfmon_counter;
+	payload.in.size = sizeof(pmu->perfmon_counter);
+	payload.in.offset =
+		pv->get_perfmon_cmd_start_offsetofvar(COUNTER_ALLOC);
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, &payload, PMU_COMMAND_QUEUE_LPQ,
+			NULL, NULL, &seq, ~0);
+
+	return 0;
+}
+
+static int pmu_perfmon_stop_sampling(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_cmd cmd;
+	u32 seq;
+
+	/* PERFMON Stop */
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PERFMON;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_perfmon_cmd_stop);
+	cmd.cmd.perfmon.stop.cmd_type = PMU_PERFMON_CMD_ID_STOP;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_LPQ,
+			NULL, NULL, &seq, ~0);
+	return 0;
+}
+
+static int pmu_handle_perfmon_event(struct pmu_gk20a *pmu,
+			struct pmu_perfmon_msg *msg)
+{
+	struct gk20a *g = pmu->g;
+	u32 rate;
+
+	gk20a_dbg_fn("");
+
+	switch (msg->msg_type) {
+	case PMU_PERFMON_MSG_ID_INCREASE_EVENT:
+		gk20a_dbg_pmu("perfmon increase event: "
+			"state_id %d, ground_id %d, pct %d",
+			msg->gen.state_id, msg->gen.group_id, msg->gen.data);
+		/* increase gk20a clock freq by 20% */
+		rate = gk20a_clk_get_rate(g);
+		gk20a_clk_set_rate(g, rate * 6 / 5);
+		break;
+	case PMU_PERFMON_MSG_ID_DECREASE_EVENT:
+		gk20a_dbg_pmu("perfmon decrease event: "
+			"state_id %d, ground_id %d, pct %d",
+			msg->gen.state_id, msg->gen.group_id, msg->gen.data);
+		/* decrease gk20a clock freq by 10% */
+		rate = gk20a_clk_get_rate(g);
+		gk20a_clk_set_rate(g, (rate / 10) * 7);
+		break;
+	case PMU_PERFMON_MSG_ID_INIT_EVENT:
+		pmu->perfmon_ready = 1;
+		gk20a_dbg_pmu("perfmon init event");
+		break;
+	default:
+		break;
+	}
+
+	/* restart sampling */
+	if (IS_ENABLED(CONFIG_GK20A_PERFMON))
+		return pmu_perfmon_start_sampling(pmu);
+	return 0;
+}
+
+
+static int pmu_handle_event(struct pmu_gk20a *pmu, struct pmu_msg *msg)
+{
+	int err;
+
+	gk20a_dbg_fn("");
+
+	switch (msg->hdr.unit_id) {
+	case PMU_UNIT_PERFMON:
+		err = pmu_handle_perfmon_event(pmu, &msg->msg.perfmon);
+		break;
+	default:
+		break;
+	}
+
+	return err;
+}
+
+static int pmu_process_message(struct pmu_gk20a *pmu)
+{
+	struct pmu_msg msg;
+	int status;
+
+	if (unlikely(!pmu->pmu_ready)) {
+		pmu_process_init_msg(pmu, &msg);
+		pmu_init_powergating(pmu);
+		pmu_init_perfmon(pmu);
+		return 0;
+	}
+
+	while (pmu_read_message(pmu,
+		&pmu->queue[PMU_MESSAGE_QUEUE], &msg, &status)) {
+
+		gk20a_dbg_pmu("read msg hdr: "
+				"unit_id = 0x%08x, size = 0x%08x, "
+				"ctrl_flags = 0x%08x, seq_id = 0x%08x",
+				msg.hdr.unit_id, msg.hdr.size,
+				msg.hdr.ctrl_flags, msg.hdr.seq_id);
+
+		msg.hdr.ctrl_flags &= ~PMU_CMD_FLAGS_PMU_MASK;
+
+		if (msg.hdr.ctrl_flags == PMU_CMD_FLAGS_EVENT) {
+			pmu_handle_event(pmu, &msg);
+		} else {
+			pmu_response_handle(pmu, &msg);
+		}
+	}
+
+	return 0;
+}
+
+static int pmu_wait_message_cond(struct pmu_gk20a *pmu, u32 timeout,
+				 u32 *var, u32 val)
+{
+	struct gk20a *g = pmu->g;
+	unsigned long end_jiffies = jiffies + msecs_to_jiffies(timeout);
+	unsigned long delay = GR_IDLE_CHECK_DEFAULT;
+
+	do {
+		if (*var == val)
+			return 0;
+
+		if (gk20a_readl(g, pwr_falcon_irqstat_r()))
+			gk20a_pmu_isr(g);
+
+		usleep_range(delay, delay * 2);
+		delay = min_t(u32, delay << 1, GR_IDLE_CHECK_MAX);
+	} while (time_before(jiffies, end_jiffies) ||
+			!tegra_platform_is_silicon());
+
+	return -ETIMEDOUT;
+}
+
+static void pmu_dump_elpg_stats(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_pg_stats stats;
+
+	pmu_copy_from_dmem(pmu, pmu->stat_dmem_offset,
+		(u8 *)&stats, sizeof(struct pmu_pg_stats), 0);
+
+	gk20a_dbg_pmu("pg_entry_start_timestamp : 0x%016llx",
+		stats.pg_entry_start_timestamp);
+	gk20a_dbg_pmu("pg_exit_start_timestamp : 0x%016llx",
+		stats.pg_exit_start_timestamp);
+	gk20a_dbg_pmu("pg_ingating_start_timestamp : 0x%016llx",
+		stats.pg_ingating_start_timestamp);
+	gk20a_dbg_pmu("pg_ungating_start_timestamp : 0x%016llx",
+		stats.pg_ungating_start_timestamp);
+	gk20a_dbg_pmu("pg_avg_entry_time_us : 0x%08x",
+		stats.pg_avg_entry_time_us);
+	gk20a_dbg_pmu("pg_avg_exit_time_us : 0x%08x",
+		stats.pg_avg_exit_time_us);
+	gk20a_dbg_pmu("pg_ingating_cnt : 0x%08x",
+		stats.pg_ingating_cnt);
+	gk20a_dbg_pmu("pg_ingating_time_us : 0x%08x",
+		stats.pg_ingating_time_us);
+	gk20a_dbg_pmu("pg_ungating_count : 0x%08x",
+		stats.pg_ungating_count);
+	gk20a_dbg_pmu("pg_ungating_time_us 0x%08x: ",
+		stats.pg_ungating_time_us);
+	gk20a_dbg_pmu("pg_gating_cnt : 0x%08x",
+		stats.pg_gating_cnt);
+	gk20a_dbg_pmu("pg_gating_deny_cnt : 0x%08x",
+		stats.pg_gating_deny_cnt);
+
+	/*
+	   Turn on PG_DEBUG in ucode and locate symbol "ElpgLog" offset
+	   in .nm file, e.g. 0x1000066c. use 0x66c.
+	u32 i, val[20];
+	pmu_copy_from_dmem(pmu, 0x66c,
+		(u8 *)val, sizeof(val), 0);
+	gk20a_dbg_pmu("elpg log begin");
+	for (i = 0; i < 20; i++)
+		gk20a_dbg_pmu("0x%08x", val[i]);
+	gk20a_dbg_pmu("elpg log end");
+	*/
+
+	gk20a_dbg_pmu("pwr_pmu_idle_mask_supp_r(3): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_mask_supp_r(3)));
+	gk20a_dbg_pmu("pwr_pmu_idle_mask_1_supp_r(3): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_mask_1_supp_r(3)));
+	gk20a_dbg_pmu("pwr_pmu_idle_ctrl_supp_r(3): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_ctrl_supp_r(3)));
+	gk20a_dbg_pmu("pwr_pmu_pg_idle_cnt_r(0): 0x%08x",
+		gk20a_readl(g, pwr_pmu_pg_idle_cnt_r(0)));
+	gk20a_dbg_pmu("pwr_pmu_pg_intren_r(0): 0x%08x",
+		gk20a_readl(g, pwr_pmu_pg_intren_r(0)));
+
+	gk20a_dbg_pmu("pwr_pmu_idle_count_r(3): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_count_r(3)));
+	gk20a_dbg_pmu("pwr_pmu_idle_count_r(4): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_count_r(4)));
+	gk20a_dbg_pmu("pwr_pmu_idle_count_r(7): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_count_r(7)));
+
+	/*
+	 TBD: script can't generate those registers correctly
+	gk20a_dbg_pmu("pwr_pmu_idle_status_r(): 0x%08x",
+		gk20a_readl(g, pwr_pmu_idle_status_r()));
+	gk20a_dbg_pmu("pwr_pmu_pg_ctrl_r(): 0x%08x",
+		gk20a_readl(g, pwr_pmu_pg_ctrl_r()));
+	*/
+}
+
+static void pmu_dump_falcon_stats(struct pmu_gk20a *pmu)
+{
+	struct gk20a *g = pmu->g;
+	int i;
+
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_os_r : %d",
+		gk20a_readl(g, pwr_falcon_os_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_cpuctl_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_cpuctl_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_idlestate_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_idlestate_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_mailbox0_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_mailbox0_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_mailbox1_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_mailbox1_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqstat_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_irqstat_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqmode_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_irqmode_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqmask_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_irqmask_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_irqdest_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_irqdest_r()));
+
+	for (i = 0; i < pwr_pmu_mailbox__size_1_v(); i++)
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_mailbox_r(%d) : 0x%x",
+			i, gk20a_readl(g, pwr_pmu_mailbox_r(i)));
+
+	for (i = 0; i < pwr_pmu_debug__size_1_v(); i++)
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_debug_r(%d) : 0x%x",
+			i, gk20a_readl(g, pwr_pmu_debug_r(i)));
+
+	for (i = 0; i < 6/*NV_PPWR_FALCON_ICD_IDX_RSTAT__SIZE_1*/; i++) {
+		gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+			pwr_pmu_falcon_icd_cmd_opc_rstat_f() |
+			pwr_pmu_falcon_icd_cmd_idx_f(i));
+		gk20a_err(dev_from_gk20a(g), "pmu_rstat (%d) : 0x%x",
+			i, gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+	}
+
+	i = gk20a_readl(g, pwr_pmu_bar0_error_status_r());
+	gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_error_status_r : 0x%x", i);
+	if (i != 0) {
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_addr_r : 0x%x",
+			gk20a_readl(g, pwr_pmu_bar0_addr_r()));
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_data_r : 0x%x",
+			gk20a_readl(g, pwr_pmu_bar0_data_r()));
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_timeout_r : 0x%x",
+			gk20a_readl(g, pwr_pmu_bar0_timeout_r()));
+		gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_ctl_r : 0x%x",
+			gk20a_readl(g, pwr_pmu_bar0_ctl_r()));
+	}
+
+	i = gk20a_readl(g, pwr_pmu_bar0_fecs_error_r());
+	gk20a_err(dev_from_gk20a(g), "pwr_pmu_bar0_fecs_error_r : 0x%x", i);
+
+	i = gk20a_readl(g, pwr_falcon_exterrstat_r());
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_exterrstat_r : 0x%x", i);
+	if (pwr_falcon_exterrstat_valid_v(i) ==
+			pwr_falcon_exterrstat_valid_true_v()) {
+		gk20a_err(dev_from_gk20a(g), "pwr_falcon_exterraddr_r : 0x%x",
+			gk20a_readl(g, pwr_falcon_exterraddr_r()));
+		gk20a_err(dev_from_gk20a(g), "top_fs_status_r : 0x%x",
+			gk20a_readl(g, top_fs_status_r()));
+		gk20a_err(dev_from_gk20a(g), "pmc_enable : 0x%x",
+			gk20a_readl(g, mc_enable_r()));
+	}
+
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_engctl_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_engctl_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_curctx_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_curctx_r()));
+	gk20a_err(dev_from_gk20a(g), "pwr_falcon_nxtctx_r : 0x%x",
+		gk20a_readl(g, pwr_falcon_nxtctx_r()));
+
+	gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+		pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+		pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_IMB));
+	gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_IMB : 0x%x",
+		gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+	gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+		pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+		pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_DMB));
+	gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_DMB : 0x%x",
+		gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+	gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+		pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+		pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CSW));
+	gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_CSW : 0x%x",
+		gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+	gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+		pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+		pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_CTX));
+	gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_CTX : 0x%x",
+		gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+	gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+		pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+		pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_EXCI));
+	gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_EXCI : 0x%x",
+		gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+	for (i = 0; i < 4; i++) {
+		gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+			pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+			pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_PC));
+		gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_PC : 0x%x",
+			gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+
+		gk20a_writel(g, pwr_pmu_falcon_icd_cmd_r(),
+			pwr_pmu_falcon_icd_cmd_opc_rreg_f() |
+			pwr_pmu_falcon_icd_cmd_idx_f(PMU_FALCON_REG_SP));
+		gk20a_err(dev_from_gk20a(g), "PMU_FALCON_REG_SP : 0x%x",
+			gk20a_readl(g, pwr_pmu_falcon_icd_rdata_r()));
+	}
+
+	/* PMU may crash due to FECS crash. Dump FECS status */
+	gk20a_fecs_dump_falcon_stats(g);
+}
+
+void gk20a_pmu_isr(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_queue *queue;
+	u32 intr, mask;
+	bool recheck = false;
+
+	gk20a_dbg_fn("");
+
+	mutex_lock(&pmu->isr_mutex);
+
+	mask = gk20a_readl(g, pwr_falcon_irqmask_r()) &
+		gk20a_readl(g, pwr_falcon_irqdest_r());
+
+	intr = gk20a_readl(g, pwr_falcon_irqstat_r()) & mask;
+
+	gk20a_dbg_pmu("received falcon interrupt: 0x%08x", intr);
+
+	if (!intr) {
+		mutex_unlock(&pmu->isr_mutex);
+		return;
+	}
+
+	if (intr & pwr_falcon_irqstat_halt_true_f()) {
+		gk20a_err(dev_from_gk20a(g),
+			"pmu halt intr not implemented");
+		pmu_dump_falcon_stats(pmu);
+	}
+	if (intr & pwr_falcon_irqstat_exterr_true_f()) {
+		gk20a_err(dev_from_gk20a(g),
+			"pmu exterr intr not implemented. Clearing interrupt.");
+		pmu_dump_falcon_stats(pmu);
+
+		gk20a_writel(g, pwr_falcon_exterrstat_r(),
+			gk20a_readl(g, pwr_falcon_exterrstat_r()) &
+				~pwr_falcon_exterrstat_valid_m());
+	}
+	if (intr & pwr_falcon_irqstat_swgen0_true_f()) {
+		pmu_process_message(pmu);
+		recheck = true;
+	}
+
+	gk20a_writel(g, pwr_falcon_irqsclr_r(), intr);
+
+	if (recheck) {
+		queue = &pmu->queue[PMU_MESSAGE_QUEUE];
+		if (!pmu_queue_is_empty(pmu, queue))
+			gk20a_writel(g, pwr_falcon_irqsset_r(),
+				pwr_falcon_irqsset_swgen0_set_f());
+	}
+
+	mutex_unlock(&pmu->isr_mutex);
+}
+
+static bool pmu_validate_cmd(struct pmu_gk20a *pmu, struct pmu_cmd *cmd,
+			struct pmu_msg *msg, struct pmu_payload *payload,
+			u32 queue_id)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_queue *queue;
+	u32 in_size, out_size;
+
+	if (!PMU_IS_SW_COMMAND_QUEUE(queue_id))
+		goto invalid_cmd;
+
+	queue = &pmu->queue[queue_id];
+	if (cmd->hdr.size < PMU_CMD_HDR_SIZE)
+		goto invalid_cmd;
+
+	if (cmd->hdr.size > (queue->size >> 1))
+		goto invalid_cmd;
+
+	if (msg != NULL && msg->hdr.size < PMU_MSG_HDR_SIZE)
+		goto invalid_cmd;
+
+	if (!PMU_UNIT_ID_IS_VALID(cmd->hdr.unit_id))
+		goto invalid_cmd;
+
+	if (payload == NULL)
+		return true;
+
+	if (payload->in.buf == NULL && payload->out.buf == NULL)
+		goto invalid_cmd;
+
+	if ((payload->in.buf != NULL && payload->in.size == 0) ||
+	    (payload->out.buf != NULL && payload->out.size == 0))
+		goto invalid_cmd;
+
+	in_size = PMU_CMD_HDR_SIZE;
+	if (payload->in.buf) {
+		in_size += payload->in.offset;
+		in_size += g->ops.pmu_ver.get_pmu_allocation_struct_size(pmu);
+	}
+
+	out_size = PMU_CMD_HDR_SIZE;
+	if (payload->out.buf) {
+		out_size += payload->out.offset;
+		out_size += g->ops.pmu_ver.get_pmu_allocation_struct_size(pmu);
+	}
+
+	if (in_size > cmd->hdr.size || out_size > cmd->hdr.size)
+		goto invalid_cmd;
+
+
+	if ((payload->in.offset != 0 && payload->in.buf == NULL) ||
+	    (payload->out.offset != 0 && payload->out.buf == NULL))
+		goto invalid_cmd;
+
+	return true;
+
+invalid_cmd:
+	gk20a_err(dev_from_gk20a(g), "invalid pmu cmd :\n"
+		"queue_id=%d,\n"
+		"cmd_size=%d, cmd_unit_id=%d, msg=%p, msg_size=%d,\n"
+		"payload in=%p, in_size=%d, in_offset=%d,\n"
+		"payload out=%p, out_size=%d, out_offset=%d",
+		queue_id, cmd->hdr.size, cmd->hdr.unit_id,
+		msg, msg?msg->hdr.unit_id:~0,
+		&payload->in, payload->in.size, payload->in.offset,
+		&payload->out, payload->out.size, payload->out.offset);
+
+	return false;
+}
+
+static int pmu_write_cmd(struct pmu_gk20a *pmu, struct pmu_cmd *cmd,
+			u32 queue_id, unsigned long timeout)
+{
+	struct gk20a *g = pmu->g;
+	struct pmu_queue *queue;
+	unsigned long end_jiffies = jiffies +
+		msecs_to_jiffies(timeout);
+	int err;
+
+	gk20a_dbg_fn("");
+
+	queue = &pmu->queue[queue_id];
+
+	do {
+		err = pmu_queue_open_write(pmu, queue, cmd->hdr.size);
+		if (err == -EAGAIN && time_before(jiffies, end_jiffies))
+			usleep_range(1000, 2000);
+		else
+			break;
+	} while (1);
+
+	if (err)
+		goto clean_up;
+
+	pmu_queue_push(pmu, queue, cmd, cmd->hdr.size);
+
+	err = pmu_queue_close(pmu, queue, true);
+
+clean_up:
+	if (err)
+		gk20a_err(dev_from_gk20a(g),
+			"fail to write cmd to queue %d", queue_id);
+	else
+		gk20a_dbg_fn("done");
+
+	return err;
+}
+
+int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd,
+		struct pmu_msg *msg, struct pmu_payload *payload,
+		u32 queue_id, pmu_callback callback, void* cb_param,
+		u32 *seq_desc, unsigned long timeout)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_v *pv = &g->ops.pmu_ver;
+	struct pmu_sequence *seq;
+	void *in = NULL, *out = NULL;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	BUG_ON(!cmd);
+	BUG_ON(!seq_desc);
+	BUG_ON(!pmu->pmu_ready);
+
+	if (!pmu_validate_cmd(pmu, cmd, msg, payload, queue_id))
+		return -EINVAL;
+
+	err = pmu_seq_acquire(pmu, &seq);
+	if (err)
+		return err;
+
+	cmd->hdr.seq_id = seq->id;
+
+	cmd->hdr.ctrl_flags = 0;
+	cmd->hdr.ctrl_flags |= PMU_CMD_FLAGS_STATUS;
+	cmd->hdr.ctrl_flags |= PMU_CMD_FLAGS_INTR;
+
+	seq->callback = callback;
+	seq->cb_params = cb_param;
+	seq->msg = msg;
+	seq->out_payload = NULL;
+	seq->desc = pmu->next_seq_desc++;
+
+	if (payload)
+		seq->out_payload = payload->out.buf;
+
+	*seq_desc = seq->desc;
+
+	if (payload && payload->in.offset != 0) {
+		pv->set_pmu_allocation_ptr(pmu, &in,
+		((u8 *)&cmd->cmd + payload->in.offset));
+
+		if (payload->in.buf != payload->out.buf)
+			pv->pmu_allocation_set_dmem_size(pmu, in,
+			(u16)payload->in.size);
+		else
+			pv->pmu_allocation_set_dmem_size(pmu, in,
+			(u16)max(payload->in.size, payload->out.size));
+
+		err = pmu->dmem.alloc(&pmu->dmem,
+		pv->pmu_allocation_get_dmem_offset_addr(pmu, in),
+		pv->pmu_allocation_get_dmem_size(pmu, in));
+		if (err)
+			goto clean_up;
+
+		pmu_copy_to_dmem(pmu, (pv->pmu_allocation_get_dmem_offset(pmu,
+		in)),
+			payload->in.buf, payload->in.size, 0);
+		pv->pmu_allocation_set_dmem_size(pmu,
+		pv->get_pmu_seq_in_a_ptr(seq),
+		pv->pmu_allocation_get_dmem_size(pmu, in));
+		pv->pmu_allocation_set_dmem_offset(pmu,
+		pv->get_pmu_seq_in_a_ptr(seq),
+		pv->pmu_allocation_get_dmem_offset(pmu, in));
+	}
+
+	if (payload && payload->out.offset != 0) {
+		pv->set_pmu_allocation_ptr(pmu, &out,
+		((u8 *)&cmd->cmd + payload->out.offset));
+		pv->pmu_allocation_set_dmem_size(pmu, out,
+		(u16)payload->out.size);
+
+		if (payload->out.buf != payload->in.buf) {
+			err = pmu->dmem.alloc(&pmu->dmem,
+			pv->pmu_allocation_get_dmem_offset_addr(pmu, out),
+			pv->pmu_allocation_get_dmem_size(pmu, out));
+			if (err)
+				goto clean_up;
+		} else {
+			BUG_ON(in == NULL);
+			pv->pmu_allocation_set_dmem_offset(pmu, out,
+			pv->pmu_allocation_get_dmem_offset(pmu, in));
+		}
+
+		pv->pmu_allocation_set_dmem_size(pmu,
+		pv->get_pmu_seq_out_a_ptr(seq),
+		pv->pmu_allocation_get_dmem_size(pmu, out));
+		pv->pmu_allocation_set_dmem_offset(pmu,
+		pv->get_pmu_seq_out_a_ptr(seq),
+		pv->pmu_allocation_get_dmem_offset(pmu, out));
+	}
+
+	seq->state = PMU_SEQ_STATE_USED;
+	err = pmu_write_cmd(pmu, cmd, queue_id, timeout);
+	if (err)
+		seq->state = PMU_SEQ_STATE_PENDING;
+
+	gk20a_dbg_fn("done");
+
+	return 0;
+
+clean_up:
+	gk20a_dbg_fn("fail");
+	if (in)
+		pmu->dmem.free(&pmu->dmem,
+		pv->pmu_allocation_get_dmem_offset(pmu, in),
+		pv->pmu_allocation_get_dmem_size(pmu, in));
+	if (out)
+		pmu->dmem.free(&pmu->dmem,
+		pv->pmu_allocation_get_dmem_offset(pmu, out),
+		pv->pmu_allocation_get_dmem_size(pmu, out));
+
+	pmu_seq_release(pmu, seq);
+	return err;
+}
+
+static int gk20a_pmu_enable_elpg_locked(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_cmd cmd;
+	u32 seq, status;
+
+	gk20a_dbg_fn("");
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+	cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+	cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_ALLOW;
+
+	/* no need to wait ack for ELPG enable but set pending to sync
+	   with follow up ELPG disable */
+	pmu->elpg_stat = PMU_ELPG_STAT_ON_PENDING;
+
+	status = gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+
+	BUG_ON(status != 0);
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+int gk20a_pmu_enable_elpg(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct gr_gk20a *gr = &g->gr;
+
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (!pmu->elpg_ready || !pmu->initialized)
+		goto exit;
+
+	mutex_lock(&pmu->elpg_mutex);
+
+	pmu->elpg_refcnt++;
+	if (pmu->elpg_refcnt <= 0)
+		goto exit_unlock;
+
+	/* something is not right if we end up in following code path */
+	if (unlikely(pmu->elpg_refcnt > 1)) {
+		gk20a_warn(dev_from_gk20a(g),
+		"%s(): possible elpg refcnt mismatch. elpg refcnt=%d",
+		__func__, pmu->elpg_refcnt);
+		WARN_ON(1);
+	}
+
+	/* do NOT enable elpg until golden ctx is created,
+	   which is related with the ctx that ELPG save and restore. */
+	if (unlikely(!gr->ctx_vars.golden_image_initialized))
+		goto exit_unlock;
+
+	/* return if ELPG is already on or on_pending or off_on_pending */
+	if (pmu->elpg_stat != PMU_ELPG_STAT_OFF)
+		goto exit_unlock;
+
+	/* if ELPG is not allowed right now, mark that it should be enabled
+	 * immediately after it is allowed */
+	if (!pmu->elpg_enable_allow) {
+		pmu->elpg_stat = PMU_ELPG_STAT_OFF_ON_PENDING;
+		goto exit_unlock;
+	}
+
+	ret = gk20a_pmu_enable_elpg_locked(g);
+
+exit_unlock:
+	mutex_unlock(&pmu->elpg_mutex);
+exit:
+	gk20a_dbg_fn("done");
+	return ret;
+}
+
+static void pmu_elpg_enable_allow(struct work_struct *work)
+{
+	struct pmu_gk20a *pmu = container_of(to_delayed_work(work),
+					struct pmu_gk20a, elpg_enable);
+
+	gk20a_dbg_fn("");
+
+	mutex_lock(&pmu->elpg_mutex);
+
+	/* It is ok to enabled powergating now */
+	pmu->elpg_enable_allow = true;
+
+	/* do we have pending requests? */
+	if (pmu->elpg_stat == PMU_ELPG_STAT_OFF_ON_PENDING) {
+		pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+		gk20a_pmu_enable_elpg_locked(pmu->g);
+	}
+
+	mutex_unlock(&pmu->elpg_mutex);
+
+	gk20a_dbg_fn("done");
+}
+
+static int gk20a_pmu_disable_elpg_defer_enable(struct gk20a *g, bool enable)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_cmd cmd;
+	u32 seq;
+	int ret = 0;
+
+	gk20a_dbg_fn("");
+
+	if (!pmu->elpg_ready || !pmu->initialized)
+		return 0;
+
+	/* remove the work from queue */
+	cancel_delayed_work_sync(&pmu->elpg_enable);
+
+	mutex_lock(&pmu->elpg_mutex);
+
+	pmu->elpg_refcnt--;
+	if (pmu->elpg_refcnt > 0) {
+		gk20a_warn(dev_from_gk20a(g),
+		"%s(): possible elpg refcnt mismatch. elpg refcnt=%d",
+		__func__, pmu->elpg_refcnt);
+		WARN_ON(1);
+		ret = 0;
+		goto exit_unlock;
+	}
+
+	/* cancel off_on_pending and return */
+	if (pmu->elpg_stat == PMU_ELPG_STAT_OFF_ON_PENDING) {
+		pmu->elpg_stat = PMU_ELPG_STAT_OFF;
+		ret = 0;
+		goto exit_reschedule;
+	}
+	/* wait if on_pending */
+	else if (pmu->elpg_stat == PMU_ELPG_STAT_ON_PENDING) {
+
+		pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+				      &pmu->elpg_stat, PMU_ELPG_STAT_ON);
+
+		if (pmu->elpg_stat != PMU_ELPG_STAT_ON) {
+			gk20a_err(dev_from_gk20a(g),
+				"ELPG_ALLOW_ACK failed, elpg_stat=%d",
+				pmu->elpg_stat);
+			pmu_dump_elpg_stats(pmu);
+			pmu_dump_falcon_stats(pmu);
+			ret = -EBUSY;
+			goto exit_unlock;
+		}
+	}
+	/* return if ELPG is already off */
+	else if (pmu->elpg_stat != PMU_ELPG_STAT_ON) {
+		ret = 0;
+		goto exit_reschedule;
+	}
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(struct pmu_pg_cmd_elpg_cmd);
+	cmd.cmd.pg.elpg_cmd.cmd_type = PMU_PG_CMD_ID_ELPG_CMD;
+	cmd.cmd.pg.elpg_cmd.engine_id = ENGINE_GR_GK20A;
+	cmd.cmd.pg.elpg_cmd.cmd = PMU_PG_ELPG_CMD_DISALLOW;
+
+	pmu->elpg_stat = PMU_ELPG_STAT_OFF_PENDING;
+
+	gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			pmu_handle_pg_elpg_msg, pmu, &seq, ~0);
+
+	pmu_wait_message_cond(pmu, gk20a_get_gr_idle_timeout(g),
+			      &pmu->elpg_stat, PMU_ELPG_STAT_OFF);
+	if (pmu->elpg_stat != PMU_ELPG_STAT_OFF) {
+		gk20a_err(dev_from_gk20a(g),
+			"ELPG_DISALLOW_ACK failed");
+		pmu_dump_elpg_stats(pmu);
+		pmu_dump_falcon_stats(pmu);
+		ret = -EBUSY;
+		goto exit_unlock;
+	}
+
+exit_reschedule:
+	if (enable) {
+		pmu->elpg_enable_allow = false;
+		schedule_delayed_work(&pmu->elpg_enable,
+			msecs_to_jiffies(PMU_ELPG_ENABLE_ALLOW_DELAY_MSEC));
+	} else
+		pmu->elpg_enable_allow = true;
+
+
+exit_unlock:
+	mutex_unlock(&pmu->elpg_mutex);
+	gk20a_dbg_fn("done");
+	return ret;
+}
+
+int gk20a_pmu_disable_elpg(struct gk20a *g)
+{
+	return gk20a_pmu_disable_elpg_defer_enable(g, true);
+}
+
+int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	int err;
+
+	gk20a_dbg_fn("");
+
+	if (enable)
+		err = pmu_perfmon_start_sampling(pmu);
+	else
+		err = pmu_perfmon_stop_sampling(pmu);
+
+	return err;
+}
+
+int gk20a_pmu_destroy(struct gk20a *g)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	u32 elpg_ingating_time, elpg_ungating_time, gating_cnt;
+
+	gk20a_dbg_fn("");
+
+	if (!support_gk20a_pmu())
+		return 0;
+
+	/* make sure the pending operations are finished before we continue */
+	cancel_delayed_work_sync(&pmu->elpg_enable);
+	cancel_work_sync(&pmu->pg_init);
+
+	gk20a_pmu_get_elpg_residency_gating(g, &elpg_ingating_time,
+		&elpg_ungating_time, &gating_cnt);
+
+	gk20a_pmu_disable_elpg_defer_enable(g, false);
+	pmu->initialized = false;
+
+	/* update the s/w ELPG residency counters */
+	g->pg_ingating_time_us += (u64)elpg_ingating_time;
+	g->pg_ungating_time_us += (u64)elpg_ungating_time;
+	g->pg_gating_cnt += gating_cnt;
+
+	pmu_enable(pmu, false);
+
+	if (pmu->remove_support) {
+		pmu->remove_support(pmu);
+		pmu->remove_support = NULL;
+	}
+
+	gk20a_dbg_fn("done");
+	return 0;
+}
+
+int gk20a_pmu_load_norm(struct gk20a *g, u32 *load)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	u16 _load = 0;
+
+	if (!pmu->perfmon_ready) {
+		*load = 0;
+		return 0;
+	}
+
+	pmu_copy_from_dmem(pmu, pmu->sample_buffer, (u8 *)&_load, 2, 0);
+	*load = _load / 10;
+
+	return 0;
+}
+
+void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
+				 u32 *total_cycles)
+{
+	if (!g->power_on) {
+		*busy_cycles = 0;
+		*total_cycles = 0;
+		return;
+	}
+
+	gk20a_busy(g->dev);
+	*busy_cycles = pwr_pmu_idle_count_value_v(
+		gk20a_readl(g, pwr_pmu_idle_count_r(1)));
+	rmb();
+	*total_cycles = pwr_pmu_idle_count_value_v(
+		gk20a_readl(g, pwr_pmu_idle_count_r(2)));
+	gk20a_idle(g->dev);
+}
+
+void gk20a_pmu_reset_load_counters(struct gk20a *g)
+{
+	u32 reg_val = pwr_pmu_idle_count_reset_f(1);
+
+	if (!g->power_on)
+		return;
+
+	gk20a_busy(g->dev);
+	gk20a_writel(g, pwr_pmu_idle_count_r(2), reg_val);
+	wmb();
+	gk20a_writel(g, pwr_pmu_idle_count_r(1), reg_val);
+	gk20a_idle(g->dev);
+}
+
+static int gk20a_pmu_get_elpg_residency_gating(struct gk20a *g,
+			u32 *ingating_time, u32 *ungating_time, u32 *gating_cnt)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	struct pmu_pg_stats stats;
+
+	if (!pmu->initialized) {
+		*ingating_time = 0;
+		*ungating_time = 0;
+		*gating_cnt = 0;
+		return 0;
+	}
+
+	pmu_copy_from_dmem(pmu, pmu->stat_dmem_offset,
+		(u8 *)&stats, sizeof(struct pmu_pg_stats), 0);
+
+	*ingating_time = stats.pg_ingating_time_us;
+	*ungating_time = stats.pg_ungating_time_us;
+	*gating_cnt = stats.pg_gating_cnt;
+
+	return 0;
+}
+
+/* Send an Adaptive Power (AP) related command to PMU */
+static int gk20a_pmu_ap_send_command(struct gk20a *g,
+			union pmu_ap_cmd *p_ap_cmd, bool b_block)
+{
+	struct pmu_gk20a *pmu = &g->pmu;
+	/* FIXME: where is the PG structure defined?? */
+	u32 status = 0;
+	struct pmu_cmd cmd;
+	u32 seq;
+	pmu_callback p_callback = NULL;
+
+	memset(&cmd, 0, sizeof(struct pmu_cmd));
+
+	/* Copy common members */
+	cmd.hdr.unit_id = PMU_UNIT_PG;
+	cmd.hdr.size = PMU_CMD_HDR_SIZE + sizeof(union pmu_ap_cmd);
+
+	cmd.cmd.pg.ap_cmd.cmn.cmd_type = PMU_PG_CMD_ID_AP;
+	cmd.cmd.pg.ap_cmd.cmn.cmd_id = p_ap_cmd->cmn.cmd_id;
+
+	/* Copy other members of command */
+	switch (p_ap_cmd->cmn.cmd_id) {
+	case PMU_AP_CMD_ID_INIT:
+		cmd.cmd.pg.ap_cmd.init.pg_sampling_period_us =
+			p_ap_cmd->init.pg_sampling_period_us;
+		p_callback = ap_callback_init_and_enable_ctrl;
+		break;
+
+	case PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL:
+		cmd.cmd.pg.ap_cmd.init_and_enable_ctrl.ctrl_id =
+		p_ap_cmd->init_and_enable_ctrl.ctrl_id;
+		memcpy(
+		(void *)&(cmd.cmd.pg.ap_cmd.init_and_enable_ctrl.params),
+			(void *)&(p_ap_cmd->init_and_enable_ctrl.params),
+			sizeof(struct pmu_ap_ctrl_init_params));
+
+		p_callback = ap_callback_init_and_enable_ctrl;
+		break;
+
+	case PMU_AP_CMD_ID_ENABLE_CTRL:
+		cmd.cmd.pg.ap_cmd.enable_ctrl.ctrl_id =
+			p_ap_cmd->enable_ctrl.ctrl_id;
+		break;
+
+	case PMU_AP_CMD_ID_DISABLE_CTRL:
+		cmd.cmd.pg.ap_cmd.disable_ctrl.ctrl_id =
+			p_ap_cmd->disable_ctrl.ctrl_id;
+		break;
+
+	case PMU_AP_CMD_ID_KICK_CTRL:
+		cmd.cmd.pg.ap_cmd.kick_ctrl.ctrl_id =
+			p_ap_cmd->kick_ctrl.ctrl_id;
+		cmd.cmd.pg.ap_cmd.kick_ctrl.skip_count =
+			p_ap_cmd->kick_ctrl.skip_count;
+		break;
+
+	default:
+		gk20a_dbg_pmu("%s: Invalid Adaptive Power command %d\n",
+			__func__, p_ap_cmd->cmn.cmd_id);
+		return 0x2f;
+	}
+
+	status = gk20a_pmu_cmd_post(g, &cmd, NULL, NULL, PMU_COMMAND_QUEUE_HPQ,
+			p_callback, pmu, &seq, ~0);
+
+	if (!status) {
+		gk20a_dbg_pmu(
+			"%s: Unable to submit Adaptive Power Command %d\n",
+			__func__, p_ap_cmd->cmn.cmd_id);
+		goto err_return;
+	}
+
+	/* TODO: Implement blocking calls (b_block) */
+
+err_return:
+	return status;
+}
+
+static void ap_callback_init_and_enable_ctrl(
+		struct gk20a *g, struct pmu_msg *msg,
+		void *param, u32 seq_desc, u32 status)
+{
+	/* Define p_ap (i.e pointer to pmu_ap structure) */
+	WARN_ON(!msg);
+
+	if (!status) {
+		switch (msg->msg.pg.ap_msg.cmn.msg_id) {
+		case PMU_AP_MSG_ID_INIT_ACK:
+			break;
+
+		default:
+			gk20a_dbg_pmu(
+			"%s: Invalid Adaptive Power Message: %x\n",
+			__func__, msg->msg.pg.ap_msg.cmn.msg_id);
+			break;
+		}
+	}
+}
+
+static int gk20a_aelpg_init(struct gk20a *g)
+{
+	int status = 0;
+
+	/* Remove reliance on app_ctrl field. */
+	union pmu_ap_cmd ap_cmd;
+
+	/* TODO: Check for elpg being ready? */
+	ap_cmd.init.cmd_id = PMU_AP_CMD_ID_INIT;
+	ap_cmd.init.pg_sampling_period_us =
+		APCTRL_SAMPLING_PERIOD_PG_DEFAULT_US;
+
+	status = gk20a_pmu_ap_send_command(g, &ap_cmd, false);
+	return status;
+}
+
+static int gk20a_aelpg_init_and_enable(struct gk20a *g, u8 ctrl_id)
+{
+	int status = 0;
+	union pmu_ap_cmd ap_cmd;
+
+	/* TODO: Probably check if ELPG is ready? */
+
+	ap_cmd.init_and_enable_ctrl.cmd_id = PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL;
+	ap_cmd.init_and_enable_ctrl.ctrl_id = ctrl_id;
+	ap_cmd.init_and_enable_ctrl.params.min_idle_filter_us =
+		APCTRL_MINIMUM_IDLE_FILTER_DEFAULT_US;
+	ap_cmd.init_and_enable_ctrl.params.min_target_saving_us =
+		APCTRL_MINIMUM_TARGET_SAVING_DEFAULT_US;
+	ap_cmd.init_and_enable_ctrl.params.power_break_even_us =
+		APCTRL_POWER_BREAKEVEN_DEFAULT_US;
+	ap_cmd.init_and_enable_ctrl.params.cycles_per_sample_max =
+		APCTRL_CYCLES_PER_SAMPLE_MAX_DEFAULT;
+
+	switch (ctrl_id) {
+	case PMU_AP_CTRL_ID_GRAPHICS:
+		break;
+	default:
+		break;
+	}
+
+	status = gk20a_pmu_ap_send_command(g, &ap_cmd, true);
+	return status;
+}
+
+#if CONFIG_DEBUG_FS
+static int elpg_residency_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	u32 ingating_time = 0;
+	u32 ungating_time = 0;
+	u32 gating_cnt;
+	u64 total_ingating, total_ungating, residency, divisor, dividend;
+
+	/* Don't unnecessarily power on the device */
+	if (g->power_on) {
+		gk20a_busy(g->dev);
+		gk20a_pmu_get_elpg_residency_gating(g, &ingating_time,
+			&ungating_time, &gating_cnt);
+		gk20a_idle(g->dev);
+	}
+	total_ingating = g->pg_ingating_time_us + (u64)ingating_time;
+	total_ungating = g->pg_ungating_time_us + (u64)ungating_time;
+	divisor = total_ingating + total_ungating;
+
+	/* We compute the residency on a scale of 1000 */
+	dividend = total_ingating * 1000;
+
+	if (divisor)
+		residency = div64_u64(dividend, divisor);
+	else
+		residency = 0;
+
+	seq_printf(s, "Time in ELPG: %llu us\n"
+			"Time out of ELPG: %llu us\n"
+			"ELPG residency ratio: %llu\n",
+			total_ingating, total_ungating, residency);
+	return 0;
+
+}
+
+static int elpg_residency_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, elpg_residency_show, inode->i_private);
+}
+
+static const struct file_operations elpg_residency_fops = {
+	.open		= elpg_residency_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int elpg_transitions_show(struct seq_file *s, void *data)
+{
+	struct gk20a *g = s->private;
+	u32 ingating_time, ungating_time, total_gating_cnt;
+	u32 gating_cnt = 0;
+
+	if (g->power_on) {
+		gk20a_busy(g->dev);
+		gk20a_pmu_get_elpg_residency_gating(g, &ingating_time,
+			&ungating_time, &gating_cnt);
+		gk20a_idle(g->dev);
+	}
+	total_gating_cnt = g->pg_gating_cnt + gating_cnt;
+
+	seq_printf(s, "%u\n", total_gating_cnt);
+	return 0;
+
+}
+
+static int elpg_transitions_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, elpg_transitions_show, inode->i_private);
+}
+
+static const struct file_operations elpg_transitions_fops = {
+	.open		= elpg_transitions_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+int gk20a_pmu_debugfs_init(struct platform_device *dev)
+{
+	struct dentry *d;
+	struct gk20a_platform *platform = platform_get_drvdata(dev);
+	struct gk20a *g = get_gk20a(dev);
+
+	d = debugfs_create_file(
+		"elpg_residency", S_IRUGO|S_IWUSR, platform->debugfs, g,
+						&elpg_residency_fops);
+	if (!d)
+		goto err_out;
+
+	d = debugfs_create_file(
+		"elpg_transitions", S_IRUGO, platform->debugfs, g,
+						&elpg_transitions_fops);
+	if (!d)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	pr_err("%s: Failed to make debugfs node\n", __func__);
+	debugfs_remove_recursive(platform->debugfs);
+	return -ENOMEM;
+}
+#endif
diff --git a/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
new file mode 100644
index 000000000000..c1b8ff1f61b8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/pmu_gk20a.h
@@ -0,0 +1,1097 @@
+/*
+ * drivers/video/tegra/host/gk20a/pmu_gk20a.h
+ *
+ * GK20A PMU (aka. gPMU outside gk20a context)
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PMU_GK20A_H__
+#define __PMU_GK20A_H__
+
+/* defined by pmu hw spec */
+#define GK20A_PMU_VA_START		((128 * 1024) << 10)
+#define GK20A_PMU_VA_SIZE		(512 * 1024 * 1024)
+#define GK20A_PMU_INST_SIZE		(4 * 1024)
+#define GK20A_PMU_UCODE_SIZE_MAX	(256 * 1024)
+#define GK20A_PMU_SEQ_BUF_SIZE		4096
+
+#define ZBC_MASK(i)			(~(~(0) << ((i)+1)) & 0xfffe)
+
+/* PMU Command/Message Interfaces for Adaptive Power */
+/* Macro to get Histogram index */
+#define PMU_AP_HISTOGRAM(idx)		(idx)
+#define PMU_AP_HISTOGRAM_CONT		(4)
+
+/* Total number of histogram bins */
+#define PMU_AP_CFG_HISTOGRAM_BIN_N	(16)
+
+/* Mapping between Idle counters and histograms */
+#define PMU_AP_IDLE_MASK_HIST_IDX_0		(2)
+#define PMU_AP_IDLE_MASK_HIST_IDX_1		(3)
+#define PMU_AP_IDLE_MASK_HIST_IDX_2		(5)
+#define PMU_AP_IDLE_MASK_HIST_IDX_3		(6)
+
+
+/* Mapping between AP_CTRLs and Histograms */
+#define PMU_AP_HISTOGRAM_IDX_GRAPHICS	(PMU_AP_HISTOGRAM(1))
+
+/* Mapping between AP_CTRLs and Idle counters */
+#define PMU_AP_IDLE_MASK_GRAPHICS	(PMU_AP_IDLE_MASK_HIST_IDX_1)
+
+#define APP_VERSION_1 17997577
+#define APP_VERSION_0 16856675
+
+
+enum pmu_perfmon_cmd_start_fields {
+	COUNTER_ALLOC
+};
+
+/* Adaptive Power Controls (AP_CTRL) */
+enum {
+	PMU_AP_CTRL_ID_GRAPHICS = 0x0,
+	/* PMU_AP_CTRL_ID_MS         ,*/
+	PMU_AP_CTRL_ID_MAX           ,
+};
+
+/* AP_CTRL Statistics */
+struct pmu_ap_ctrl_stat {
+	/*
+	 * Represents whether AP is active or not
+	 * TODO: This is NvBool in RM; is that 1 byte of 4 bytes?
+	 */
+	u8	b_active;
+
+	/* Idle filter represented by histogram bin index */
+	u8	idle_filter_x;
+	u8	rsvd[2];
+
+	/* Total predicted power saving cycles. */
+	s32	power_saving_h_cycles;
+
+	/* Counts how many times AP gave us -ve power benefits. */
+	u32	bad_decision_count;
+
+	/*
+	 * Number of times ap structure needs to skip AP iterations
+	 * KICK_CTRL from kernel updates this parameter.
+	 */
+	u32	skip_count;
+	u8	bin[PMU_AP_CFG_HISTOGRAM_BIN_N];
+};
+
+/* Parameters initialized by INITn APCTRL command */
+struct pmu_ap_ctrl_init_params {
+	/* Minimum idle filter value in Us */
+	u32	min_idle_filter_us;
+
+	/*
+	 * Minimum Targeted Saving in Us. AP will update idle thresholds only
+	 * if power saving achieved by updating idle thresholds is greater than
+	 * Minimum targeted saving.
+	 */
+	u32	min_target_saving_us;
+
+	/* Minimum targeted residency of power feature in Us */
+	u32	power_break_even_us;
+
+	/*
+	 * Maximum number of allowed power feature cycles per sample.
+	 *
+	 * We are allowing at max "pgPerSampleMax" cycles in one iteration of AP
+	 * AKA pgPerSampleMax in original algorithm.
+	 */
+	u32	cycles_per_sample_max;
+};
+
+/* AP Commands/Message structures */
+
+/*
+ * Structure for Generic AP Commands
+ */
+struct pmu_ap_cmd_common {
+	u8	cmd_type;
+	u16	cmd_id;
+};
+
+/*
+ * Structure for INIT AP command
+ */
+struct pmu_ap_cmd_init {
+	u8	cmd_type;
+	u16	cmd_id;
+	u8	rsvd;
+	u32	pg_sampling_period_us;
+};
+
+/*
+ * Structure for Enable/Disable ApCtrl Commands
+ */
+struct pmu_ap_cmd_enable_ctrl {
+	u8	cmd_type;
+	u16	cmd_id;
+
+	u8	ctrl_id;
+};
+
+struct pmu_ap_cmd_disable_ctrl {
+	u8	cmd_type;
+	u16	cmd_id;
+
+	u8	ctrl_id;
+};
+
+/*
+ * Structure for INIT command
+ */
+struct pmu_ap_cmd_init_ctrl {
+	u8				cmd_type;
+	u16				cmd_id;
+	u8				ctrl_id;
+	struct pmu_ap_ctrl_init_params	params;
+};
+
+struct pmu_ap_cmd_init_and_enable_ctrl {
+	u8				cmd_type;
+	u16				cmd_id;
+	u8				ctrl_id;
+	struct pmu_ap_ctrl_init_params	params;
+};
+
+/*
+ * Structure for KICK_CTRL command
+ */
+struct pmu_ap_cmd_kick_ctrl {
+	u8	cmd_type;
+	u16	cmd_id;
+	u8	ctrl_id;
+
+	u32	skip_count;
+};
+
+/*
+ * Structure for PARAM command
+ */
+struct pmu_ap_cmd_param {
+	u8	cmd_type;
+	u16	cmd_id;
+	u8	ctrl_id;
+
+	u32	data;
+};
+
+/*
+ * Defines for AP commands
+ */
+enum {
+	PMU_AP_CMD_ID_INIT = 0x0          ,
+	PMU_AP_CMD_ID_INIT_AND_ENABLE_CTRL,
+	PMU_AP_CMD_ID_ENABLE_CTRL         ,
+	PMU_AP_CMD_ID_DISABLE_CTRL        ,
+	PMU_AP_CMD_ID_KICK_CTRL           ,
+};
+
+/*
+ * AP Command
+ */
+union pmu_ap_cmd {
+	u8					cmd_type;
+	struct pmu_ap_cmd_common		cmn;
+	struct pmu_ap_cmd_init			init;
+	struct pmu_ap_cmd_init_and_enable_ctrl	init_and_enable_ctrl;
+	struct pmu_ap_cmd_enable_ctrl		enable_ctrl;
+	struct pmu_ap_cmd_disable_ctrl		disable_ctrl;
+	struct pmu_ap_cmd_kick_ctrl		kick_ctrl;
+};
+
+/*
+ * Structure for generic AP Message
+ */
+struct pmu_ap_msg_common {
+	u8	msg_type;
+	u16	msg_id;
+};
+
+/*
+ * Structure for INIT_ACK Message
+ */
+struct pmu_ap_msg_init_ack {
+	u8	msg_type;
+	u16	msg_id;
+	u8	ctrl_id;
+	u32	stats_dmem_offset;
+};
+
+/*
+ * Defines for AP messages
+ */
+enum {
+	PMU_AP_MSG_ID_INIT_ACK = 0x0,
+};
+
+/*
+ * AP Message
+ */
+union pmu_ap_msg {
+	u8				msg_type;
+	struct pmu_ap_msg_common	cmn;
+	struct pmu_ap_msg_init_ack	init_ack;
+};
+
+/* Default Sampling Period of AELPG */
+#define APCTRL_SAMPLING_PERIOD_PG_DEFAULT_US                    (1000000)
+
+/* Default values of APCTRL parameters */
+#define APCTRL_MINIMUM_IDLE_FILTER_DEFAULT_US                   (100)
+#define APCTRL_MINIMUM_TARGET_SAVING_DEFAULT_US                 (10000)
+#define APCTRL_POWER_BREAKEVEN_DEFAULT_US                       (2000)
+#define APCTRL_CYCLES_PER_SAMPLE_MAX_DEFAULT                    (100)
+
+/*
+ * Disable reason for Adaptive Power Controller
+ */
+enum {
+	APCTRL_DISABLE_REASON_RM_UNLOAD,
+	APCTRL_DISABLE_REASON_RMCTRL,
+};
+
+/*
+ * Adaptive Power Controller
+ */
+struct ap_ctrl {
+	u32			stats_dmem_offset;
+	u32			disable_reason_mask;
+	struct pmu_ap_ctrl_stat	stat_cache;
+	u8			b_ready;
+};
+
+/*
+ * Adaptive Power structure
+ *
+ * ap structure provides generic infrastructure to make any power feature
+ * adaptive.
+ */
+struct pmu_ap {
+	u32			supported_mask;
+	struct ap_ctrl		ap_ctrl[PMU_AP_CTRL_ID_MAX];
+};
+
+
+enum {
+	GK20A_PMU_DMAIDX_UCODE		= 0,
+	GK20A_PMU_DMAIDX_VIRT		= 1,
+	GK20A_PMU_DMAIDX_PHYS_VID	= 2,
+	GK20A_PMU_DMAIDX_PHYS_SYS_COH	= 3,
+	GK20A_PMU_DMAIDX_PHYS_SYS_NCOH	= 4,
+	GK20A_PMU_DMAIDX_RSVD		= 5,
+	GK20A_PMU_DMAIDX_PELPG		= 6,
+	GK20A_PMU_DMAIDX_END		= 7
+};
+
+struct pmu_mem_v0 {
+	u32 dma_base;
+	u8  dma_offset;
+	u8  dma_idx;
+};
+
+struct pmu_mem_v1 {
+	u32 dma_base;
+	u8  dma_offset;
+	u8  dma_idx;
+	u16 fb_size;
+};
+
+struct pmu_dmem {
+	u16 size;
+	u32 offset;
+};
+
+/* Make sure size of this structure is a multiple of 4 bytes */
+struct pmu_cmdline_args_v0 {
+	u32 cpu_freq_hz;		/* Frequency of the clock driving PMU */
+	u32 falc_trace_size;		/* falctrace buffer size (bytes) */
+	u32 falc_trace_dma_base;	/* 256-byte block address */
+	u32 falc_trace_dma_idx;		/* dmaIdx for DMA operations */
+	struct pmu_mem_v0 gc6_ctx;		/* dmem offset of gc6 context */
+};
+
+struct pmu_cmdline_args_v1 {
+	u32 cpu_freq_hz;		/* Frequency of the clock driving PMU */
+	u32 falc_trace_size;		/* falctrace buffer size (bytes) */
+	u32 falc_trace_dma_base;	/* 256-byte block address */
+	u32 falc_trace_dma_idx;		/* dmaIdx for DMA operations */
+	u8 secure_mode;
+	struct pmu_mem_v1 gc6_ctx;		/* dmem offset of gc6 context */
+};
+
+#define GK20A_PMU_DMEM_BLKSIZE2		8
+
+#define GK20A_PMU_UCODE_NB_MAX_OVERLAY	    32
+#define GK20A_PMU_UCODE_NB_MAX_DATE_LENGTH  64
+
+struct pmu_ucode_desc {
+	u32 descriptor_size;
+	u32 image_size;
+	u32 tools_version;
+	u32 app_version;
+	char date[GK20A_PMU_UCODE_NB_MAX_DATE_LENGTH];
+	u32 bootloader_start_offset;
+	u32 bootloader_size;
+	u32 bootloader_imem_offset;
+	u32 bootloader_entry_point;
+	u32 app_start_offset;
+	u32 app_size;
+	u32 app_imem_offset;
+	u32 app_imem_entry;
+	u32 app_dmem_offset;
+	u32 app_resident_code_offset;  /* Offset from appStartOffset */
+	u32 app_resident_code_size;    /* Exact size of the resident code ( potentially contains CRC inside at the end ) */
+	u32 app_resident_data_offset;  /* Offset from appStartOffset */
+	u32 app_resident_data_size;    /* Exact size of the resident code ( potentially contains CRC inside at the end ) */
+	u32 nb_overlays;
+	struct {u32 start; u32 size;} load_ovl[GK20A_PMU_UCODE_NB_MAX_OVERLAY];
+	u32 compressed;
+};
+
+#define PMU_UNIT_REWIND		(0x00)
+#define PMU_UNIT_I2C		(0x01)
+#define PMU_UNIT_SEQ		(0x02)
+#define PMU_UNIT_PG		(0x03)
+#define PMU_UNIT_AVAILABLE1	(0x04)
+#define PMU_UNIT_AVAILABLE2	(0x05)
+#define PMU_UNIT_MEM		(0x06)
+#define PMU_UNIT_INIT		(0x07)
+#define PMU_UNIT_FBBA		(0x08)
+#define PMU_UNIT_DIDLE		(0x09)
+#define PMU_UNIT_AVAILABLE3	(0x0A)
+#define PMU_UNIT_AVAILABLE4	(0x0B)
+#define PMU_UNIT_HDCP_MAIN	(0x0C)
+#define PMU_UNIT_HDCP_V		(0x0D)
+#define PMU_UNIT_HDCP_SRM	(0x0E)
+#define PMU_UNIT_NVDPS		(0x0F)
+#define PMU_UNIT_DEINIT		(0x10)
+#define PMU_UNIT_AVAILABLE5	(0x11)
+#define PMU_UNIT_PERFMON	(0x12)
+#define PMU_UNIT_FAN		(0x13)
+#define PMU_UNIT_PBI		(0x14)
+#define PMU_UNIT_ISOBLIT	(0x15)
+#define PMU_UNIT_DETACH		(0x16)
+#define PMU_UNIT_DISP		(0x17)
+#define PMU_UNIT_HDCP		(0x18)
+#define PMU_UNIT_REGCACHE	(0x19)
+#define PMU_UNIT_SYSMON		(0x1A)
+#define PMU_UNIT_THERM		(0x1B)
+#define PMU_UNIT_PMGR		(0x1C)
+#define PMU_UNIT_PERF		(0x1D)
+#define PMU_UNIT_PCM		(0x1E)
+#define PMU_UNIT_RC		(0x1F)
+#define PMU_UNIT_NULL		(0x20)
+#define PMU_UNIT_LOGGER		(0x21)
+#define PMU_UNIT_SMBPBI		(0x22)
+#define PMU_UNIT_END		(0x23)
+
+#define PMU_UNIT_TEST_START	(0xFE)
+#define PMU_UNIT_END_SIM	(0xFF)
+#define PMU_UNIT_TEST_END	(0xFF)
+
+#define PMU_UNIT_ID_IS_VALID(id)		\
+		(((id) < PMU_UNIT_END) || ((id) >= PMU_UNIT_TEST_START))
+
+#define PMU_DMEM_ALLOC_ALIGNMENT	(32)
+#define PMU_DMEM_ALIGNMENT		(4)
+
+#define PMU_CMD_FLAGS_PMU_MASK		(0xF0)
+
+#define PMU_CMD_FLAGS_STATUS		BIT(0)
+#define PMU_CMD_FLAGS_INTR		BIT(1)
+#define PMU_CMD_FLAGS_EVENT		BIT(2)
+#define PMU_CMD_FLAGS_WATERMARK		BIT(3)
+
+struct pmu_hdr {
+	u8 unit_id;
+	u8 size;
+	u8 ctrl_flags;
+	u8 seq_id;
+};
+#define PMU_MSG_HDR_SIZE	sizeof(struct pmu_hdr)
+#define PMU_CMD_HDR_SIZE	sizeof(struct pmu_hdr)
+
+#define PMU_QUEUE_COUNT		5
+
+struct pmu_allocation_v0 {
+	u8 pad[3];
+	u8 fb_mem_use;
+	struct {
+		struct pmu_dmem dmem;
+		struct pmu_mem_v0 fb;
+	} alloc;
+};
+
+struct pmu_allocation_v1 {
+	struct {
+		struct pmu_dmem dmem;
+		struct pmu_mem_v1 fb;
+	} alloc;
+};
+
+enum {
+	PMU_INIT_MSG_TYPE_PMU_INIT = 0,
+};
+
+struct pmu_init_msg_pmu_v0 {
+	u8 msg_type;
+	u8 pad;
+
+	struct {
+		u16 size;
+		u16 offset;
+		u8  index;
+		u8  pad;
+	} queue_info[PMU_QUEUE_COUNT];
+
+	u16 sw_managed_area_offset;
+	u16 sw_managed_area_size;
+};
+
+struct pmu_init_msg_pmu_v1 {
+	u8 msg_type;
+	u8 pad;
+	u16  os_debug_entry_point;
+
+	struct {
+		u16 size;
+		u16 offset;
+		u8  index;
+		u8  pad;
+	} queue_info[PMU_QUEUE_COUNT];
+
+	u16 sw_managed_area_offset;
+	u16 sw_managed_area_size;
+};
+
+union pmu_init_msg_pmu {
+	struct pmu_init_msg_pmu_v0 v0;
+	struct pmu_init_msg_pmu_v1 v1;
+};
+
+struct pmu_init_msg {
+	union {
+		u8 msg_type;
+		struct pmu_init_msg_pmu_v1 pmu_init_v1;
+		struct pmu_init_msg_pmu_v0 pmu_init_v0;
+	};
+};
+
+enum {
+	PMU_PG_ELPG_MSG_INIT_ACK,
+	PMU_PG_ELPG_MSG_DISALLOW_ACK,
+	PMU_PG_ELPG_MSG_ALLOW_ACK,
+	PMU_PG_ELPG_MSG_FREEZE_ACK,
+	PMU_PG_ELPG_MSG_FREEZE_ABORT,
+	PMU_PG_ELPG_MSG_UNFREEZE_ACK,
+};
+
+struct pmu_pg_msg_elpg_msg {
+	u8 msg_type;
+	u8 engine_id;
+	u16 msg;
+};
+
+enum {
+	PMU_PG_STAT_MSG_RESP_DMEM_OFFSET = 0,
+};
+
+struct pmu_pg_msg_stat {
+	u8 msg_type;
+	u8 engine_id;
+	u16 sub_msg_id;
+	u32 data;
+};
+
+enum {
+	PMU_PG_MSG_ENG_BUF_LOADED,
+	PMU_PG_MSG_ENG_BUF_UNLOADED,
+	PMU_PG_MSG_ENG_BUF_FAILED,
+};
+
+struct pmu_pg_msg_eng_buf_stat {
+	u8 msg_type;
+	u8 engine_id;
+	u8 buf_idx;
+	u8 status;
+};
+
+struct pmu_pg_msg {
+	union {
+		u8 msg_type;
+		struct pmu_pg_msg_elpg_msg elpg_msg;
+		struct pmu_pg_msg_stat stat;
+		struct pmu_pg_msg_eng_buf_stat eng_buf_stat;
+		/* TBD: other pg messages */
+		union pmu_ap_msg ap_msg;
+	};
+};
+
+enum {
+	PMU_RC_MSG_TYPE_UNHANDLED_CMD = 0,
+};
+
+struct pmu_rc_msg_unhandled_cmd {
+	u8 msg_type;
+	u8 unit_id;
+};
+
+struct pmu_rc_msg {
+	u8 msg_type;
+	struct pmu_rc_msg_unhandled_cmd unhandled_cmd;
+};
+
+enum {
+	PMU_PG_CMD_ID_ELPG_CMD = 0,
+	PMU_PG_CMD_ID_ENG_BUF_LOAD,
+	PMU_PG_CMD_ID_ENG_BUF_UNLOAD,
+	PMU_PG_CMD_ID_PG_STAT,
+	PMU_PG_CMD_ID_PG_LOG_INIT,
+	PMU_PG_CMD_ID_PG_LOG_FLUSH,
+	PMU_PG_CMD_ID_PG_PARAM,
+	PMU_PG_CMD_ID_ELPG_INIT,
+	PMU_PG_CMD_ID_ELPG_POLL_CTXSAVE,
+	PMU_PG_CMD_ID_ELPG_ABORT_POLL,
+	PMU_PG_CMD_ID_ELPG_PWR_UP,
+	PMU_PG_CMD_ID_ELPG_DISALLOW,
+	PMU_PG_CMD_ID_ELPG_ALLOW,
+	PMU_PG_CMD_ID_AP,
+	RM_PMU_PG_CMD_ID_PSI,
+	RM_PMU_PG_CMD_ID_CG,
+	PMU_PG_CMD_ID_ZBC_TABLE_UPDATE,
+	PMU_PG_CMD_ID_PWR_RAIL_GATE_DISABLE = 0x20,
+	PMU_PG_CMD_ID_PWR_RAIL_GATE_ENABLE,
+	PMU_PG_CMD_ID_PWR_RAIL_SMU_MSG_DISABLE
+};
+
+enum {
+	PMU_PG_ELPG_CMD_INIT,
+	PMU_PG_ELPG_CMD_DISALLOW,
+	PMU_PG_ELPG_CMD_ALLOW,
+	PMU_PG_ELPG_CMD_FREEZE,
+	PMU_PG_ELPG_CMD_UNFREEZE,
+};
+
+struct pmu_pg_cmd_elpg_cmd {
+	u8 cmd_type;
+	u8 engine_id;
+	u16 cmd;
+};
+
+struct pmu_pg_cmd_eng_buf_load {
+	u8 cmd_type;
+	u8 engine_id;
+	u8 buf_idx;
+	u8 pad;
+	u16 buf_size;
+	u32 dma_base;
+	u8 dma_offset;
+	u8 dma_idx;
+};
+
+enum {
+	PMU_PG_STAT_CMD_ALLOC_DMEM = 0,
+};
+
+struct pmu_pg_cmd_stat {
+	u8 cmd_type;
+	u8 engine_id;
+	u16 sub_cmd_id;
+	u32 data;
+};
+
+struct pmu_pg_cmd {
+	union {
+		u8 cmd_type;
+		struct pmu_pg_cmd_elpg_cmd elpg_cmd;
+		struct pmu_pg_cmd_eng_buf_load eng_buf_load;
+		struct pmu_pg_cmd_stat stat;
+		/* TBD: other pg commands */
+		union pmu_ap_cmd ap_cmd;
+	};
+};
+
+/* PERFMON */
+#define PMU_DOMAIN_GROUP_PSTATE		0
+#define PMU_DOMAIN_GROUP_GPC2CLK	1
+#define PMU_DOMAIN_GROUP_NUM		2
+
+/* TBD: smart strategy */
+#define PMU_PERFMON_PCT_TO_INC		58
+#define PMU_PERFMON_PCT_TO_DEC		23
+
+struct pmu_perfmon_counter {
+	u8 index;
+	u8 flags;
+	u8 group_id;
+	u8 valid;
+	u16 upper_threshold; /* units of 0.01% */
+	u16 lower_threshold; /* units of 0.01% */
+};
+
+#define PMU_PERFMON_FLAG_ENABLE_INCREASE	(0x00000001)
+#define PMU_PERFMON_FLAG_ENABLE_DECREASE	(0x00000002)
+#define PMU_PERFMON_FLAG_CLEAR_PREV		(0x00000004)
+
+/* PERFMON CMD */
+enum {
+	PMU_PERFMON_CMD_ID_START = 0,
+	PMU_PERFMON_CMD_ID_STOP  = 1,
+	PMU_PERFMON_CMD_ID_INIT  = 2
+};
+
+struct pmu_perfmon_cmd_start_v1 {
+	u8 cmd_type;
+	u8 group_id;
+	u8 state_id;
+	u8 flags;
+	struct pmu_allocation_v1 counter_alloc;
+};
+
+struct pmu_perfmon_cmd_start_v0 {
+	u8 cmd_type;
+	u8 group_id;
+	u8 state_id;
+	u8 flags;
+	struct pmu_allocation_v0 counter_alloc;
+};
+
+struct pmu_perfmon_cmd_stop {
+	u8 cmd_type;
+};
+
+struct pmu_perfmon_cmd_init_v1 {
+	u8 cmd_type;
+	u8 to_decrease_count;
+	u8 base_counter_id;
+	u32 sample_period_us;
+	struct pmu_allocation_v1 counter_alloc;
+	u8 num_counters;
+	u8 samples_in_moving_avg;
+	u16 sample_buffer;
+};
+
+struct pmu_perfmon_cmd_init_v0 {
+	u8 cmd_type;
+	u8 to_decrease_count;
+	u8 base_counter_id;
+	u32 sample_period_us;
+	struct pmu_allocation_v0 counter_alloc;
+	u8 num_counters;
+	u8 samples_in_moving_avg;
+	u16 sample_buffer;
+};
+
+struct pmu_perfmon_cmd {
+	union {
+		u8 cmd_type;
+		struct pmu_perfmon_cmd_start_v0 start_v0;
+		struct pmu_perfmon_cmd_start_v1 start_v1;
+		struct pmu_perfmon_cmd_stop stop;
+		struct pmu_perfmon_cmd_init_v0 init_v0;
+		struct pmu_perfmon_cmd_init_v1 init_v1;
+	};
+};
+
+struct pmu_zbc_cmd {
+	u8 cmd_type;
+	u8 pad;
+	u16 entry_mask;
+};
+
+/* PERFMON MSG */
+enum {
+	PMU_PERFMON_MSG_ID_INCREASE_EVENT = 0,
+	PMU_PERFMON_MSG_ID_DECREASE_EVENT = 1,
+	PMU_PERFMON_MSG_ID_INIT_EVENT     = 2,
+	PMU_PERFMON_MSG_ID_ACK            = 3
+};
+
+struct pmu_perfmon_msg_generic {
+	u8 msg_type;
+	u8 state_id;
+	u8 group_id;
+	u8 data;
+};
+
+struct pmu_perfmon_msg {
+	union {
+		u8 msg_type;
+		struct pmu_perfmon_msg_generic gen;
+	};
+};
+
+
+struct pmu_cmd {
+	struct pmu_hdr hdr;
+	union {
+		struct pmu_perfmon_cmd perfmon;
+		struct pmu_pg_cmd pg;
+		struct pmu_zbc_cmd zbc;
+	} cmd;
+};
+
+struct pmu_msg {
+	struct pmu_hdr hdr;
+	union {
+		struct pmu_init_msg init;
+		struct pmu_perfmon_msg perfmon;
+		struct pmu_pg_msg pg;
+		struct pmu_rc_msg rc;
+	} msg;
+};
+
+#define PMU_SHA1_GID_SIGNATURE		0xA7C66AD2
+#define PMU_SHA1_GID_SIGNATURE_SIZE	4
+
+#define PMU_SHA1_GID_SIZE	16
+
+struct pmu_sha1_gid {
+	bool valid;
+	u8 gid[PMU_SHA1_GID_SIZE];
+};
+
+struct pmu_sha1_gid_data {
+	u8 signature[PMU_SHA1_GID_SIGNATURE_SIZE];
+	u8 gid[PMU_SHA1_GID_SIZE];
+};
+
+#define PMU_COMMAND_QUEUE_HPQ		0	/* write by sw, read by pmu, protected by sw mutex lock */
+#define PMU_COMMAND_QUEUE_LPQ		1	/* write by sw, read by pmu, protected by sw mutex lock */
+#define PMU_COMMAND_QUEUE_BIOS		2	/* read/write by sw/hw, protected by hw pmu mutex, id = 2 */
+#define PMU_COMMAND_QUEUE_SMI		3	/* read/write by sw/hw, protected by hw pmu mutex, id = 3 */
+#define PMU_MESSAGE_QUEUE		4	/* write by pmu, read by sw, accessed by interrupt handler, no lock */
+#define PMU_QUEUE_COUNT			5
+
+enum {
+	PMU_MUTEX_ID_RSVD1 = 0	,
+	PMU_MUTEX_ID_GPUSER	,
+	PMU_MUTEX_ID_QUEUE_BIOS	,
+	PMU_MUTEX_ID_QUEUE_SMI	,
+	PMU_MUTEX_ID_GPMUTEX	,
+	PMU_MUTEX_ID_I2C	,
+	PMU_MUTEX_ID_RMLOCK	,
+	PMU_MUTEX_ID_MSGBOX	,
+	PMU_MUTEX_ID_FIFO	,
+	PMU_MUTEX_ID_PG		,
+	PMU_MUTEX_ID_GR		,
+	PMU_MUTEX_ID_CLK	,
+	PMU_MUTEX_ID_RSVD6	,
+	PMU_MUTEX_ID_RSVD7	,
+	PMU_MUTEX_ID_RSVD8	,
+	PMU_MUTEX_ID_RSVD9	,
+	PMU_MUTEX_ID_INVALID
+};
+
+#define PMU_IS_COMMAND_QUEUE(id)	\
+		((id)  < PMU_MESSAGE_QUEUE)
+
+#define PMU_IS_SW_COMMAND_QUEUE(id)	\
+		(((id) == PMU_COMMAND_QUEUE_HPQ) || \
+		 ((id) == PMU_COMMAND_QUEUE_LPQ))
+
+#define  PMU_IS_MESSAGE_QUEUE(id)	\
+		((id) == PMU_MESSAGE_QUEUE)
+
+enum
+{
+	OFLAG_READ = 0,
+	OFLAG_WRITE
+};
+
+#define QUEUE_SET		(true)
+#define QUEUE_GET		(false)
+
+#define QUEUE_ALIGNMENT		(4)
+
+#define PMU_PGENG_GR_BUFFER_IDX_INIT	(0)
+#define PMU_PGENG_GR_BUFFER_IDX_ZBC	(1)
+#define PMU_PGENG_GR_BUFFER_IDX_FECS	(2)
+
+enum
+{
+    PMU_DMAIDX_UCODE         = 0,
+    PMU_DMAIDX_VIRT          = 1,
+    PMU_DMAIDX_PHYS_VID      = 2,
+    PMU_DMAIDX_PHYS_SYS_COH  = 3,
+    PMU_DMAIDX_PHYS_SYS_NCOH = 4,
+    PMU_DMAIDX_RSVD          = 5,
+    PMU_DMAIDX_PELPG         = 6,
+    PMU_DMAIDX_END           = 7
+};
+
+struct pmu_gk20a;
+struct pmu_queue;
+
+struct pmu_queue {
+
+	/* used by hw, for BIOS/SMI queue */
+	u32 mutex_id;
+	u32 mutex_lock;
+	/* used by sw, for LPQ/HPQ queue */
+	struct mutex mutex;
+
+	/* current write position */
+	u32 position;
+	/* physical dmem offset where this queue begins */
+	u32 offset;
+	/* logical queue identifier */
+	u32 id;
+	/* physical queue index */
+	u32 index;
+	/* in bytes */
+	u32 size;
+
+	/* open-flag */
+	u32 oflag;
+	bool opened; /* opened implies locked */
+	bool locked; /* check free space after setting locked but before setting opened */
+};
+
+
+#define PMU_MUTEX_ID_IS_VALID(id)	\
+		((id) < PMU_MUTEX_ID_INVALID)
+
+#define PMU_INVALID_MUTEX_OWNER_ID	(0)
+
+struct pmu_mutex {
+	u32 id;
+	u32 index;
+	u32 ref_cnt;
+};
+
+#define PMU_MAX_NUM_SEQUENCES		(256)
+#define PMU_SEQ_BIT_SHIFT		(5)
+#define PMU_SEQ_TBL_SIZE	\
+		(PMU_MAX_NUM_SEQUENCES >> PMU_SEQ_BIT_SHIFT)
+
+#define PMU_INVALID_SEQ_DESC		(~0)
+
+enum
+{
+	PMU_SEQ_STATE_FREE = 0,
+	PMU_SEQ_STATE_PENDING,
+	PMU_SEQ_STATE_USED,
+	PMU_SEQ_STATE_CANCELLED
+};
+
+struct pmu_payload {
+	struct {
+		void *buf;
+		u32 offset;
+		u32 size;
+	} in, out;
+};
+
+typedef void (*pmu_callback)(struct gk20a *, struct pmu_msg *, void *, u32,
+	u32);
+
+struct pmu_sequence {
+	u8 id;
+	u32 state;
+	u32 desc;
+	struct pmu_msg *msg;
+	union {
+		struct pmu_allocation_v0 in_v0;
+		struct pmu_allocation_v1 in_v1;
+	};
+	union {
+		struct pmu_allocation_v0 out_v0;
+		struct pmu_allocation_v1 out_v1;
+	};
+	u8 *out_payload;
+	pmu_callback callback;
+	void* cb_params;
+};
+
+struct pmu_pg_stats {
+	u64 pg_entry_start_timestamp;
+	u64 pg_ingating_start_timestamp;
+	u64 pg_exit_start_timestamp;
+	u64 pg_ungating_start_timestamp;
+	u32 pg_avg_entry_time_us;
+	u32 pg_ingating_cnt;
+	u32 pg_ingating_time_us;
+	u32 pg_avg_exit_time_us;
+	u32 pg_ungating_count;
+	u32 pg_ungating_time_us;
+	u32 pg_gating_cnt;
+	u32 pg_gating_deny_cnt;
+};
+
+#define PMU_PG_IDLE_THRESHOLD_SIM		1000
+#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD_SIM	4000000
+/* TBD: QT or else ? */
+#define PMU_PG_IDLE_THRESHOLD			15000
+#define PMU_PG_POST_POWERUP_IDLE_THRESHOLD	1000000
+
+/* state transition :
+    OFF => [OFF_ON_PENDING optional] => ON_PENDING => ON => OFF
+    ON => OFF is always synchronized */
+#define PMU_ELPG_STAT_OFF		0   /* elpg is off */
+#define PMU_ELPG_STAT_ON		1   /* elpg is on */
+#define PMU_ELPG_STAT_ON_PENDING	2   /* elpg is off, ALLOW cmd has been sent, wait for ack */
+#define PMU_ELPG_STAT_OFF_PENDING	3   /* elpg is on, DISALLOW cmd has been sent, wait for ack */
+#define PMU_ELPG_STAT_OFF_ON_PENDING	4   /* elpg is off, caller has requested on, but ALLOW
+					       cmd hasn't been sent due to ENABLE_ALLOW delay */
+
+/* Falcon Register index */
+#define PMU_FALCON_REG_R0		(0)
+#define PMU_FALCON_REG_R1		(1)
+#define PMU_FALCON_REG_R2		(2)
+#define PMU_FALCON_REG_R3		(3)
+#define PMU_FALCON_REG_R4		(4)
+#define PMU_FALCON_REG_R5		(5)
+#define PMU_FALCON_REG_R6		(6)
+#define PMU_FALCON_REG_R7		(7)
+#define PMU_FALCON_REG_R8		(8)
+#define PMU_FALCON_REG_R9		(9)
+#define PMU_FALCON_REG_R10		(10)
+#define PMU_FALCON_REG_R11		(11)
+#define PMU_FALCON_REG_R12		(12)
+#define PMU_FALCON_REG_R13		(13)
+#define PMU_FALCON_REG_R14		(14)
+#define PMU_FALCON_REG_R15		(15)
+#define PMU_FALCON_REG_IV0		(16)
+#define PMU_FALCON_REG_IV1		(17)
+#define PMU_FALCON_REG_UNDEFINED	(18)
+#define PMU_FALCON_REG_EV		(19)
+#define PMU_FALCON_REG_SP		(20)
+#define PMU_FALCON_REG_PC		(21)
+#define PMU_FALCON_REG_IMB		(22)
+#define PMU_FALCON_REG_DMB		(23)
+#define PMU_FALCON_REG_CSW		(24)
+#define PMU_FALCON_REG_CCR		(25)
+#define PMU_FALCON_REG_SEC		(26)
+#define PMU_FALCON_REG_CTX		(27)
+#define PMU_FALCON_REG_EXCI		(28)
+#define PMU_FALCON_REG_RSVD0		(29)
+#define PMU_FALCON_REG_RSVD1		(30)
+#define PMU_FALCON_REG_RSVD2		(31)
+#define PMU_FALCON_REG_SIZE		(32)
+
+struct pmu_gk20a {
+
+	struct gk20a *g;
+
+	struct pmu_ucode_desc *desc;
+	struct pmu_mem_desc ucode;
+
+	struct pmu_mem_desc pg_buf;
+	/* TBD: remove this if ZBC seq is fixed */
+	struct pmu_mem_desc seq_buf;
+	bool buf_loaded;
+
+	struct pmu_sha1_gid gid_info;
+
+	struct pmu_queue queue[PMU_QUEUE_COUNT];
+
+	struct pmu_sequence *seq;
+	unsigned long pmu_seq_tbl[PMU_SEQ_TBL_SIZE];
+	u32 next_seq_desc;
+
+	struct pmu_mutex *mutex;
+	u32 mutex_cnt;
+
+	struct mutex pmu_copy_lock;
+	struct mutex pmu_seq_lock;
+
+	struct gk20a_allocator dmem;
+
+	u32 *ucode_image;
+	bool pmu_ready;
+
+	u32 zbc_save_done;
+
+	u32 stat_dmem_offset;
+
+	bool elpg_ready;
+	u32 elpg_stat;
+	wait_queue_head_t pg_wq;
+
+#define PMU_ELPG_ENABLE_ALLOW_DELAY_MSEC	1 /* msec */
+	struct delayed_work elpg_enable; /* deferred elpg enable */
+	struct work_struct pg_init;
+	bool elpg_enable_allow; /* true after init, false after disable, true after delay */
+	struct mutex elpg_mutex; /* protect elpg enable/disable */
+	int elpg_refcnt; /* disable -1, enable +1, <=0 elpg disabled, > 0 elpg enabled */
+
+	struct pmu_perfmon_counter perfmon_counter;
+	u32 perfmon_state_id[PMU_DOMAIN_GROUP_NUM];
+
+	bool initialized;
+
+	void (*remove_support)(struct pmu_gk20a *pmu);
+	bool sw_ready;
+	bool perfmon_ready;
+
+	u32 sample_buffer;
+
+	struct mutex isr_mutex;
+	bool zbc_ready;
+	union {
+		struct pmu_cmdline_args_v0 args_v0;
+		struct pmu_cmdline_args_v1 args_v1;
+	};
+};
+
+struct gk20a_pmu_save_state {
+	struct pmu_sequence *seq;
+	u32 next_seq_desc;
+	struct pmu_mutex *mutex;
+	u32 mutex_cnt;
+	struct pmu_ucode_desc *desc;
+	struct pmu_mem_desc ucode;
+	struct pmu_mem_desc seq_buf;
+	struct pmu_mem_desc pg_buf;
+	struct delayed_work elpg_enable;
+	wait_queue_head_t pg_wq;
+	bool sw_ready;
+	struct work_struct pg_init;
+};
+
+int gk20a_init_pmu_support(struct gk20a *g);
+int gk20a_init_pmu_setup_hw2(struct gk20a *g);
+
+void gk20a_pmu_isr(struct gk20a *g);
+
+/* send a cmd to pmu */
+int gk20a_pmu_cmd_post(struct gk20a *g, struct pmu_cmd *cmd, struct pmu_msg *msg,
+		struct pmu_payload *payload, u32 queue_id,
+		pmu_callback callback, void* cb_param,
+		u32 *seq_desc, unsigned long timeout);
+
+int gk20a_pmu_enable_elpg(struct gk20a *g);
+int gk20a_pmu_disable_elpg(struct gk20a *g);
+
+void gk20a_pmu_save_zbc(struct gk20a *g, u32 entries);
+
+int gk20a_pmu_perfmon_enable(struct gk20a *g, bool enable);
+
+int pmu_mutex_acquire(struct pmu_gk20a *pmu, u32 id, u32 *token);
+int pmu_mutex_release(struct pmu_gk20a *pmu, u32 id, u32 *token);
+int gk20a_pmu_destroy(struct gk20a *g);
+int gk20a_pmu_load_norm(struct gk20a *g, u32 *load);
+int gk20a_pmu_debugfs_init(struct platform_device *dev);
+void gk20a_pmu_reset_load_counters(struct gk20a *g);
+void gk20a_pmu_get_load_counters(struct gk20a *g, u32 *busy_cycles,
+		u32 *total_cycles);
+
+#endif /*__PMU_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c
new file mode 100644
index 000000000000..aea1a80bbcad
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.c
@@ -0,0 +1,91 @@
+/*
+ * GK20A priv ring
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/delay.h>	/* for mdelay */
+
+#include "gk20a.h"
+#include "hw_mc_gk20a.h"
+#include "hw_pri_ringmaster_gk20a.h"
+#include "hw_pri_ringstation_sys_gk20a.h"
+#include "hw_trim_gk20a.h"
+
+void gk20a_reset_priv_ring(struct gk20a *g)
+{
+	u32 data;
+
+	if (tegra_platform_is_linsim())
+		return;
+
+	data = gk20a_readl(g, trim_sys_gpc2clk_out_r());
+	data = set_field(data,
+			trim_sys_gpc2clk_out_bypdiv_m(),
+			trim_sys_gpc2clk_out_bypdiv_f(0));
+	gk20a_writel(g, trim_sys_gpc2clk_out_r(), data);
+
+	gk20a_reset(g, mc_enable_priv_ring_enabled_f());
+
+	gk20a_writel(g,pri_ringmaster_command_r(),
+			0x4);
+
+	gk20a_writel(g, pri_ringstation_sys_decode_config_r(),
+			0x2);
+
+	gk20a_readl(g, pri_ringstation_sys_decode_config_r());
+}
+
+void gk20a_priv_ring_isr(struct gk20a *g)
+{
+	u32 status0, status1;
+	u32 cmd;
+	s32 retry = 100;
+
+	if (tegra_platform_is_linsim())
+		return;
+
+	status0 = gk20a_readl(g, pri_ringmaster_intr_status0_r());
+	status1 = gk20a_readl(g, pri_ringmaster_intr_status1_r());
+
+	gk20a_dbg_info("ringmaster intr status0: 0x%08x,"
+		"status1: 0x%08x", status0, status1);
+
+	if (status0 & (0x1 | 0x2 | 0x4)) {
+		gk20a_reset_priv_ring(g);
+	}
+
+	cmd = gk20a_readl(g, pri_ringmaster_command_r());
+	cmd = set_field(cmd, pri_ringmaster_command_cmd_m(),
+		pri_ringmaster_command_cmd_ack_interrupt_f());
+	gk20a_writel(g, pri_ringmaster_command_r(), cmd);
+
+	do {
+		cmd = pri_ringmaster_command_cmd_v(
+			gk20a_readl(g, pri_ringmaster_command_r()));
+		usleep_range(20, 40);
+	} while (cmd != pri_ringmaster_command_cmd_no_cmd_v() && --retry);
+
+	if (retry <= 0)
+		gk20a_warn(dev_from_gk20a(g),
+			"priv ringmaster cmd ack too many retries");
+
+	status0 = gk20a_readl(g, pri_ringmaster_intr_status0_r());
+	status1 = gk20a_readl(g, pri_ringmaster_intr_status1_r());
+
+	gk20a_dbg_info("ringmaster intr status0: 0x%08x,"
+		" status1: 0x%08x", status0, status1);
+}
+
diff --git a/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h
new file mode 100644
index 000000000000..cb9d49c7be07
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/priv_ring_gk20a.h
@@ -0,0 +1,27 @@
+/*
+ * drivers/video/tegra/host/gk20a/priv_ring_gk20a.h
+ *
+ * GK20A PRIV ringmaster
+ *
+ * Copyright (c) 2011-2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __PRIV_RING_GK20A_H__
+#define __PRIV_RING_GK20A_H__
+
+void gk20a_reset_priv_ring(struct gk20a *g);
+void gk20a_priv_ring_isr(struct gk20a *g);
+
+#endif /*__PRIV_RING_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.c b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
new file mode 100644
index 000000000000..4a115fb10fac
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.c
@@ -0,0 +1,704 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bsearch.h>
+#include <linux/nvhost_dbg_gpu_ioctl.h>
+
+#include "gk20a.h"
+#include "gr_gk20a.h"
+#include "dbg_gpu_gk20a.h"
+#include "regops_gk20a.h"
+
+
+
+struct regop_offset_range {
+	u32 base:24;
+	u32 count:8;
+};
+
+static int regop_bsearch_range_cmp(const void *pkey, const void *pelem)
+{
+	u32 key = *(u32 *)pkey;
+	struct regop_offset_range *prange = (struct regop_offset_range *)pelem;
+	if (key < prange->base)
+		return -1;
+	else if (prange->base <= key && key < (prange->base +
+					       (prange->count * 4)))
+		return 0;
+	return 1;
+}
+
+static inline bool linear_search(u32 offset, const u32 *list, int size)
+{
+	int i;
+	for (i = 0; i < size; i++)
+		if (list[i] == offset)
+			return true;
+	return false;
+}
+
+static const struct regop_offset_range gk20a_global_whitelist_ranges[] = {
+	{ 0x000004f0,   1 },
+	{ 0x00001a00,   3 },
+	{ 0x0000259c,   1 },
+	{ 0x0000280c,   1 },
+	{ 0x00009400,   1 },
+	{ 0x00009410,   1 },
+	{ 0x00020200,   1 },
+	{ 0x00022430,   7 },
+	{ 0x00022548,   1 },
+	{ 0x00100c18,   3 },
+	{ 0x00100c84,   1 },
+	{ 0x00100cc4,   1 },
+	{ 0x00106640,   1 },
+	{ 0x0010a0a8,   1 },
+	{ 0x0010a4f0,   1 },
+	{ 0x0010e064,   1 },
+	{ 0x0010e164,   1 },
+	{ 0x0010e490,   1 },
+	{ 0x00110100,   1 },
+	{ 0x00140028,   1 },
+	{ 0x001408dc,   1 },
+	{ 0x00140a5c,   1 },
+	{ 0x001410dc,   1 },
+	{ 0x0014125c,   1 },
+	{ 0x0017e028,   1 },
+	{ 0x0017e8dc,   1 },
+	{ 0x0017ea5c,   1 },
+	{ 0x0017f0dc,   1 },
+	{ 0x0017f25c,   1 },
+	{ 0x00180000,  68 },
+	{ 0x00180200,  68 },
+	{ 0x001a0000,  68 },
+	{ 0x001b0000,  68 },
+	{ 0x001b0200,  68 },
+	{ 0x001b0400,  68 },
+	{ 0x001b0600,  68 },
+	{ 0x001b4000,   3 },
+	{ 0x001b4010,   3 },
+	{ 0x001b4020,   3 },
+	{ 0x001b4040,   3 },
+	{ 0x001b4050,   3 },
+	{ 0x001b4060,  16 },
+	{ 0x001b40a4,   1 },
+	{ 0x001b4100,   6 },
+	{ 0x001b4124,   2 },
+	{ 0x001b8000,   7 },
+	{ 0x001bc000,   7 },
+	{ 0x001be000,   7 },
+	{ 0x00400500,   1 },
+	{ 0x00400700,   1 },
+	{ 0x0040415c,   1 },
+	{ 0x00405850,   1 },
+	{ 0x00405908,   1 },
+	{ 0x00405b40,   1 },
+	{ 0x00405b50,   1 },
+	{ 0x00406024,   1 },
+	{ 0x00407010,   1 },
+	{ 0x00407808,   1 },
+	{ 0x0040803c,   1 },
+	{ 0x0040880c,   1 },
+	{ 0x00408910,   1 },
+	{ 0x00408984,   1 },
+	{ 0x004090a8,   1 },
+	{ 0x004098a0,   1 },
+	{ 0x0041000c,   1 },
+	{ 0x00410110,   1 },
+	{ 0x00410184,   1 },
+	{ 0x00418384,   1 },
+	{ 0x004184a0,   1 },
+	{ 0x00418604,   1 },
+	{ 0x00418680,   1 },
+	{ 0x00418714,   1 },
+	{ 0x0041881c,   1 },
+	{ 0x004188c8,   2 },
+	{ 0x00418b04,   1 },
+	{ 0x00418c04,   1 },
+	{ 0x00418c64,   2 },
+	{ 0x00418c88,   1 },
+	{ 0x00418cb4,   2 },
+	{ 0x00418d00,   1 },
+	{ 0x00418d28,   2 },
+	{ 0x00418e08,   1 },
+	{ 0x00418e1c,   2 },
+	{ 0x00418f08,   1 },
+	{ 0x00418f20,   2 },
+	{ 0x00419000,   1 },
+	{ 0x0041900c,   1 },
+	{ 0x00419018,   1 },
+	{ 0x00419854,   1 },
+	{ 0x00419ab0,   1 },
+	{ 0x00419ab8,   3 },
+	{ 0x00419ac8,   1 },
+	{ 0x00419c0c,   1 },
+	{ 0x00419c8c,   3 },
+	{ 0x00419ca8,   1 },
+	{ 0x00419d08,   2 },
+	{ 0x00419e00,   1 },
+	{ 0x00419e0c,   1 },
+	{ 0x00419e14,   2 },
+	{ 0x00419e24,   2 },
+	{ 0x00419e34,   2 },
+	{ 0x00419e44,   4 },
+	{ 0x00419ea4,   1 },
+	{ 0x00419eb0,   1 },
+	{ 0x0041a0a0,   1 },
+	{ 0x0041a0a8,   1 },
+	{ 0x0041a17c,   1 },
+	{ 0x0041a890,   2 },
+	{ 0x0041a8a0,   3 },
+	{ 0x0041a8b0,   2 },
+	{ 0x0041b014,   1 },
+	{ 0x0041b0a0,   1 },
+	{ 0x0041b0cc,   1 },
+	{ 0x0041b0e8,   2 },
+	{ 0x0041b1dc,   1 },
+	{ 0x0041b1f8,   2 },
+	{ 0x0041be14,   1 },
+	{ 0x0041bea0,   1 },
+	{ 0x0041becc,   1 },
+	{ 0x0041bee8,   2 },
+	{ 0x0041bfdc,   1 },
+	{ 0x0041bff8,   2 },
+	{ 0x0041c054,   1 },
+	{ 0x0041c2b0,   1 },
+	{ 0x0041c2b8,   3 },
+	{ 0x0041c2c8,   1 },
+	{ 0x0041c40c,   1 },
+	{ 0x0041c48c,   3 },
+	{ 0x0041c4a8,   1 },
+	{ 0x0041c508,   2 },
+	{ 0x0041c600,   1 },
+	{ 0x0041c60c,   1 },
+	{ 0x0041c614,   2 },
+	{ 0x0041c624,   2 },
+	{ 0x0041c634,   2 },
+	{ 0x0041c644,   4 },
+	{ 0x0041c6a4,   1 },
+	{ 0x0041c6b0,   1 },
+	{ 0x00500384,   1 },
+	{ 0x005004a0,   1 },
+	{ 0x00500604,   1 },
+	{ 0x00500680,   1 },
+	{ 0x00500714,   1 },
+	{ 0x0050081c,   1 },
+	{ 0x005008c8,   2 },
+	{ 0x00500b04,   1 },
+	{ 0x00500c04,   1 },
+	{ 0x00500c64,   2 },
+	{ 0x00500c88,   1 },
+	{ 0x00500cb4,   2 },
+	{ 0x00500d00,   1 },
+	{ 0x00500d28,   2 },
+	{ 0x00500e08,   1 },
+	{ 0x00500e1c,   2 },
+	{ 0x00500f08,   1 },
+	{ 0x00500f20,   2 },
+	{ 0x00501000,   1 },
+	{ 0x0050100c,   1 },
+	{ 0x00501018,   1 },
+	{ 0x00501854,   1 },
+	{ 0x00501ab0,   1 },
+	{ 0x00501ab8,   3 },
+	{ 0x00501ac8,   1 },
+	{ 0x00501c0c,   1 },
+	{ 0x00501c8c,   3 },
+	{ 0x00501ca8,   1 },
+	{ 0x00501d08,   2 },
+	{ 0x00501e00,   1 },
+	{ 0x00501e0c,   1 },
+	{ 0x00501e14,   2 },
+	{ 0x00501e24,   2 },
+	{ 0x00501e34,   2 },
+	{ 0x00501e44,   4 },
+	{ 0x00501ea4,   1 },
+	{ 0x00501eb0,   1 },
+	{ 0x005020a0,   1 },
+	{ 0x005020a8,   1 },
+	{ 0x0050217c,   1 },
+	{ 0x00502890,   2 },
+	{ 0x005028a0,   3 },
+	{ 0x005028b0,   2 },
+	{ 0x00503014,   1 },
+	{ 0x005030a0,   1 },
+	{ 0x005030cc,   1 },
+	{ 0x005030e8,   2 },
+	{ 0x005031dc,   1 },
+	{ 0x005031f8,   2 },
+	{ 0x00503e14,   1 },
+	{ 0x00503ea0,   1 },
+	{ 0x00503ecc,   1 },
+	{ 0x00503ee8,   2 },
+	{ 0x00503fdc,   1 },
+	{ 0x00503ff8,   2 },
+	{ 0x00504054,   1 },
+	{ 0x005042b0,   1 },
+	{ 0x005042b8,   3 },
+	{ 0x005042c8,   1 },
+	{ 0x0050440c,   1 },
+	{ 0x0050448c,   3 },
+	{ 0x005044a8,   1 },
+	{ 0x00504508,   2 },
+	{ 0x00504600,   1 },
+	{ 0x0050460c,   1 },
+	{ 0x00504614,   2 },
+	{ 0x00504624,   2 },
+	{ 0x00504634,   2 },
+	{ 0x00504644,   4 },
+	{ 0x005046a4,   1 },
+	{ 0x005046b0,   1 },
+};
+static const u32 gk20a_global_whitelist_ranges_count =
+	ARRAY_SIZE(gk20a_global_whitelist_ranges);
+
+/* context */
+
+static const struct regop_offset_range gk20a_context_whitelist_ranges[] = {
+	{ 0x0000280c,   1 },
+	{ 0x00100cc4,   1 },
+	{ 0x00400500,   1 },
+	{ 0x00405b40,   1 },
+	{ 0x00419000,   1 },
+	{ 0x00419c8c,   3 },
+	{ 0x00419d08,   2 },
+	{ 0x00419e04,   3 },
+	{ 0x00419e14,   2 },
+	{ 0x00419e24,   2 },
+	{ 0x00419e34,   2 },
+	{ 0x00419e44,   4 },
+	{ 0x00419e58,   6 },
+	{ 0x00419e84,   5 },
+	{ 0x00419ea4,   1 },
+	{ 0x00419eac,   2 },
+	{ 0x00419f30,   8 },
+	{ 0x0041c48c,   3 },
+	{ 0x0041c508,   2 },
+	{ 0x0041c604,   3 },
+	{ 0x0041c614,   2 },
+	{ 0x0041c624,   2 },
+	{ 0x0041c634,   2 },
+	{ 0x0041c644,   4 },
+	{ 0x0041c658,   6 },
+	{ 0x0041c684,   5 },
+	{ 0x0041c6a4,   1 },
+	{ 0x0041c6ac,   2 },
+	{ 0x0041c730,   8 },
+	{ 0x00501000,   1 },
+	{ 0x00501c8c,   3 },
+	{ 0x00501d08,   2 },
+	{ 0x00501e04,   3 },
+	{ 0x00501e14,   2 },
+	{ 0x00501e24,   2 },
+	{ 0x00501e34,   2 },
+	{ 0x00501e44,   4 },
+	{ 0x00501e58,   6 },
+	{ 0x00501e84,   5 },
+	{ 0x00501ea4,   1 },
+	{ 0x00501eac,   2 },
+	{ 0x00501f30,   8 },
+	{ 0x0050448c,   3 },
+	{ 0x00504508,   2 },
+	{ 0x00504604,   3 },
+	{ 0x00504614,   2 },
+	{ 0x00504624,   2 },
+	{ 0x00504634,   2 },
+	{ 0x00504644,   4 },
+	{ 0x00504658,   6 },
+	{ 0x00504684,   5 },
+	{ 0x005046a4,   1 },
+	{ 0x005046ac,   2 },
+	{ 0x00504730,   8 },
+};
+static const u32 gk20a_context_whitelist_ranges_count =
+	ARRAY_SIZE(gk20a_context_whitelist_ranges);
+
+/* runcontrol */
+static const u32 gk20a_runcontrol_whitelist[] = {
+	0x00419e10,
+	0x0041c610,
+	0x00501e10,
+	0x00504610,
+};
+static const u32 gk20a_runcontrol_whitelist_count =
+	ARRAY_SIZE(gk20a_runcontrol_whitelist);
+
+static const struct regop_offset_range gk20a_runcontrol_whitelist_ranges[] = {
+	{ 0x00419e10,   1 },
+	{ 0x0041c610,   1 },
+	{ 0x00501e10,   1 },
+	{ 0x00504610,   1 },
+};
+static const u32 gk20a_runcontrol_whitelist_ranges_count =
+	ARRAY_SIZE(gk20a_runcontrol_whitelist_ranges);
+
+
+/* quad ctl */
+static const u32 gk20a_qctl_whitelist[] = {
+	0x00504670,
+    0x00504674,
+    0x00504678,
+    0x0050467c,
+    0x00504680,
+	0x00504730,
+	0x00504734,
+	0x00504738,
+	0x0050473c,
+};
+static const u32 gk20a_qctl_whitelist_count =
+	ARRAY_SIZE(gk20a_qctl_whitelist);
+
+static const struct regop_offset_range gk20a_qctl_whitelist_ranges[] = {
+	{ 0x00504670,   1 },
+	{ 0x00504730,   4 },
+};
+static const u32 gk20a_qctl_whitelist_ranges_count =
+	ARRAY_SIZE(gk20a_qctl_whitelist_ranges);
+
+
+
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+			     u32 *ctx_rd_count, u32 *ctx_wr_count,
+			     struct nvhost_dbg_gpu_reg_op *ops,
+			     u32 op_count);
+
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+		      struct nvhost_dbg_gpu_reg_op *ops,
+		      u64 num_ops)
+{
+	int err = 0, i;
+	struct channel_gk20a *ch = NULL;
+	struct gk20a *g = dbg_s->g;
+	/*struct gr_gk20a *gr = &g->gr;*/
+	u32 data32_lo = 0, data32_hi = 0;
+	u32 ctx_rd_count = 0, ctx_wr_count = 0;
+	bool skip_read_lo, skip_read_hi;
+	bool ok;
+
+	gk20a_dbg(gpu_dbg_fn | gpu_dbg_gpu_dbg, "");
+
+	ch = dbg_s->ch;
+
+	ok = validate_reg_ops(dbg_s,
+			      &ctx_rd_count, &ctx_wr_count,
+			      ops, num_ops);
+	if (!ok) {
+		dev_err(dbg_s->dev, "invalid op(s)");
+		err = -EINVAL;
+		/* each op has its own err/status */
+		goto clean_up;
+	}
+
+	for (i = 0; i < num_ops; i++) {
+		/* if it isn't global then it is done in the ctx ops... */
+		if (ops[i].type != REGOP(TYPE_GLOBAL))
+			continue;
+
+		switch (ops[i].op) {
+
+		case REGOP(READ_32):
+			ops[i].value_hi = 0;
+			ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+			gk20a_dbg(gpu_dbg_gpu_dbg, "read_32 0x%08x from 0x%08x",
+				   ops[i].value_lo, ops[i].offset);
+
+			break;
+
+		case REGOP(READ_64):
+			ops[i].value_lo = gk20a_readl(g, ops[i].offset);
+			ops[i].value_hi =
+				gk20a_readl(g, ops[i].offset + 4);
+
+			gk20a_dbg(gpu_dbg_gpu_dbg, "read_64 0x%08x:%08x from 0x%08x",
+				   ops[i].value_hi, ops[i].value_lo,
+				   ops[i].offset);
+		break;
+
+		case REGOP(WRITE_32):
+		case REGOP(WRITE_64):
+			/* some of this appears wonky/unnecessary but
+			   we've kept it for compat with existing
+			   debugger code.  just in case... */
+			skip_read_lo = skip_read_hi = false;
+			if (ops[i].and_n_mask_lo == ~(u32)0) {
+				data32_lo = ops[i].value_lo;
+				skip_read_lo = true;
+			}
+
+			if ((ops[i].op == REGOP(WRITE_64)) &&
+			    (ops[i].and_n_mask_hi == ~(u32)0)) {
+				data32_hi = ops[i].value_hi;
+				skip_read_hi = true;
+			}
+
+			/* read first 32bits */
+			if (unlikely(skip_read_lo == false)) {
+				data32_lo = gk20a_readl(g, ops[i].offset);
+				data32_lo &= ~ops[i].and_n_mask_lo;
+				data32_lo |= ops[i].value_lo;
+			}
+
+			/* if desired, read second 32bits */
+			if ((ops[i].op == REGOP(WRITE_64)) &&
+			    !skip_read_hi) {
+				data32_hi = gk20a_readl(g, ops[i].offset + 4);
+				data32_hi &= ~ops[i].and_n_mask_hi;
+				data32_hi |= ops[i].value_hi;
+			}
+
+			/* now update first 32bits */
+			gk20a_writel(g, ops[i].offset, data32_lo);
+			gk20a_dbg(gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+				   data32_lo, ops[i].offset);
+			/* if desired, update second 32bits */
+			if (ops[i].op == REGOP(WRITE_64)) {
+				gk20a_writel(g, ops[i].offset + 4, data32_hi);
+				gk20a_dbg(gpu_dbg_gpu_dbg, "Wrote 0x%08x to 0x%08x ",
+					   data32_hi, ops[i].offset + 4);
+
+			}
+
+
+			break;
+
+		/* shouldn't happen as we've already screened */
+		default:
+			BUG();
+			err = -EINVAL;
+			goto clean_up;
+			break;
+		}
+	}
+
+	if (ctx_wr_count | ctx_rd_count) {
+		err = gr_gk20a_exec_ctx_ops(ch, ops, num_ops,
+					    ctx_wr_count, ctx_rd_count);
+		if (err) {
+			dev_warn(dbg_s->dev,
+				 "failed to perform ctx ops\n");
+			goto clean_up;
+		}
+	}
+
+ clean_up:
+	gk20a_dbg(gpu_dbg_gpu_dbg, "ret=%d", err);
+	return err;
+
+}
+
+
+static int validate_reg_op_info(struct dbg_session_gk20a *dbg_s,
+				struct nvhost_dbg_gpu_reg_op *op)
+{
+	int err = 0;
+
+	op->status = REGOP(STATUS_SUCCESS);
+
+	switch (op->op) {
+	case REGOP(READ_32):
+	case REGOP(READ_64):
+	case REGOP(WRITE_32):
+	case REGOP(WRITE_64):
+		break;
+	default:
+		op->status |= REGOP(STATUS_UNSUPPORTED_OP);
+		/*gk20a_err(dbg_s->dev, "Invalid regops op %d!", op->op);*/
+		err = -EINVAL;
+		break;
+	}
+
+	switch (op->type) {
+	case REGOP(TYPE_GLOBAL):
+	case REGOP(TYPE_GR_CTX):
+	case REGOP(TYPE_GR_CTX_TPC):
+	case REGOP(TYPE_GR_CTX_SM):
+	case REGOP(TYPE_GR_CTX_CROP):
+	case REGOP(TYPE_GR_CTX_ZROP):
+	case REGOP(TYPE_GR_CTX_QUAD):
+		break;
+	/*
+	case NVHOST_DBG_GPU_REG_OP_TYPE_FB:
+	*/
+	default:
+		op->status |= REGOP(STATUS_INVALID_TYPE);
+		/*gk20a_err(dbg_s->dev, "Invalid regops type %d!", op->type);*/
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
+static bool check_whitelists(struct dbg_session_gk20a *dbg_s,
+			  struct nvhost_dbg_gpu_reg_op *op, u32 offset)
+{
+	bool valid = false;
+
+	if (op->type == REGOP(TYPE_GLOBAL)) {
+		/* search global list */
+		valid = !!bsearch(&offset,
+				  gk20a_global_whitelist_ranges,
+				  gk20a_global_whitelist_ranges_count,
+				  sizeof(*gk20a_global_whitelist_ranges),
+				  regop_bsearch_range_cmp);
+
+		/* if debug session and channel is bound search context list */
+		if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+			/* binary search context list */
+			valid = !!bsearch(&offset,
+					  gk20a_context_whitelist_ranges,
+					  gk20a_context_whitelist_ranges_count,
+					  sizeof(*gk20a_context_whitelist_ranges),
+					  regop_bsearch_range_cmp);
+		}
+
+		/* if debug session and channel is bound search runcontrol list */
+		if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+			valid = linear_search(offset,
+					      gk20a_runcontrol_whitelist,
+					      gk20a_runcontrol_whitelist_count);
+		}
+	} else if (op->type == REGOP(TYPE_GR_CTX)) {
+		/* it's a context-relative op */
+		if (!dbg_s->ch) {
+			gk20a_err(dbg_s->dev, "can't perform ctx regop unless bound");
+			op->status = REGOP(STATUS_UNSUPPORTED_OP);
+			return -ENODEV;
+		}
+
+		/* binary search context list */
+		valid = !!bsearch(&offset,
+				  gk20a_context_whitelist_ranges,
+				  gk20a_context_whitelist_ranges_count,
+				  sizeof(*gk20a_context_whitelist_ranges),
+				  regop_bsearch_range_cmp);
+
+		/* if debug session and channel is bound search runcontrol list */
+		if ((!valid) && (!dbg_s->is_profiler && dbg_s->ch)) {
+			valid = linear_search(offset,
+					      gk20a_runcontrol_whitelist,
+					      gk20a_runcontrol_whitelist_count);
+		}
+
+	} else if (op->type == REGOP(TYPE_GR_CTX_QUAD)) {
+		valid = linear_search(offset,
+				      gk20a_qctl_whitelist,
+				      gk20a_qctl_whitelist_count);
+	}
+
+	return valid;
+}
+
+/* note: the op here has already been through validate_reg_op_info */
+static int validate_reg_op_offset(struct dbg_session_gk20a *dbg_s,
+				  struct nvhost_dbg_gpu_reg_op *op)
+{
+	int err;
+	u32 buf_offset_lo, buf_offset_addr, num_offsets, offset;
+	bool valid = false;
+
+	op->status = 0;
+	offset = op->offset;
+
+	/* support only 24-bit 4-byte aligned offsets */
+	if (offset & 0xFF000003) {
+		gk20a_err(dbg_s->dev, "invalid regop offset: 0x%x\n", offset);
+		op->status |= REGOP(STATUS_INVALID_OFFSET);
+		return -EINVAL;
+	}
+
+	valid = check_whitelists(dbg_s, op, offset);
+	if ((op->op == REGOP(READ_64) || op->op == REGOP(WRITE_64)) && valid)
+		valid = check_whitelists(dbg_s, op, offset + 4);
+
+	if (valid && (op->type != REGOP(TYPE_GLOBAL))) {
+			err = gr_gk20a_get_ctx_buffer_offsets(dbg_s->g,
+							      op->offset,
+							      1,
+							      &buf_offset_lo,
+							      &buf_offset_addr,
+							      &num_offsets,
+							      op->type == REGOP(TYPE_GR_CTX_QUAD),
+							      op->quad);
+			if (err) {
+				op->status |= REGOP(STATUS_INVALID_OFFSET);
+				return -EINVAL;
+			}
+			if (!buf_offset_lo) {
+				op->status |= REGOP(STATUS_INVALID_OFFSET);
+				return -EINVAL;
+			}
+	}
+
+	if (!valid) {
+		gk20a_err(dbg_s->dev, "invalid regop offset: 0x%x\n", offset);
+		op->status |= REGOP(STATUS_INVALID_OFFSET);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static bool validate_reg_ops(struct dbg_session_gk20a *dbg_s,
+			    u32 *ctx_rd_count, u32 *ctx_wr_count,
+			    struct nvhost_dbg_gpu_reg_op *ops,
+			    u32 op_count)
+{
+	u32 i;
+	int err;
+	bool ok = true;
+
+	/* keep going until the end so every op can get
+	 * a separate error code if needed */
+	for (i = 0; i < op_count; i++) {
+
+		err = validate_reg_op_info(dbg_s, &ops[i]);
+		ok &= !err;
+
+		if (reg_op_is_gr_ctx(ops[i].type)) {
+			if (reg_op_is_read(ops[i].op))
+				(*ctx_rd_count)++;
+			else
+				(*ctx_wr_count)++;
+		}
+
+		err = validate_reg_op_offset(dbg_s, &ops[i]);
+		ok &= !err;
+	}
+
+	gk20a_dbg(gpu_dbg_gpu_dbg, "ctx_wrs:%d ctx_rds:%d\n",
+		   *ctx_wr_count, *ctx_rd_count);
+
+	return ok;
+}
+
+/* exported for tools like cyclestats, etc */
+bool is_bar0_global_offset_whitelisted_gk20a(u32 offset)
+{
+
+	bool valid = !!bsearch(&offset,
+			       gk20a_global_whitelist_ranges,
+			       gk20a_global_whitelist_ranges_count,
+			       sizeof(*gk20a_global_whitelist_ranges),
+			       regop_bsearch_range_cmp);
+	return valid;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/regops_gk20a.h b/drivers/gpu/nvgpu/gk20a/regops_gk20a.h
new file mode 100644
index 000000000000..23b4865b8db8
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/regops_gk20a.h
@@ -0,0 +1,47 @@
+/*
+ *
+ * Tegra GK20A GPU Debugger Driver Register Ops
+ *
+ * Copyright (c) 2013-2014, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __REGOPS_GK20A_H_
+#define __REGOPS_GK20A_H_
+
+int exec_regops_gk20a(struct dbg_session_gk20a *dbg_s,
+		      struct nvhost_dbg_gpu_reg_op *ops,
+		      u64 num_ops);
+
+/* turn seriously unwieldy names -> something shorter */
+#define REGOP(x) NVHOST_DBG_GPU_REG_OP_##x
+
+
+static inline bool reg_op_is_gr_ctx(u8 type)
+{
+	return  type == REGOP(TYPE_GR_CTX) ||
+		type == REGOP(TYPE_GR_CTX_TPC) ||
+		type == REGOP(TYPE_GR_CTX_SM) ||
+		type == REGOP(TYPE_GR_CTX_CROP) ||
+		type == REGOP(TYPE_GR_CTX_ZROP) ||
+		type == REGOP(TYPE_GR_CTX_QUAD);
+}
+static inline bool reg_op_is_read(u8 op)
+{
+	return  op == REGOP(READ_32) ||
+		op == REGOP(READ_64) ;
+}
+
+bool is_bar0_global_offset_whitelisted_gk20a(u32 offset);
+
+#endif /* __REGOPS_GK20A_H_ */
diff --git a/drivers/gpu/nvgpu/gk20a/sim_gk20a.h b/drivers/gpu/nvgpu/gk20a/sim_gk20a.h
new file mode 100644
index 000000000000..5fc8006e202b
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/sim_gk20a.h
@@ -0,0 +1,62 @@
+/*
+ * drivers/video/tegra/host/gk20a/sim_gk20a.h
+ *
+ * GK20A sim support
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef __SIM_GK20A_H__
+#define __SIM_GK20A_H__
+
+
+struct gk20a;
+struct sim_gk20a {
+	struct gk20a *g;
+	struct resource *reg_mem;
+	void __iomem *regs;
+	struct {
+		struct page *page;
+		void *kvaddr;
+		phys_addr_t phys;
+	} send_bfr, recv_bfr, msg_bfr;
+	u32 send_ring_put;
+	u32 recv_ring_get;
+	u32 recv_ring_put;
+	u32 sequence_base;
+	void (*remove_support)(struct sim_gk20a *);
+};
+
+
+int gk20a_sim_esc_read(struct gk20a *g, char *path, u32 index,
+			  u32 count, u32 *data);
+
+static inline int gk20a_sim_esc_read_no_sim(struct gk20a *g, char *p,
+				     u32 i, u32 c, u32 *d)
+{
+	*d = ~(u32)0;
+	return -1;
+}
+
+static inline int gk20a_sim_esc_readl(struct gk20a *g, char * p, u32 i, u32 *d)
+{
+	if (tegra_cpu_is_asim())
+		return gk20a_sim_esc_read(g, p, i, sizeof(u32), d);
+
+	return gk20a_sim_esc_read_no_sim(g, p, i, sizeof(u32), d);
+}
+
+
+#endif /*__SIM_GK20A_H__*/
diff --git a/drivers/gpu/nvgpu/gk20a/therm_gk20a.c b/drivers/gpu/nvgpu/gk20a/therm_gk20a.c
new file mode 100644
index 000000000000..da9119798c1f
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/therm_gk20a.c
@@ -0,0 +1,142 @@
+/*
+ * drivers/video/tegra/host/gk20a/therm_gk20a.c
+ *
+ * GK20A Therm
+ *
+ * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "gk20a.h"
+#include "hw_chiplet_pwr_gk20a.h"
+#include "hw_gr_gk20a.h"
+#include "hw_therm_gk20a.h"
+
+static int gk20a_init_therm_reset_enable_hw(struct gk20a *g)
+{
+	return 0;
+}
+
+static int gk20a_init_therm_setup_sw(struct gk20a *g)
+{
+	return 0;
+}
+
+static int gk20a_init_therm_setup_hw(struct gk20a *g)
+{
+	/* program NV_THERM registers */
+	gk20a_writel(g, therm_use_a_r(), NV_THERM_USE_A_INIT);
+	gk20a_writel(g, therm_evt_ext_therm_0_r(),
+		NV_THERM_EVT_EXT_THERM_0_INIT);
+	gk20a_writel(g, therm_evt_ext_therm_1_r(),
+		NV_THERM_EVT_EXT_THERM_1_INIT);
+	gk20a_writel(g, therm_evt_ext_therm_2_r(),
+		NV_THERM_EVT_EXT_THERM_2_INIT);
+
+/*
+	u32 data;
+
+	data = gk20a_readl(g, gr_gpcs_tpcs_l1c_cfg_r());
+	data = set_field(data, gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_m(),
+		gr_gpcs_tpcs_l1c_cfg_blkactivity_enable_enable_f());
+	gk20a_writel(g, gr_gpcs_tpcs_l1c_cfg_r(), data);
+
+	data = gk20a_readl(g, gr_gpcs_tpcs_l1c_pm_r());
+	data = set_field(data, gr_gpcs_tpcs_l1c_pm_enable_m(),
+		gr_gpcs_tpcs_l1c_pm_enable_enable_f());
+	gk20a_writel(g, gr_gpcs_tpcs_l1c_pm_r(), data);
+
+	data = gk20a_readl(g, gr_gpcs_tpcs_sm_pm_ctrl_r());
+	data = set_field(data, gr_gpcs_tpcs_sm_pm_ctrl_core_enable_m(),
+		gr_gpcs_tpcs_sm_pm_ctrl_core_enable_enable_f());
+	data = set_field(data, gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_m(),
+		gr_gpcs_tpcs_sm_pm_ctrl_qctl_enable_enable_f());
+	gk20a_writel(g, gr_gpcs_tpcs_sm_pm_ctrl_r(), data);
+
+	data = gk20a_readl(g, gr_gpcs_tpcs_sm_halfctl_ctrl_r());
+	data = set_field(data, gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_m(),
+		gr_gpcs_tpcs_sm_halfctl_ctrl_sctl_blkactivity_enable_enable_f());
+	gk20a_writel(g, gr_gpcs_tpcs_sm_halfctl_ctrl_r(), data);
+
+	data = gk20a_readl(g, gr_gpcs_tpcs_sm_debug_sfe_control_r());
+	data = set_field(data, gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_m(),
+		gr_gpcs_tpcs_sm_debug_sfe_control_blkactivity_enable_enable_f());
+	gk20a_writel(g, gr_gpcs_tpcs_sm_debug_sfe_control_r(), data);
+
+	gk20a_writel(g, therm_peakpower_config6_r(0),
+		therm_peakpower_config6_trigger_cfg_1h_intr_f() |
+		therm_peakpower_config6_trigger_cfg_1l_intr_f());
+
+	gk20a_writel(g, chiplet_pwr_gpcs_config_1_r(),
+		chiplet_pwr_gpcs_config_1_ba_enable_yes_f());
+	gk20a_writel(g, chiplet_pwr_fbps_config_1_r(),
+		chiplet_pwr_fbps_config_1_ba_enable_yes_f());
+
+	data = gk20a_readl(g, therm_config1_r());
+	data = set_field(data, therm_config1_ba_enable_m(),
+		therm_config1_ba_enable_yes_f());
+	gk20a_writel(g, therm_config1_r(), data);
+
+	gk20a_writel(g, gr_gpcs_tpcs_sm_power_throttle_r(), 0x441a);
+
+	gk20a_writel(g, therm_weight_1_r(), 0xd3);
+	gk20a_writel(g, chiplet_pwr_gpcs_weight_6_r(), 0x7d);
+	gk20a_writel(g, chiplet_pwr_gpcs_weight_7_r(), 0xff);
+	gk20a_writel(g, chiplet_pwr_fbps_weight_0_r(), 0x13000000);
+	gk20a_writel(g, chiplet_pwr_fbps_weight_1_r(), 0x19);
+
+	gk20a_writel(g, therm_peakpower_config8_r(0), 0x8);
+	gk20a_writel(g, therm_peakpower_config9_r(0), 0x0);
+
+	gk20a_writel(g, therm_evt_ba_w0_t1h_r(), 0x100);
+
+	gk20a_writel(g, therm_use_a_r(), therm_use_a_ba_w0_t1h_yes_f());
+
+	gk20a_writel(g, therm_peakpower_config1_r(0),
+		therm_peakpower_config1_window_period_2m_f() |
+		therm_peakpower_config1_ba_sum_shift_20_f() |
+		therm_peakpower_config1_window_en_enabled_f());
+
+	gk20a_writel(g, therm_peakpower_config2_r(0),
+		therm_peakpower_config2_ba_threshold_1h_val_f(1) |
+		therm_peakpower_config2_ba_threshold_1h_en_enabled_f());
+
+	gk20a_writel(g, therm_peakpower_config4_r(0),
+		therm_peakpower_config4_ba_threshold_1l_val_f(1) |
+		therm_peakpower_config4_ba_threshold_1l_en_enabled_f());
+*/
+	return 0;
+}
+
+int gk20a_init_therm_support(struct gk20a *g)
+{
+	u32 err;
+
+	gk20a_dbg_fn("");
+
+	err = gk20a_init_therm_reset_enable_hw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_therm_setup_sw(g);
+	if (err)
+		return err;
+
+	err = gk20a_init_therm_setup_hw(g);
+	if (err)
+		return err;
+
+	return err;
+}
diff --git a/drivers/gpu/nvgpu/gk20a/therm_gk20a.h b/drivers/gpu/nvgpu/gk20a/therm_gk20a.h
new file mode 100644
index 000000000000..3f67ee124429
--- /dev/null
+++ b/drivers/gpu/nvgpu/gk20a/therm_gk20a.h
@@ -0,0 +1,33 @@
+/*
+ * drivers/video/tegra/host/gk20a/therm_gk20a.h
+ *
+ * GK20A Therm
+ *
+ * Copyright (c) 2011 - 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _NVHOST_THERM_GK20A_H_
+#define _NVHOST_THERM_GK20A_H_
+
+/* priority for EXT_THERM_0 event set to highest */
+#define NV_THERM_EVT_EXT_THERM_0_INIT	0x3000100
+#define NV_THERM_EVT_EXT_THERM_1_INIT	0x2000200
+#define NV_THERM_EVT_EXT_THERM_2_INIT	0x1000300
+/* configures the thermal events that may cause clock slowdown */
+#define NV_THERM_USE_A_INIT	0x7
+
+int gk20a_init_therm_support(struct gk20a *g);
+
+#endif /* _NVHOST_THERM_GK20A_H_ */
author	Arto Merilainen <amerilainen@nvidia.com>	2014-03-19 09:38:25 +0200
committer	Terje Bergstrom <tbergstrom@nvidia.com>	2014-03-28 04:21:39 -0700
commit	e51e1033bd22dc5ea6a86f6704142baf89a2f7cb (patch)
tree	9b5f65258f5777273f3b62e4f59f8001ed7da543 /drivers/gpu/nvgpu
parent	1428ed474d1acb22321e89301c06be1bb9e5fe17 (diff)