6 files changed, 248 insertions, 62 deletions
diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
index 37c8157e116e..c8c823953a34 100644
--- a/arch/arm/mm/cache-v7.S
+++ b/arch/arm/mm/cache-v7.S
@@ -18,27 +18,28 @@
 #include "proc-macros.S"
 
 /*
- *	v7_flush_dcache_all()
+ *	v7_op_dcache_all op
  *
- *	Flush the whole D-cache.
+ *	op=c14, Flush the whole D-cache.
+ *	op=c10, Clean the whole D-cache.
  *
  *	Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
  *
  *	- mm    - mm_struct describing address space
  */
-ENTRY(v7_flush_dcache_all)
+.macro v7_op_dcache_all op			@ op=c10 clean, op=c14 flush
 	dmb					@ ensure ordering with previous memory accesses
 	mrc	p15, 1, r0, c0, c0, 1		@ read clidr
 	ands	r3, r0, #0x7000000		@ extract loc from clidr
 	mov	r3, r3, lsr #23			@ left align loc bit field
-	beq	finished			@ if loc is 0, then no need to clean
+	beq	1005f				@ if loc is 0, then no need to clean
 	mov	r10, #0				@ start clean at cache level 0
-loop1:
+1001:
 	add	r2, r10, r10, lsr #1		@ work out 3x current cache level
 	mov	r1, r0, lsr r2			@ extract cache type bits from clidr
 	and	r1, r1, #7			@ mask of the bits for current cache only
 	cmp	r1, #2				@ see what cache we have at this level
-	blt	skip				@ skip if no cache, or just i-cache
+	blt	1004f				@ skip if no cache, or just i-cache
 	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
 	isb					@ isb to sych the new cssr&csidr
 	mrc	p15, 1, r1, c0, c0, 0		@ read the new csidr
@@ -49,32 +50,40 @@ loop1:
 	clz	r5, r4				@ find bit position of way size increment
 	ldr	r7, =0x7fff
 	ands	r7, r7, r1, lsr #13		@ extract max number of the index size
-loop2:
+1002:
 	mov	r9, r4				@ create working copy of max way size
-loop3:
+1003:
  ARM(	orr	r11, r10, r9, lsl r5	)	@ factor way and cache number into r11
  THUMB(	lsl	r6, r9, r5		)
  THUMB(	orr	r11, r10, r6		)	@ factor way and cache number into r11
  ARM(	orr	r11, r11, r7, lsl r2	)	@ factor index number into r11
  THUMB(	lsl	r6, r7, r2		)
  THUMB(	orr	r11, r11, r6		)	@ factor index number into r11
-	mcr	p15, 0, r11, c7, c14, 2		@ clean & invalidate by set/way
+	mcr	p15, 0, r11, c7, \op, 2		@ op=c10/c14, clean/flush by set/way
 	subs	r9, r9, #1			@ decrement the way
-	bge	loop3
+	bge	1003b
 	subs	r7, r7, #1			@ decrement the index
-	bge	loop2
-skip:
+	bge	1002b
+1004:
 	add	r10, r10, #2			@ increment cache number
 	cmp	r3, r10
-	bgt	loop1
-finished:
+	bgt	1001b
+1005:
 	mov	r10, #0				@ swith back to cache level 0
 	mcr	p15, 2, r10, c0, c0, 0		@ select current cache level in cssr
 	dsb
 	isb
 	mov	pc, lr
+.endm
+
+ENTRY(v7_flush_dcache_all)
+	v7_op_dcache_all c14
 ENDPROC(v7_flush_dcache_all)
 
+ENTRY(v7_clean_dcache_all)
+	v7_op_dcache_all c10
+ENDPROC(v7_clean_dcache_all)
+
 /*
  *	v7_flush_cache_all()
  *
@@ -102,6 +111,24 @@ ENTRY(v7_flush_kern_cache_all)
 ENDPROC(v7_flush_kern_cache_all)
 
 /*
+ *       v7_clean_kern_cache_all()
+ */
+ENTRY(v7_clean_kern_cache_all)
+ ARM(	stmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	stmfd	sp!, {r4-r7, r9-r11, lr}	)
+	bl	v7_clean_dcache_all
+	mov	r0, #0
+#ifdef CONFIG_SMP
+	mcr	p15, 0, r0, c7, c1, 0		@ invalidate I-cache inner shareable
+#else
+	mcr	p15, 0, r0, c7, c5, 0		@ I+BTB cache invalidate
+#endif
+ ARM(	ldmfd	sp!, {r4-r5, r7, r9-r11, lr}	)
+ THUMB(	ldmfd	sp!, {r4-r7, r9-r11, lr}	)
+	mov	pc, lr
+ENDPROC(v7_clean_kern_cache_all)
+
+/*
  *	v7_flush_cache_all()
  *
  *	Flush all TLB entries in a particular address space
diff --git a/drivers/video/tegra/nvmap/nvmap_common.h b/drivers/video/tegra/nvmap/nvmap_common.h
new file mode 100644
index 000000000000..20d27fa955bb
--- /dev/null
+++ b/drivers/video/tegra/nvmap/nvmap_common.h
@@ -0,0 +1,36 @@
+/*
+ * drivers/video/tegra/nvmap/nvmap_common.h
+ *
+ * GPU memory management driver for Tegra
+ *
+ * Copyright (c) 2011, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *'
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+extern void v7_flush_kern_cache_all(void *);
+extern void v7_clean_kern_cache_all(void *);
+
+#define FLUSH_CLEAN_BY_SET_WAY_THRESHOLD (8 * PAGE_SIZE)
+
+static inline void inner_flush_cache_all(void)
+{
+	on_each_cpu(v7_flush_kern_cache_all, NULL, 1);
+}
+
+static inline void inner_clean_cache_all(void)
+{
+	on_each_cpu(v7_clean_kern_cache_all, NULL, 1);
+}
diff --git a/drivers/video/tegra/nvmap/nvmap_dev.c b/drivers/video/tegra/nvmap/nvmap_dev.c
index ed97228d0d63..5d63dbc4fbc9 100644
--- a/drivers/video/tegra/nvmap/nvmap_dev.c
+++ b/drivers/video/tegra/nvmap/nvmap_dev.c
@@ -3,7 +3,7 @@
  *
  * User-space interface to nvmap
  *
- * Copyright (c) 2010, NVIDIA Corporation.
+ * Copyright (c) 2011, NVIDIA Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -44,6 +44,7 @@
 #include "nvmap.h"
 #include "nvmap_ioctl.h"
 #include "nvmap_mru.h"
+#include "nvmap_common.h"
 
 #define NVMAP_NUM_PTES		64
 #define NVMAP_CARVEOUT_KILLER_RETRY_TIME 100 /* msecs */
@@ -250,8 +251,30 @@ unsigned long nvmap_carveout_usage(struct nvmap_client *c,
 	return 0;
 }
 
-static int nvmap_flush_heap_block(struct nvmap_client *client,
-				  struct nvmap_heap_block *block, size_t len)
+/*
+ * This routine is used to flush the carveout memory from cache.
+ * Why cache flush is needed for carveout? Consider the case, where a piece of
+ * carveout is allocated as cached and released. After this, if the same memory is
+ * allocated for uncached request and the memory is not flushed out from cache.
+ * In this case, the client might pass this to H/W engine and it could start modify
+ * the memory. As this was cached earlier, it might have some portion of it in cache.
+ * During cpu request to read/write other memory, the cached portion of this memory
+ * might get flushed back to main memory and would cause corruptions, if it happens
+ * after H/W writes data to memory.
+ *
+ * But flushing out the memory blindly on each carveout allocation is redundant.
+ *
+ * In order to optimize the carveout buffer cache flushes, the following
+ * strategy is used.
+ *
+ * The whole Carveout is flushed out from cache during its initialization.
+ * During allocation, carveout buffers are not flused from cache.
+ * During deallocation, carveout buffers are flushed, if they were allocated as cached.
+ * if they were allocated as uncached/writecombined, no cache flush is needed.
+ * Just draining store buffers is enough.
+ */
+int nvmap_flush_heap_block(struct nvmap_client *client,
+	struct nvmap_heap_block *block, size_t len, unsigned int prot)
 {
 	pte_t **pte;
 	void *addr;
@@ -259,7 +282,17 @@ static int nvmap_flush_heap_block(struct nvmap_client *client,
 	unsigned long phys = block->base;
 	unsigned long end = block->base + len;
 
-	pte = nvmap_alloc_pte(client->dev, &addr);
+	if (prot == NVMAP_HANDLE_UNCACHEABLE || prot == NVMAP_HANDLE_WRITE_COMBINE)
+		goto out;
+
+	if ( len >= FLUSH_CLEAN_BY_SET_WAY_THRESHOLD ) {
+		inner_flush_cache_all();
+		if (prot != NVMAP_HANDLE_INNER_CACHEABLE)
+			outer_flush_range(block->base, block->base + len);
+		goto out;
+	}
+
+	pte = nvmap_alloc_pte((client ? client->dev : nvmap_dev), &addr);
 	if (IS_ERR(pte))
 		return PTR_ERR(pte);
 
@@ -277,9 +310,12 @@ static int nvmap_flush_heap_block(struct nvmap_client *client,
 		phys = next;
 	}
 
-	outer_flush_range(block->base, block->base + len);
+	if (prot != NVMAP_HANDLE_INNER_CACHEABLE)
+		outer_flush_range(block->base, block->base + len);
 
-	nvmap_free_pte(client->dev, pte);
+	nvmap_free_pte((client ? client->dev: nvmap_dev), pte);
+out:
+	wmb();
 	return 0;
 }
 
@@ -421,13 +457,6 @@ struct nvmap_heap_block *do_nvmap_carveout_alloc(struct nvmap_client *client,
 		block = nvmap_heap_alloc(co_heap->carveout, len,
 					align, prot, handle);
 		if (block) {
-			/* flush any stale data that may be left in the
-			 * cache at the block's address, since the new
-			 * block may be mapped uncached */
-			if (nvmap_flush_heap_block(client, block, len)) {
-				nvmap_heap_free(block);
-				block = NULL;
-			}
 			return block;
 		}
 	}
diff --git a/drivers/video/tegra/nvmap/nvmap_handle.c b/drivers/video/tegra/nvmap/nvmap_handle.c
index dc3be30ca2f5..a9150a36cf2a 100644
--- a/drivers/video/tegra/nvmap/nvmap_handle.c
+++ b/drivers/video/tegra/nvmap/nvmap_handle.c
@@ -37,6 +37,7 @@
 
 #include "nvmap.h"
 #include "nvmap_mru.h"
+#include "nvmap_common.h"
 
 #define NVMAP_SECURE_HEAPS	(NVMAP_HEAP_CARVEOUT_IRAM | NVMAP_HEAP_IOVMM)
 #ifdef CONFIG_NVMAP_HIGHMEM_ONLY
@@ -107,7 +108,8 @@ out:
 
 extern void __flush_dcache_page(struct address_space *, struct page *);
 
-static struct page *nvmap_alloc_pages_exact(gfp_t gfp, size_t size)
+static struct page *nvmap_alloc_pages_exact(gfp_t gfp,
+	size_t size, bool flush_inner)
 {
 	struct page *page, *p, *e;
 	unsigned int order;
@@ -127,8 +129,10 @@ static struct page *nvmap_alloc_pages_exact(gfp_t gfp, size_t size)
 		__free_page(p);
 
 	e = page + (size >> PAGE_SHIFT);
-	for (p = page; p < e; p++)
-		__flush_dcache_page(page_mapping(p), p);
+	if (flush_inner) {
+		for (p = page; p < e; p++)
+			__flush_dcache_page(page_mapping(p), p);
+	}
 
 	base = page_to_phys(page);
 	outer_flush_range(base, base + size);
@@ -143,6 +147,7 @@ static int handle_page_alloc(struct nvmap_client *client,
 	pgprot_t prot;
 	unsigned int i = 0;
 	struct page **pages;
+	bool flush_inner = true;
 
 	pages = altalloc(nr_page * sizeof(*pages));
 	if (!pages)
@@ -155,10 +160,14 @@ static int handle_page_alloc(struct nvmap_client *client,
 		contiguous = true;
 #endif
 
+	if (size >= FLUSH_CLEAN_BY_SET_WAY_THRESHOLD) {
+		inner_flush_cache_all();
+		flush_inner = false;
+	}
 	h->pgalloc.area = NULL;
 	if (contiguous) {
 		struct page *page;
-		page = nvmap_alloc_pages_exact(GFP_NVMAP, size);
+		page = nvmap_alloc_pages_exact(GFP_NVMAP, size, flush_inner);
 		if (!page)
 			goto fail;
 
@@ -167,7 +176,8 @@ static int handle_page_alloc(struct nvmap_client *client,
 
 	} else {
 		for (i = 0; i < nr_page; i++) {
-			pages[i] = nvmap_alloc_pages_exact(GFP_NVMAP, PAGE_SIZE);
+			pages[i] = nvmap_alloc_pages_exact(GFP_NVMAP, PAGE_SIZE,
+				flush_inner);
 			if (!pages[i])
 				goto fail;
 		}
@@ -193,6 +203,7 @@ fail:
 	while (i--)
 		__free_page(pages[i]);
 	altfree(pages, nr_page * sizeof(*pages));
+	wmb();
 	return -ENOMEM;
 }
 
diff --git a/drivers/video/tegra/nvmap/nvmap_heap.c b/drivers/video/tegra/nvmap/nvmap_heap.c
index c920048db82b..a0a574d78944 100644
--- a/drivers/video/tegra/nvmap/nvmap_heap.c
+++ b/drivers/video/tegra/nvmap/nvmap_heap.c
@@ -3,7 +3,7 @@
  *
  * GPU heap allocator.
  *
- * Copyright (c) 2010, NVIDIA Corporation.
+ * Copyright (c) 2011, NVIDIA Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -31,6 +31,7 @@
 #include <mach/nvmap.h>
 #include "nvmap.h"
 #include "nvmap_heap.h"
+#include "nvmap_common.h"
 
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
@@ -887,6 +888,9 @@ struct nvmap_heap_block *nvmap_heap_alloc(struct nvmap_heap *h, size_t len,
 
 struct nvmap_heap *nvmap_block_to_heap(struct nvmap_heap_block *b)
 {
+	struct buddy_heap *bh = NULL;
+	struct nvmap_heap *h;
+
 	if (b->type == BLOCK_BUDDY) {
 		struct buddy_block *bb;
 		bb = container_of(b, struct buddy_block, block);
@@ -898,17 +902,24 @@ struct nvmap_heap *nvmap_block_to_heap(struct nvmap_heap_block *b)
 	}
 }
 
+int nvmap_flush_heap_block(struct nvmap_client *client,
+	struct nvmap_heap_block *block, size_t len, unsigned int prot);
+
 /* nvmap_heap_free: frees block b*/
 void nvmap_heap_free(struct nvmap_heap_block *b)
 {
 	struct buddy_heap *bh = NULL;
 	struct nvmap_heap *h = nvmap_block_to_heap(b);
+	struct list_block *lb;
 
 	mutex_lock(&h->lock);
 	if (b->type == BLOCK_BUDDY)
 		bh = do_buddy_free(b);
-	else
+	else {
+		lb = container_of(b, struct list_block, block);
+		nvmap_flush_heap_block(NULL, b, lb->size, lb->mem_prot);
 		do_heap_free(b);
+	}
 
 	if (bh) {
 		list_del(&bh->buddy_list);
@@ -1008,6 +1019,10 @@ struct nvmap_heap *nvmap_heap_create(struct device *parent, const char *name,
 	l->orig_addr = base;
 	list_add_tail(&l->free_list, &h->free_list);
 	list_add_tail(&l->all_list, &h->all_list);
+
+	inner_flush_cache_all();
+	outer_flush_range(base, base + len);
+	wmb();
 	return h;
 
 fail_register:
diff --git a/drivers/video/tegra/nvmap/nvmap_ioctl.c b/drivers/video/tegra/nvmap/nvmap_ioctl.c
index fb8c5ff00bdd..fc367c89ad45 100644
--- a/drivers/video/tegra/nvmap/nvmap_ioctl.c
+++ b/drivers/video/tegra/nvmap/nvmap_ioctl.c
@@ -3,7 +3,7 @@
  *
  * User-space interface to nvmap
  *
- * Copyright (c) 2010, NVIDIA Corporation.
+ * Copyright (c) 2011, NVIDIA Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +35,7 @@
 
 #include "nvmap_ioctl.h"
 #include "nvmap.h"
+#include "nvmap_common.h"
 
 static ssize_t rw_handle(struct nvmap_client *client, struct nvmap_handle *h,
 			 int is_read, unsigned long h_offs,
@@ -477,10 +478,91 @@ int nvmap_ioctl_free(struct file *filp, unsigned long arg)
 	return 0;
 }
 
+static void inner_cache_maint(unsigned int op, void *vaddr, size_t size)
+{
+	if (op == NVMAP_CACHE_OP_WB_INV)
+		dmac_flush_range(vaddr, vaddr + size);
+	else if (op == NVMAP_CACHE_OP_INV)
+		dmac_map_area(vaddr, size, DMA_FROM_DEVICE);
+	else
+		dmac_map_area(vaddr, size, DMA_TO_DEVICE);
+}
+
+static void outer_cache_maint(unsigned int op, unsigned long paddr, size_t size)
+{
+	if (op == NVMAP_CACHE_OP_WB_INV)
+		outer_flush_range(paddr, paddr + size);
+	else if (op == NVMAP_CACHE_OP_INV)
+		outer_inv_range(paddr, paddr + size);
+	else
+		outer_clean_range(paddr, paddr + size);
+}
+
+static void heap_page_cache_maint(struct nvmap_client *client,
+	struct nvmap_handle *h, unsigned long start, unsigned long end,
+	unsigned int op, bool inner, bool outer, pte_t **pte,
+	unsigned long kaddr, pgprot_t prot)
+{
+	struct page *page;
+	unsigned long paddr;
+	unsigned long next;
+	unsigned long off;
+	size_t size;
+
+	while (start < end) {
+		page = h->pgalloc.pages[start >> PAGE_SHIFT];
+		next = min(((start + PAGE_SIZE) & PAGE_MASK), end);
+		off = start & ~PAGE_MASK;
+		size = next - start;
+		paddr = page_to_phys(page) + off;
+
+		if (inner) {
+			void *vaddr = (void *)kaddr + off;
+			BUG_ON(!pte);
+			BUG_ON(!kaddr);
+			set_pte_at(&init_mm, kaddr, *pte,
+				pfn_pte(__phys_to_pfn(paddr), prot));
+			flush_tlb_kernel_page(kaddr);
+			inner_cache_maint(op, vaddr, size);
+		}
+
+		if (outer)
+			outer_cache_maint(op, paddr, size);
+		start = next;
+	}
+}
+
+static bool fast_cache_maint(struct nvmap_client *client, struct nvmap_handle *h,
+	unsigned long start, unsigned long end, unsigned int op)
+{
+	int ret = false;
+
+	if ( (op == NVMAP_CACHE_OP_INV) ||
+		((end - start) < FLUSH_CLEAN_BY_SET_WAY_THRESHOLD) )
+		goto out;
+
+	if (op == NVMAP_CACHE_OP_WB_INV) {
+		inner_flush_cache_all();
+	} else if (op == NVMAP_CACHE_OP_WB) {
+		inner_clean_cache_all();
+	}
+
+	if (h->heap_pgalloc && (h->flags != NVMAP_HANDLE_INNER_CACHEABLE)) {
+		heap_page_cache_maint(client, h, start, end, op,
+				false, true, NULL, 0, 0);
+	} else if (h->flags != NVMAP_HANDLE_INNER_CACHEABLE) {
+		start += h->carveout->base;
+		end += h->carveout->base;
+		outer_cache_maint(op, start, end - start);
+	}
+	ret = true;
+out:
+	return ret;
+}
+
 static int cache_maint(struct nvmap_client *client, struct nvmap_handle *h,
 		       unsigned long start, unsigned long end, unsigned int op)
 {
-	enum dma_data_direction dir;
 	pgprot_t prot;
 	pte_t **pte = NULL;
 	unsigned long kaddr;
@@ -501,26 +583,8 @@ static int cache_maint(struct nvmap_client *client, struct nvmap_handle *h,
 	    start == end)
 		goto out;
 
-	if (WARN_ON_ONCE(op == NVMAP_CACHE_OP_WB_INV))
-		dir = DMA_BIDIRECTIONAL;
-	else if (op == NVMAP_CACHE_OP_WB)
-		dir = DMA_TO_DEVICE;
-	else
-		dir = DMA_FROM_DEVICE;
-
-	if (h->heap_pgalloc) {
-		while (start < end) {
-			unsigned long next = (start + PAGE_SIZE) & PAGE_MASK;
-			struct page *page;
-
-			page = h->pgalloc.pages[start >> PAGE_SHIFT];
-			next = min(next, end);
-			__dma_page_cpu_to_dev(page, start & ~PAGE_MASK,
-					      next - start, dir);
-			start = next;
-		}
+	if (fast_cache_maint(client, h, start, end, op))
 		goto out;
-	}
 
 	prot = nvmap_pgprot(h, pgprot_kernel);
 	pte = nvmap_alloc_pte(client->dev, (void **)&kaddr);
@@ -530,6 +594,13 @@ static int cache_maint(struct nvmap_client *client, struct nvmap_handle *h,
 		goto out;
 	}
 
+	if (h->heap_pgalloc) {
+		heap_page_cache_maint(client, h, start, end, op, true,
+			(h->flags == NVMAP_HANDLE_INNER_CACHEABLE) ? false : true,
+			pte, kaddr, prot);
+		goto out;
+	}
+
 	if (start > h->size || end > h->size) {
 		nvmap_warn(client, "cache maintenance outside handle\n");
 		return -EINVAL;
@@ -552,16 +623,13 @@ static int cache_maint(struct nvmap_client *client, struct nvmap_handle *h,
 			   pfn_pte(__phys_to_pfn(loop), prot));
 		flush_tlb_kernel_page(kaddr);
 
-		dmac_map_area(base, next - loop, dir);
+		inner_cache_maint(op, base, next - loop);
 		loop = next;
 	}
 
-	if (h->flags != NVMAP_HANDLE_INNER_CACHEABLE) {
-		if (dir != DMA_FROM_DEVICE)
-			outer_clean_range(start, end);
-		else
-			outer_inv_range(start, end);
-	}
+	if (h->flags != NVMAP_HANDLE_INNER_CACHEABLE)
+		outer_cache_maint(op, start, end - start);
+
 	/* unlock carveout */
 	nvmap_usecount_dec(h);