From a086ff98ad0bb996241037689188caf394f6c633 Mon Sep 17 00:00:00 2001 From: Max Krummenacher Date: Fri, 12 Jun 2015 13:27:55 +0200 Subject: [PATCH] memcpy: don't use optimized for VFP/NEON versions Tests with the tinymembench tool on a Colibri T30 show the performance of the standard arm based memcpy to be around 2 times faster than __memcpy_neon or __memcpy_vfp. Tests on Apalis iMX6 confirm this, although running only around 1.3 times faster. --- sysdeps/arm/armv7/multiarch/Makefile | 3 --- sysdeps/arm/armv7/multiarch/ifunc-impl-list.c | 18 ------------------ sysdeps/arm/armv7/multiarch/memcpy.S | 17 ----------------- sysdeps/arm/armv7/multiarch/memcpy_neon.S | 9 --------- sysdeps/arm/armv7/multiarch/memcpy_vfp.S | 7 ------- 5 files changed, 54 deletions(-) diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile index e834cc9..e69de29 100644 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ b/sysdeps/arm/armv7/multiarch/Makefile @@ -1,3 +0,0 @@ -ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp -endif diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c index 2515418..322eae6 100644 --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c +++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c @@ -31,25 +31,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, { size_t i = 0; - bool use_neon = true; -#ifdef __ARM_NEON__ -# define __memcpy_neon memcpy -#else - use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; -#endif - -#ifndef __ARM_NEON__ - bool use_vfp = true; -# ifdef __SOFTFP__ - use_vfp = (GLRO(dl_hwcap) & HWCAP_ARM_VFP) != 0; -# endif -#endif - IFUNC_IMPL (i, name, memcpy, - IFUNC_IMPL_ADD (array, i, memcpy, use_neon, __memcpy_neon) -#ifndef __ARM_NEON__ - IFUNC_IMPL_ADD (array, i, memcpy, use_vfp, __memcpy_vfp) -#endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); return i; diff --git a/sysdeps/arm/armv7/multiarch/memcpy.S b/sysdeps/arm/armv7/multiarch/memcpy.S index c4f4e80..9ee4d73 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy.S +++ b/sysdeps/arm/armv7/multiarch/memcpy.S @@ -23,37 +23,20 @@ #include #if IS_IN (libc) -/* Under __ARM_NEON__, memcpy_neon.S defines the name memcpy. */ -# ifndef __ARM_NEON__ .text ENTRY(memcpy) .type memcpy, %gnu_indirect_function -# ifdef __SOFTFP__ ldr r1, .Lmemcpy_arm - tst r0, #HWCAP_ARM_VFP - ldrne r1, .Lmemcpy_vfp -# else - ldr r1, .Lmemcpy_vfp -# endif - tst r0, #HWCAP_ARM_NEON - ldrne r1, .Lmemcpy_neon 1: add r0, r1, pc DO_RET(lr) -# ifdef __SOFTFP__ .Lmemcpy_arm: .long C_SYMBOL_NAME(__memcpy_arm) - 1b - PC_OFS -# endif -.Lmemcpy_neon: - .long C_SYMBOL_NAME(__memcpy_neon) - 1b - PC_OFS -.Lmemcpy_vfp: - .long C_SYMBOL_NAME(__memcpy_vfp) - 1b - PC_OFS END(memcpy) libc_hidden_builtin_def (memcpy) -#endif /* Not __ARM_NEON__. */ /* These versions of memcpy are defined not to clobber any VFP or NEON registers so they must always call the ARM variant of the memcpy code. */ diff --git a/sysdeps/arm/armv7/multiarch/memcpy_neon.S b/sysdeps/arm/armv7/multiarch/memcpy_neon.S index e60d1cc..e69de29 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy_neon.S +++ b/sysdeps/arm/armv7/multiarch/memcpy_neon.S @@ -1,9 +0,0 @@ -#ifdef __ARM_NEON__ -/* Under __ARM_NEON__, this file defines memcpy directly. */ -libc_hidden_builtin_def (memcpy) -#else -# define memcpy __memcpy_neon -#endif - -#define MEMCPY_NEON -#include "memcpy_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S index e008c04..e69de29 100644 --- a/sysdeps/arm/armv7/multiarch/memcpy_vfp.S +++ b/sysdeps/arm/armv7/multiarch/memcpy_vfp.S @@ -1,7 +0,0 @@ -/* Under __ARM_NEON__, memcpy_neon.S defines memcpy directly - and the __memcpy_vfp code will never be used. */ -#ifndef __ARM_NEON__ -# define MEMCPY_VFP -# define memcpy __memcpy_vfp -# include "memcpy_impl.S" -#endif -- 1.9.3