summaryrefslogtreecommitdiff
path: root/sound
diff options
context:
space:
mode:
authorNicolin Chen <Guangyu.Chen@freescale.com>2014-03-06 19:04:30 +0800
committerNicolin Chen <Guangyu.Chen@freescale.com>2014-03-27 19:18:56 +0800
commitaca6d0c4c96d658021bda4b5a4e454076e27e9f2 (patch)
treee48ceed0fab62bfb2a455d486b00362452fd2dde /sound
parent563931094bf096da2ce6f2cc40387f9e726b3342 (diff)
ENGR00305624-1 ASoC: imx-hdmi-dma: Use neon data copy function
Use neon data copy function as default to improve data copy performance so that we can prevent some noise issue happening to HDMI audio due to the performance issue. Acked-by: Wang Shengjiu <b02247@freescale.com> Signed-off-by: Nicolin Chen <Guangyu.Chen@freescale.com>
Diffstat (limited to 'sound')
-rw-r--r--sound/soc/fsl/Makefile4
-rw-r--r--sound/soc/fsl/hdmi_pcm.S246
-rw-r--r--sound/soc/fsl/imx-hdmi-dma.c51
3 files changed, 300 insertions, 1 deletions
diff --git a/sound/soc/fsl/Makefile b/sound/soc/fsl/Makefile
index 171788ecc7a6..a04afcf18f22 100644
--- a/sound/soc/fsl/Makefile
+++ b/sound/soc/fsl/Makefile
@@ -45,7 +45,7 @@ obj-$(CONFIG_SND_SOC_IMX_AUDMUX) += snd-soc-imx-audmux.o
obj-$(CONFIG_SND_SOC_IMX_PCM_FIQ) += imx-pcm-fiq.o
obj-$(CONFIG_SND_SOC_IMX_PCM_DMA) += imx-pcm-dma.o
-obj-$(CONFIG_SND_SOC_IMX_HDMI_DMA) += imx-hdmi-dma.o
+obj-$(CONFIG_SND_SOC_IMX_HDMI_DMA) += imx-hdmi-dma.o hdmi_pcm.o
# i.MX Machine Support
snd-soc-eukrea-tlv320-objs := eukrea-tlv320.o
@@ -71,3 +71,5 @@ obj-$(CONFIG_SND_SOC_IMX_SPDIF) += snd-soc-imx-spdif.o
obj-$(CONFIG_SND_SOC_IMX_MC13783) += snd-soc-imx-mc13783.o
obj-$(CONFIG_SND_SOC_IMX_HDMI) += snd-soc-imx-hdmi.o
obj-$(CONFIG_SND_SOC_IMX_SI476X) += snd-soc-imx-si476x.o
+
+AFLAGS_hdmi_pcm.o := -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mfloat-abi=softfp
diff --git a/sound/soc/fsl/hdmi_pcm.S b/sound/soc/fsl/hdmi_pcm.S
new file mode 100644
index 000000000000..d8d95fd8f42f
--- /dev/null
+++ b/sound/soc/fsl/hdmi_pcm.S
@@ -0,0 +1,246 @@
+/**
+ * Copyright (C) 2010-2014 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+.section .text
+
+.global hdmi_dma_copy_16_neon_lut
+.global hdmi_dma_copy_16_neon_fast
+.global hdmi_dma_copy_24_neon_lut
+.global hdmi_dma_copy_24_neon_fast
+
+
+/**
+ * hdmi_dma_copy_16_neon_lut
+ * Convert pcm sample to iec sample. Pcm sample is 16 bits.
+ * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_16_neon_lut(unsigned short *src, unsigned int *dst,
+ * int samples, unsigned char *lookup_table);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM16 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ * lookup_table Preconstructed header table. Channels interleaved.
+ */
+
+hdmi_dma_copy_16_neon_lut:
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d6, r12
+
+hdmi_dma_copy_16_neon_lut_start:
+
+ /* get 8 samples to q0 */
+ vld1.16 {d0, d1}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q1, q0 /* count of 1s */
+ vpadd.i8 d2, d2, d3 /* only care about the LST in every element */
+ vand d2, d2, d6 /* clear other bits while keep the least bit */
+ vshl.u8 d2, d2, #3 /* bit p: d2 = d2 << 3 */
+
+ /* get packet header */
+ vld1.8 {d5}, [r3]!
+ veor d4, d5, d2 /* xor bit c */
+
+ /* store: (d4 << 16 | q0) << 8 */
+ vmovl.u8 q2, d4 /* expand from char to short */
+ vzip.16 q0, q2
+ vshl.u32 q0, q0, #8
+ vshl.u32 q1, q2, #8
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_16_neon_lut_start
+
+ mov pc, lr
+
+/**
+ * hdmi_dma_copy_16_neon_fast
+ * Convert pcm sample to iec sample. Pcm sample is 16 bits.
+ * Frame index's between 48 and 191 inclusively.
+ * Channel count can be 1, 2, 4 or 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_16_neon_fast(unsigned short *src,
+ * unsigned int *dst, int samples);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM16 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ */
+
+hdmi_dma_copy_16_neon_fast:
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d6, r12
+
+hdmi_dma_copy_16_neon_fast_start:
+ /* get 8 samples to q0 */
+ vld1.16 {d0, d1}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q1, q0 /* count of 1s */
+ vpadd.i8 d2, d2, d3
+ vand d2, d2, d6 /* clear other bits while keep the LST */
+ /* finally we construct packet header */
+ vshl.u8 d4, d2, #3 /* bit p: d2 = d2 << 3 */
+
+ /* get packet header: always 0 */
+
+ /* store: (d4 << 16 | q0) << 8 */
+ vmovl.u8 q2, d4 /* expand from char to short */
+ vzip.16 q0, q2
+ vshl.u32 q0, q0, #8
+ vshl.u32 q1, q2, #8
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_16_neon_fast_start
+
+ mov pc, lr
+
+
+
+/**
+ * hdmi_dma_copy_24_neon_lut
+ * Convert pcm sample to iec sample. Pcm sample is 24 bits.
+ * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_24_neon_lut(unsigned int *src, unsigned int *dst,
+ * int samples, unsigned char *lookup_table);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM24 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ * lookup_table Preconstructed header table. Channels interleaved.
+ */
+
+hdmi_dma_copy_24_neon_lut:
+ vpush {d8}
+
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d8, r12
+
+hdmi_dma_copy_24_neon_lut_start:
+
+ /* get 8 samples to q0 and q1 */
+ vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q2, q0 /* count of 1s */
+ vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */
+ vcnt.8 q3, q1
+ vpadd.i8 d6, d6, d7
+ vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */
+ vand d4, d4, d8 /* clear other bits while keep the least bit */
+ vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */
+
+ /* get packet header */
+ vld1.8 {d5}, [r3]!/* d5: original header */
+ veor d5, d5, d4 /* fix bit p */
+
+ /* store: (d5 << 24 | q0) */
+ vmovl.u8 q3, d5 /* expand from char to short */
+ vmovl.u16 q2, d6 /* expand from short to int */
+ vmovl.u16 q3, d7
+ vshl.u32 q2, q2, #24
+ vshl.u32 q3, q3, #24
+ vorr q0, q0, q2
+ vorr q1, q1, q3
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_24_neon_lut_start
+
+ vpop {d8}
+ mov pc, lr
+
+/**
+ * hdmi_dma_copy_24_neon_fast
+ * Convert pcm sample to iec sample. Pcm sample is 24 bits.
+ * Frame index's between 48 and 191 inclusively.
+ * Channel count can be 1, 2, 4 or 8.
+ * Frame count should be multipliable by 4, and Sample count by 8.
+ *
+ * C Prototype
+ * void hdmi_dma_copy_24_neon_fast(unsigned int *src,
+ * unsigned int *dst, int samples);
+ * Return value
+ * None
+ * Parameters
+ * src Source PCM24 samples
+ * dst Dest buffer to store pcm with header
+ * samples Contains sample count (=frame_count * channel_count)
+ */
+
+hdmi_dma_copy_24_neon_fast:
+ vpush {d8}
+
+ mov r12, #1 /* construct vector(1) */
+ vdup.8 d8, r12
+
+hdmi_dma_copy_24_neon_fast_start:
+ /* get 8 samples to q0 and q1 */
+ vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */
+
+ /* pld [r1, #(64*4)] */
+
+ /* xor every bit */
+ vcnt.8 q2, q0 /* count of 1s */
+ vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */
+ vcnt.8 q3, q1
+ vpadd.i8 d6, d6, d7
+ vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */
+ vand d4, d4, d8 /* clear other bits while keep the least bit */
+ vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */
+
+ /* store: (d4 << 24 | q0) */
+ vmovl.u8 q3, d4 /* expand from char to short */
+ vmovl.u16 q2, d6 /* expand from short to int */
+ vmovl.u16 q3, d7
+ vshl.u32 q2, q2, #24
+ vshl.u32 q3, q3, #24
+ vorr q0, q0, q2
+ vorr q1, q1, q3
+ vst1.32 {d0, d1, d2, d3}, [r1]!
+
+ /* decrease sample count */
+ subs r2, r2, #8
+ bne hdmi_dma_copy_24_neon_fast_start
+
+ vpop {d8}
+ mov pc, lr
diff --git a/sound/soc/fsl/imx-hdmi-dma.c b/sound/soc/fsl/imx-hdmi-dma.c
index a0c6406969c9..8f3e79845fa6 100644
--- a/sound/soc/fsl/imx-hdmi-dma.c
+++ b/sound/soc/fsl/imx-hdmi-dma.c
@@ -79,6 +79,17 @@ struct hdmi_dma_priv {
/* max 8 channels supported; channels are interleaved */
static u8 g_packet_head_table[48 * 8];
+void hdmi_dma_copy_16_neon_lut(unsigned short *src, unsigned int *dst,
+ int samples, unsigned char *lookup_table);
+void hdmi_dma_copy_16_neon_fast(unsigned short *src, unsigned int *dst,
+ int samples);
+void hdmi_dma_copy_24_neon_lut(unsigned int *src, unsigned int *dst,
+ int samples, unsigned char *lookup_table);
+void hdmi_dma_copy_24_neon_fast(unsigned int *src, unsigned int *dst,
+ int samples);
+static void hdmi_dma_irq_enable(struct hdmi_dma_priv *priv);
+static void hdmi_dma_irq_disable(struct hdmi_dma_priv *priv);
+
union hdmi_audio_header_t iec_header;
EXPORT_SYMBOL(iec_header);
@@ -275,6 +286,7 @@ static void init_table(int channels)
}
}
+#ifdef HDMI_DMA_NO_NEON
/* Optimization for IEC head */
static void hdmi_dma_copy_16_c_lut(u16 *src, u32 *dst, int samples,
u8 *lookup_table)
@@ -356,6 +368,45 @@ static void hdmi_dma_copy_16(u16 *src, u32 *dst, int framecnt, int channelcnt)
dst += samples;
}
}
+#else
+/* NEON optimization for IEC head*/
+
+/**
+ * Convert pcm samples to iec samples suitable for HDMI transfer.
+ * PCM sample is 16 bits length.
+ * Frame index always starts from 0.
+ * Channel count can be 1, 2, 4, 6, or 8
+ * Sample count (frame_count * channel_count) is multipliable by 8.
+ */
+static void hdmi_dma_copy_16(u16 *src, u32 *dst, int framecount, int channelcount)
+{
+ /* split input frames into 192-frame each */
+ int i, count_in_192 = (framecount + 191) / 192;
+
+ for (i = 0; i < count_in_192; i++) {
+ int count, samples;
+
+ /* handles frame index [0, 48) */
+ count = (framecount < 48) ? framecount : 48;
+ samples = count * channelcount;
+ hdmi_dma_copy_16_neon_lut(src, dst, samples, g_packet_head_table);
+ framecount -= count;
+ if (framecount == 0)
+ break;
+
+ src += samples;
+ dst += samples;
+
+ /* handles frame index [48, 192) */
+ count = (framecount < 192 - 48) ? framecount : (192 - 48);
+ samples = count * channelcount;
+ hdmi_dma_copy_16_neon_fast(src, dst, samples);
+ framecount -= count;
+ src += samples;
+ dst += samples;
+ }
+}
+#endif
static void hdmi_dma_mmap_copy(struct snd_pcm_substream *substream,
int offset, int count)