| /** |
| * Copyright (C) 2010-2014 Freescale Semiconductor, Inc. All Rights Reserved. |
| * |
| * This program is free software; you can redistribute it and/or modify |
| * it under the terms of the GNU General Public License as published by |
| * the Free Software Foundation; either version 2 of the License, or |
| * (at your option) any later version. |
| |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| * GNU General Public License for more details. |
| |
| * You should have received a copy of the GNU General Public License along |
| * with this program; if not, write to the Free Software Foundation, Inc., |
| * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
| */ |
| |
| .section .text |
| |
| .global hdmi_dma_copy_16_neon_lut |
| .global hdmi_dma_copy_16_neon_fast |
| .global hdmi_dma_copy_24_neon_lut |
| .global hdmi_dma_copy_24_neon_fast |
| |
| |
| /** |
| * hdmi_dma_copy_16_neon_lut |
| * Convert pcm sample to iec sample. Pcm sample is 16 bits. |
| * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8. |
| * Frame count should be multipliable by 4, and Sample count by 8. |
| * |
| * C Prototype |
| * void hdmi_dma_copy_16_neon_lut(unsigned short *src, unsigned int *dst, |
| * int samples, unsigned char *lookup_table); |
| * Return value |
| * None |
| * Parameters |
| * src Source PCM16 samples |
| * dst Dest buffer to store pcm with header |
| * samples Contains sample count (=frame_count * channel_count) |
| * lookup_table Preconstructed header table. Channels interleaved. |
| */ |
| |
| hdmi_dma_copy_16_neon_lut: |
| mov r12, #1 /* construct vector(1) */ |
| vdup.8 d6, r12 |
| |
| hdmi_dma_copy_16_neon_lut_start: |
| |
| /* get 8 samples to q0 */ |
| vld1.16 {d0, d1}, [r0]! /* TODO: aligned */ |
| |
| /* pld [r1, #(64*4)] */ |
| |
| /* xor every bit */ |
| vcnt.8 q1, q0 /* count of 1s */ |
| vpadd.i8 d2, d2, d3 /* only care about the LST in every element */ |
| vand d2, d2, d6 /* clear other bits while keep the least bit */ |
| vshl.u8 d2, d2, #3 /* bit p: d2 = d2 << 3 */ |
| |
| /* get packet header */ |
| vld1.8 {d5}, [r3]! |
| veor d4, d5, d2 /* xor bit c */ |
| |
| /* store: (d4 << 16 | q0) << 8 */ |
| vmovl.u8 q2, d4 /* expand from char to short */ |
| vzip.16 q0, q2 |
| vshl.u32 q0, q0, #8 |
| vshl.u32 q1, q2, #8 |
| vst1.32 {d0, d1, d2, d3}, [r1]! |
| |
| /* decrease sample count */ |
| subs r2, r2, #8 |
| bne hdmi_dma_copy_16_neon_lut_start |
| |
| mov pc, lr |
| |
| /** |
| * hdmi_dma_copy_16_neon_fast |
| * Convert pcm sample to iec sample. Pcm sample is 16 bits. |
| * Frame index's between 48 and 191 inclusively. |
| * Channel count can be 1, 2, 4 or 8. |
| * Frame count should be multipliable by 4, and Sample count by 8. |
| * |
| * C Prototype |
| * void hdmi_dma_copy_16_neon_fast(unsigned short *src, |
| * unsigned int *dst, int samples); |
| * Return value |
| * None |
| * Parameters |
| * src Source PCM16 samples |
| * dst Dest buffer to store pcm with header |
| * samples Contains sample count (=frame_count * channel_count) |
| */ |
| |
| hdmi_dma_copy_16_neon_fast: |
| mov r12, #1 /* construct vector(1) */ |
| vdup.8 d6, r12 |
| |
| hdmi_dma_copy_16_neon_fast_start: |
| /* get 8 samples to q0 */ |
| vld1.16 {d0, d1}, [r0]! /* TODO: aligned */ |
| |
| /* pld [r1, #(64*4)] */ |
| |
| /* xor every bit */ |
| vcnt.8 q1, q0 /* count of 1s */ |
| vpadd.i8 d2, d2, d3 |
| vand d2, d2, d6 /* clear other bits while keep the LST */ |
| /* finally we construct packet header */ |
| vshl.u8 d4, d2, #3 /* bit p: d2 = d2 << 3 */ |
| |
| /* get packet header: always 0 */ |
| |
| /* store: (d4 << 16 | q0) << 8 */ |
| vmovl.u8 q2, d4 /* expand from char to short */ |
| vzip.16 q0, q2 |
| vshl.u32 q0, q0, #8 |
| vshl.u32 q1, q2, #8 |
| vst1.32 {d0, d1, d2, d3}, [r1]! |
| |
| /* decrease sample count */ |
| subs r2, r2, #8 |
| bne hdmi_dma_copy_16_neon_fast_start |
| |
| mov pc, lr |
| |
| |
| |
| /** |
| * hdmi_dma_copy_24_neon_lut |
| * Convert pcm sample to iec sample. Pcm sample is 24 bits. |
| * Frame index's between 0 and 47 inclusively. Channel count can be 1, 2, 4, 8. |
| * Frame count should be multipliable by 4, and Sample count by 8. |
| * |
| * C Prototype |
| * void hdmi_dma_copy_24_neon_lut(unsigned int *src, unsigned int *dst, |
| * int samples, unsigned char *lookup_table); |
| * Return value |
| * None |
| * Parameters |
| * src Source PCM24 samples |
| * dst Dest buffer to store pcm with header |
| * samples Contains sample count (=frame_count * channel_count) |
| * lookup_table Preconstructed header table. Channels interleaved. |
| */ |
| |
| hdmi_dma_copy_24_neon_lut: |
| vpush {d8} |
| |
| mov r12, #1 /* construct vector(1) */ |
| vdup.8 d8, r12 |
| |
| hdmi_dma_copy_24_neon_lut_start: |
| |
| /* get 8 samples to q0 and q1 */ |
| vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */ |
| |
| /* pld [r1, #(64*4)] */ |
| |
| /* xor every bit */ |
| vcnt.8 q2, q0 /* count of 1s */ |
| vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */ |
| vcnt.8 q3, q1 |
| vpadd.i8 d6, d6, d7 |
| vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */ |
| vand d4, d4, d8 /* clear other bits while keep the least bit */ |
| vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */ |
| |
| /* get packet header */ |
| vld1.8 {d5}, [r3]!/* d5: original header */ |
| veor d5, d5, d4 /* fix bit p */ |
| |
| /* store: (d5 << 24 | q0) */ |
| vmovl.u8 q3, d5 /* expand from char to short */ |
| vmovl.u16 q2, d6 /* expand from short to int */ |
| vmovl.u16 q3, d7 |
| vshl.u32 q2, q2, #24 |
| vshl.u32 q3, q3, #24 |
| vorr q0, q0, q2 |
| vorr q1, q1, q3 |
| vst1.32 {d0, d1, d2, d3}, [r1]! |
| |
| /* decrease sample count */ |
| subs r2, r2, #8 |
| bne hdmi_dma_copy_24_neon_lut_start |
| |
| vpop {d8} |
| mov pc, lr |
| |
| /** |
| * hdmi_dma_copy_24_neon_fast |
| * Convert pcm sample to iec sample. Pcm sample is 24 bits. |
| * Frame index's between 48 and 191 inclusively. |
| * Channel count can be 1, 2, 4 or 8. |
| * Frame count should be multipliable by 4, and Sample count by 8. |
| * |
| * C Prototype |
| * void hdmi_dma_copy_24_neon_fast(unsigned int *src, |
| * unsigned int *dst, int samples); |
| * Return value |
| * None |
| * Parameters |
| * src Source PCM24 samples |
| * dst Dest buffer to store pcm with header |
| * samples Contains sample count (=frame_count * channel_count) |
| */ |
| |
| hdmi_dma_copy_24_neon_fast: |
| vpush {d8} |
| |
| mov r12, #1 /* construct vector(1) */ |
| vdup.8 d8, r12 |
| |
| hdmi_dma_copy_24_neon_fast_start: |
| /* get 8 samples to q0 and q1 */ |
| vld1.32 {d0, d1, d2, d3}, [r0]! /* TODO: aligned */ |
| |
| /* pld [r1, #(64*4)] */ |
| |
| /* xor every bit */ |
| vcnt.8 q2, q0 /* count of 1s */ |
| vpadd.i8 d4, d4, d5 /* only care about the LSB in every element */ |
| vcnt.8 q3, q1 |
| vpadd.i8 d6, d6, d7 |
| vpadd.i8 d4, d4, d6 /* d4: contains xor result and other dirty bits */ |
| vand d4, d4, d8 /* clear other bits while keep the least bit */ |
| vshl.u8 d4, d4, #3 /* bit p: d4 = d4 << 3 */ |
| |
| /* store: (d4 << 24 | q0) */ |
| vmovl.u8 q3, d4 /* expand from char to short */ |
| vmovl.u16 q2, d6 /* expand from short to int */ |
| vmovl.u16 q3, d7 |
| vshl.u32 q2, q2, #24 |
| vshl.u32 q3, q3, #24 |
| vorr q0, q0, q2 |
| vorr q1, q1, q3 |
| vst1.32 {d0, d1, d2, d3}, [r1]! |
| |
| /* decrease sample count */ |
| subs r2, r2, #8 |
| bne hdmi_dma_copy_24_neon_fast_start |
| |
| vpop {d8} |
| mov pc, lr |