root/trunk/libffado/src/libutil/ByteSwap.h

Revision 847, 4.0 kB (checked in by ppalmers, 15 years ago)

use SSE2 for byteswaps (20% faster than ntohl())

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #ifndef __FFADO_BYTESWAP__
25 #define __FFADO_BYTESWAP__
26
27 #include <netinet/in.h>
28 #include <assert.h>
29
30 // to check for SSE etc...
31 #include "config.h"
32
33 #include <stdio.h>
34
35 #ifdef __SSE2__
36 #include <emmintrin.h>
37 #warning SSE2 build
38
39 static inline void
40 byteSwapToBus(quadlet_t *data, unsigned int nb_elements)
41 {
42     // Work input until data reaches 16 byte alignment
43     while ((((unsigned long)data) & 0xF) && nb_elements > 0) {
44         *data = htonl(*data);
45         data++;
46         nb_elements--;
47     }
48     assert((((unsigned long)data) & 0xF) == 0);
49
50     // now do the SSE based conversion
51     // we have to go from [A B C D] to [D C B A]
52     // where A, B, C, D are bytes
53     //
54     // the algorithm is:
55     // 1) [A B C D] => [B A D C]
56     // 2) [B A D C] => [D C B A]
57     //
58     // i.e. first do a 2x(2x8bit) swap
59     // then a 2x16bit swap
60    
61     __m128i v;
62     while(nb_elements >= 4) {
63         // prefetch the data for the next round
64          __builtin_prefetch(data+128, 0, 0);
65
66         // load the data into the vector unit
67         v = _mm_load_si128((__m128i*)data);
68         // do first swap
69         v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); //swap it
70         // do second swap
71         v = _mm_or_si128( _mm_slli_epi32( v, 16 ), _mm_srli_epi32( v, 16 ) ); //swap it
72         // store result
73         _mm_store_si128 ((__m128i*)data, v);
74        
75         data += 4;
76         nb_elements -= 4;
77     }
78
79     // and do the remaining ones
80     while (nb_elements > 0) {
81         *data = htonl(*data);
82         data++;
83         nb_elements--;
84     }
85 }
86
87 static inline void
88 byteSwapFromBus(quadlet_t *data, unsigned int nb_elements)
89 {
90     // Work input until data reaches 16 byte alignment
91     while ((((unsigned long)data) & 0xF) && nb_elements > 0) {
92         *data = htonl(*data);
93         data++;
94         nb_elements--;
95     }
96     assert((((unsigned long)data) & 0xF) == 0);
97
98     // now do the SSE based conversion
99     // we have to go from [A B C D] to [D C B A]
100     // where A, B, C, D are bytes
101     //
102     // the algorithm is:
103     // 1) [A B C D] => [B A D C]
104     // 2) [B A D C] => [D C B A]
105     //
106     // i.e. first do a 2x(2x8bit) swap
107     // then a 2x16bit swap
108    
109     __m128i v;
110     while(nb_elements >= 4) {
111         // prefetch the data for the next round
112          __builtin_prefetch(data+128, 0, 0);
113
114         // load the data into the vector unit
115         v = _mm_load_si128((__m128i*)data);
116         // do first swap
117         v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); //swap it
118         // do second swap
119         v = _mm_or_si128( _mm_slli_epi32( v, 16 ), _mm_srli_epi32( v, 16 ) ); //swap it
120         // store result
121         _mm_store_si128 ((__m128i*)data, v);
122        
123         data += 4;
124         nb_elements -= 4;
125     }
126
127     // and do the remaining ones
128     while (nb_elements > 0) {
129         *data = htonl(*data);
130         data++;
131         nb_elements--;
132     }
133 }
134
135 #else
136
137 static inline void
138 byteSwapToBus(quadlet_t *data, unsigned int nb_elements)
139 {
140     unsigned int i=0;
141     for(; i<nb_elements; i++) {
142         *data = htonl(*data);
143         data++;
144     }
145 }
146
147 static inline void
148 byteSwapFromBus(quadlet_t *data, unsigned int nb_elements)
149 {
150     unsigned int i=0;
151     for(; i<nb_elements; i++) {
152         *data = ntohl(*data);
153         data++;
154     }
155 }
156
157 #endif
158
159 #endif
Note: See TracBrowser for help on using the browser.