root/trunk/libffado/src/libutil/ByteSwap.h

Revision 849, 4.3 kB (checked in by ppalmers, 16 years ago)

temp commit of some SSE/memory optimizations

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #ifndef __FFADO_BYTESWAP__
25 #define __FFADO_BYTESWAP__
26
27 #include <netinet/in.h>
28 #include <endian.h>
29 #include <assert.h>
30
31 // to check for SSE etc...
32 #include "config.h"
33
34 #include <stdio.h>
35
36 #if __BYTE_ORDER == __BIG_ENDIAN
37
38 // no-op for big endian machines
39 static inline void
40 byteSwapToBus(quadlet_t *data, unsigned int nb_elements)
41 {
42     return;
43 }
44
45 static inline void
46 byteSwapFromBus(quadlet_t *data, unsigned int nb_elements)
47 {
48     return;
49 }
50
51 #else
52
53 #ifdef __SSE2__
54 #include <emmintrin.h>
55 #warning SSE2 build
56
57 //static inline void
58 void
59 byteSwapToBus(quadlet_t *data, unsigned int nb_elements)
60 {
61     // Work input until data reaches 16 byte alignment
62     while ((((unsigned long)data) & 0xF) && nb_elements > 0) {
63         *data = htonl(*data);
64         data++;
65         nb_elements--;
66     }
67     assert((((unsigned long)data) & 0xF) == 0);
68
69     // now do the SSE based conversion
70     // we have to go from [A B C D] to [D C B A]
71     // where A, B, C, D are bytes
72     //
73     // the algorithm is:
74     // 1) [A B C D] => [B A D C]
75     // 2) [B A D C] => [D C B A]
76     //
77     // i.e. first do a 2x(2x8bit) swap
78     // then a 2x16bit swap
79    
80     __m128i v;
81     while(nb_elements >= 4) {
82         // prefetch the data for the next round
83          __builtin_prefetch(data+128, 0, 0);
84
85         // load the data into the vector unit
86         v = _mm_load_si128((__m128i*)data);
87         // do first swap
88         v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); //swap it
89         // do second swap
90         v = _mm_or_si128( _mm_slli_epi32( v, 16 ), _mm_srli_epi32( v, 16 ) ); //swap it
91         // store result
92         _mm_store_si128 ((__m128i*)data, v);
93        
94         data += 4;
95         nb_elements -= 4;
96     }
97
98     // and do the remaining ones
99     while (nb_elements > 0) {
100         *data = htonl(*data);
101         data++;
102         nb_elements--;
103     }
104 }
105
106 //static inline void
107 void
108 byteSwapFromBus(quadlet_t *data, unsigned int nb_elements)
109 {
110     // Work input until data reaches 16 byte alignment
111     while ((((unsigned long)data) & 0xF) && nb_elements > 0) {
112         *data = htonl(*data);
113         data++;
114         nb_elements--;
115     }
116     assert((((unsigned long)data) & 0xF) == 0);
117
118     // now do the SSE based conversion
119     // we have to go from [A B C D] to [D C B A]
120     // where A, B, C, D are bytes
121     //
122     // the algorithm is:
123     // 1) [A B C D] => [B A D C]
124     // 2) [B A D C] => [D C B A]
125     //
126     // i.e. first do a 2x(2x8bit) swap
127     // then a 2x16bit swap
128    
129     __m128i v;
130     while(nb_elements >= 4) {
131         // load the data into the vector unit
132         v = _mm_load_si128((__m128i*)data);
133         // do first swap
134         v = _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) ); //swap it
135         // do second swap
136         v = _mm_or_si128( _mm_slli_epi32( v, 16 ), _mm_srli_epi32( v, 16 ) ); //swap it
137         // store result
138         _mm_store_si128 ((__m128i*)data, v);
139        
140         data += 4;
141         nb_elements -= 4;
142     }
143
144     // and do the remaining ones
145     while (nb_elements > 0) {
146         *data = htonl(*data);
147         data++;
148         nb_elements--;
149     }
150 }
151
152 #else
153
154 static inline void
155 byteSwapToBus(quadlet_t *data, unsigned int nb_elements)
156 {
157     unsigned int i=0;
158     for(; i<nb_elements; i++) {
159         *data = htonl(*data);
160         data++;
161     }
162 }
163
164 static inline void
165 byteSwapFromBus(quadlet_t *data, unsigned int nb_elements)
166 {
167     unsigned int i=0;
168     for(; i<nb_elements; i++) {
169         *data = ntohl(*data);
170         data++;
171     }
172 }
173
174 #endif // sse2
175
176 #endif // byte order
177
178 #endif // h
Note: See TracBrowser for help on using the browser.