root/trunk/libffado/src/libstreaming/amdtp/AmdtpTransmitStreamProcessor.cpp

Revision 856, 37.5 kB (checked in by ppalmers, 16 years ago)

revert to old structure and implement SSE routines in there. This avoids memory accesses hence speeding things up

Line 
1 /*
2  * Copyright (C) 2005-2007 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 3 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #include "config.h"
25 #include "AmdtpTransmitStreamProcessor.h"
26 #include "AmdtpPort.h"
27 #include "../StreamProcessorManager.h"
28 #include "devicemanager.h"
29
30 #include "libutil/Time.h"
31
32 #include "libieee1394/ieee1394service.h"
33 #include "libieee1394/IsoHandlerManager.h"
34 #include "libieee1394/cycletimer.h"
35
36 #include <netinet/in.h>
37 #include <assert.h>
38
39 #define AMDTP_FLOAT_MULTIPLIER 2147483392.0
40
41 namespace Streaming
42 {
43
44 /* transmit */
45 AmdtpTransmitStreamProcessor::AmdtpTransmitStreamProcessor(FFADODevice &parent, int dimension)
46         : StreamProcessor(parent, ePT_Transmit)
47         , m_dimension( dimension )
48         , m_dbc( 0 )
49         , m_nb_audio_ports( 0 )
50         , m_nb_midi_ports( 0 )
51 {}
52
53 enum StreamProcessor::eChildReturnValue
54 AmdtpTransmitStreamProcessor::generatePacketHeader (
55     unsigned char *data, unsigned int *length,
56     unsigned char *tag, unsigned char *sy,
57     int cycle, unsigned int dropped, unsigned int max_length )
58 {
59     __builtin_prefetch(data, 1, 0); // prefetch events for write, no temporal locality
60     struct iec61883_packet *packet = (struct iec61883_packet *)data;
61     /* Our node ID can change after a bus reset, so it is best to fetch
62     * our node ID for each packet. */
63     packet->sid = m_local_node_id;
64
65     packet->dbs = m_dimension;
66     packet->fn = 0;
67     packet->qpc = 0;
68     packet->sph = 0;
69     packet->reserved = 0;
70     packet->dbc = m_dbc;
71     packet->eoh1 = 2;
72     packet->fmt = IEC61883_FMT_AMDTP;
73
74     *tag = IEC61883_TAG_WITH_CIP;
75     *sy = 0;
76
77     signed int fc;
78     uint64_t presentation_time;
79     unsigned int presentation_cycle;
80     int cycles_until_presentation;
81
82     uint64_t transmit_at_time;
83     unsigned int transmit_at_cycle;
84     int cycles_until_transmit;
85
86     debugOutput ( DEBUG_LEVEL_ULTRA_VERBOSE, "Try for cycle %d\n", cycle );
87     // check whether the packet buffer has packets for us to send.
88     // the base timestamp is the one of the next sample in the buffer
89     ffado_timestamp_t ts_head_tmp;
90     m_data_buffer->getBufferHeadTimestamp ( &ts_head_tmp, &fc ); // thread safe
91
92     // the timestamp gives us the time at which we want the sample block
93     // to be output by the device
94     presentation_time = ( uint64_t ) ts_head_tmp;
95     m_last_timestamp = presentation_time;
96
97     // now we calculate the time when we have to transmit the sample block
98     transmit_at_time = substractTicks ( presentation_time, AMDTP_TRANSMIT_TRANSFER_DELAY );
99
100     // calculate the cycle this block should be presented in
101     // (this is just a virtual calculation since at that time it should
102     //  already be in the device's buffer)
103     presentation_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( presentation_time ) );
104
105     // calculate the cycle this block should be transmitted in
106     transmit_at_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( transmit_at_time ) );
107
108     // we can check whether this cycle is within the 'window' we have
109     // to send this packet.
110     // first calculate the number of cycles left before presentation time
111     cycles_until_presentation = diffCycles ( presentation_cycle, cycle );
112
113     // we can check whether this cycle is within the 'window' we have
114     // to send this packet.
115     // first calculate the number of cycles left before presentation time
116     cycles_until_transmit = diffCycles ( transmit_at_cycle, cycle );
117
118     if (dropped) {
119         debugOutput ( DEBUG_LEVEL_VERBOSE,
120                     "Gen HDR: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
121                     cycle,
122                     transmit_at_cycle, cycles_until_transmit,
123                     transmit_at_time, ( unsigned int ) TICKS_TO_CYCLES ( transmit_at_time ),
124                     presentation_time, ( unsigned int ) TICKS_TO_CYCLES ( presentation_time ) );
125     }
126     // two different options:
127     // 1) there are not enough frames for one packet
128     //      => determine wether this is a problem, since we might still
129     //         have some time to send it
130     // 2) there are enough packets
131     //      => determine whether we have to send them in this packet
132     if ( fc < ( signed int ) m_syt_interval )
133     {
134         // not enough frames in the buffer,
135
136         // we can still postpone the queueing of the packets
137         // if we are far enough ahead of the presentation time
138         if ( cycles_until_presentation <= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION )
139         {
140             debugOutput ( DEBUG_LEVEL_VERBOSE,
141                         "Insufficient frames (P): N=%02d, CY=%04u, TC=%04u, CUT=%04d\n",
142                         fc, cycle, transmit_at_cycle, cycles_until_transmit );
143             // we are too late
144             return eCRV_XRun;
145         }
146         else
147         {
148             unsigned int now_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( m_1394service.getCycleTimerTicks() ) );
149
150             debugOutput ( DEBUG_LEVEL_VERBOSE,
151                         "Insufficient frames (NP): N=%02d, CY=%04u, TC=%04u, CUT=%04d, NOW=%04d\n",
152                         fc, cycle, transmit_at_cycle, cycles_until_transmit, now_cycle );
153             debugWarning("Insufficient frames (NP): N=%02d, CY=%04u, TC=%04u, CUT=%04d, NOW=%04d\n",
154                          fc, cycle, transmit_at_cycle, cycles_until_transmit, now_cycle );
155
156             // there is still time left to send the packet
157             // we want the system to give this packet another go at a later time instant
158             return eCRV_Again; // note that the raw1394 again system doesn't work as expected
159
160             // we could wait here for a certain time before trying again. However, this
161             // is not going to work since we then block the iterator thread, hence also
162             // the receiving code, meaning that we are not processing received packets,
163             // and hence there is no progression in the number of frames available.
164
165             // for example:
166             // SleepRelativeUsec(125); // one cycle
167             // goto try_block_of_frames;
168
169             // or more advanced, calculate how many cycles we are ahead of 'now' and
170             // base the sleep on that.
171
172             // note that this requires that there is one thread for each IsoHandler,
173             // otherwise we're in the deadlock described above.
174         }
175     }
176     else
177     {
178         // there are enough frames, so check the time they are intended for
179         // all frames have a certain 'time window' in which they can be sent
180         // this corresponds to the range of the timestamp mechanism:
181         // we can send a packet 15 cycles in advance of the 'presentation time'
182         // in theory we can send the packet up till one cycle before the presentation time,
183         // however this is not very smart.
184
185         // There are 3 options:
186         // 1) the frame block is too early
187         //      => send an empty packet
188         // 2) the frame block is within the window
189         //      => send it
190         // 3) the frame block is too late
191         //      => discard (and raise xrun?)
192         //         get next block of frames and repeat
193
194         if(cycles_until_transmit < 0)
195         {
196             // we are too late
197             debugOutput(DEBUG_LEVEL_VERBOSE,
198                         "Too late: CY=%04u, TC=%04u, CUT=%04d, TSP=%011llu (%04u)\n",
199                         cycle,
200                         transmit_at_cycle, cycles_until_transmit,
201                         presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time) );
202             //debugShowBackLogLines(200);
203 //             // however, if we can send this sufficiently before the presentation
204 //             // time, it could be harmless.
205 //             // NOTE: dangerous since the device has no way of reporting that it didn't get
206 //             //       this packet on time.
207 //             if(cycles_until_presentation >= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION)
208 //             {
209 //                 // we are not that late and can still try to transmit the packet
210 //                 m_dbc += fillDataPacketHeader(packet, length, m_last_timestamp);
211 //                 return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
212 //             }
213 //             else   // definitely too late
214 //             {
215                 return eCRV_XRun;
216 //             }
217         }
218         else if(cycles_until_transmit <= AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY)
219         {
220             // it's time send the packet
221             m_dbc += fillDataPacketHeader(packet, length, m_last_timestamp);
222             return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
223         }
224         else
225         {
226             debugOutput ( DEBUG_LEVEL_VERY_VERBOSE,
227                         "Too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
228                         cycle,
229                         transmit_at_cycle, cycles_until_transmit,
230                         transmit_at_time, ( unsigned int ) TICKS_TO_CYCLES ( transmit_at_time ),
231                         presentation_time, ( unsigned int ) TICKS_TO_CYCLES ( presentation_time ) );
232 #ifdef DEBUG
233             if ( cycles_until_transmit > AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY + 1 )
234             {
235                 debugOutput ( DEBUG_LEVEL_VERY_VERBOSE,
236                             "Way too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
237                             cycle,
238                             transmit_at_cycle, cycles_until_transmit,
239                             transmit_at_time, ( unsigned int ) TICKS_TO_CYCLES ( transmit_at_time ),
240                             presentation_time, ( unsigned int ) TICKS_TO_CYCLES ( presentation_time ) );
241             }
242 #endif
243             // we are too early, send only an empty packet
244             return eCRV_EmptyPacket;
245         }
246     }
247     return eCRV_Invalid;
248 }
249
250 enum StreamProcessor::eChildReturnValue
251 AmdtpTransmitStreamProcessor::generatePacketData (
252     unsigned char *data, unsigned int *length,
253     unsigned char *tag, unsigned char *sy,
254     int cycle, unsigned int dropped, unsigned int max_length )
255 {
256     if ( m_data_buffer->readFrames ( m_syt_interval, ( char * ) ( data + 8 ) ) )
257     {
258         debugOutput ( DEBUG_LEVEL_ULTRA_VERBOSE, "XMIT DATA (cy %04d): TSP=%011llu (%04u)\n",
259                     cycle, m_last_timestamp, ( unsigned int ) TICKS_TO_CYCLES ( m_last_timestamp ) );
260         return eCRV_OK;
261     }
262     else return eCRV_XRun;
263
264 }
265
266 enum StreamProcessor::eChildReturnValue
267 AmdtpTransmitStreamProcessor::generateSilentPacketHeader (
268     unsigned char *data, unsigned int *length,
269     unsigned char *tag, unsigned char *sy,
270     int cycle, unsigned int dropped, unsigned int max_length )
271 {
272     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
273     debugOutput ( DEBUG_LEVEL_ULTRA_VERBOSE, "XMIT NONE (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
274                 cycle, m_last_timestamp, ( unsigned int ) TICKS_TO_CYCLES ( m_last_timestamp ) );
275
276     /* Our node ID can change after a bus reset, so it is best to fetch
277     * our node ID for each packet. */
278     packet->sid = m_local_node_id;
279
280     packet->dbs = m_dimension;
281     packet->fn = 0;
282     packet->qpc = 0;
283     packet->sph = 0;
284     packet->reserved = 0;
285     packet->dbc = m_dbc;
286     packet->eoh1 = 2;
287     packet->fmt = IEC61883_FMT_AMDTP;
288
289     *tag = IEC61883_TAG_WITH_CIP;
290     *sy = 0;
291
292     m_dbc += fillNoDataPacketHeader ( packet, length );
293     return eCRV_OK;
294 }
295
296 enum StreamProcessor::eChildReturnValue
297 AmdtpTransmitStreamProcessor::generateSilentPacketData (
298     unsigned char *data, unsigned int *length,
299     unsigned char *tag, unsigned char *sy,
300     int cycle, unsigned int dropped, unsigned int max_length )
301 {
302     return eCRV_OK; // no need to do anything
303 }
304
305 unsigned int AmdtpTransmitStreamProcessor::fillDataPacketHeader (
306     struct iec61883_packet *packet, unsigned int* length,
307     uint32_t ts )
308 {
309
310     packet->fdf = m_fdf;
311
312     // convert the timestamp to SYT format
313     uint16_t timestamp_SYT = TICKS_TO_SYT ( ts );
314     packet->syt = ntohs ( timestamp_SYT );
315
316     *length = m_syt_interval*sizeof ( quadlet_t ) *m_dimension + 8;
317
318     return m_syt_interval;
319 }
320
321 unsigned int AmdtpTransmitStreamProcessor::fillNoDataPacketHeader (
322     struct iec61883_packet *packet, unsigned int* length )
323 {
324
325     // no-data packets have syt=0xFFFF
326     // and have the usual amount of events as dummy data (?)
327     packet->fdf = IEC61883_FDF_NODATA;
328     packet->syt = 0xffff;
329
330     // FIXME: either make this a setting or choose
331     bool send_payload=true;
332     if ( send_payload )
333     {
334         // this means no-data packets with payload (DICE doesn't like that)
335         *length = 2*sizeof ( quadlet_t ) + m_syt_interval * m_dimension * sizeof ( quadlet_t );
336         return m_syt_interval;
337     }
338     else
339     {
340         // dbc is not incremented
341         // this means no-data packets without payload
342         *length = 2*sizeof ( quadlet_t );
343         return 0;
344     }
345 }
346
347 unsigned int
348 AmdtpTransmitStreamProcessor::getSytInterval() {
349     switch (m_StreamProcessorManager.getNominalRate()) {
350         case 32000:
351         case 44100:
352         case 48000:
353             return 8;
354         case 88200:
355         case 96000:
356             return 16;
357         case 176400:
358         case 192000:
359             return 32;
360         default:
361             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
362             return 0;
363     }
364 }
365 unsigned int
366 AmdtpTransmitStreamProcessor::getFDF() {
367     switch (m_StreamProcessorManager.getNominalRate()) {
368         case 32000: return IEC61883_FDF_SFC_32KHZ;
369         case 44100: return IEC61883_FDF_SFC_44K1HZ;
370         case 48000: return IEC61883_FDF_SFC_48KHZ;
371         case 88200: return IEC61883_FDF_SFC_88K2HZ;
372         case 96000: return IEC61883_FDF_SFC_96KHZ;
373         case 176400: return IEC61883_FDF_SFC_176K4HZ;
374         case 192000: return IEC61883_FDF_SFC_192KHZ;
375         default:
376             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
377             return 0;
378     }
379 }
380
381 bool AmdtpTransmitStreamProcessor::prepareChild()
382 {
383     debugOutput ( DEBUG_LEVEL_VERBOSE, "Preparing (%p)...\n", this );
384     m_syt_interval = getSytInterval();
385     m_fdf = getFDF();
386
387     iec61883_cip_init (
388         &m_cip_status,
389         IEC61883_FMT_AMDTP,
390         m_fdf,
391         m_StreamProcessorManager.getNominalRate(),
392         m_dimension,
393         m_syt_interval );
394
395     if (!initPortCache()) {
396         debugError("Could not init port cache\n");
397         return false;
398     }
399
400     return true;
401 }
402
403 /*
404 * compose the event streams for the packets from the port buffers
405 */
406 bool AmdtpTransmitStreamProcessor::processWriteBlock ( char *data,
407         unsigned int nevents, unsigned int offset )
408 {
409     // update the variable parts of the cache
410     updatePortCache();
411
412     // encode audio data
413     switch(m_StreamProcessorManager.getAudioDataType()) {
414         case StreamProcessorManager::eADT_Int24:
415             encodeAudioPortsInt24((quadlet_t *)data, offset, nevents);
416             break;
417         case StreamProcessorManager::eADT_Float:
418             encodeAudioPortsFloat((quadlet_t *)data, offset, nevents);
419             break;
420     }
421
422     // do midi ports
423     encodeMidiPorts((quadlet_t *)data, offset, nevents);
424     return true;
425 }
426
427 bool
428 AmdtpTransmitStreamProcessor::transmitSilenceBlock(
429     char *data, unsigned int nevents, unsigned int offset)
430 {
431     // no need to update the port cache when transmitting silence since
432     // no dynamic values are used to do so.
433     encodeAudioPortsSilence((quadlet_t *)data, offset, nevents);
434     encodeMidiPortsSilence((quadlet_t *)data, offset, nevents);
435     return true;
436 }
437
438 /**
439  * @brief encodes all audio ports in the cache to events (silent data)
440  * @param data
441  * @param offset
442  * @param nevents
443  */
444 void
445 AmdtpTransmitStreamProcessor::encodeAudioPortsSilence(quadlet_t *data,
446                                                       unsigned int offset,
447                                                       unsigned int nevents)
448 {
449     unsigned int j;
450     quadlet_t *target_event;
451     unsigned int i;
452
453     for (i = 0; i < m_nb_audio_ports; i++) {
454         target_event = (quadlet_t *)(data + i);
455
456         for (j = 0;j < nevents; j += 1)
457         {
458             *target_event = 0x00000040;
459             target_event += m_dimension;
460         }
461     }
462 }
463
464 #ifdef __SSE2__
465 //#if 0
466 #include <emmintrin.h>
467 #warning SSE2 build
468
469 /**
470  * @brief mux all audio ports to events
471  * @param data
472  * @param offset
473  * @param nevents
474  */
475 void
476 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
477                                                     unsigned int offset,
478                                                     unsigned int nevents)
479 {
480     unsigned int j;
481     quadlet_t *target_event;
482     unsigned int i;
483
484     float * client_buffers[4];
485     float tmp_values[4] __attribute__ ((aligned (16)));
486     uint32_t tmp_values_int[4] __attribute__ ((aligned (16)));
487
488     // prepare the scratch buffer
489     assert(m_scratch_buffer_size_bytes > nevents * 4);
490     memset(m_scratch_buffer, 0, nevents * 4);
491
492     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
493     const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER);
494
495     // this assumes that audio ports are sorted by position,
496     // and that there are no gaps
497     for (i = 0; i < m_nb_audio_ports-4; i += 4) {
498         struct _MBLA_port_cache *p;
499
500         // get the port buffers
501         for (j=0; j<4; j++) {
502             p = &(m_audio_ports.at(i+j));
503             if(p->buffer && p->enabled) {
504                 client_buffers[j] = (float *) p->buffer;
505                 client_buffers[j] += offset;
506             } else {
507                 // if a port is disabled or has no valid
508                 // buffer, use the scratch buffer (all zero's)
509                 client_buffers[j] = (float *) m_scratch_buffer;
510             }
511         }
512
513         // the base event for this position
514         target_event = (quadlet_t *)(data + i);
515
516         // process the events
517         for (j=0;j < nevents; j += 1)
518         {
519             // read the values
520             tmp_values[0] = *(client_buffers[0]);
521             tmp_values[1] = *(client_buffers[1]);
522             tmp_values[2] = *(client_buffers[2]);
523             tmp_values[3] = *(client_buffers[3]);
524
525             // now do the SSE based conversion/labeling
526             __m128 v_float = *((__m128*)tmp_values);
527             __m128i *target = (__m128i*)target_event;
528             __m128i v_int;
529
530             // multiply
531             v_float = _mm_mul_ps(v_float, mult);
532             // convert to signed integer
533             v_int = _mm_cvttps_epi32( v_float );
534             // shift right 8 bits
535             v_int = _mm_srli_epi32( v_int, 8 );
536             // label it
537             v_int = _mm_or_si128( v_int, label );
538
539             // do endian conversion (SSE is always little endian)
540             // do first swap
541             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
542             // do second swap
543             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
544
545             // store the packed int
546             // (target misalignment is assumed since we don't know the m_dimension)
547             _mm_storeu_si128 (target, v_int);
548
549             // increment the buffer pointers
550             client_buffers[0]++;
551             client_buffers[1]++;
552             client_buffers[2]++;
553             client_buffers[3]++;
554
555             // go to next target event position
556             target_event += m_dimension;
557         }
558     }
559
560     // do remaining ports
561     // NOTE: these can be time-SSE'd
562     for (; i < m_nb_audio_ports; i++) {
563         struct _MBLA_port_cache &p = m_audio_ports.at(i);
564         target_event = (quadlet_t *)(data + i);
565         assert(nevents + offset <= p.buffer_size );
566
567         if(p.buffer && p.enabled) {
568             float *buffer = (float *)(p.buffer);
569             buffer += offset;
570    
571             for (j = 0;j < nevents; j += 4)
572             {
573                 // read the values
574                 tmp_values[0] = *buffer;
575                 buffer++;
576                 tmp_values[1] = *buffer;
577                 buffer++;
578                 tmp_values[2] = *buffer;
579                 buffer++;
580                 tmp_values[3] = *buffer;
581                 buffer++;
582    
583                 // now do the SSE based conversion/labeling
584                 __m128 v_float = *((__m128*)tmp_values);
585                 __m128i v_int;
586    
587                 // multiply
588                 v_float = _mm_mul_ps(v_float, mult);
589                 // convert to signed integer
590                 v_int = _mm_cvttps_epi32( v_float );
591                 // shift right 8 bits
592                 v_int = _mm_srli_epi32( v_int, 8 );
593                 // label it
594                 v_int = _mm_or_si128( v_int, label );
595    
596                 // do endian conversion (SSE is always little endian)
597                 // do first swap
598                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
599                 // do second swap
600                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
601
602                 // store the packed int
603                 _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int);
604
605                 // increment the buffer pointers
606                 *target_event = tmp_values_int[0];
607                 target_event += m_dimension;
608                 *target_event = tmp_values_int[1];
609                 target_event += m_dimension;
610                 *target_event = tmp_values_int[2];
611                 target_event += m_dimension;
612                 *target_event = tmp_values_int[3];
613                 target_event += m_dimension;
614             }
615
616             // do the remainder of the events
617             for(;j < nevents; j += 1) {
618                 float *in = (float *)buffer;
619                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
620                 unsigned int tmp = ((int) v);
621                 tmp = ( tmp >> 8 ) | 0x40000000;
622                 *target_event = htonl((quadlet_t)tmp);
623                 buffer++;
624                 target_event += m_dimension;
625             }
626
627         } else {
628             for (j = 0;j < nevents; j += 1)
629             {
630                 // hardcoded byte swapped
631                 *target_event = 0x00000040;
632                 target_event += m_dimension;
633             }
634         }
635     }
636 }
637
638
639 /**
640  * @brief mux all audio ports to events
641  * @param data
642  * @param offset
643  * @param nevents
644  */
645 void
646 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
647                                                     unsigned int offset,
648                                                     unsigned int nevents)
649 {
650     unsigned int j;
651     quadlet_t *target_event;
652     unsigned int i;
653
654     uint32_t *client_buffers[4];
655     uint32_t tmp_values[4] __attribute__ ((aligned (16)));
656
657     // prepare the scratch buffer
658     assert(m_scratch_buffer_size_bytes > nevents * 4);
659     memset(m_scratch_buffer, 0, nevents * 4);
660
661     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
662     const __m128i mask  = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
663
664     // this assumes that audio ports are sorted by position,
665     // and that there are no gaps
666     for (i = 0; i < m_nb_audio_ports-4; i += 4) {
667         struct _MBLA_port_cache *p;
668
669         // get the port buffers
670         for (j=0; j<4; j++) {
671             p = &(m_audio_ports.at(i+j));
672             if(p->buffer && p->enabled) {
673                 client_buffers[j] = (uint32_t *) p->buffer;
674                 client_buffers[j] += offset;
675             } else {
676                 // if a port is disabled or has no valid
677                 // buffer, use the scratch buffer (all zero's)
678                 client_buffers[j] = (uint32_t *) m_scratch_buffer;
679             }
680         }
681
682         // the base event for this position
683         target_event = (quadlet_t *)(data + i);
684
685         // process the events
686         for (j=0;j < nevents; j += 1)
687         {
688             // read the values
689             tmp_values[0] = *(client_buffers[0]);
690             tmp_values[1] = *(client_buffers[1]);
691             tmp_values[2] = *(client_buffers[2]);
692             tmp_values[3] = *(client_buffers[3]);
693
694             // now do the SSE based conversion/labeling
695             __m128i *target = (__m128i*)target_event;
696             __m128i v_int = *((__m128i*)tmp_values);;
697
698             // mask
699             v_int = _mm_and_si128( v_int, mask );
700             // label it
701             v_int = _mm_or_si128( v_int, label );
702
703             // do endian conversion (SSE is always little endian)
704             // do first swap
705             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
706             // do second swap
707             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
708
709             // store the packed int
710             // (target misalignment is assumed since we don't know the m_dimension)
711             _mm_storeu_si128 (target, v_int);
712
713             // increment the buffer pointers
714             client_buffers[0]++;
715             client_buffers[1]++;
716             client_buffers[2]++;
717             client_buffers[3]++;
718
719             // go to next target event position
720             target_event += m_dimension;
721         }
722     }
723
724     // do remaining ports
725     // NOTE: these can be time-SSE'd
726     for (; i < m_nb_audio_ports; i++) {
727         struct _MBLA_port_cache &p = m_audio_ports.at(i);
728         target_event = (quadlet_t *)(data + i);
729         assert(nevents + offset <= p.buffer_size );
730
731         if(p.buffer && p.enabled) {
732             uint32_t *buffer = (uint32_t *)(p.buffer);
733             buffer += offset;
734    
735             for (j = 0;j < nevents; j += 4)
736             {
737                 // read the values
738                 tmp_values[0] = *buffer;
739                 buffer++;
740                 tmp_values[1] = *buffer;
741                 buffer++;
742                 tmp_values[2] = *buffer;
743                 buffer++;
744                 tmp_values[3] = *buffer;
745                 buffer++;
746
747                 // now do the SSE based conversion/labeling
748                 __m128i v_int = *((__m128i*)tmp_values);;
749
750                 // mask
751                 v_int = _mm_and_si128( v_int, mask );
752                 // label it
753                 v_int = _mm_or_si128( v_int, label );
754
755                 // do endian conversion (SSE is always little endian)
756                 // do first swap
757                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
758                 // do second swap
759                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
760
761                 // store the packed int
762                 _mm_store_si128 ((__m128i *)(&tmp_values), v_int);
763
764                 // increment the buffer pointers
765                 *target_event = tmp_values[0];
766                 target_event += m_dimension;
767                 *target_event = tmp_values[1];
768                 target_event += m_dimension;
769                 *target_event = tmp_values[2];
770                 target_event += m_dimension;
771                 *target_event = tmp_values[3];
772                 target_event += m_dimension;
773             }
774
775             // do the remainder of the events
776             for(;j < nevents; j += 1) {
777                 uint32_t in = (uint32_t)(*buffer);
778                 *target_event = htonl((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
779                 buffer++;
780                 target_event += m_dimension;
781             }
782
783         } else {
784             for (j = 0;j < nevents; j += 1)
785             {
786                 // hardcoded byte swapped
787                 *target_event = 0x00000040;
788                 target_event += m_dimension;
789             }
790         }
791     }
792 }
793
794 #else
795
796 /**
797  * @brief mux all audio ports to events
798  * @param data
799  * @param offset
800  * @param nevents
801  */
802 void
803 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
804                                                     unsigned int offset,
805                                                     unsigned int nevents)
806 {
807     unsigned int j;
808     quadlet_t *target_event;
809     unsigned int i;
810
811     for (i = 0; i < m_nb_audio_ports; i++) {
812         struct _MBLA_port_cache &p = m_audio_ports.at(i);
813         target_event = (quadlet_t *)(data + i);
814         assert(nevents + offset <= p.buffer_size );
815
816         if(p.buffer && p.enabled) {
817             quadlet_t *buffer = (quadlet_t *)(p.buffer);
818             buffer += offset;
819    
820             for (j = 0;j < nevents; j += 1)
821             {
822                 uint32_t in = (uint32_t)(*buffer);
823                 *target_event = htonl((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
824                 buffer++;
825                 target_event += m_dimension;
826             }
827         } else {
828             for (j = 0;j < nevents; j += 1)
829             {
830                 *target_event = 0x00000040;
831                 target_event += m_dimension;
832             }
833         }
834     }
835 }
836
837 /**
838  * @brief mux all audio ports to events
839  * @param data
840  * @param offset
841  * @param nevents
842  */
843 void
844 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
845                                                     unsigned int offset,
846                                                     unsigned int nevents)
847 {
848     unsigned int j;
849     quadlet_t *target_event;
850     unsigned int i;
851
852     for (i = 0; i < m_nb_audio_ports; i++) {
853         struct _MBLA_port_cache &p = m_audio_ports.at(i);
854         target_event = (quadlet_t *)(data + i);
855         assert(nevents + offset <= p.buffer_size );
856
857         if(p.buffer && p.enabled) {
858             quadlet_t *buffer = (quadlet_t *)(p.buffer);
859             buffer += offset;
860    
861             for (j = 0;j < nevents; j += 1)
862             {
863                 float *in = (float *)buffer;
864                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
865                 unsigned int tmp = ((int) v);
866                 tmp = ( tmp >> 8 ) | 0x40000000;
867                 *target_event = htonl((quadlet_t)tmp);
868                 buffer++;
869                 target_event += m_dimension;
870             }
871         } else {
872             for (j = 0;j < nevents; j += 1)
873             {
874                 // hardcoded little endian
875                 *target_event = 0x00000040;
876                 target_event += m_dimension;
877             }
878         }
879     }
880 }
881 #endif
882
883 /**
884  * @brief encodes all midi ports in the cache to events (silence)
885  * @param data
886  * @param offset
887  * @param nevents
888  */
889 void
890 AmdtpTransmitStreamProcessor::encodeMidiPortsSilence(quadlet_t *data,
891                                                      unsigned int offset,
892                                                      unsigned int nevents)
893 {
894     quadlet_t *target_event;
895     unsigned int i,j;
896
897     for (i = 0; i < m_nb_midi_ports; i++) {
898         struct _MIDI_port_cache &p = m_midi_ports.at(i);
899
900         for (j = p.location;j < nevents; j += 8) {
901             target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
902             *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
903         }
904     }
905 }
906
907 /**
908  * @brief encodes all midi ports in the cache to events
909  * @param data
910  * @param offset
911  * @param nevents
912  */
913 void
914 AmdtpTransmitStreamProcessor::encodeMidiPorts(quadlet_t *data,
915                                               unsigned int offset,
916                                               unsigned int nevents)
917 {
918     quadlet_t *target_event;
919     unsigned int i,j;
920
921     for (i = 0; i < m_nb_midi_ports; i++) {
922         struct _MIDI_port_cache &p = m_midi_ports.at(i);
923         if (p.buffer && p.enabled) {
924             uint32_t *buffer = (quadlet_t *)(p.buffer);
925             buffer += offset;
926
927             for (j = p.location;j < nevents; j += 8) {
928                 target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
929
930                 if ( *buffer & 0xFF000000 )   // we can send a byte
931                 {
932                     quadlet_t tmpval;
933                     tmpval = ((*buffer)<<16) & 0x00FF0000;
934                     tmpval = IEC61883_AM824_SET_LABEL(tmpval, IEC61883_AM824_LABEL_MIDI_1X);
935                     *target_event = htonl(tmpval);
936
937 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "MIDI port %s, pos=%u, loc=%u, nevents=%u, dim=%d\n",
938 //                                p.port->getName().c_str(), p.position, p.location, nevents, m_dimension );
939 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "base=%p, target=%p, value=%08X\n",
940 //                                data, target_event, tmpval );
941                 } else {
942                     // can't send a byte, either because there is no byte,
943                     // or because this would exceed the maximum rate
944                     // FIXME: this can be ifdef optimized since it's a constant
945                     *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
946                 }
947                 buffer+=8;
948             }
949         } else {
950             for (j = p.location;j < nevents; j += 8) {
951                 target_event = (quadlet_t *)(data + ((j * m_dimension) + p.position));
952                 __builtin_prefetch(target_event, 1, 0); // prefetch events for write, no temporal locality
953                 *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
954             }
955         }
956     }
957 }
958
959 bool
960 AmdtpTransmitStreamProcessor::initPortCache() {
961     // make use of the fact that audio ports are the first ports in
962     // the cluster as per AMDTP. so we can sort the ports by position
963     // and have very efficient lookups:
964     // m_float_ports.at(i).buffer -> audio stream i buffer
965     // for midi ports we simply cache all port info since they are (usually) not
966     // that numerous
967     m_nb_audio_ports = 0;
968     m_audio_ports.clear();
969    
970     m_nb_midi_ports = 0;
971     m_midi_ports.clear();
972    
973     for(PortVectorIterator it = m_Ports.begin();
974         it != m_Ports.end();
975         ++it )
976     {
977         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
978         assert(pinfo); // this should not fail!!
979
980         switch( pinfo->getFormat() )
981         {
982             case AmdtpPortInfo::E_MBLA:
983                 m_nb_audio_ports++;
984                 break;
985             case AmdtpPortInfo::E_SPDIF: // still unimplemented
986                 break;
987             case AmdtpPortInfo::E_Midi:
988                 m_nb_midi_ports++;
989                 break;
990             default: // ignore
991                 break;
992         }
993     }
994
995     unsigned int idx;
996     for (idx = 0; idx < m_nb_audio_ports; idx++) {
997         for(PortVectorIterator it = m_Ports.begin();
998             it != m_Ports.end();
999             ++it )
1000         {
1001             AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1002             debugOutput(DEBUG_LEVEL_VERY_VERBOSE, "idx %u: looking at port %s at position %u\n",
1003                                               idx, (*it)->getName().c_str(), pinfo->getPosition());
1004             if(pinfo->getPosition() == idx) {
1005                 struct _MBLA_port_cache p;
1006                 p.port = dynamic_cast<AmdtpAudioPort *>(*it);
1007                 if(p.port == NULL) {
1008                     debugError("Port is not an AmdtpAudioPort!\n");
1009                     return false;
1010                 }
1011                 p.buffer = NULL; // to be filled by updatePortCache
1012                 #ifdef DEBUG
1013                 p.buffer_size = (*it)->getBufferSize();
1014                 #endif
1015
1016                 m_audio_ports.push_back(p);
1017                 debugOutput(DEBUG_LEVEL_VERBOSE, "Cached port %s at position %u\n",
1018                                                  p.port->getName().c_str(), idx);
1019                 goto next_index;
1020             }
1021         }
1022         debugError("No MBLA port found for position %d\n", idx);
1023         return false;
1024 next_index:
1025         continue;
1026     }
1027
1028     for(PortVectorIterator it = m_Ports.begin();
1029         it != m_Ports.end();
1030         ++it )
1031     {
1032         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1033         debugOutput(DEBUG_LEVEL_VERY_VERBOSE, "idx %u: looking at port %s at position %u, location %u\n",
1034                                         idx, (*it)->getName().c_str(), pinfo->getPosition(), pinfo->getLocation());
1035         if ((*it)->getPortType() == Port::E_Midi) {
1036             struct _MIDI_port_cache p;
1037             p.port = dynamic_cast<AmdtpMidiPort *>(*it);
1038             if(p.port == NULL) {
1039                 debugError("Port is not an AmdtpMidiPort!\n");
1040                 return false;
1041             }
1042             p.position = pinfo->getPosition();
1043             p.location = pinfo->getLocation();
1044             p.buffer = NULL; // to be filled by updatePortCache
1045             #ifdef DEBUG
1046             p.buffer_size = (*it)->getBufferSize();
1047             #endif
1048
1049             m_midi_ports.push_back(p);
1050             debugOutput(DEBUG_LEVEL_VERBOSE, "Cached port %s at position %u, location %u\n",
1051                                             p.port->getName().c_str(), p.position, p.location);
1052         }
1053     }
1054
1055     return true;
1056 }
1057
1058 void
1059 AmdtpTransmitStreamProcessor::updatePortCache() {
1060     unsigned int idx;
1061     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1062         struct _MBLA_port_cache& p = m_audio_ports.at(idx);
1063         AmdtpAudioPort *port = p.port;
1064         p.buffer = port->getBufferAddress();
1065         p.enabled = !port->isDisabled();
1066     }
1067     for (idx = 0; idx < m_nb_midi_ports; idx++) {
1068         struct _MIDI_port_cache& p = m_midi_ports.at(idx);
1069         AmdtpMidiPort *port = p.port;
1070         p.buffer = port->getBufferAddress();
1071         p.enabled = !port->isDisabled();
1072     }
1073 }
1074
1075 } // end of namespace Streaming
Note: See TracBrowser for help on using the browser.