root/trunk/libffado/src/libstreaming/amdtp/AmdtpTransmitStreamProcessor.cpp

Revision 887, 40.4 kB (checked in by ppalmers, 16 years ago)

implement clipping for float data

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 2 of the License, or
12  * (at your option) version 3 of the License.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #include "config.h"
25 #include "AmdtpTransmitStreamProcessor.h"
26 #include "AmdtpPort.h"
27 #include "../StreamProcessorManager.h"
28 #include "devicemanager.h"
29
30 #include "libutil/Time.h"
31
32 #include "libieee1394/ieee1394service.h"
33 #include "libieee1394/IsoHandlerManager.h"
34 #include "libieee1394/cycletimer.h"
35
36 #include <netinet/in.h>
37 #include <assert.h>
38
39 #define AMDTP_FLOAT_MULTIPLIER 2147483392.0
40
41 namespace Streaming
42 {
43
44 /* transmit */
45 AmdtpTransmitStreamProcessor::AmdtpTransmitStreamProcessor(FFADODevice &parent, int dimension)
46         : StreamProcessor(parent, ePT_Transmit)
47         , m_dimension( dimension )
48         , m_dbc( 0 )
49         , m_nb_audio_ports( 0 )
50         , m_nb_midi_ports( 0 )
51 {}
52
53 enum StreamProcessor::eChildReturnValue
54 AmdtpTransmitStreamProcessor::generatePacketHeader (
55     unsigned char *data, unsigned int *length,
56     unsigned char *tag, unsigned char *sy,
57     int cycle, unsigned int dropped, unsigned int max_length )
58 {
59     __builtin_prefetch(data, 1, 0); // prefetch events for write, no temporal locality
60     struct iec61883_packet *packet = (struct iec61883_packet *)data;
61     /* Our node ID can change after a bus reset, so it is best to fetch
62     * our node ID for each packet. */
63     packet->sid = m_local_node_id;
64
65     packet->dbs = m_dimension;
66     packet->fn = 0;
67     packet->qpc = 0;
68     packet->sph = 0;
69     packet->reserved = 0;
70     packet->dbc = m_dbc;
71     packet->eoh1 = 2;
72     packet->fmt = IEC61883_FMT_AMDTP;
73
74     *tag = IEC61883_TAG_WITH_CIP;
75     *sy = 0;
76
77     signed int fc;
78     uint64_t presentation_time;
79     unsigned int presentation_cycle;
80     int cycles_until_presentation;
81
82     uint64_t transmit_at_time;
83     unsigned int transmit_at_cycle;
84     int cycles_until_transmit;
85
86     debugOutputExtreme( DEBUG_LEVEL_ULTRA_VERBOSE,
87                         "Try for cycle %d\n", cycle );
88     // check whether the packet buffer has packets for us to send.
89     // the base timestamp is the one of the next sample in the buffer
90     ffado_timestamp_t ts_head_tmp;
91     m_data_buffer->getBufferHeadTimestamp ( &ts_head_tmp, &fc ); // thread safe
92
93     // the timestamp gives us the time at which we want the sample block
94     // to be output by the device
95     presentation_time = ( uint64_t ) ts_head_tmp;
96     m_last_timestamp = presentation_time;
97
98     // now we calculate the time when we have to transmit the sample block
99     transmit_at_time = substractTicks ( presentation_time, AMDTP_TRANSMIT_TRANSFER_DELAY );
100
101     // calculate the cycle this block should be presented in
102     // (this is just a virtual calculation since at that time it should
103     //  already be in the device's buffer)
104     presentation_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( presentation_time ) );
105
106     // calculate the cycle this block should be transmitted in
107     transmit_at_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( transmit_at_time ) );
108
109     // we can check whether this cycle is within the 'window' we have
110     // to send this packet.
111     // first calculate the number of cycles left before presentation time
112     cycles_until_presentation = diffCycles ( presentation_cycle, cycle );
113
114     // we can check whether this cycle is within the 'window' we have
115     // to send this packet.
116     // first calculate the number of cycles left before presentation time
117     cycles_until_transmit = diffCycles ( transmit_at_cycle, cycle );
118
119     if (dropped) {
120         debugOutput( DEBUG_LEVEL_VERBOSE,
121                      "Gen HDR: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
122                      cycle,
123                      transmit_at_cycle, cycles_until_transmit,
124                      transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
125                      presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time) );
126     }
127     // two different options:
128     // 1) there are not enough frames for one packet
129     //      => determine wether this is a problem, since we might still
130     //         have some time to send it
131     // 2) there are enough packets
132     //      => determine whether we have to send them in this packet
133     if ( fc < ( signed int ) m_syt_interval )
134     {
135         // not enough frames in the buffer,
136
137         // we can still postpone the queueing of the packets
138         // if we are far enough ahead of the presentation time
139         if ( cycles_until_presentation <= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION )
140         {
141             debugOutput( DEBUG_LEVEL_NORMAL,
142                          "Insufficient frames (P): N=%02d, CY=%04u, TC=%04u, CUT=%04d\n",
143                          fc, cycle, transmit_at_cycle, cycles_until_transmit );
144             // we are too late
145             return eCRV_XRun;
146         }
147         else
148         {
149             unsigned int now_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( m_1394service.getCycleTimerTicks() ) );
150
151             debugOutputExtreme(DEBUG_LEVEL_VERBOSE,
152                                "Insufficient frames (NP): N=%02d, CY=%04u, TC=%04u, CUT=%04d, NOW=%04d\n",
153                                fc, cycle, transmit_at_cycle, cycles_until_transmit, now_cycle );
154
155             // there is still time left to send the packet
156             // we want the system to give this packet another go at a later time instant
157             return eCRV_Again; // note that the raw1394 again system doesn't work as expected
158
159             // we could wait here for a certain time before trying again. However, this
160             // is not going to work since we then block the iterator thread, hence also
161             // the receiving code, meaning that we are not processing received packets,
162             // and hence there is no progression in the number of frames available.
163
164             // for example:
165             // SleepRelativeUsec(125); // one cycle
166             // goto try_block_of_frames;
167
168             // or more advanced, calculate how many cycles we are ahead of 'now' and
169             // base the sleep on that.
170
171             // note that this requires that there is one thread for each IsoHandler,
172             // otherwise we're in the deadlock described above.
173         }
174     }
175     else
176     {
177         // there are enough frames, so check the time they are intended for
178         // all frames have a certain 'time window' in which they can be sent
179         // this corresponds to the range of the timestamp mechanism:
180         // we can send a packet 15 cycles in advance of the 'presentation time'
181         // in theory we can send the packet up till one cycle before the presentation time,
182         // however this is not very smart.
183
184         // There are 3 options:
185         // 1) the frame block is too early
186         //      => send an empty packet
187         // 2) the frame block is within the window
188         //      => send it
189         // 3) the frame block is too late
190         //      => discard (and raise xrun?)
191         //         get next block of frames and repeat
192
193         if(cycles_until_transmit < 0)
194         {
195             // we are too late
196             debugOutput(DEBUG_LEVEL_NORMAL,
197                         "Too late: CY=%04u, TC=%04u, CUT=%04d, TSP=%011llu (%04u)\n",
198                         cycle,
199                         transmit_at_cycle, cycles_until_transmit,
200                         presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time) );
201             //debugShowBackLogLines(200);
202 //             // however, if we can send this sufficiently before the presentation
203 //             // time, it could be harmless.
204 //             // NOTE: dangerous since the device has no way of reporting that it didn't get
205 //             //       this packet on time.
206 //             if(cycles_until_presentation >= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION)
207 //             {
208 //                 // we are not that late and can still try to transmit the packet
209 //                 m_dbc += fillDataPacketHeader(packet, length, m_last_timestamp);
210 //                 return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
211 //             }
212 //             else   // definitely too late
213 //             {
214                 return eCRV_XRun;
215 //             }
216         }
217         else if(cycles_until_transmit <= AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY)
218         {
219             // it's time send the packet
220             m_dbc += fillDataPacketHeader(packet, length, m_last_timestamp);
221             return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
222         }
223         else
224         {
225             debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
226                                "Too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
227                                cycle,
228                                transmit_at_cycle, cycles_until_transmit,
229                                transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
230                                presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
231 #ifdef DEBUG
232             if ( cycles_until_transmit > AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY + 1 )
233             {
234                 debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
235                                    "Way too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
236                                    cycle,
237                                    transmit_at_cycle, cycles_until_transmit,
238                                    transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
239                                    presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
240             }
241 #endif
242             // we are too early, send only an empty packet
243             return eCRV_EmptyPacket;
244         }
245     }
246     return eCRV_Invalid;
247 }
248
249 enum StreamProcessor::eChildReturnValue
250 AmdtpTransmitStreamProcessor::generatePacketData (
251     unsigned char *data, unsigned int *length,
252     unsigned char *tag, unsigned char *sy,
253     int cycle, unsigned int dropped, unsigned int max_length )
254 {
255     if ( m_data_buffer->readFrames ( m_syt_interval, ( char * ) ( data + 8 ) ) )
256     {
257         debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
258                            "XMIT DATA (cy %04d): TSP=%011llu (%04u)\n",
259                            cycle, m_last_timestamp, (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
260         return eCRV_OK;
261     }
262     else return eCRV_XRun;
263
264 }
265
266 enum StreamProcessor::eChildReturnValue
267 AmdtpTransmitStreamProcessor::generateSilentPacketHeader (
268     unsigned char *data, unsigned int *length,
269     unsigned char *tag, unsigned char *sy,
270     int cycle, unsigned int dropped, unsigned int max_length )
271 {
272     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
273     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
274                        "XMIT SILENT (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
275                        cycle, m_last_timestamp, (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
276
277     packet->sid = m_local_node_id;
278
279     packet->dbs = m_dimension;
280     packet->fn = 0;
281     packet->qpc = 0;
282     packet->sph = 0;
283     packet->reserved = 0;
284     packet->dbc = m_dbc;
285     packet->eoh1 = 2;
286     packet->fmt = IEC61883_FMT_AMDTP;
287
288     *tag = IEC61883_TAG_WITH_CIP;
289     *sy = 0;
290
291     m_dbc += fillNoDataPacketHeader ( packet, length );
292     return eCRV_Packet;
293 }
294
295 enum StreamProcessor::eChildReturnValue
296 AmdtpTransmitStreamProcessor::generateSilentPacketData (
297     unsigned char *data, unsigned int *length,
298     unsigned char *tag, unsigned char *sy,
299     int cycle, unsigned int dropped, unsigned int max_length )
300 {
301     return eCRV_OK; // no need to do anything
302 }
303
304 enum StreamProcessor::eChildReturnValue
305 AmdtpTransmitStreamProcessor::generateEmptyPacketHeader (
306     unsigned char *data, unsigned int *length,
307     unsigned char *tag, unsigned char *sy,
308     int cycle, unsigned int dropped, unsigned int max_length )
309 {
310     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
311     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
312                        "XMIT EMPTY (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
313                        cycle, m_last_timestamp, (unsigned int)TICKS_TO_CYCLES(m_last_timestamp) );
314     packet->sid = m_local_node_id;
315
316     packet->dbs = m_dimension;
317     packet->fn = 0;
318     packet->qpc = 0;
319     packet->sph = 0;
320     packet->reserved = 0;
321     packet->dbc = m_dbc;
322     packet->eoh1 = 2;
323     packet->fmt = IEC61883_FMT_AMDTP;
324
325     *tag = IEC61883_TAG_WITH_CIP;
326     *sy = 0;
327
328     m_dbc += fillNoDataPacketHeader ( packet, length );
329     return eCRV_OK;
330 }
331
332 enum StreamProcessor::eChildReturnValue
333 AmdtpTransmitStreamProcessor::generateEmptyPacketData (
334     unsigned char *data, unsigned int *length,
335     unsigned char *tag, unsigned char *sy,
336     int cycle, unsigned int dropped, unsigned int max_length )
337 {
338     return eCRV_OK; // no need to do anything
339 }
340
341 unsigned int AmdtpTransmitStreamProcessor::fillDataPacketHeader (
342     struct iec61883_packet *packet, unsigned int* length,
343     uint32_t ts )
344 {
345
346     packet->fdf = m_fdf;
347
348     // convert the timestamp to SYT format
349     uint16_t timestamp_SYT = TICKS_TO_SYT ( ts );
350     packet->syt = ntohs ( timestamp_SYT );
351
352     *length = m_syt_interval*sizeof ( quadlet_t ) *m_dimension + 8;
353
354     return m_syt_interval;
355 }
356
357 unsigned int AmdtpTransmitStreamProcessor::fillNoDataPacketHeader (
358     struct iec61883_packet *packet, unsigned int* length )
359 {
360
361     // no-data packets have syt=0xFFFF
362     // and have the usual amount of events as dummy data (?)
363     packet->fdf = IEC61883_FDF_NODATA;
364     packet->syt = 0xffff;
365
366     // FIXME: either make this a setting or choose
367     bool send_payload=true;
368     if ( send_payload )
369     {
370         // this means no-data packets with payload (DICE doesn't like that)
371         *length = 2*sizeof ( quadlet_t ) + m_syt_interval * m_dimension * sizeof ( quadlet_t );
372         return m_syt_interval;
373     }
374     else
375     {
376         // dbc is not incremented
377         // this means no-data packets without payload
378         *length = 2*sizeof ( quadlet_t );
379         return 0;
380     }
381 }
382
383 unsigned int
384 AmdtpTransmitStreamProcessor::getSytInterval() {
385     switch (m_StreamProcessorManager.getNominalRate()) {
386         case 32000:
387         case 44100:
388         case 48000:
389             return 8;
390         case 88200:
391         case 96000:
392             return 16;
393         case 176400:
394         case 192000:
395             return 32;
396         default:
397             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
398             return 0;
399     }
400 }
401 unsigned int
402 AmdtpTransmitStreamProcessor::getFDF() {
403     switch (m_StreamProcessorManager.getNominalRate()) {
404         case 32000: return IEC61883_FDF_SFC_32KHZ;
405         case 44100: return IEC61883_FDF_SFC_44K1HZ;
406         case 48000: return IEC61883_FDF_SFC_48KHZ;
407         case 88200: return IEC61883_FDF_SFC_88K2HZ;
408         case 96000: return IEC61883_FDF_SFC_96KHZ;
409         case 176400: return IEC61883_FDF_SFC_176K4HZ;
410         case 192000: return IEC61883_FDF_SFC_192KHZ;
411         default:
412             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
413             return 0;
414     }
415 }
416
417 bool AmdtpTransmitStreamProcessor::prepareChild()
418 {
419     debugOutput ( DEBUG_LEVEL_VERBOSE, "Preparing (%p)...\n", this );
420     m_syt_interval = getSytInterval();
421     m_fdf = getFDF();
422
423     iec61883_cip_init (
424         &m_cip_status,
425         IEC61883_FMT_AMDTP,
426         m_fdf,
427         m_StreamProcessorManager.getNominalRate(),
428         m_dimension,
429         m_syt_interval );
430
431     if (!initPortCache()) {
432         debugError("Could not init port cache\n");
433         return false;
434     }
435
436     return true;
437 }
438
439 /*
440 * compose the event streams for the packets from the port buffers
441 */
442 bool AmdtpTransmitStreamProcessor::processWriteBlock ( char *data,
443         unsigned int nevents, unsigned int offset )
444 {
445     // update the variable parts of the cache
446     updatePortCache();
447
448     // encode audio data
449     switch(m_StreamProcessorManager.getAudioDataType()) {
450         case StreamProcessorManager::eADT_Int24:
451             encodeAudioPortsInt24((quadlet_t *)data, offset, nevents);
452             break;
453         case StreamProcessorManager::eADT_Float:
454             encodeAudioPortsFloat((quadlet_t *)data, offset, nevents);
455             break;
456     }
457
458     // do midi ports
459     encodeMidiPorts((quadlet_t *)data, offset, nevents);
460     return true;
461 }
462
463 bool
464 AmdtpTransmitStreamProcessor::transmitSilenceBlock(
465     char *data, unsigned int nevents, unsigned int offset)
466 {
467     // no need to update the port cache when transmitting silence since
468     // no dynamic values are used to do so.
469     encodeAudioPortsSilence((quadlet_t *)data, offset, nevents);
470     encodeMidiPortsSilence((quadlet_t *)data, offset, nevents);
471     return true;
472 }
473
474 /**
475  * @brief encodes all audio ports in the cache to events (silent data)
476  * @param data
477  * @param offset
478  * @param nevents
479  */
480 void
481 AmdtpTransmitStreamProcessor::encodeAudioPortsSilence(quadlet_t *data,
482                                                       unsigned int offset,
483                                                       unsigned int nevents)
484 {
485     unsigned int j;
486     quadlet_t *target_event;
487     unsigned int i;
488
489     for (i = 0; i < m_nb_audio_ports; i++) {
490         target_event = (quadlet_t *)(data + i);
491
492         for (j = 0;j < nevents; j += 1)
493         {
494             *target_event = 0x00000040;
495             target_event += m_dimension;
496         }
497     }
498 }
499
500 #ifdef __SSE2__
501 //#if 0
502 #include <emmintrin.h>
503 #warning SSE2 build
504
505 /**
506  * @brief mux all audio ports to events
507  * @param data
508  * @param offset
509  * @param nevents
510  */
511 void
512 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
513                                                     unsigned int offset,
514                                                     unsigned int nevents)
515 {
516     unsigned int j;
517     quadlet_t *target_event;
518     unsigned int i;
519
520     float * client_buffers[4];
521     float tmp_values[4] __attribute__ ((aligned (16)));
522     uint32_t tmp_values_int[4] __attribute__ ((aligned (16)));
523
524     // prepare the scratch buffer
525     assert(m_scratch_buffer_size_bytes > nevents * 4);
526     memset(m_scratch_buffer, 0, nevents * 4);
527
528     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
529     const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER);
530
531 #if AMDTP_CLIP_FLOATS
532     const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
533     const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
534 #endif
535
536     // this assumes that audio ports are sorted by position,
537     // and that there are no gaps
538     for (i = 0; i < m_nb_audio_ports-4; i += 4) {
539         struct _MBLA_port_cache *p;
540
541         // get the port buffers
542         for (j=0; j<4; j++) {
543             p = &(m_audio_ports.at(i+j));
544             if(p->buffer && p->enabled) {
545                 client_buffers[j] = (float *) p->buffer;
546                 client_buffers[j] += offset;
547             } else {
548                 // if a port is disabled or has no valid
549                 // buffer, use the scratch buffer (all zero's)
550                 client_buffers[j] = (float *) m_scratch_buffer;
551             }
552         }
553
554         // the base event for this position
555         target_event = (quadlet_t *)(data + i);
556
557         // process the events
558         for (j=0;j < nevents; j += 1)
559         {
560             // read the values
561             tmp_values[0] = *(client_buffers[0]);
562             tmp_values[1] = *(client_buffers[1]);
563             tmp_values[2] = *(client_buffers[2]);
564             tmp_values[3] = *(client_buffers[3]);
565
566             // now do the SSE based conversion/labeling
567             __m128 v_float = *((__m128*)tmp_values);
568             __m128i *target = (__m128i*)target_event;
569             __m128i v_int;
570
571             // clip
572 #if AMDTP_CLIP_FLOATS
573             // implement sample<min?min:sample
574             // and sample>max?max:sample
575             // we use separate masks since that allows the
576             // compiler/cpu to do more out-of-order optimization
577
578             // is any of the pieces less than the minimum?
579             // or larger than the maximum?
580             __m128 mask1 = _mm_cmplt_ps(v_float, v_min);
581             __m128 mask2 = _mm_cmpgt_ps(v_float, v_max);
582             // clip the values that need to be clipped
583             // pass the values that don't
584             v_float = _mm_or_ps(_mm_andnot_ps(mask1, v_float), _mm_and_ps(mask1, v_min));
585             v_float = _mm_or_ps(_mm_andnot_ps(mask2, v_float), _mm_and_ps(mask2, v_max));
586 #endif
587
588             // multiply
589             v_float = _mm_mul_ps(v_float, mult);
590             // convert to signed integer
591             v_int = _mm_cvttps_epi32( v_float );
592             // shift right 8 bits
593             v_int = _mm_srli_epi32( v_int, 8 );
594             // label it
595             v_int = _mm_or_si128( v_int, label );
596
597             // do endian conversion (SSE is always little endian)
598             // do first swap
599             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
600             // do second swap
601             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
602
603             // store the packed int
604             // (target misalignment is assumed since we don't know the m_dimension)
605             _mm_storeu_si128 (target, v_int);
606
607             // increment the buffer pointers
608             client_buffers[0]++;
609             client_buffers[1]++;
610             client_buffers[2]++;
611             client_buffers[3]++;
612
613             // go to next target event position
614             target_event += m_dimension;
615         }
616     }
617
618     // do remaining ports
619     // NOTE: these can be time-SSE'd
620     for (; i < m_nb_audio_ports; i++) {
621         struct _MBLA_port_cache &p = m_audio_ports.at(i);
622         target_event = (quadlet_t *)(data + i);
623         assert(nevents + offset <= p.buffer_size );
624
625         if(p.buffer && p.enabled) {
626             float *buffer = (float *)(p.buffer);
627             buffer += offset;
628    
629             for (j = 0;j < nevents; j += 4)
630             {
631                 // read the values
632                 tmp_values[0] = *buffer;
633                 buffer++;
634                 tmp_values[1] = *buffer;
635                 buffer++;
636                 tmp_values[2] = *buffer;
637                 buffer++;
638                 tmp_values[3] = *buffer;
639                 buffer++;
640
641                 // now do the SSE based conversion/labeling
642                 __m128 v_float = *((__m128*)tmp_values);
643                 __m128i v_int;
644
645 #if AMDTP_CLIP_FLOATS
646                 // implement sample<min?min:sample
647                 // and sample>max?max:sample
648                 // we use separate masks since that allows the
649                 // compiler/cpu to do more out-of-order optimization
650
651                 // is any of the pieces less than the minimum?
652                 // or larger than the maximum?
653                 __m128 mask1 = _mm_cmplt_ps(v_float, v_min);
654                 __m128 mask2 = _mm_cmpgt_ps(v_float, v_max);
655                 // clip the values that need to be clipped
656                 // pass the values that don't
657                 v_float = _mm_or_ps(_mm_andnot_ps(mask1, v_float), _mm_and_ps(mask1, v_min));
658                 v_float = _mm_or_ps(_mm_andnot_ps(mask2, v_float), _mm_and_ps(mask2, v_max));
659 #endif
660
661                 // multiply
662                 v_float = _mm_mul_ps(v_float, mult);
663                 // convert to signed integer
664                 v_int = _mm_cvttps_epi32( v_float );
665                 // shift right 8 bits
666                 v_int = _mm_srli_epi32( v_int, 8 );
667                 // label it
668                 v_int = _mm_or_si128( v_int, label );
669    
670                 // do endian conversion (SSE is always little endian)
671                 // do first swap
672                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
673                 // do second swap
674                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
675
676                 // store the packed int
677                 _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int);
678
679                 // increment the buffer pointers
680                 *target_event = tmp_values_int[0];
681                 target_event += m_dimension;
682                 *target_event = tmp_values_int[1];
683                 target_event += m_dimension;
684                 *target_event = tmp_values_int[2];
685                 target_event += m_dimension;
686                 *target_event = tmp_values_int[3];
687                 target_event += m_dimension;
688             }
689
690             // do the remainder of the events
691             for(;j < nevents; j += 1) {
692                 float *in = (float *)buffer;
693 #if AMDTP_CLIP_FLOATS
694                 if(*in > 1.0) *in=1.0;
695                 if(*in < -1.0) *in=-1.0;
696 #endif
697                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
698                 unsigned int tmp = ((int) v);
699                 tmp = ( tmp >> 8 ) | 0x40000000;
700                 *target_event = htonl((quadlet_t)tmp);
701                 buffer++;
702                 target_event += m_dimension;
703             }
704
705         } else {
706             for (j = 0;j < nevents; j += 1)
707             {
708                 // hardcoded byte swapped
709                 *target_event = 0x00000040;
710                 target_event += m_dimension;
711             }
712         }
713     }
714 }
715
716
717 /**
718  * @brief mux all audio ports to events
719  * @param data
720  * @param offset
721  * @param nevents
722  */
723 void
724 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
725                                                     unsigned int offset,
726                                                     unsigned int nevents)
727 {
728     unsigned int j;
729     quadlet_t *target_event;
730     unsigned int i;
731
732     uint32_t *client_buffers[4];
733     uint32_t tmp_values[4] __attribute__ ((aligned (16)));
734
735     // prepare the scratch buffer
736     assert(m_scratch_buffer_size_bytes > nevents * 4);
737     memset(m_scratch_buffer, 0, nevents * 4);
738
739     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
740     const __m128i mask  = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
741
742     // this assumes that audio ports are sorted by position,
743     // and that there are no gaps
744     for (i = 0; i < m_nb_audio_ports-4; i += 4) {
745         struct _MBLA_port_cache *p;
746
747         // get the port buffers
748         for (j=0; j<4; j++) {
749             p = &(m_audio_ports.at(i+j));
750             if(p->buffer && p->enabled) {
751                 client_buffers[j] = (uint32_t *) p->buffer;
752                 client_buffers[j] += offset;
753             } else {
754                 // if a port is disabled or has no valid
755                 // buffer, use the scratch buffer (all zero's)
756                 client_buffers[j] = (uint32_t *) m_scratch_buffer;
757             }
758         }
759
760         // the base event for this position
761         target_event = (quadlet_t *)(data + i);
762
763         // process the events
764         for (j=0;j < nevents; j += 1)
765         {
766             // read the values
767             tmp_values[0] = *(client_buffers[0]);
768             tmp_values[1] = *(client_buffers[1]);
769             tmp_values[2] = *(client_buffers[2]);
770             tmp_values[3] = *(client_buffers[3]);
771
772             // now do the SSE based conversion/labeling
773             __m128i *target = (__m128i*)target_event;
774             __m128i v_int = *((__m128i*)tmp_values);;
775
776             // mask
777             v_int = _mm_and_si128( v_int, mask );
778             // label it
779             v_int = _mm_or_si128( v_int, label );
780
781             // do endian conversion (SSE is always little endian)
782             // do first swap
783             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
784             // do second swap
785             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
786
787             // store the packed int
788             // (target misalignment is assumed since we don't know the m_dimension)
789             _mm_storeu_si128 (target, v_int);
790
791             // increment the buffer pointers
792             client_buffers[0]++;
793             client_buffers[1]++;
794             client_buffers[2]++;
795             client_buffers[3]++;
796
797             // go to next target event position
798             target_event += m_dimension;
799         }
800     }
801
802     // do remaining ports
803     // NOTE: these can be time-SSE'd
804     for (; i < m_nb_audio_ports; i++) {
805         struct _MBLA_port_cache &p = m_audio_ports.at(i);
806         target_event = (quadlet_t *)(data + i);
807         assert(nevents + offset <= p.buffer_size );
808
809         if(p.buffer && p.enabled) {
810             uint32_t *buffer = (uint32_t *)(p.buffer);
811             buffer += offset;
812    
813             for (j = 0;j < nevents; j += 4)
814             {
815                 // read the values
816                 tmp_values[0] = *buffer;
817                 buffer++;
818                 tmp_values[1] = *buffer;
819                 buffer++;
820                 tmp_values[2] = *buffer;
821                 buffer++;
822                 tmp_values[3] = *buffer;
823                 buffer++;
824
825                 // now do the SSE based conversion/labeling
826                 __m128i v_int = *((__m128i*)tmp_values);;
827
828                 // mask
829                 v_int = _mm_and_si128( v_int, mask );
830                 // label it
831                 v_int = _mm_or_si128( v_int, label );
832
833                 // do endian conversion (SSE is always little endian)
834                 // do first swap
835                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
836                 // do second swap
837                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
838
839                 // store the packed int
840                 _mm_store_si128 ((__m128i *)(&tmp_values), v_int);
841
842                 // increment the buffer pointers
843                 *target_event = tmp_values[0];
844                 target_event += m_dimension;
845                 *target_event = tmp_values[1];
846                 target_event += m_dimension;
847                 *target_event = tmp_values[2];
848                 target_event += m_dimension;
849                 *target_event = tmp_values[3];
850                 target_event += m_dimension;
851             }
852
853             // do the remainder of the events
854             for(;j < nevents; j += 1) {
855                 uint32_t in = (uint32_t)(*buffer);
856                 *target_event = htonl((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
857                 buffer++;
858                 target_event += m_dimension;
859             }
860
861         } else {
862             for (j = 0;j < nevents; j += 1)
863             {
864                 // hardcoded byte swapped
865                 *target_event = 0x00000040;
866                 target_event += m_dimension;
867             }
868         }
869     }
870 }
871
872 #else
873
874 /**
875  * @brief mux all audio ports to events
876  * @param data
877  * @param offset
878  * @param nevents
879  */
880 void
881 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
882                                                     unsigned int offset,
883                                                     unsigned int nevents)
884 {
885     unsigned int j;
886     quadlet_t *target_event;
887     unsigned int i;
888
889     for (i = 0; i < m_nb_audio_ports; i++) {
890         struct _MBLA_port_cache &p = m_audio_ports.at(i);
891         target_event = (quadlet_t *)(data + i);
892         assert(nevents + offset <= p.buffer_size );
893
894         if(p.buffer && p.enabled) {
895             quadlet_t *buffer = (quadlet_t *)(p.buffer);
896             buffer += offset;
897    
898             for (j = 0;j < nevents; j += 1)
899             {
900                 uint32_t in = (uint32_t)(*buffer);
901                 *target_event = htonl((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
902                 buffer++;
903                 target_event += m_dimension;
904             }
905         } else {
906             for (j = 0;j < nevents; j += 1)
907             {
908                 *target_event = 0x00000040;
909                 target_event += m_dimension;
910             }
911         }
912     }
913 }
914
915 /**
916  * @brief mux all audio ports to events
917  * @param data
918  * @param offset
919  * @param nevents
920  */
921 void
922 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
923                                                     unsigned int offset,
924                                                     unsigned int nevents)
925 {
926     unsigned int j;
927     quadlet_t *target_event;
928     unsigned int i;
929
930     for (i = 0; i < m_nb_audio_ports; i++) {
931         struct _MBLA_port_cache &p = m_audio_ports.at(i);
932         target_event = (quadlet_t *)(data + i);
933         assert(nevents + offset <= p.buffer_size );
934
935         if(p.buffer && p.enabled) {
936             quadlet_t *buffer = (quadlet_t *)(p.buffer);
937             buffer += offset;
938    
939             for (j = 0;j < nevents; j += 1)
940             {
941                 float *in = (float *)buffer;
942 #if AMDTP_CLIP_FLOATS
943                 if(*in > 1.0) *in=1.0;
944                 if(*in < -1.0) *in=-1.0;
945 #endif
946                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
947                 unsigned int tmp = ((int) v);
948                 tmp = ( tmp >> 8 ) | 0x40000000;
949                 *target_event = htonl((quadlet_t)tmp);
950                 buffer++;
951                 target_event += m_dimension;
952             }
953         } else {
954             for (j = 0;j < nevents; j += 1)
955             {
956                 // hardcoded little endian
957                 *target_event = 0x00000040;
958                 target_event += m_dimension;
959             }
960         }
961     }
962 }
963 #endif
964
965 /**
966  * @brief encodes all midi ports in the cache to events (silence)
967  * @param data
968  * @param offset
969  * @param nevents
970  */
971 void
972 AmdtpTransmitStreamProcessor::encodeMidiPortsSilence(quadlet_t *data,
973                                                      unsigned int offset,
974                                                      unsigned int nevents)
975 {
976     quadlet_t *target_event;
977     unsigned int i,j;
978
979     for (i = 0; i < m_nb_midi_ports; i++) {
980         struct _MIDI_port_cache &p = m_midi_ports.at(i);
981
982         for (j = p.location;j < nevents; j += 8) {
983             target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
984             *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
985         }
986     }
987 }
988
989 /**
990  * @brief encodes all midi ports in the cache to events
991  * @param data
992  * @param offset
993  * @param nevents
994  */
995 void
996 AmdtpTransmitStreamProcessor::encodeMidiPorts(quadlet_t *data,
997                                               unsigned int offset,
998                                               unsigned int nevents)
999 {
1000     quadlet_t *target_event;
1001     unsigned int i,j;
1002
1003     for (i = 0; i < m_nb_midi_ports; i++) {
1004         struct _MIDI_port_cache &p = m_midi_ports.at(i);
1005         if (p.buffer && p.enabled) {
1006             uint32_t *buffer = (quadlet_t *)(p.buffer);
1007             buffer += offset;
1008
1009             for (j = p.location;j < nevents; j += 8) {
1010                 target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
1011
1012                 if ( *buffer & 0xFF000000 )   // we can send a byte
1013                 {
1014                     quadlet_t tmpval;
1015                     tmpval = ((*buffer)<<16) & 0x00FF0000;
1016                     tmpval = IEC61883_AM824_SET_LABEL(tmpval, IEC61883_AM824_LABEL_MIDI_1X);
1017                     *target_event = htonl(tmpval);
1018
1019 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "MIDI port %s, pos=%u, loc=%u, nevents=%u, dim=%d\n",
1020 //                                p.port->getName().c_str(), p.position, p.location, nevents, m_dimension );
1021 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "base=%p, target=%p, value=%08X\n",
1022 //                                data, target_event, tmpval );
1023                 } else {
1024                     // can't send a byte, either because there is no byte,
1025                     // or because this would exceed the maximum rate
1026                     // FIXME: this can be ifdef optimized since it's a constant
1027                     *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1028                 }
1029                 buffer+=8;
1030             }
1031         } else {
1032             for (j = p.location;j < nevents; j += 8) {
1033                 target_event = (quadlet_t *)(data + ((j * m_dimension) + p.position));
1034                 __builtin_prefetch(target_event, 1, 0); // prefetch events for write, no temporal locality
1035                 *target_event = htonl(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1036             }
1037         }
1038     }
1039 }
1040
1041 bool
1042 AmdtpTransmitStreamProcessor::initPortCache() {
1043     // make use of the fact that audio ports are the first ports in
1044     // the cluster as per AMDTP. so we can sort the ports by position
1045     // and have very efficient lookups:
1046     // m_float_ports.at(i).buffer -> audio stream i buffer
1047     // for midi ports we simply cache all port info since they are (usually) not
1048     // that numerous
1049     m_nb_audio_ports = 0;
1050     m_audio_ports.clear();
1051    
1052     m_nb_midi_ports = 0;
1053     m_midi_ports.clear();
1054    
1055     for(PortVectorIterator it = m_Ports.begin();
1056         it != m_Ports.end();
1057         ++it )
1058     {
1059         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1060         assert(pinfo); // this should not fail!!
1061
1062         switch( pinfo->getFormat() )
1063         {
1064             case AmdtpPortInfo::E_MBLA:
1065                 m_nb_audio_ports++;
1066                 break;
1067             case AmdtpPortInfo::E_SPDIF: // still unimplemented
1068                 break;
1069             case AmdtpPortInfo::E_Midi:
1070                 m_nb_midi_ports++;
1071                 break;
1072             default: // ignore
1073                 break;
1074         }
1075     }
1076
1077     unsigned int idx;
1078     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1079         for(PortVectorIterator it = m_Ports.begin();
1080             it != m_Ports.end();
1081             ++it )
1082         {
1083             AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1084             debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1085                         "idx %u: looking at port %s at position %u\n",
1086                         idx, (*it)->getName().c_str(), pinfo->getPosition());
1087             if(pinfo->getPosition() == idx) {
1088                 struct _MBLA_port_cache p;
1089                 p.port = dynamic_cast<AmdtpAudioPort *>(*it);
1090                 if(p.port == NULL) {
1091                     debugError("Port is not an AmdtpAudioPort!\n");
1092                     return false;
1093                 }
1094                 p.buffer = NULL; // to be filled by updatePortCache
1095                 #ifdef DEBUG
1096                 p.buffer_size = (*it)->getBufferSize();
1097                 #endif
1098
1099                 m_audio_ports.push_back(p);
1100                 debugOutput(DEBUG_LEVEL_VERBOSE,
1101                             "Cached port %s at position %u\n",
1102                             p.port->getName().c_str(), idx);
1103                 goto next_index;
1104             }
1105         }
1106         debugError("No MBLA port found for position %d\n", idx);
1107         return false;
1108 next_index:
1109         continue;
1110     }
1111
1112     for(PortVectorIterator it = m_Ports.begin();
1113         it != m_Ports.end();
1114         ++it )
1115     {
1116         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1117         debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1118                     "idx %u: looking at port %s at position %u, location %u\n",
1119                     idx, (*it)->getName().c_str(), pinfo->getPosition(), pinfo->getLocation());
1120         if ((*it)->getPortType() == Port::E_Midi) {
1121             struct _MIDI_port_cache p;
1122             p.port = dynamic_cast<AmdtpMidiPort *>(*it);
1123             if(p.port == NULL) {
1124                 debugError("Port is not an AmdtpMidiPort!\n");
1125                 return false;
1126             }
1127             p.position = pinfo->getPosition();
1128             p.location = pinfo->getLocation();
1129             p.buffer = NULL; // to be filled by updatePortCache
1130             #ifdef DEBUG
1131             p.buffer_size = (*it)->getBufferSize();
1132             #endif
1133
1134             m_midi_ports.push_back(p);
1135             debugOutput(DEBUG_LEVEL_VERBOSE,
1136                         "Cached port %s at position %u, location %u\n",
1137                         p.port->getName().c_str(), p.position, p.location);
1138         }
1139     }
1140
1141     return true;
1142 }
1143
1144 void
1145 AmdtpTransmitStreamProcessor::updatePortCache() {
1146     unsigned int idx;
1147     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1148         struct _MBLA_port_cache& p = m_audio_ports.at(idx);
1149         AmdtpAudioPort *port = p.port;
1150         p.buffer = port->getBufferAddress();
1151         p.enabled = !port->isDisabled();
1152     }
1153     for (idx = 0; idx < m_nb_midi_ports; idx++) {
1154         struct _MIDI_port_cache& p = m_midi_ports.at(idx);
1155         AmdtpMidiPort *port = p.port;
1156         p.buffer = port->getBufferAddress();
1157         p.enabled = !port->isDisabled();
1158     }
1159 }
1160
1161 } // end of namespace Streaming
Note: See TracBrowser for help on using the browser.