root/branches/libffado-2.0/src/libstreaming/amdtp/AmdtpTransmitStreamProcessor.cpp

Revision 1379, 41.2 kB (checked in by ppalmers, 15 years ago)

add branch prediction aids

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 2 of the License, or
12  * (at your option) version 3 of the License.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #include "config.h"
25 #include "AmdtpTransmitStreamProcessor.h"
26 #include "AmdtpPort.h"
27 #include "../StreamProcessorManager.h"
28 #include "devicemanager.h"
29
30 #include "libutil/Time.h"
31 #include "libutil/float_cast.h"
32
33 #include "libieee1394/ieee1394service.h"
34 #include "libieee1394/IsoHandlerManager.h"
35 #include "libieee1394/cycletimer.h"
36
37 #include "libutil/ByteSwap.h"
38 #include <assert.h>
39 #include <cstring>
40
41 #define likely(x)   __builtin_expect((x),1)
42 #define unlikely(x) __builtin_expect((x),0)
43
44 #define AMDTP_FLOAT_MULTIPLIER (1.0f * ((1<<23) - 1))
45 namespace Streaming
46 {
47
48 /* transmit */
49 AmdtpTransmitStreamProcessor::AmdtpTransmitStreamProcessor(FFADODevice &parent, int dimension)
50         : StreamProcessor(parent, ePT_Transmit)
51         , m_dimension( dimension )
52         , m_dbc( 0 )
53 #if AMDTP_ALLOW_PAYLOAD_IN_NODATA_XMIT
54         , m_send_nodata_payload ( AMDTP_SEND_PAYLOAD_IN_NODATA_XMIT_BY_DEFAULT )
55 #endif
56         , m_nb_audio_ports( 0 )
57         , m_nb_midi_ports( 0 )
58 {}
59
60 enum StreamProcessor::eChildReturnValue
61 AmdtpTransmitStreamProcessor::generatePacketHeader (
62     unsigned char *data, unsigned int *length,
63     unsigned char *tag, unsigned char *sy,
64     uint32_t pkt_ctr )
65 {
66     __builtin_prefetch(data, 1, 0); // prefetch events for write, no temporal locality
67     struct iec61883_packet *packet = (struct iec61883_packet *)data;
68     /* Our node ID can change after a bus reset, so it is best to fetch
69     * our node ID for each packet. */
70     packet->sid = m_local_node_id;
71
72     packet->dbs = m_dimension;
73     packet->fn = 0;
74     packet->qpc = 0;
75     packet->sph = 0;
76     packet->reserved = 0;
77     packet->dbc = m_dbc;
78     packet->eoh1 = 2;
79     packet->fmt = IEC61883_FMT_AMDTP;
80
81     *tag = IEC61883_TAG_WITH_CIP;
82     *sy = 0;
83
84     signed int fc;
85     uint64_t presentation_time;
86     unsigned int presentation_cycle;
87     int cycles_until_presentation;
88
89     uint64_t transmit_at_time;
90     unsigned int transmit_at_cycle;
91     int cycles_until_transmit;
92
93     debugOutputExtreme( DEBUG_LEVEL_ULTRA_VERBOSE,
94                         "Try for cycle %d\n", CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
95     // check whether the packet buffer has packets for us to send.
96     // the base timestamp is the one of the next sample in the buffer
97     ffado_timestamp_t ts_head_tmp;
98     m_data_buffer->getBufferHeadTimestamp( &ts_head_tmp, &fc ); // thread safe
99
100     // the timestamp gives us the time at which we want the sample block
101     // to be output by the device
102     presentation_time = ( uint64_t ) ts_head_tmp;
103
104     // now we calculate the time when we have to transmit the sample block
105     transmit_at_time = substractTicks( presentation_time, AMDTP_TRANSMIT_TRANSFER_DELAY );
106
107     // calculate the cycle this block should be presented in
108     // (this is just a virtual calculation since at that time it should
109     //  already be in the device's buffer)
110     presentation_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( presentation_time ) );
111
112     // calculate the cycle this block should be transmitted in
113     transmit_at_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( transmit_at_time ) );
114
115     // we can check whether this cycle is within the 'window' we have
116     // to send this packet.
117     // first calculate the number of cycles left before presentation time
118     cycles_until_presentation = diffCycles ( presentation_cycle, CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
119
120     // we can check whether this cycle is within the 'window' we have
121     // to send this packet.
122     // first calculate the number of cycles left before presentation time
123     cycles_until_transmit = diffCycles ( transmit_at_cycle, CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
124
125     // two different options:
126     // 1) there are not enough frames for one packet
127     //      => determine wether this is a problem, since we might still
128     //         have some time to send it
129     // 2) there are enough packets
130     //      => determine whether we have to send them in this packet
131     if ( fc < ( signed int ) m_syt_interval )
132     {
133         // not enough frames in the buffer,
134
135         // we can still postpone the queueing of the packets
136         // if we are far enough ahead of the presentation time
137         if ( cycles_until_presentation <= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION )
138         {
139             debugOutput( DEBUG_LEVEL_NORMAL,
140                          "Insufficient frames (P): N=%02d, CY=%04u, TC=%04u, CUT=%04d\n",
141                          fc, CYCLE_TIMER_GET_CYCLES(pkt_ctr),
142                          transmit_at_cycle, cycles_until_transmit );
143             // we are too late
144             return eCRV_XRun;
145         }
146         else
147         {
148             #if DEBUG_EXTREME
149             unsigned int now_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( m_1394service.getCycleTimerTicks() ) );
150
151             debugOutputExtreme(DEBUG_LEVEL_VERBOSE,
152                                "Insufficient frames (NP): N=%02d, CY=%04u, TC=%04u, CUT=%04d, NOW=%04d\n",
153                                fc, CYCLE_TIMER_GET_CYCLES(pkt_ctr),
154                                transmit_at_cycle, cycles_until_transmit, now_cycle );
155             #endif
156
157             // there is still time left to send the packet
158             // we want the system to give this packet another go at a later time instant
159             return eCRV_Again; // note that the raw1394 again system doesn't work as expected
160
161             // we could wait here for a certain time before trying again. However, this
162             // is not going to work since we then block the iterator thread, hence also
163             // the receiving code, meaning that we are not processing received packets,
164             // and hence there is no progression in the number of frames available.
165
166             // for example:
167             // SleepRelativeUsec(125); // one cycle
168             // goto try_block_of_frames;
169
170             // or more advanced, calculate how many cycles we are ahead of 'now' and
171             // base the sleep on that.
172
173             // note that this requires that there is one thread for each IsoHandler,
174             // otherwise we're in the deadlock described above.
175         }
176     }
177     else
178     {
179         // there are enough frames, so check the time they are intended for
180         // all frames have a certain 'time window' in which they can be sent
181         // this corresponds to the range of the timestamp mechanism:
182         // we can send a packet 15 cycles in advance of the 'presentation time'
183         // in theory we can send the packet up till one cycle before the presentation time,
184         // however this is not very smart.
185
186         // There are 3 options:
187         // 1) the frame block is too early
188         //      => send an empty packet
189         // 2) the frame block is within the window
190         //      => send it
191         // 3) the frame block is too late
192         //      => discard (and raise xrun?)
193         //         get next block of frames and repeat
194
195         if(cycles_until_transmit < 0)
196         {
197             // we are too late
198             debugOutput(DEBUG_LEVEL_VERBOSE,
199                         "Too late: CY=%04u, TC=%04u, CUT=%04d, TSP=%011llu (%04u)\n",
200                         CYCLE_TIMER_GET_CYCLES(pkt_ctr),
201                         transmit_at_cycle, cycles_until_transmit,
202                         presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time) );
203             //debugShowBackLogLines(200);
204             // however, if we can send this sufficiently before the presentation
205             // time, it could be harmless.
206             // NOTE: dangerous since the device has no way of reporting that it didn't get
207             //       this packet on time.
208             if(cycles_until_presentation >= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION)
209             {
210                 // we are not that late and can still try to transmit the packet
211                 m_dbc += fillDataPacketHeader(packet, length, presentation_time);
212                 m_last_timestamp = presentation_time;
213                 return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
214             }
215             else   // definitely too late
216             {
217                 return eCRV_XRun;
218             }
219         }
220         else if(cycles_until_transmit <= AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY)
221         {
222             // it's time send the packet
223             m_dbc += fillDataPacketHeader(packet, length, presentation_time);
224             m_last_timestamp = presentation_time;
225
226             // for timestamp tracing
227             debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
228                                "XMIT PKT: TSP= %011llu (%04u) (%04u) (%04u)\n",
229                                presentation_time,
230                                (unsigned int)CYCLE_TIMER_GET_CYCLES(pkt_ctr),
231                                presentation_cycle, transmit_at_cycle);
232
233             return (fc < (signed)(m_syt_interval) ? eCRV_Defer : eCRV_Packet);
234         }
235         else
236         {
237             debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
238                                "Too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
239                                CYCLE_TIMER_GET_CYCLES(pkt_ctr),
240                                transmit_at_cycle, cycles_until_transmit,
241                                transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
242                                presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
243 #ifdef DEBUG
244             if ( cycles_until_transmit > AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY + 1 )
245             {
246                 debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
247                                    "Way too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
248                                    CYCLE_TIMER_GET_CYCLES(pkt_ctr),
249                                    transmit_at_cycle, cycles_until_transmit,
250                                    transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
251                                    presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
252             }
253 #endif
254             // we are too early, send only an empty packet
255             return eCRV_EmptyPacket;
256         }
257     }
258     return eCRV_Invalid;
259 }
260
261 enum StreamProcessor::eChildReturnValue
262 AmdtpTransmitStreamProcessor::generatePacketData (
263     unsigned char *data, unsigned int *length )
264 {
265     if (m_data_buffer->readFrames(m_syt_interval, (char *)(data + 8)))
266     {
267         debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
268                            "XMIT DATA: TSP= %011llu (%04u)\n",
269                            m_last_timestamp,
270                            (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
271         return eCRV_OK;
272     }
273     else return eCRV_XRun;
274 }
275
276 enum StreamProcessor::eChildReturnValue
277 AmdtpTransmitStreamProcessor::generateSilentPacketHeader (
278     unsigned char *data, unsigned int *length,
279     unsigned char *tag, unsigned char *sy,
280     uint32_t pkt_ctr )
281 {
282     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
283     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
284                        "XMIT SILENT (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
285                        CYCLE_TIMER_GET_CYCLES(pkt_ctr), m_last_timestamp,
286                        (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
287
288     packet->sid = m_local_node_id;
289
290     packet->dbs = m_dimension;
291     packet->fn = 0;
292     packet->qpc = 0;
293     packet->sph = 0;
294     packet->reserved = 0;
295     packet->dbc = m_dbc;
296     packet->eoh1 = 2;
297     packet->fmt = IEC61883_FMT_AMDTP;
298
299     *tag = IEC61883_TAG_WITH_CIP;
300     *sy = 0;
301
302     m_dbc += fillNoDataPacketHeader(packet, length);
303     return eCRV_Packet;
304 }
305
306 enum StreamProcessor::eChildReturnValue
307 AmdtpTransmitStreamProcessor::generateSilentPacketData (
308     unsigned char *data, unsigned int *length )
309 {
310     return eCRV_OK; // no need to do anything
311 }
312
313 enum StreamProcessor::eChildReturnValue
314 AmdtpTransmitStreamProcessor::generateEmptyPacketHeader (
315     unsigned char *data, unsigned int *length,
316     unsigned char *tag, unsigned char *sy,
317     uint32_t pkt_ctr )
318 {
319     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
320     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
321                        "XMIT EMPTY (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
322                        CYCLE_TIMER_GET_CYCLES(pkt_ctr), m_last_timestamp,
323                        (unsigned int)TICKS_TO_CYCLES(m_last_timestamp) );
324     packet->sid = m_local_node_id;
325
326     packet->dbs = m_dimension;
327     packet->fn = 0;
328     packet->qpc = 0;
329     packet->sph = 0;
330     packet->reserved = 0;
331     packet->dbc = m_dbc;
332     packet->eoh1 = 2;
333     packet->fmt = IEC61883_FMT_AMDTP;
334
335     *tag = IEC61883_TAG_WITH_CIP;
336     *sy = 0;
337
338     m_dbc += fillNoDataPacketHeader(packet, length);
339     return eCRV_OK;
340 }
341
342 enum StreamProcessor::eChildReturnValue
343 AmdtpTransmitStreamProcessor::generateEmptyPacketData (
344     unsigned char *data, unsigned int *length )
345 {
346     return eCRV_OK; // no need to do anything
347 }
348
349 unsigned int AmdtpTransmitStreamProcessor::fillDataPacketHeader (
350     struct iec61883_packet *packet, unsigned int* length,
351     uint32_t ts )
352 {
353
354     packet->fdf = m_fdf;
355
356     // convert the timestamp to SYT format
357     uint16_t timestamp_SYT = TICKS_TO_SYT ( ts );
358     packet->syt = CondSwapToBus16 ( timestamp_SYT );
359
360     // FIXME: use a precomputed value here
361     *length = m_syt_interval*sizeof ( quadlet_t ) *m_dimension + 8;
362
363     return m_syt_interval;
364 }
365
366 unsigned int AmdtpTransmitStreamProcessor::fillNoDataPacketHeader (
367     struct iec61883_packet *packet, unsigned int* length )
368 {
369     // no-data packets have syt=0xFFFF
370     // and (can) have the usual amount of events as dummy data
371     // DBC is not increased
372     packet->fdf = IEC61883_FDF_NODATA;
373     packet->syt = 0xffff;
374
375 #if AMDTP_ALLOW_PAYLOAD_IN_NODATA_XMIT
376     if ( m_send_nodata_payload )
377     { // no-data packets with payload (NOTE: DICE-II doesn't like that)
378         *length = 2*sizeof ( quadlet_t ) + m_syt_interval * m_dimension * sizeof ( quadlet_t );
379         return m_syt_interval;
380     } else { // no-data packets without payload
381         *length = 2*sizeof ( quadlet_t );
382         return 0;
383     }
384 #else
385     // no-data packets without payload
386     *length = 2*sizeof ( quadlet_t );
387     return 0;
388 #endif
389 }
390
391 unsigned int
392 AmdtpTransmitStreamProcessor::getSytInterval() {
393     switch (m_StreamProcessorManager.getNominalRate()) {
394         case 32000:
395         case 44100:
396         case 48000:
397             return 8;
398         case 88200:
399         case 96000:
400             return 16;
401         case 176400:
402         case 192000:
403             return 32;
404         default:
405             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
406             return 0;
407     }
408 }
409
410 unsigned int
411 AmdtpTransmitStreamProcessor::getAveragePacketSize()
412 {
413     // in one second we have 8000 packets
414     // containing FRAMERATE frames of m_dimension quadlets
415     // so 8000 packet headers + FRAMERATE*m_dimension quadlets
416     unsigned int one_second = 8000 * 2 * sizeof(quadlet_t) + m_StreamProcessorManager.getNominalRate() * m_dimension * sizeof(quadlet_t);
417     return one_second / 8000;
418 }
419
420 unsigned int
421 AmdtpTransmitStreamProcessor::getFDF() {
422     switch (m_StreamProcessorManager.getNominalRate()) {
423         case 32000: return IEC61883_FDF_SFC_32KHZ;
424         case 44100: return IEC61883_FDF_SFC_44K1HZ;
425         case 48000: return IEC61883_FDF_SFC_48KHZ;
426         case 88200: return IEC61883_FDF_SFC_88K2HZ;
427         case 96000: return IEC61883_FDF_SFC_96KHZ;
428         case 176400: return IEC61883_FDF_SFC_176K4HZ;
429         case 192000: return IEC61883_FDF_SFC_192KHZ;
430         default:
431             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
432             return 0;
433     }
434 }
435
436 bool AmdtpTransmitStreamProcessor::prepareChild()
437 {
438     debugOutput ( DEBUG_LEVEL_VERBOSE, "Preparing (%p)...\n", this );
439     m_syt_interval = getSytInterval();
440     m_fdf = getFDF();
441
442     iec61883_cip_init (
443         &m_cip_status,
444         IEC61883_FMT_AMDTP,
445         m_fdf,
446         m_StreamProcessorManager.getNominalRate(),
447         m_dimension,
448         m_syt_interval );
449
450     if (!initPortCache()) {
451         debugError("Could not init port cache\n");
452         return false;
453     }
454
455     return true;
456 }
457
458 /*
459 * compose the event streams for the packets from the port buffers
460 */
461 bool AmdtpTransmitStreamProcessor::processWriteBlock ( char *data,
462         unsigned int nevents, unsigned int offset )
463 {
464     // update the variable parts of the cache
465     updatePortCache();
466
467     // encode audio data
468     switch(m_StreamProcessorManager.getAudioDataType()) {
469         case StreamProcessorManager::eADT_Int24:
470             encodeAudioPortsInt24((quadlet_t *)data, offset, nevents);
471             break;
472         case StreamProcessorManager::eADT_Float:
473             encodeAudioPortsFloat((quadlet_t *)data, offset, nevents);
474             break;
475     }
476
477     // do midi ports
478     encodeMidiPorts((quadlet_t *)data, offset, nevents);
479     return true;
480 }
481
482 bool
483 AmdtpTransmitStreamProcessor::transmitSilenceBlock(
484     char *data, unsigned int nevents, unsigned int offset)
485 {
486     // no need to update the port cache when transmitting silence since
487     // no dynamic values are used to do so.
488     encodeAudioPortsSilence((quadlet_t *)data, offset, nevents);
489     encodeMidiPortsSilence((quadlet_t *)data, offset, nevents);
490     return true;
491 }
492
493 /**
494  * @brief encodes all audio ports in the cache to events (silent data)
495  * @param data
496  * @param offset
497  * @param nevents
498  */
499 void
500 AmdtpTransmitStreamProcessor::encodeAudioPortsSilence(quadlet_t *data,
501                                                       unsigned int offset,
502                                                       unsigned int nevents)
503 {
504     unsigned int j;
505     quadlet_t *target_event;
506     int i;
507
508     for (i = 0; i < m_nb_audio_ports; i++) {
509         target_event = (quadlet_t *)(data + i);
510
511         for (j = 0;j < nevents; j += 1)
512         {
513             *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
514             target_event += m_dimension;
515         }
516     }
517 }
518
519 #ifdef __SSE2__
520 #include <emmintrin.h>
521 #warning SSE2 build
522
523 /**
524  * @brief mux all audio ports to events
525  * @param data
526  * @param offset
527  * @param nevents
528  */
529 void
530 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
531                                                     unsigned int offset,
532                                                     unsigned int nevents)
533 {
534     unsigned int j;
535     quadlet_t *target_event;
536     int i;
537
538     float * client_buffers[4];
539     float tmp_values[4] __attribute__ ((aligned (16)));
540     uint32_t tmp_values_int[4] __attribute__ ((aligned (16)));
541
542     // prepare the scratch buffer
543     assert(m_scratch_buffer_size_bytes > nevents * 4);
544     memset(m_scratch_buffer, 0, nevents * 4);
545
546     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
547     const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
548     const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER);
549
550 #if AMDTP_CLIP_FLOATS
551     const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
552     const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
553 #endif
554
555     // this assumes that audio ports are sorted by position,
556     // and that there are no gaps
557     for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) {
558         struct _MBLA_port_cache *p;
559
560         // get the port buffers
561         for (j=0; j<4; j++) {
562             p = &(m_audio_ports.at(i+j));
563             if(likely(p->buffer && p->enabled)) {
564                 client_buffers[j] = (float *) p->buffer;
565                 client_buffers[j] += offset;
566             } else {
567                 // if a port is disabled or has no valid
568                 // buffer, use the scratch buffer (all zero's)
569                 client_buffers[j] = (float *) m_scratch_buffer;
570             }
571         }
572
573         // the base event for this position
574         target_event = (quadlet_t *)(data + i);
575         // process the events
576         for (j=0;j < nevents; j += 1)
577         {
578             // read the values
579             tmp_values[0] = *(client_buffers[0]);
580             tmp_values[1] = *(client_buffers[1]);
581             tmp_values[2] = *(client_buffers[2]);
582             tmp_values[3] = *(client_buffers[3]);
583
584             // now do the SSE based conversion/labeling
585             __m128 v_float = *((__m128*)tmp_values);
586             __m128i *target = (__m128i*)target_event;
587             __m128i v_int;
588
589             // clip
590 #if AMDTP_CLIP_FLOATS
591             // do SSE clipping
592             v_float = _mm_max_ps(v_float, v_min);
593             v_float = _mm_min_ps(v_float, v_max);
594 #endif
595
596             // multiply
597             v_float = _mm_mul_ps(v_float, mult);
598             // convert to signed integer
599             v_int = _mm_cvttps_epi32( v_float );
600             // mask
601             v_int = _mm_and_si128( v_int, mask );
602             // label it
603             v_int = _mm_or_si128( v_int, label );
604
605             // do endian conversion (SSE is always little endian)
606             // do first swap
607             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
608             // do second swap
609             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
610             // store the packed int
611             // (target misalignment is assumed since we don't know the m_dimension)
612             _mm_storeu_si128 (target, v_int);
613
614             // increment the buffer pointers
615             client_buffers[0]++;
616             client_buffers[1]++;
617             client_buffers[2]++;
618             client_buffers[3]++;
619
620             // go to next target event position
621             target_event += m_dimension;
622         }
623     }
624
625     // do remaining ports
626     // NOTE: these can be time-SSE'd
627     for (; i < (int)m_nb_audio_ports; i++) {
628         struct _MBLA_port_cache &p = m_audio_ports.at(i);
629         target_event = (quadlet_t *)(data + i);
630         assert(nevents + offset <= p.buffer_size );
631
632         if(likely(p.buffer && p.enabled)) {
633             float *buffer = (float *)(p.buffer);
634             buffer += offset;
635    
636             for (j = 0;j < nevents; j += 4)
637             {
638                 // read the values
639                 tmp_values[0] = *buffer;
640                 buffer++;
641                 tmp_values[1] = *buffer;
642                 buffer++;
643                 tmp_values[2] = *buffer;
644                 buffer++;
645                 tmp_values[3] = *buffer;
646                 buffer++;
647
648                 // now do the SSE based conversion/labeling
649                 __m128 v_float = *((__m128*)tmp_values);
650                 __m128i v_int;
651
652 #if AMDTP_CLIP_FLOATS
653                 // do SSE clipping
654                 v_float = _mm_max_ps(v_float, v_min);
655                 v_float = _mm_min_ps(v_float, v_max);
656 #endif
657                 // multiply
658                 v_float = _mm_mul_ps(v_float, mult);
659                 // convert to signed integer
660                 v_int = _mm_cvttps_epi32( v_float );
661                 // mask
662                 v_int = _mm_and_si128( v_int, mask );
663                 // label it
664                 v_int = _mm_or_si128( v_int, label );
665    
666                 // do endian conversion (SSE is always little endian)
667                 // do first swap
668                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
669                 // do second swap
670                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
671
672                 // store the packed int
673                 _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int);
674
675                 // increment the buffer pointers
676                 *target_event = tmp_values_int[0];
677                 target_event += m_dimension;
678                 *target_event = tmp_values_int[1];
679                 target_event += m_dimension;
680                 *target_event = tmp_values_int[2];
681                 target_event += m_dimension;
682                 *target_event = tmp_values_int[3];
683                 target_event += m_dimension;
684             }
685
686             // do the remainder of the events
687             for(;j < nevents; j += 1) {
688                 float *in = (float *)buffer;
689 #if AMDTP_CLIP_FLOATS
690                 // clip directly to the value of a maxed event
691                 if(unlikely(*in > 1.0)) {
692                     *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF);
693                 } else if(unlikely(*in < -1.0)) {
694                     *target_event = CONDSWAPTOBUS32_CONST(0x40800001);
695                 } else {
696                     float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
697                     unsigned int tmp = ((int) v);
698                     tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
699                     *target_event = CondSwapToBus32((quadlet_t)tmp);
700                 }
701 #else
702                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
703                 unsigned int tmp = ((int) v);
704                 tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
705                 *target_event = CondSwapToBus32((quadlet_t)tmp);
706 #endif
707                 buffer++;
708                 target_event += m_dimension;
709             }
710
711         } else {
712             for (j = 0;j < nevents; j += 1)
713             {
714                 // hardcoded byte swapped
715                 *target_event = 0x00000040;
716                 target_event += m_dimension;
717             }
718         }
719     }
720 }
721
722
723 /**
724  * @brief mux all audio ports to events
725  * @param data
726  * @param offset
727  * @param nevents
728  */
729 void
730 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
731                                                     unsigned int offset,
732                                                     unsigned int nevents)
733 {
734     unsigned int j;
735     quadlet_t *target_event;
736     int i;
737
738     uint32_t *client_buffers[4];
739     uint32_t tmp_values[4] __attribute__ ((aligned (16)));
740
741     // prepare the scratch buffer
742     assert(m_scratch_buffer_size_bytes > nevents * 4);
743     memset(m_scratch_buffer, 0, nevents * 4);
744
745     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
746     const __m128i mask  = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
747
748     // this assumes that audio ports are sorted by position,
749     // and that there are no gaps
750     for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) {
751         struct _MBLA_port_cache *p;
752
753         // get the port buffers
754         for (j=0; j<4; j++) {
755             p = &(m_audio_ports.at(i+j));
756             if(likely(p->buffer && p->enabled)) {
757                 client_buffers[j] = (uint32_t *) p->buffer;
758                 client_buffers[j] += offset;
759             } else {
760                 // if a port is disabled or has no valid
761                 // buffer, use the scratch buffer (all zero's)
762                 client_buffers[j] = (uint32_t *) m_scratch_buffer;
763             }
764         }
765
766         // the base event for this position
767         target_event = (quadlet_t *)(data + i);
768
769         // process the events
770         for (j=0;j < nevents; j += 1)
771         {
772             // read the values
773             tmp_values[0] = *(client_buffers[0]);
774             tmp_values[1] = *(client_buffers[1]);
775             tmp_values[2] = *(client_buffers[2]);
776             tmp_values[3] = *(client_buffers[3]);
777
778             // now do the SSE based conversion/labeling
779             __m128i *target = (__m128i*)target_event;
780             __m128i v_int = *((__m128i*)tmp_values);;
781
782             // mask
783             v_int = _mm_and_si128( v_int, mask );
784             // label it
785             v_int = _mm_or_si128( v_int, label );
786
787             // do endian conversion (SSE is always little endian)
788             // do first swap
789             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
790             // do second swap
791             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
792
793             // store the packed int
794             // (target misalignment is assumed since we don't know the m_dimension)
795             _mm_storeu_si128 (target, v_int);
796
797             // increment the buffer pointers
798             client_buffers[0]++;
799             client_buffers[1]++;
800             client_buffers[2]++;
801             client_buffers[3]++;
802
803             // go to next target event position
804             target_event += m_dimension;
805         }
806     }
807
808     // do remaining ports
809     // NOTE: these can be time-SSE'd
810     for (; i < ((int)m_nb_audio_ports); i++) {
811         struct _MBLA_port_cache &p = m_audio_ports.at(i);
812         target_event = (quadlet_t *)(data + i);
813         assert(nevents + offset <= p.buffer_size );
814
815         if(likely(p.buffer && p.enabled)) {
816             uint32_t *buffer = (uint32_t *)(p.buffer);
817             buffer += offset;
818    
819             for (j = 0;j < nevents; j += 4)
820             {
821                 // read the values
822                 tmp_values[0] = *buffer;
823                 buffer++;
824                 tmp_values[1] = *buffer;
825                 buffer++;
826                 tmp_values[2] = *buffer;
827                 buffer++;
828                 tmp_values[3] = *buffer;
829                 buffer++;
830
831                 // now do the SSE based conversion/labeling
832                 __m128i v_int = *((__m128i*)tmp_values);;
833
834                 // mask
835                 v_int = _mm_and_si128( v_int, mask );
836                 // label it
837                 v_int = _mm_or_si128( v_int, label );
838
839                 // do endian conversion (SSE is always little endian)
840                 // do first swap
841                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
842                 // do second swap
843                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
844
845                 // store the packed int
846                 _mm_store_si128 ((__m128i *)(&tmp_values), v_int);
847
848                 // increment the buffer pointers
849                 *target_event = tmp_values[0];
850                 target_event += m_dimension;
851                 *target_event = tmp_values[1];
852                 target_event += m_dimension;
853                 *target_event = tmp_values[2];
854                 target_event += m_dimension;
855                 *target_event = tmp_values[3];
856                 target_event += m_dimension;
857             }
858
859             // do the remainder of the events
860             for(;j < nevents; j += 1) {
861                 uint32_t in = (uint32_t)(*buffer);
862                 *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
863                 buffer++;
864                 target_event += m_dimension;
865             }
866
867         } else {
868             for (j = 0;j < nevents; j += 1)
869             {
870                 // hardcoded byte swapped
871                 *target_event = 0x00000040;
872                 target_event += m_dimension;
873             }
874         }
875     }
876 }
877
878 #else
879
880 /**
881  * @brief mux all audio ports to events
882  * @param data
883  * @param offset
884  * @param nevents
885  */
886 void
887 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
888                                                     unsigned int offset,
889                                                     unsigned int nevents)
890 {
891     unsigned int j;
892     quadlet_t *target_event;
893     int i;
894
895     for (i = 0; i < m_nb_audio_ports; i++) {
896         struct _MBLA_port_cache &p = m_audio_ports.at(i);
897         target_event = (quadlet_t *)(data + i);
898         assert(nevents + offset <= p.buffer_size );
899
900         if(likely(p.buffer && p.enabled)) {
901             quadlet_t *buffer = (quadlet_t *)(p.buffer);
902             buffer += offset;
903    
904             for (j = 0;j < nevents; j += 1)
905             {
906                 uint32_t in = (uint32_t)(*buffer);
907                 *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
908                 buffer++;
909                 target_event += m_dimension;
910             }
911         } else {
912             for (j = 0;j < nevents; j += 1)
913             {
914                 *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
915                 target_event += m_dimension;
916             }
917         }
918     }
919 }
920
921 /**
922  * @brief mux all audio ports to events
923  * @param data
924  * @param offset
925  * @param nevents
926  */
927 void
928 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
929                                                     unsigned int offset,
930                                                     unsigned int nevents)
931 {
932     unsigned int j;
933     quadlet_t *target_event;
934     int i;
935
936     for (i = 0; i < m_nb_audio_ports; i++) {
937         struct _MBLA_port_cache &p = m_audio_ports.at(i);
938         target_event = (quadlet_t *)(data + i);
939         assert(nevents + offset <= p.buffer_size );
940
941         if(likely(p.buffer && p.enabled)) {
942             quadlet_t *buffer = (quadlet_t *)(p.buffer);
943             buffer += offset;
944    
945             for (j = 0;j < nevents; j += 1)
946             {
947                 float *in = (float *)buffer;
948 #if AMDTP_CLIP_FLOATS
949                 // clip directly to the value of a maxed event
950                 if(unlikely(*in > 1.0)) {
951                     *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF);
952                 } else if(unlikely(*in < -1.0)) {
953                     *target_event = CONDSWAPTOBUS32_CONST(0x40800001);
954                 } else {
955                     float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
956                     unsigned int tmp = ((int) v);
957                     tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
958                     *target_event = CondSwapToBus32((quadlet_t)tmp);
959                 }
960 #else
961                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
962                 unsigned int tmp = ((int) v);
963                 tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
964                 *target_event = CondSwapToBus32((quadlet_t)tmp);
965 #endif
966                 buffer++;
967                 target_event += m_dimension;
968             }
969         } else {
970             for (j = 0;j < nevents; j += 1)
971             {
972                 *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
973                 target_event += m_dimension;
974             }
975         }
976     }
977 }
978 #endif
979
980 /**
981  * @brief encodes all midi ports in the cache to events (silence)
982  * @param data
983  * @param offset
984  * @param nevents
985  */
986 void
987 AmdtpTransmitStreamProcessor::encodeMidiPortsSilence(quadlet_t *data,
988                                                      unsigned int offset,
989                                                      unsigned int nevents)
990 {
991     quadlet_t *target_event;
992     int i;
993     unsigned int j;
994
995     for (i = 0; i < m_nb_midi_ports; i++) {
996         struct _MIDI_port_cache &p = m_midi_ports.at(i);
997
998         for (j = p.location;j < nevents; j += 8) {
999             target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
1000             *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1001         }
1002     }
1003 }
1004
1005 /**
1006  * @brief encodes all midi ports in the cache to events
1007  * @param data
1008  * @param offset
1009  * @param nevents
1010  */
1011 void
1012 AmdtpTransmitStreamProcessor::encodeMidiPorts(quadlet_t *data,
1013                                               unsigned int offset,
1014                                               unsigned int nevents)
1015 {
1016     quadlet_t *target_event;
1017     int i;
1018     unsigned int j;
1019
1020     for (i = 0; i < m_nb_midi_ports; i++) {
1021         struct _MIDI_port_cache &p = m_midi_ports.at(i);
1022         if (p.buffer && p.enabled) {
1023             uint32_t *buffer = (quadlet_t *)(p.buffer);
1024             buffer += offset;
1025
1026             for (j = p.location;j < nevents; j += 8) {
1027                 target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
1028
1029                 if ( *buffer & 0xFF000000 )   // we can send a byte
1030                 {
1031                     quadlet_t tmpval;
1032                     tmpval = ((*buffer)<<16) & 0x00FF0000;
1033                     tmpval = IEC61883_AM824_SET_LABEL(tmpval, IEC61883_AM824_LABEL_MIDI_1X);
1034                     *target_event = CondSwapToBus32(tmpval);
1035
1036 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "MIDI port %s, pos=%u, loc=%u, nevents=%u, dim=%d\n",
1037 //                                p.port->getName().c_str(), p.position, p.location, nevents, m_dimension );
1038 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "base=%p, target=%p, value=%08X\n",
1039 //                                data, target_event, tmpval );
1040                 } else {
1041                     // can't send a byte, either because there is no byte,
1042                     // or because this would exceed the maximum rate
1043                     // FIXME: this can be ifdef optimized since it's a constant
1044                     *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1045                 }
1046                 buffer+=8;
1047             }
1048         } else {
1049             for (j = p.location;j < nevents; j += 8) {
1050                 target_event = (quadlet_t *)(data + ((j * m_dimension) + p.position));
1051                 __builtin_prefetch(target_event, 1, 0); // prefetch events for write, no temporal locality
1052                 *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1053             }
1054         }
1055     }
1056 }
1057
1058 bool
1059 AmdtpTransmitStreamProcessor::initPortCache() {
1060     // make use of the fact that audio ports are the first ports in
1061     // the cluster as per AMDTP. so we can sort the ports by position
1062     // and have very efficient lookups:
1063     // m_float_ports.at(i).buffer -> audio stream i buffer
1064     // for midi ports we simply cache all port info since they are (usually) not
1065     // that numerous
1066     m_nb_audio_ports = 0;
1067     m_audio_ports.clear();
1068    
1069     m_nb_midi_ports = 0;
1070     m_midi_ports.clear();
1071    
1072     for(PortVectorIterator it = m_Ports.begin();
1073         it != m_Ports.end();
1074         ++it )
1075     {
1076         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1077         assert(pinfo); // this should not fail!!
1078
1079         switch( pinfo->getFormat() )
1080         {
1081             case AmdtpPortInfo::E_MBLA:
1082                 m_nb_audio_ports++;
1083                 break;
1084             case AmdtpPortInfo::E_SPDIF: // still unimplemented
1085                 break;
1086             case AmdtpPortInfo::E_Midi:
1087                 m_nb_midi_ports++;
1088                 break;
1089             default: // ignore
1090                 break;
1091         }
1092     }
1093
1094     int idx;
1095     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1096         for(PortVectorIterator it = m_Ports.begin();
1097             it != m_Ports.end();
1098             ++it )
1099         {
1100             AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1101             debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1102                         "idx %u: looking at port %s at position %u\n",
1103                         idx, (*it)->getName().c_str(), pinfo->getPosition());
1104             if(pinfo->getPosition() == (unsigned int)idx) {
1105                 struct _MBLA_port_cache p;
1106                 p.port = dynamic_cast<AmdtpAudioPort *>(*it);
1107                 if(p.port == NULL) {
1108                     debugError("Port is not an AmdtpAudioPort!\n");
1109                     return false;
1110                 }
1111                 p.buffer = NULL; // to be filled by updatePortCache
1112                 #ifdef DEBUG
1113                 p.buffer_size = (*it)->getBufferSize();
1114                 #endif
1115
1116                 m_audio_ports.push_back(p);
1117                 debugOutput(DEBUG_LEVEL_VERBOSE,
1118                             "Cached port %s at position %u\n",
1119                             p.port->getName().c_str(), idx);
1120                 goto next_index;
1121             }
1122         }
1123         debugError("No MBLA port found for position %d\n", idx);
1124         return false;
1125 next_index:
1126         continue;
1127     }
1128
1129     for(PortVectorIterator it = m_Ports.begin();
1130         it != m_Ports.end();
1131         ++it )
1132     {
1133         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1134         debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1135                     "idx %u: looking at port %s at position %u, location %u\n",
1136                     idx, (*it)->getName().c_str(), pinfo->getPosition(), pinfo->getLocation());
1137         if ((*it)->getPortType() == Port::E_Midi) {
1138             struct _MIDI_port_cache p;
1139             p.port = dynamic_cast<AmdtpMidiPort *>(*it);
1140             if(p.port == NULL) {
1141                 debugError("Port is not an AmdtpMidiPort!\n");
1142                 return false;
1143             }
1144             p.position = pinfo->getPosition();
1145             p.location = pinfo->getLocation();
1146             p.buffer = NULL; // to be filled by updatePortCache
1147             #ifdef DEBUG
1148             p.buffer_size = (*it)->getBufferSize();
1149             #endif
1150
1151             m_midi_ports.push_back(p);
1152             debugOutput(DEBUG_LEVEL_VERBOSE,
1153                         "Cached port %s at position %u, location %u\n",
1154                         p.port->getName().c_str(), p.position, p.location);
1155         }
1156     }
1157
1158     return true;
1159 }
1160
1161 void
1162 AmdtpTransmitStreamProcessor::updatePortCache() {
1163     int idx;
1164     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1165         struct _MBLA_port_cache& p = m_audio_ports.at(idx);
1166         AmdtpAudioPort *port = p.port;
1167         p.buffer = port->getBufferAddress();
1168         p.enabled = !port->isDisabled();
1169     }
1170     for (idx = 0; idx < m_nb_midi_ports; idx++) {
1171         struct _MIDI_port_cache& p = m_midi_ports.at(idx);
1172         AmdtpMidiPort *port = p.port;
1173         p.buffer = port->getBufferAddress();
1174         p.enabled = !port->isDisabled();
1175     }
1176 }
1177
1178 } // end of namespace Streaming
Note: See TracBrowser for help on using the browser.