root/branches/libffado-2.0/src/libstreaming/amdtp/AmdtpTransmitStreamProcessor.cpp

Revision 1376, 41.0 kB (checked in by ppalmers, 12 years ago)

fix clipping and float->int conversion. optimize this a bit. fix SSE code.

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 2 of the License, or
12  * (at your option) version 3 of the License.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #include "config.h"
25 #include "AmdtpTransmitStreamProcessor.h"
26 #include "AmdtpPort.h"
27 #include "../StreamProcessorManager.h"
28 #include "devicemanager.h"
29
30 #include "libutil/Time.h"
31 #include "libutil/float_cast.h"
32
33 #include "libieee1394/ieee1394service.h"
34 #include "libieee1394/IsoHandlerManager.h"
35 #include "libieee1394/cycletimer.h"
36
37 #include "libutil/ByteSwap.h"
38 #include <assert.h>
39 #include <cstring>
40
41 #define AMDTP_FLOAT_MULTIPLIER (1.0f * ((1<<23) - 1))
42 namespace Streaming
43 {
44
45 /* transmit */
46 AmdtpTransmitStreamProcessor::AmdtpTransmitStreamProcessor(FFADODevice &parent, int dimension)
47         : StreamProcessor(parent, ePT_Transmit)
48         , m_dimension( dimension )
49         , m_dbc( 0 )
50 #if AMDTP_ALLOW_PAYLOAD_IN_NODATA_XMIT
51         , m_send_nodata_payload ( AMDTP_SEND_PAYLOAD_IN_NODATA_XMIT_BY_DEFAULT )
52 #endif
53         , m_nb_audio_ports( 0 )
54         , m_nb_midi_ports( 0 )
55 {}
56
57 enum StreamProcessor::eChildReturnValue
58 AmdtpTransmitStreamProcessor::generatePacketHeader (
59     unsigned char *data, unsigned int *length,
60     unsigned char *tag, unsigned char *sy,
61     uint32_t pkt_ctr )
62 {
63     __builtin_prefetch(data, 1, 0); // prefetch events for write, no temporal locality
64     struct iec61883_packet *packet = (struct iec61883_packet *)data;
65     /* Our node ID can change after a bus reset, so it is best to fetch
66     * our node ID for each packet. */
67     packet->sid = m_local_node_id;
68
69     packet->dbs = m_dimension;
70     packet->fn = 0;
71     packet->qpc = 0;
72     packet->sph = 0;
73     packet->reserved = 0;
74     packet->dbc = m_dbc;
75     packet->eoh1 = 2;
76     packet->fmt = IEC61883_FMT_AMDTP;
77
78     *tag = IEC61883_TAG_WITH_CIP;
79     *sy = 0;
80
81     signed int fc;
82     uint64_t presentation_time;
83     unsigned int presentation_cycle;
84     int cycles_until_presentation;
85
86     uint64_t transmit_at_time;
87     unsigned int transmit_at_cycle;
88     int cycles_until_transmit;
89
90     debugOutputExtreme( DEBUG_LEVEL_ULTRA_VERBOSE,
91                         "Try for cycle %d\n", CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
92     // check whether the packet buffer has packets for us to send.
93     // the base timestamp is the one of the next sample in the buffer
94     ffado_timestamp_t ts_head_tmp;
95     m_data_buffer->getBufferHeadTimestamp( &ts_head_tmp, &fc ); // thread safe
96
97     // the timestamp gives us the time at which we want the sample block
98     // to be output by the device
99     presentation_time = ( uint64_t ) ts_head_tmp;
100
101     // now we calculate the time when we have to transmit the sample block
102     transmit_at_time = substractTicks( presentation_time, AMDTP_TRANSMIT_TRANSFER_DELAY );
103
104     // calculate the cycle this block should be presented in
105     // (this is just a virtual calculation since at that time it should
106     //  already be in the device's buffer)
107     presentation_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( presentation_time ) );
108
109     // calculate the cycle this block should be transmitted in
110     transmit_at_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( transmit_at_time ) );
111
112     // we can check whether this cycle is within the 'window' we have
113     // to send this packet.
114     // first calculate the number of cycles left before presentation time
115     cycles_until_presentation = diffCycles ( presentation_cycle, CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
116
117     // we can check whether this cycle is within the 'window' we have
118     // to send this packet.
119     // first calculate the number of cycles left before presentation time
120     cycles_until_transmit = diffCycles ( transmit_at_cycle, CYCLE_TIMER_GET_CYCLES(pkt_ctr) );
121
122     // two different options:
123     // 1) there are not enough frames for one packet
124     //      => determine wether this is a problem, since we might still
125     //         have some time to send it
126     // 2) there are enough packets
127     //      => determine whether we have to send them in this packet
128     if ( fc < ( signed int ) m_syt_interval )
129     {
130         // not enough frames in the buffer,
131
132         // we can still postpone the queueing of the packets
133         // if we are far enough ahead of the presentation time
134         if ( cycles_until_presentation <= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION )
135         {
136             debugOutput( DEBUG_LEVEL_NORMAL,
137                          "Insufficient frames (P): N=%02d, CY=%04u, TC=%04u, CUT=%04d\n",
138                          fc, CYCLE_TIMER_GET_CYCLES(pkt_ctr),
139                          transmit_at_cycle, cycles_until_transmit );
140             // we are too late
141             return eCRV_XRun;
142         }
143         else
144         {
145             #if DEBUG_EXTREME
146             unsigned int now_cycle = ( unsigned int ) ( TICKS_TO_CYCLES ( m_1394service.getCycleTimerTicks() ) );
147
148             debugOutputExtreme(DEBUG_LEVEL_VERBOSE,
149                                "Insufficient frames (NP): N=%02d, CY=%04u, TC=%04u, CUT=%04d, NOW=%04d\n",
150                                fc, CYCLE_TIMER_GET_CYCLES(pkt_ctr),
151                                transmit_at_cycle, cycles_until_transmit, now_cycle );
152             #endif
153
154             // there is still time left to send the packet
155             // we want the system to give this packet another go at a later time instant
156             return eCRV_Again; // note that the raw1394 again system doesn't work as expected
157
158             // we could wait here for a certain time before trying again. However, this
159             // is not going to work since we then block the iterator thread, hence also
160             // the receiving code, meaning that we are not processing received packets,
161             // and hence there is no progression in the number of frames available.
162
163             // for example:
164             // SleepRelativeUsec(125); // one cycle
165             // goto try_block_of_frames;
166
167             // or more advanced, calculate how many cycles we are ahead of 'now' and
168             // base the sleep on that.
169
170             // note that this requires that there is one thread for each IsoHandler,
171             // otherwise we're in the deadlock described above.
172         }
173     }
174     else
175     {
176         // there are enough frames, so check the time they are intended for
177         // all frames have a certain 'time window' in which they can be sent
178         // this corresponds to the range of the timestamp mechanism:
179         // we can send a packet 15 cycles in advance of the 'presentation time'
180         // in theory we can send the packet up till one cycle before the presentation time,
181         // however this is not very smart.
182
183         // There are 3 options:
184         // 1) the frame block is too early
185         //      => send an empty packet
186         // 2) the frame block is within the window
187         //      => send it
188         // 3) the frame block is too late
189         //      => discard (and raise xrun?)
190         //         get next block of frames and repeat
191
192         if(cycles_until_transmit < 0)
193         {
194             // we are too late
195             debugOutput(DEBUG_LEVEL_VERBOSE,
196                         "Too late: CY=%04u, TC=%04u, CUT=%04d, TSP=%011llu (%04u)\n",
197                         CYCLE_TIMER_GET_CYCLES(pkt_ctr),
198                         transmit_at_cycle, cycles_until_transmit,
199                         presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time) );
200             //debugShowBackLogLines(200);
201             // however, if we can send this sufficiently before the presentation
202             // time, it could be harmless.
203             // NOTE: dangerous since the device has no way of reporting that it didn't get
204             //       this packet on time.
205             if(cycles_until_presentation >= AMDTP_MIN_CYCLES_BEFORE_PRESENTATION)
206             {
207                 // we are not that late and can still try to transmit the packet
208                 m_dbc += fillDataPacketHeader(packet, length, presentation_time);
209                 m_last_timestamp = presentation_time;
210                 return (fc < (signed)(2*m_syt_interval) ? eCRV_Defer : eCRV_Packet);
211             }
212             else   // definitely too late
213             {
214                 return eCRV_XRun;
215             }
216         }
217         else if(cycles_until_transmit <= AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY)
218         {
219             // it's time send the packet
220             m_dbc += fillDataPacketHeader(packet, length, presentation_time);
221             m_last_timestamp = presentation_time;
222
223             // for timestamp tracing
224             debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
225                                "XMIT PKT: TSP= %011llu (%04u) (%04u) (%04u)\n",
226                                presentation_time,
227                                (unsigned int)CYCLE_TIMER_GET_CYCLES(pkt_ctr),
228                                presentation_cycle, transmit_at_cycle);
229
230             return (fc < (signed)(m_syt_interval) ? eCRV_Defer : eCRV_Packet);
231         }
232         else
233         {
234             debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
235                                "Too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
236                                CYCLE_TIMER_GET_CYCLES(pkt_ctr),
237                                transmit_at_cycle, cycles_until_transmit,
238                                transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
239                                presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
240 #ifdef DEBUG
241             if ( cycles_until_transmit > AMDTP_MAX_CYCLES_TO_TRANSMIT_EARLY + 1 )
242             {
243                 debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
244                                    "Way too early: CY=%04u, TC=%04u, CUT=%04d, TST=%011llu (%04u), TSP=%011llu (%04u)\n",
245                                    CYCLE_TIMER_GET_CYCLES(pkt_ctr),
246                                    transmit_at_cycle, cycles_until_transmit,
247                                    transmit_at_time, (unsigned int)TICKS_TO_CYCLES(transmit_at_time),
248                                    presentation_time, (unsigned int)TICKS_TO_CYCLES(presentation_time));
249             }
250 #endif
251             // we are too early, send only an empty packet
252             return eCRV_EmptyPacket;
253         }
254     }
255     return eCRV_Invalid;
256 }
257
258 enum StreamProcessor::eChildReturnValue
259 AmdtpTransmitStreamProcessor::generatePacketData (
260     unsigned char *data, unsigned int *length )
261 {
262     if (m_data_buffer->readFrames(m_syt_interval, (char *)(data + 8)))
263     {
264         debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
265                            "XMIT DATA: TSP= %011llu (%04u)\n",
266                            m_last_timestamp,
267                            (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
268         return eCRV_OK;
269     }
270     else return eCRV_XRun;
271 }
272
273 enum StreamProcessor::eChildReturnValue
274 AmdtpTransmitStreamProcessor::generateSilentPacketHeader (
275     unsigned char *data, unsigned int *length,
276     unsigned char *tag, unsigned char *sy,
277     uint32_t pkt_ctr )
278 {
279     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
280     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
281                        "XMIT SILENT (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
282                        CYCLE_TIMER_GET_CYCLES(pkt_ctr), m_last_timestamp,
283                        (unsigned int)TICKS_TO_CYCLES(m_last_timestamp));
284
285     packet->sid = m_local_node_id;
286
287     packet->dbs = m_dimension;
288     packet->fn = 0;
289     packet->qpc = 0;
290     packet->sph = 0;
291     packet->reserved = 0;
292     packet->dbc = m_dbc;
293     packet->eoh1 = 2;
294     packet->fmt = IEC61883_FMT_AMDTP;
295
296     *tag = IEC61883_TAG_WITH_CIP;
297     *sy = 0;
298
299     m_dbc += fillNoDataPacketHeader(packet, length);
300     return eCRV_Packet;
301 }
302
303 enum StreamProcessor::eChildReturnValue
304 AmdtpTransmitStreamProcessor::generateSilentPacketData (
305     unsigned char *data, unsigned int *length )
306 {
307     return eCRV_OK; // no need to do anything
308 }
309
310 enum StreamProcessor::eChildReturnValue
311 AmdtpTransmitStreamProcessor::generateEmptyPacketHeader (
312     unsigned char *data, unsigned int *length,
313     unsigned char *tag, unsigned char *sy,
314     uint32_t pkt_ctr )
315 {
316     struct iec61883_packet *packet = ( struct iec61883_packet * ) data;
317     debugOutputExtreme(DEBUG_LEVEL_ULTRA_VERBOSE,
318                        "XMIT EMPTY (cy %04d): CY=%04u, TSP=%011llu (%04u)\n",
319                        CYCLE_TIMER_GET_CYCLES(pkt_ctr), m_last_timestamp,
320                        (unsigned int)TICKS_TO_CYCLES(m_last_timestamp) );
321     packet->sid = m_local_node_id;
322
323     packet->dbs = m_dimension;
324     packet->fn = 0;
325     packet->qpc = 0;
326     packet->sph = 0;
327     packet->reserved = 0;
328     packet->dbc = m_dbc;
329     packet->eoh1 = 2;
330     packet->fmt = IEC61883_FMT_AMDTP;
331
332     *tag = IEC61883_TAG_WITH_CIP;
333     *sy = 0;
334
335     m_dbc += fillNoDataPacketHeader(packet, length);
336     return eCRV_OK;
337 }
338
339 enum StreamProcessor::eChildReturnValue
340 AmdtpTransmitStreamProcessor::generateEmptyPacketData (
341     unsigned char *data, unsigned int *length )
342 {
343     return eCRV_OK; // no need to do anything
344 }
345
346 unsigned int AmdtpTransmitStreamProcessor::fillDataPacketHeader (
347     struct iec61883_packet *packet, unsigned int* length,
348     uint32_t ts )
349 {
350
351     packet->fdf = m_fdf;
352
353     // convert the timestamp to SYT format
354     uint16_t timestamp_SYT = TICKS_TO_SYT ( ts );
355     packet->syt = CondSwapToBus16 ( timestamp_SYT );
356
357     // FIXME: use a precomputed value here
358     *length = m_syt_interval*sizeof ( quadlet_t ) *m_dimension + 8;
359
360     return m_syt_interval;
361 }
362
363 unsigned int AmdtpTransmitStreamProcessor::fillNoDataPacketHeader (
364     struct iec61883_packet *packet, unsigned int* length )
365 {
366     // no-data packets have syt=0xFFFF
367     // and (can) have the usual amount of events as dummy data
368     // DBC is not increased
369     packet->fdf = IEC61883_FDF_NODATA;
370     packet->syt = 0xffff;
371
372 #if AMDTP_ALLOW_PAYLOAD_IN_NODATA_XMIT
373     if ( m_send_nodata_payload )
374     { // no-data packets with payload (NOTE: DICE-II doesn't like that)
375         *length = 2*sizeof ( quadlet_t ) + m_syt_interval * m_dimension * sizeof ( quadlet_t );
376         return m_syt_interval;
377     } else { // no-data packets without payload
378         *length = 2*sizeof ( quadlet_t );
379         return 0;
380     }
381 #else
382     // no-data packets without payload
383     *length = 2*sizeof ( quadlet_t );
384     return 0;
385 #endif
386 }
387
388 unsigned int
389 AmdtpTransmitStreamProcessor::getSytInterval() {
390     switch (m_StreamProcessorManager.getNominalRate()) {
391         case 32000:
392         case 44100:
393         case 48000:
394             return 8;
395         case 88200:
396         case 96000:
397             return 16;
398         case 176400:
399         case 192000:
400             return 32;
401         default:
402             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
403             return 0;
404     }
405 }
406
407 unsigned int
408 AmdtpTransmitStreamProcessor::getAveragePacketSize()
409 {
410     // in one second we have 8000 packets
411     // containing FRAMERATE frames of m_dimension quadlets
412     // so 8000 packet headers + FRAMERATE*m_dimension quadlets
413     unsigned int one_second = 8000 * 2 * sizeof(quadlet_t) + m_StreamProcessorManager.getNominalRate() * m_dimension * sizeof(quadlet_t);
414     return one_second / 8000;
415 }
416
417 unsigned int
418 AmdtpTransmitStreamProcessor::getFDF() {
419     switch (m_StreamProcessorManager.getNominalRate()) {
420         case 32000: return IEC61883_FDF_SFC_32KHZ;
421         case 44100: return IEC61883_FDF_SFC_44K1HZ;
422         case 48000: return IEC61883_FDF_SFC_48KHZ;
423         case 88200: return IEC61883_FDF_SFC_88K2HZ;
424         case 96000: return IEC61883_FDF_SFC_96KHZ;
425         case 176400: return IEC61883_FDF_SFC_176K4HZ;
426         case 192000: return IEC61883_FDF_SFC_192KHZ;
427         default:
428             debugError("Unsupported rate: %d\n", m_StreamProcessorManager.getNominalRate());
429             return 0;
430     }
431 }
432
433 bool AmdtpTransmitStreamProcessor::prepareChild()
434 {
435     debugOutput ( DEBUG_LEVEL_VERBOSE, "Preparing (%p)...\n", this );
436     m_syt_interval = getSytInterval();
437     m_fdf = getFDF();
438
439     iec61883_cip_init (
440         &m_cip_status,
441         IEC61883_FMT_AMDTP,
442         m_fdf,
443         m_StreamProcessorManager.getNominalRate(),
444         m_dimension,
445         m_syt_interval );
446
447     if (!initPortCache()) {
448         debugError("Could not init port cache\n");
449         return false;
450     }
451
452     return true;
453 }
454
455 /*
456 * compose the event streams for the packets from the port buffers
457 */
458 bool AmdtpTransmitStreamProcessor::processWriteBlock ( char *data,
459         unsigned int nevents, unsigned int offset )
460 {
461     // update the variable parts of the cache
462     updatePortCache();
463
464     // encode audio data
465     switch(m_StreamProcessorManager.getAudioDataType()) {
466         case StreamProcessorManager::eADT_Int24:
467             encodeAudioPortsInt24((quadlet_t *)data, offset, nevents);
468             break;
469         case StreamProcessorManager::eADT_Float:
470             encodeAudioPortsFloat((quadlet_t *)data, offset, nevents);
471             break;
472     }
473
474     // do midi ports
475     encodeMidiPorts((quadlet_t *)data, offset, nevents);
476     return true;
477 }
478
479 bool
480 AmdtpTransmitStreamProcessor::transmitSilenceBlock(
481     char *data, unsigned int nevents, unsigned int offset)
482 {
483     // no need to update the port cache when transmitting silence since
484     // no dynamic values are used to do so.
485     encodeAudioPortsSilence((quadlet_t *)data, offset, nevents);
486     encodeMidiPortsSilence((quadlet_t *)data, offset, nevents);
487     return true;
488 }
489
490 /**
491  * @brief encodes all audio ports in the cache to events (silent data)
492  * @param data
493  * @param offset
494  * @param nevents
495  */
496 void
497 AmdtpTransmitStreamProcessor::encodeAudioPortsSilence(quadlet_t *data,
498                                                       unsigned int offset,
499                                                       unsigned int nevents)
500 {
501     unsigned int j;
502     quadlet_t *target_event;
503     int i;
504
505     for (i = 0; i < m_nb_audio_ports; i++) {
506         target_event = (quadlet_t *)(data + i);
507
508         for (j = 0;j < nevents; j += 1)
509         {
510             *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
511             target_event += m_dimension;
512         }
513     }
514 }
515
516 #ifdef __SSE2__
517 #include <emmintrin.h>
518 #warning SSE2 build
519
520 /**
521  * @brief mux all audio ports to events
522  * @param data
523  * @param offset
524  * @param nevents
525  */
526 void
527 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
528                                                     unsigned int offset,
529                                                     unsigned int nevents)
530 {
531     unsigned int j;
532     quadlet_t *target_event;
533     int i;
534
535     float * client_buffers[4];
536     float tmp_values[4] __attribute__ ((aligned (16)));
537     uint32_t tmp_values_int[4] __attribute__ ((aligned (16)));
538
539     // prepare the scratch buffer
540     assert(m_scratch_buffer_size_bytes > nevents * 4);
541     memset(m_scratch_buffer, 0, nevents * 4);
542
543     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
544     const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
545     const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER);
546
547 #if AMDTP_CLIP_FLOATS
548     const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
549     const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0);
550 #endif
551
552     // this assumes that audio ports are sorted by position,
553     // and that there are no gaps
554     for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) {
555         struct _MBLA_port_cache *p;
556
557         // get the port buffers
558         for (j=0; j<4; j++) {
559             p = &(m_audio_ports.at(i+j));
560             if(p->buffer && p->enabled) {
561                 client_buffers[j] = (float *) p->buffer;
562                 client_buffers[j] += offset;
563             } else {
564                 // if a port is disabled or has no valid
565                 // buffer, use the scratch buffer (all zero's)
566                 client_buffers[j] = (float *) m_scratch_buffer;
567             }
568         }
569
570         // the base event for this position
571         target_event = (quadlet_t *)(data + i);
572         // process the events
573         for (j=0;j < nevents; j += 1)
574         {
575             // read the values
576             tmp_values[0] = *(client_buffers[0]);
577             tmp_values[1] = *(client_buffers[1]);
578             tmp_values[2] = *(client_buffers[2]);
579             tmp_values[3] = *(client_buffers[3]);
580
581             // now do the SSE based conversion/labeling
582             __m128 v_float = *((__m128*)tmp_values);
583             __m128i *target = (__m128i*)target_event;
584             __m128i v_int;
585
586             // clip
587 #if AMDTP_CLIP_FLOATS
588             // do SSE clipping
589             v_float = _mm_max_ps(v_float, v_min);
590             v_float = _mm_min_ps(v_float, v_max);
591 #endif
592
593             // multiply
594             v_float = _mm_mul_ps(v_float, mult);
595             // convert to signed integer
596             v_int = _mm_cvttps_epi32( v_float );
597             // mask
598             v_int = _mm_and_si128( v_int, mask );
599             // label it
600             v_int = _mm_or_si128( v_int, label );
601
602             // do endian conversion (SSE is always little endian)
603             // do first swap
604             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
605             // do second swap
606             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
607             // store the packed int
608             // (target misalignment is assumed since we don't know the m_dimension)
609             _mm_storeu_si128 (target, v_int);
610
611             // increment the buffer pointers
612             client_buffers[0]++;
613             client_buffers[1]++;
614             client_buffers[2]++;
615             client_buffers[3]++;
616
617             // go to next target event position
618             target_event += m_dimension;
619         }
620     }
621
622     // do remaining ports
623     // NOTE: these can be time-SSE'd
624     for (; i < (int)m_nb_audio_ports; i++) {
625         struct _MBLA_port_cache &p = m_audio_ports.at(i);
626         target_event = (quadlet_t *)(data + i);
627         assert(nevents + offset <= p.buffer_size );
628
629         if(p.buffer && p.enabled) {
630             float *buffer = (float *)(p.buffer);
631             buffer += offset;
632    
633             for (j = 0;j < nevents; j += 4)
634             {
635                 // read the values
636                 tmp_values[0] = *buffer;
637                 buffer++;
638                 tmp_values[1] = *buffer;
639                 buffer++;
640                 tmp_values[2] = *buffer;
641                 buffer++;
642                 tmp_values[3] = *buffer;
643                 buffer++;
644
645                 // now do the SSE based conversion/labeling
646                 __m128 v_float = *((__m128*)tmp_values);
647                 __m128i v_int;
648
649 #if AMDTP_CLIP_FLOATS
650                 // do SSE clipping
651                 v_float = _mm_max_ps(v_float, v_min);
652                 v_float = _mm_min_ps(v_float, v_max);
653 #endif
654                 // multiply
655                 v_float = _mm_mul_ps(v_float, mult);
656                 // convert to signed integer
657                 v_int = _mm_cvttps_epi32( v_float );
658                 // mask
659                 v_int = _mm_and_si128( v_int, mask );
660                 // label it
661                 v_int = _mm_or_si128( v_int, label );
662    
663                 // do endian conversion (SSE is always little endian)
664                 // do first swap
665                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
666                 // do second swap
667                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
668
669                 // store the packed int
670                 _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int);
671
672                 // increment the buffer pointers
673                 *target_event = tmp_values_int[0];
674                 target_event += m_dimension;
675                 *target_event = tmp_values_int[1];
676                 target_event += m_dimension;
677                 *target_event = tmp_values_int[2];
678                 target_event += m_dimension;
679                 *target_event = tmp_values_int[3];
680                 target_event += m_dimension;
681             }
682
683             // do the remainder of the events
684             for(;j < nevents; j += 1) {
685                 float *in = (float *)buffer;
686 #if AMDTP_CLIP_FLOATS
687                 // clip directly to the value of a maxed event
688                 if(*in > 1.0) {
689                     *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF);
690                 } else if(*in < -1.0) {
691                     *target_event = CONDSWAPTOBUS32_CONST(0x40800001);
692                 } else {
693                     float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
694                     unsigned int tmp = ((int) v);
695                     tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
696                     *target_event = CondSwapToBus32((quadlet_t)tmp);
697                 }
698 #else
699                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
700                 unsigned int tmp = ((int) v);
701                 tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
702                 *target_event = CondSwapToBus32((quadlet_t)tmp);
703 #endif
704                 buffer++;
705                 target_event += m_dimension;
706             }
707
708         } else {
709             for (j = 0;j < nevents; j += 1)
710             {
711                 // hardcoded byte swapped
712                 *target_event = 0x00000040;
713                 target_event += m_dimension;
714             }
715         }
716     }
717 }
718
719
720 /**
721  * @brief mux all audio ports to events
722  * @param data
723  * @param offset
724  * @param nevents
725  */
726 void
727 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
728                                                     unsigned int offset,
729                                                     unsigned int nevents)
730 {
731     unsigned int j;
732     quadlet_t *target_event;
733     int i;
734
735     uint32_t *client_buffers[4];
736     uint32_t tmp_values[4] __attribute__ ((aligned (16)));
737
738     // prepare the scratch buffer
739     assert(m_scratch_buffer_size_bytes > nevents * 4);
740     memset(m_scratch_buffer, 0, nevents * 4);
741
742     const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000);
743     const __m128i mask  = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF);
744
745     // this assumes that audio ports are sorted by position,
746     // and that there are no gaps
747     for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) {
748         struct _MBLA_port_cache *p;
749
750         // get the port buffers
751         for (j=0; j<4; j++) {
752             p = &(m_audio_ports.at(i+j));
753             if(p->buffer && p->enabled) {
754                 client_buffers[j] = (uint32_t *) p->buffer;
755                 client_buffers[j] += offset;
756             } else {
757                 // if a port is disabled or has no valid
758                 // buffer, use the scratch buffer (all zero's)
759                 client_buffers[j] = (uint32_t *) m_scratch_buffer;
760             }
761         }
762
763         // the base event for this position
764         target_event = (quadlet_t *)(data + i);
765
766         // process the events
767         for (j=0;j < nevents; j += 1)
768         {
769             // read the values
770             tmp_values[0] = *(client_buffers[0]);
771             tmp_values[1] = *(client_buffers[1]);
772             tmp_values[2] = *(client_buffers[2]);
773             tmp_values[3] = *(client_buffers[3]);
774
775             // now do the SSE based conversion/labeling
776             __m128i *target = (__m128i*)target_event;
777             __m128i v_int = *((__m128i*)tmp_values);;
778
779             // mask
780             v_int = _mm_and_si128( v_int, mask );
781             // label it
782             v_int = _mm_or_si128( v_int, label );
783
784             // do endian conversion (SSE is always little endian)
785             // do first swap
786             v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
787             // do second swap
788             v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
789
790             // store the packed int
791             // (target misalignment is assumed since we don't know the m_dimension)
792             _mm_storeu_si128 (target, v_int);
793
794             // increment the buffer pointers
795             client_buffers[0]++;
796             client_buffers[1]++;
797             client_buffers[2]++;
798             client_buffers[3]++;
799
800             // go to next target event position
801             target_event += m_dimension;
802         }
803     }
804
805     // do remaining ports
806     // NOTE: these can be time-SSE'd
807     for (; i < ((int)m_nb_audio_ports); i++) {
808         struct _MBLA_port_cache &p = m_audio_ports.at(i);
809         target_event = (quadlet_t *)(data + i);
810         assert(nevents + offset <= p.buffer_size );
811
812         if(p.buffer && p.enabled) {
813             uint32_t *buffer = (uint32_t *)(p.buffer);
814             buffer += offset;
815    
816             for (j = 0;j < nevents; j += 4)
817             {
818                 // read the values
819                 tmp_values[0] = *buffer;
820                 buffer++;
821                 tmp_values[1] = *buffer;
822                 buffer++;
823                 tmp_values[2] = *buffer;
824                 buffer++;
825                 tmp_values[3] = *buffer;
826                 buffer++;
827
828                 // now do the SSE based conversion/labeling
829                 __m128i v_int = *((__m128i*)tmp_values);;
830
831                 // mask
832                 v_int = _mm_and_si128( v_int, mask );
833                 // label it
834                 v_int = _mm_or_si128( v_int, label );
835
836                 // do endian conversion (SSE is always little endian)
837                 // do first swap
838                 v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) );
839                 // do second swap
840                 v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) );
841
842                 // store the packed int
843                 _mm_store_si128 ((__m128i *)(&tmp_values), v_int);
844
845                 // increment the buffer pointers
846                 *target_event = tmp_values[0];
847                 target_event += m_dimension;
848                 *target_event = tmp_values[1];
849                 target_event += m_dimension;
850                 *target_event = tmp_values[2];
851                 target_event += m_dimension;
852                 *target_event = tmp_values[3];
853                 target_event += m_dimension;
854             }
855
856             // do the remainder of the events
857             for(;j < nevents; j += 1) {
858                 uint32_t in = (uint32_t)(*buffer);
859                 *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
860                 buffer++;
861                 target_event += m_dimension;
862             }
863
864         } else {
865             for (j = 0;j < nevents; j += 1)
866             {
867                 // hardcoded byte swapped
868                 *target_event = 0x00000040;
869                 target_event += m_dimension;
870             }
871         }
872     }
873 }
874
875 #else
876
877 /**
878  * @brief mux all audio ports to events
879  * @param data
880  * @param offset
881  * @param nevents
882  */
883 void
884 AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data,
885                                                     unsigned int offset,
886                                                     unsigned int nevents)
887 {
888     unsigned int j;
889     quadlet_t *target_event;
890     int i;
891
892     for (i = 0; i < m_nb_audio_ports; i++) {
893         struct _MBLA_port_cache &p = m_audio_ports.at(i);
894         target_event = (quadlet_t *)(data + i);
895         assert(nevents + offset <= p.buffer_size );
896
897         if(p.buffer && p.enabled) {
898             quadlet_t *buffer = (quadlet_t *)(p.buffer);
899             buffer += offset;
900    
901             for (j = 0;j < nevents; j += 1)
902             {
903                 uint32_t in = (uint32_t)(*buffer);
904                 *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000));
905                 buffer++;
906                 target_event += m_dimension;
907             }
908         } else {
909             for (j = 0;j < nevents; j += 1)
910             {
911                 *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
912                 target_event += m_dimension;
913             }
914         }
915     }
916 }
917
918 /**
919  * @brief mux all audio ports to events
920  * @param data
921  * @param offset
922  * @param nevents
923  */
924 void
925 AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data,
926                                                     unsigned int offset,
927                                                     unsigned int nevents)
928 {
929     unsigned int j;
930     quadlet_t *target_event;
931     int i;
932
933     for (i = 0; i < m_nb_audio_ports; i++) {
934         struct _MBLA_port_cache &p = m_audio_ports.at(i);
935         target_event = (quadlet_t *)(data + i);
936         assert(nevents + offset <= p.buffer_size );
937
938         if(p.buffer && p.enabled) {
939             quadlet_t *buffer = (quadlet_t *)(p.buffer);
940             buffer += offset;
941    
942             for (j = 0;j < nevents; j += 1)
943             {
944                 float *in = (float *)buffer;
945 #if AMDTP_CLIP_FLOATS
946                 // clip directly to the value of a maxed event
947                 if(*in > 1.0) {
948                     *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF);
949                 } else if(*in < -1.0) {
950                     *target_event = CONDSWAPTOBUS32_CONST(0x40800001);
951                 } else {
952                     float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
953                     unsigned int tmp = ((int) v);
954                     tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
955                     *target_event = CondSwapToBus32((quadlet_t)tmp);
956                 }
957 #else
958                 float v = (*in) * AMDTP_FLOAT_MULTIPLIER;
959                 unsigned int tmp = ((int) v);
960                 tmp = ( tmp & 0x00FFFFFF ) | 0x40000000;
961                 *target_event = CondSwapToBus32((quadlet_t)tmp);
962 #endif
963                 buffer++;
964                 target_event += m_dimension;
965             }
966         } else {
967             for (j = 0;j < nevents; j += 1)
968             {
969                 *target_event = CONDSWAPTOBUS32_CONST(0x40000000);
970                 target_event += m_dimension;
971             }
972         }
973     }
974 }
975 #endif
976
977 /**
978  * @brief encodes all midi ports in the cache to events (silence)
979  * @param data
980  * @param offset
981  * @param nevents
982  */
983 void
984 AmdtpTransmitStreamProcessor::encodeMidiPortsSilence(quadlet_t *data,
985                                                      unsigned int offset,
986                                                      unsigned int nevents)
987 {
988     quadlet_t *target_event;
989     int i;
990     unsigned int j;
991
992     for (i = 0; i < m_nb_midi_ports; i++) {
993         struct _MIDI_port_cache &p = m_midi_ports.at(i);
994
995         for (j = p.location;j < nevents; j += 8) {
996             target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
997             *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
998         }
999     }
1000 }
1001
1002 /**
1003  * @brief encodes all midi ports in the cache to events
1004  * @param data
1005  * @param offset
1006  * @param nevents
1007  */
1008 void
1009 AmdtpTransmitStreamProcessor::encodeMidiPorts(quadlet_t *data,
1010                                               unsigned int offset,
1011                                               unsigned int nevents)
1012 {
1013     quadlet_t *target_event;
1014     int i;
1015     unsigned int j;
1016
1017     for (i = 0; i < m_nb_midi_ports; i++) {
1018         struct _MIDI_port_cache &p = m_midi_ports.at(i);
1019         if (p.buffer && p.enabled) {
1020             uint32_t *buffer = (quadlet_t *)(p.buffer);
1021             buffer += offset;
1022
1023             for (j = p.location;j < nevents; j += 8) {
1024                 target_event = (quadlet_t *) (data + ((j * m_dimension) + p.position));
1025
1026                 if ( *buffer & 0xFF000000 )   // we can send a byte
1027                 {
1028                     quadlet_t tmpval;
1029                     tmpval = ((*buffer)<<16) & 0x00FF0000;
1030                     tmpval = IEC61883_AM824_SET_LABEL(tmpval, IEC61883_AM824_LABEL_MIDI_1X);
1031                     *target_event = CondSwapToBus32(tmpval);
1032
1033 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "MIDI port %s, pos=%u, loc=%u, nevents=%u, dim=%d\n",
1034 //                                p.port->getName().c_str(), p.position, p.location, nevents, m_dimension );
1035 //                     debugOutput ( DEBUG_LEVEL_VERBOSE, "base=%p, target=%p, value=%08X\n",
1036 //                                data, target_event, tmpval );
1037                 } else {
1038                     // can't send a byte, either because there is no byte,
1039                     // or because this would exceed the maximum rate
1040                     // FIXME: this can be ifdef optimized since it's a constant
1041                     *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1042                 }
1043                 buffer+=8;
1044             }
1045         } else {
1046             for (j = p.location;j < nevents; j += 8) {
1047                 target_event = (quadlet_t *)(data + ((j * m_dimension) + p.position));
1048                 __builtin_prefetch(target_event, 1, 0); // prefetch events for write, no temporal locality
1049                 *target_event = CondSwapToBus32(IEC61883_AM824_SET_LABEL(0, IEC61883_AM824_LABEL_MIDI_NO_DATA));
1050             }
1051         }
1052     }
1053 }
1054
1055 bool
1056 AmdtpTransmitStreamProcessor::initPortCache() {
1057     // make use of the fact that audio ports are the first ports in
1058     // the cluster as per AMDTP. so we can sort the ports by position
1059     // and have very efficient lookups:
1060     // m_float_ports.at(i).buffer -> audio stream i buffer
1061     // for midi ports we simply cache all port info since they are (usually) not
1062     // that numerous
1063     m_nb_audio_ports = 0;
1064     m_audio_ports.clear();
1065    
1066     m_nb_midi_ports = 0;
1067     m_midi_ports.clear();
1068    
1069     for(PortVectorIterator it = m_Ports.begin();
1070         it != m_Ports.end();
1071         ++it )
1072     {
1073         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1074         assert(pinfo); // this should not fail!!
1075
1076         switch( pinfo->getFormat() )
1077         {
1078             case AmdtpPortInfo::E_MBLA:
1079                 m_nb_audio_ports++;
1080                 break;
1081             case AmdtpPortInfo::E_SPDIF: // still unimplemented
1082                 break;
1083             case AmdtpPortInfo::E_Midi:
1084                 m_nb_midi_ports++;
1085                 break;
1086             default: // ignore
1087                 break;
1088         }
1089     }
1090
1091     int idx;
1092     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1093         for(PortVectorIterator it = m_Ports.begin();
1094             it != m_Ports.end();
1095             ++it )
1096         {
1097             AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1098             debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1099                         "idx %u: looking at port %s at position %u\n",
1100                         idx, (*it)->getName().c_str(), pinfo->getPosition());
1101             if(pinfo->getPosition() == (unsigned int)idx) {
1102                 struct _MBLA_port_cache p;
1103                 p.port = dynamic_cast<AmdtpAudioPort *>(*it);
1104                 if(p.port == NULL) {
1105                     debugError("Port is not an AmdtpAudioPort!\n");
1106                     return false;
1107                 }
1108                 p.buffer = NULL; // to be filled by updatePortCache
1109                 #ifdef DEBUG
1110                 p.buffer_size = (*it)->getBufferSize();
1111                 #endif
1112
1113                 m_audio_ports.push_back(p);
1114                 debugOutput(DEBUG_LEVEL_VERBOSE,
1115                             "Cached port %s at position %u\n",
1116                             p.port->getName().c_str(), idx);
1117                 goto next_index;
1118             }
1119         }
1120         debugError("No MBLA port found for position %d\n", idx);
1121         return false;
1122 next_index:
1123         continue;
1124     }
1125
1126     for(PortVectorIterator it = m_Ports.begin();
1127         it != m_Ports.end();
1128         ++it )
1129     {
1130         AmdtpPortInfo *pinfo=dynamic_cast<AmdtpPortInfo *>(*it);
1131         debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
1132                     "idx %u: looking at port %s at position %u, location %u\n",
1133                     idx, (*it)->getName().c_str(), pinfo->getPosition(), pinfo->getLocation());
1134         if ((*it)->getPortType() == Port::E_Midi) {
1135             struct _MIDI_port_cache p;
1136             p.port = dynamic_cast<AmdtpMidiPort *>(*it);
1137             if(p.port == NULL) {
1138                 debugError("Port is not an AmdtpMidiPort!\n");
1139                 return false;
1140             }
1141             p.position = pinfo->getPosition();
1142             p.location = pinfo->getLocation();
1143             p.buffer = NULL; // to be filled by updatePortCache
1144             #ifdef DEBUG
1145             p.buffer_size = (*it)->getBufferSize();
1146             #endif
1147
1148             m_midi_ports.push_back(p);
1149             debugOutput(DEBUG_LEVEL_VERBOSE,
1150                         "Cached port %s at position %u, location %u\n",
1151                         p.port->getName().c_str(), p.position, p.location);
1152         }
1153     }
1154
1155     return true;
1156 }
1157
1158 void
1159 AmdtpTransmitStreamProcessor::updatePortCache() {
1160     int idx;
1161     for (idx = 0; idx < m_nb_audio_ports; idx++) {
1162         struct _MBLA_port_cache& p = m_audio_ports.at(idx);
1163         AmdtpAudioPort *port = p.port;
1164         p.buffer = port->getBufferAddress();
1165         p.enabled = !port->isDisabled();
1166     }
1167     for (idx = 0; idx < m_nb_midi_ports; idx++) {
1168         struct _MIDI_port_cache& p = m_midi_ports.at(idx);
1169         AmdtpMidiPort *port = p.port;
1170         p.buffer = port->getBufferAddress();
1171         p.enabled = !port->isDisabled();
1172     }
1173 }
1174
1175 } // end of namespace Streaming
Note: See TracBrowser for help on using the browser.