root/trunk/libffado/src/libutil/Watchdog.cpp

Revision 2344, 10.6 kB (checked in by jwoithe, 7 years ago)

watchdog: fix possible segfault during cleanup. On some systems pthread_cancel() caused issues since the watchdog tasks did not restrict themselves to cancel-safe functions. Resolve this using a simple message pipe so pthread_cancel() is no longer needed during watchdog shutdown. At the time of writing, the only known distribution to trigger a segfault due to this issue was Slackware 14.0 (gcc 4.7.1, glibc 2.15). However, the issue does appear to be real so a fix which eliminates the need to use pthread_cancel() is justified.

Line 
1 /*
2  * Copyright (C) 2005-2008 by Pieter Palmers
3  *
4  * This file is part of FFADO
5  * FFADO = Free Firewire (pro-)audio drivers for linux
6  *
7  * FFADO is based upon FreeBoB.
8  *
9  * This program is free software: you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation, either version 2 of the License, or
12  * (at your option) version 3 of the License.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
21  *
22  */
23
24 #include "Watchdog.h"
25 #include "SystemTimeSource.h"
26 #include "PosixThread.h"
27
28 #include "config.h"
29
30 // needed for clock_nanosleep
31 #ifndef _GNU_SOURCE
32     #define _GNU_SOURCE
33 #endif
34
35 #include <time.h>
36 #include <unistd.h>
37 #include <poll.h>
38
39 namespace Util {
40
41 IMPL_DEBUG_MODULE( Watchdog, Watchdog, DEBUG_LEVEL_NORMAL );
42
43 // --- Watchdog thread common ancestor --- ///
44 Watchdog::WatchdogTask::WatchdogTask(Watchdog& parent, unsigned int interval_usecs)
45     : m_parent( parent )
46     , m_interval( interval_usecs )
47     , m_debugModule( parent.m_debugModule )
48 {
49 }
50
51 Watchdog::WatchdogTask::~WatchdogTask()
52 {
53     close(stop_msg_pipe[0]);
54     close(stop_msg_pipe[1]);
55 }
56
57 bool
58 Watchdog::WatchdogTask::Init()
59 {
60     if (pipe(stop_msg_pipe) == -1) {
61         return false;
62     }
63     return true;
64 }
65
66 bool
67 Watchdog::WatchdogTask::Execute()
68 {
69     // All watchdog threads share the need to sleep for m_interval usec, with
70     // the ability for this to be interrupted early to speed up program exit.
71     //
72     // Use ppoll() rather than SystemTimeSource::SleepUsecRelative(m_interval)
73     // so the stop message pipe can be monitored, permitting the interruption
74     // of long timing intervals to facilitate program shutdown.
75     struct pollfd fds;
76     struct timespec ts;
77     fds.fd = stop_msg_pipe[0];
78     fds.events = POLLIN;
79     ts.tv_sec = (m_interval / 1000000);
80     ts.tv_nsec = (m_interval % 1000000) * 1000;
81     if (ppoll(&fds, 1, &ts, NULL)==1 && fds.revents!=0) {
82         debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) watchdog %p received request to stop\n", this, &m_parent);
83         return false;
84     }
85     return true;
86 }
87
88 void Watchdog::WatchdogTask::ReqStop()
89 {
90     // Signal to the task that it should shop via the message pipe.  All
91     // that's needed is to make stop_msg_pipe[0] readable.
92     signed int data = 0;
93     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) watchdog %p requested to stop\n", this, &m_parent);
94     write(stop_msg_pipe[1], &data, sizeof(data));
95 }
96
97 // --- liveness check Thread --- //
98 Watchdog::WatchdogCheckTask::WatchdogCheckTask(Watchdog& parent, unsigned int interval_usecs)
99     : WatchdogTask( parent, interval_usecs )
100     , m_debugModule( parent.m_debugModule )
101 {
102 }
103
104 bool
105 Watchdog::WatchdogCheckTask::Init()
106 {
107     #ifdef DEBUG
108     m_last_loop_entry = 0;
109     m_successive_short_loops = 0;
110     #endif
111     return Watchdog::WatchdogTask::Init();
112 }
113
114 bool
115 Watchdog::WatchdogCheckTask::Execute()
116 {
117     if (Watchdog::WatchdogTask::Execute() == false)
118         return false;
119
120     if(m_parent.getHartbeat()) {
121         debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
122                     "(%p) watchdog %p still alive\n", this, &m_parent);
123         m_parent.clearHartbeat();
124     } else {
125         debugWarning("(%p) watchdog %p died\n", this, &m_parent);
126         // set all watched threads to non-rt scheduling
127         m_parent.rescheduleThreads();
128     }
129
130     #ifdef DEBUG
131     uint64_t now = Util::SystemTimeSource::getCurrentTimeAsUsecs();
132     int diff = now - m_last_loop_entry;
133     if(diff < 100) {
134         debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
135                            "(%p) short loop detected (%d usec), cnt: %d\n",
136                            this, diff, m_successive_short_loops);
137         m_successive_short_loops++;
138         if(m_successive_short_loops > 100) {
139             debugError("Shutting down runaway thread\n");
140             return false;
141         }
142     } else {
143         // reset the counter
144         m_successive_short_loops = 0;
145     }
146     m_last_loop_entry = now;
147     #endif
148
149     return true;
150 }
151
152 // --- hartbeat Thread --- //
153
154 Watchdog::WatchdogHartbeatTask::WatchdogHartbeatTask(Watchdog& parent, unsigned int interval_usecs)
155     : WatchdogTask( parent, interval_usecs )
156     , m_debugModule( parent.m_debugModule )
157 {
158 }
159
160 bool
161 Watchdog::WatchdogHartbeatTask::Init()
162 {
163     #ifdef DEBUG
164     m_last_loop_entry = 0;
165     m_successive_short_loops = 0;
166     #endif
167     return Watchdog::WatchdogTask::Init();
168 }
169
170 bool
171 Watchdog::WatchdogHartbeatTask::Execute()
172 {
173     if (Watchdog::WatchdogTask::Execute() == false)
174         return false;
175
176     debugOutput(DEBUG_LEVEL_VERY_VERBOSE,
177                 "(%p) watchdog %p hartbeat\n", this, &m_parent);
178     m_parent.setHartbeat();
179
180     #ifdef DEBUG
181     uint64_t now = Util::SystemTimeSource::getCurrentTimeAsUsecs();
182     int diff = now - m_last_loop_entry;
183     if(diff < 100) {
184         debugOutputExtreme(DEBUG_LEVEL_VERY_VERBOSE,
185                            "(%p) short loop detected (%d usec), cnt: %d\n",
186                            this, diff, m_successive_short_loops);
187         m_successive_short_loops++;
188         if(m_successive_short_loops > 100) {
189             debugError("Shutting down runaway thread\n");
190             return false;
191         }
192     } else {
193         // reset the counter
194         m_successive_short_loops = 0;
195     }
196     m_last_loop_entry = now;
197     #endif
198
199     return true;
200 }
201
202 // the actual watchdog class
203 Watchdog::Watchdog()
204 : m_hartbeat( true )
205 , m_check_interval( WATCHDOG_DEFAULT_CHECK_INTERVAL_USECS )
206 , m_realtime( WATCHDOG_DEFAULT_RUN_REALTIME )
207 , m_priority( WATCHDOG_DEFAULT_PRIORITY )
208 , m_CheckThread( NULL )
209 , m_HartbeatThread( NULL )
210 , m_CheckTask( NULL )
211 , m_HartbeatTask( NULL )
212 {
213 }
214
215 Watchdog::Watchdog(unsigned int interval_usec, bool realtime, unsigned int priority)
216 : m_hartbeat( true )
217 , m_check_interval( interval_usec )
218 , m_realtime( realtime )
219 , m_priority( priority )
220 , m_CheckThread( NULL )
221 , m_HartbeatThread( NULL )
222 , m_CheckTask( NULL )
223 , m_HartbeatTask( NULL )
224 {
225 }
226
227 Watchdog::~Watchdog()
228 {
229     // kill threads instead of stoping them since they are sleeping.
230     // Except that the threads call non-cancel-safe functions, so we have to
231     // use Stop().  Task ReqStop() methods are used to allow the tasks to
232     // prepare for exit by (for example) aborting sleeps.
233     if (m_CheckThread) {
234         m_CheckTask->ReqStop();
235         m_CheckThread->Stop();
236         //m_CheckThread->Kill();
237         delete m_CheckThread;
238     }
239     if (m_HartbeatThread) {
240         m_HartbeatTask->ReqStop();
241         m_HartbeatThread->Stop();
242         //m_HartbeatThread->Kill();
243         delete m_HartbeatThread;
244     }
245     if (m_CheckTask) {
246         delete m_CheckTask;
247     }
248     if (m_HartbeatTask) {
249         delete m_HartbeatTask;
250     }
251 }
252
253 void
254 Watchdog::setVerboseLevel(int i)
255 {
256     setDebugLevel(i);
257 }
258
259 bool
260 Watchdog::start()
261 {
262     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) Starting watchdog...\n", this);
263     debugOutput( DEBUG_LEVEL_VERBOSE, "Create hartbeat task/thread for %p...\n", this);
264     m_HartbeatTask = new WatchdogHartbeatTask( *this, m_check_interval/2 );
265     if(!m_HartbeatTask) {
266         debugFatal("No hartbeat task\n");
267         return false;
268     }
269     m_HartbeatThread = new Util::PosixThread(m_HartbeatTask, "WDGHBT", false,
270                                              0, PTHREAD_CANCEL_ASYNCHRONOUS);
271     if(!m_HartbeatThread) {
272         debugFatal("No hartbeat thread\n");
273         return false;
274     }
275     debugOutput( DEBUG_LEVEL_VERBOSE,
276                  " hartbeat task: %p, thread %p...\n",
277                  m_HartbeatTask, m_HartbeatThread);
278
279     debugOutput( DEBUG_LEVEL_VERBOSE, "Create check task/thread for %p...\n", this);
280     m_CheckTask = new WatchdogCheckTask( *this, m_check_interval );
281     if(!m_CheckTask) {
282         debugFatal("No check task\n");
283         return false;
284     }
285     m_CheckThread = new Util::PosixThread(m_CheckTask,"WDGCHK", false,
286                                           0, PTHREAD_CANCEL_ASYNCHRONOUS);
287     if(!m_CheckThread) {
288         debugFatal("No check thread\n");
289         return false;
290     }
291     debugOutput( DEBUG_LEVEL_VERBOSE,
292                  " check task: %p, thread %p...\n",
293                  m_CheckTask, m_CheckThread);
294
295     // switch to realtime if necessary
296     if(m_realtime) {
297         if(!m_CheckThread->AcquireRealTime(m_priority)) {
298             debugWarning("(%p) Could not aquire realtime priotiry for watchdog thread.\n", this);
299         }
300     }
301
302     // start threads
303     if (m_HartbeatThread->Start() != 0) {
304         debugFatal("Could not start hartbeat thread\n");
305         return false;
306     }
307     if (m_CheckThread->Start() != 0) {
308         debugFatal("Could not start check thread\n");
309         return false;
310     }
311     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) Watchdog running...\n", this);
312     return true;
313 }
314
315 bool
316 Watchdog::setThreadParameters(bool rt, int priority)
317 {
318     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) switch to: (rt=%d, prio=%d)...\n", this, rt, priority);
319     if (priority > THREAD_MAX_RTPRIO) priority = THREAD_MAX_RTPRIO; // cap the priority
320     m_realtime = rt;
321     m_priority = priority;
322
323     if (m_CheckThread) {
324         if (m_realtime) {
325             m_CheckThread->AcquireRealTime(m_priority);
326         } else {
327             m_CheckThread->DropRealTime();
328         }
329     }
330     return true;
331 }
332
333 /**
334  * register a thread to the watchdog
335  * @param thread
336  * @return
337  */
338 bool
339 Watchdog::registerThread(Thread *thread)
340 {
341     assert(thread);
342     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) Adding thread %p\n",
343         this, thread);
344
345     for ( ThreadVectorIterator it = m_Threads.begin();
346       it != m_Threads.end();
347       ++it )
348     {
349         if(*it == thread) {
350             debugError("Thread %p already registered with watchdog\n", thread);
351             return false;
352         }
353     }
354     m_Threads.push_back(thread);
355     return true;
356 }
357
358 bool
359 Watchdog::unregisterThread(Thread *thread)
360 {
361     assert(thread);
362     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) unregistering thread %p\n", this, thread);
363
364     for ( ThreadVectorIterator it = m_Threads.begin();
365       it != m_Threads.end();
366       ++it )
367     {
368         if(*it == thread) {
369             m_Threads.erase(it);
370             return true;
371         }
372     }
373     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) thread %p not found \n", this, thread);
374     return false; //not found
375 }
376
377 void Watchdog::rescheduleThreads()
378 {
379     debugOutput( DEBUG_LEVEL_VERBOSE, "(%p) rescheduling threads\n", this);
380
381     for ( ThreadVectorIterator it = m_Threads.begin();
382       it != m_Threads.end();
383       ++it )
384     {
385         (*it)->DropRealTime();
386     }
387 }
388
389 } // end of namespace Util
Note: See TracBrowser for help on using the browser.