Brilte
2006-08-14 19:30:29 UTC
I am a little new to C and programming C on Unix (in my case Linux) but
not to programming. I am writing a system monitor, in C for Linux
(http://sf.net/projects/netnms).
My code runs fine for some random amount of time, anywhere between 1
hr+ to 2 minutes, before it meets a futex deadlock on my 2.6 kernel.
I've performed some checks and tried debugging by writing to stderr and
catching the output through an strace redirected using nohup &.
After the several weeks trying to find the cause of the deadlock i am
completely stumped as to whats happening and why the deadlock occurs.
Basically it always happens inside a logging library that i have
written for my system monitor when the signal sigusr1 is delivered.
Sigusr1 is a mechanism i have chosen to use to alert a parent process,
from a child process.
Can anyone give any pointers as to the possible source/reason for futex
deadlocks
My logging source code that the deadlock occurs in is....
} else if (env->log_env_pntr->logmethod_set == 2) {
/* Log to logfile */
time_t curtime = 0;
struct tm timestamp = {0,0,0,0,0,0,0,0,0};
int time_errorcount = 0;
if (!netnms_signal_block(env, SIGUSR1)) {
return 0;
}
if (!netnms_signal_block(env, SIGCHLD)) {
return 0;
}
while (time(&curtime) == -1) {
if (time_errorcount < ATTEMPT_LIMIT) {
time_errorcount++;
continue;
} else {
/* Failed to get epoch */
return 0;
}
}
fprintf(stderr, "1\n");
if (!netnms_signal_unblock(env, SIGUSR1)) {
return 0;
}
if (!netnms_signal_unblock(env, SIGCHLD)) {
return 0;
}
fprintf(stderr, "1.1\n");
fprintf(stderr, "%X\n", &curtime);
fprintf(stderr, "%X\n", ×tamp);
if (localtime_r(&curtime, ×tamp) == NULL) {
fprintf(stderr, "1.3\n");
return 0;
}
fprintf(stderr, "2\n");
}
this case, when the signal handler is called to process the sigusr1,
the block/unblock functions pick up that fact and do not change the
signal mask.
The strace below shows the futex deadlock in action and i've found it
to occurs anywhere inside the above code block. At first i thought it
was a problem with thread-safety but im using local variables, blocking
the non thread-safe time function and using localtime_r.
The strace output is as follows...
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_SETMASK, NULL, [CHLD], 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0
write(2, "1.1\n", 4) = 4
write(2, "BFA0BB14\n", 9) = 9
write(2, "BFA0BAE8\n", 9) = 9
write(2, "2\n", 2) = 2
write(2, "3\n", 2) = 2
write(2, "4\n", 2) = 2
write(2, "5\n", 2) = 2
write(2, "6\n", 2) = 2
--- SIGUSR1 (User defined signal 1) @ 0 (0) ---
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
time(NULL) = 1155581415
futex(0x3f80bc, FUTEX_WAIT, 2, NULL
not to programming. I am writing a system monitor, in C for Linux
(http://sf.net/projects/netnms).
My code runs fine for some random amount of time, anywhere between 1
hr+ to 2 minutes, before it meets a futex deadlock on my 2.6 kernel.
I've performed some checks and tried debugging by writing to stderr and
catching the output through an strace redirected using nohup &.
After the several weeks trying to find the cause of the deadlock i am
completely stumped as to whats happening and why the deadlock occurs.
Basically it always happens inside a logging library that i have
written for my system monitor when the signal sigusr1 is delivered.
Sigusr1 is a mechanism i have chosen to use to alert a parent process,
from a child process.
Can anyone give any pointers as to the possible source/reason for futex
deadlocks
My logging source code that the deadlock occurs in is....
} else if (env->log_env_pntr->logmethod_set == 2) {
/* Log to logfile */
time_t curtime = 0;
struct tm timestamp = {0,0,0,0,0,0,0,0,0};
int time_errorcount = 0;
if (!netnms_signal_block(env, SIGUSR1)) {
return 0;
}
if (!netnms_signal_block(env, SIGCHLD)) {
return 0;
}
while (time(&curtime) == -1) {
if (time_errorcount < ATTEMPT_LIMIT) {
time_errorcount++;
continue;
} else {
/* Failed to get epoch */
return 0;
}
}
fprintf(stderr, "1\n");
if (!netnms_signal_unblock(env, SIGUSR1)) {
return 0;
}
if (!netnms_signal_unblock(env, SIGCHLD)) {
return 0;
}
fprintf(stderr, "1.1\n");
fprintf(stderr, "%X\n", &curtime);
fprintf(stderr, "%X\n", ×tamp);
if (localtime_r(&curtime, ×tamp) == NULL) {
fprintf(stderr, "1.3\n");
return 0;
}
fprintf(stderr, "2\n");
}
From the above code snippet, netnms_signal_block and
netnms_signal_unblock are used to control the delivery of signals. Inthis case, when the signal handler is called to process the sigusr1,
the block/unblock functions pick up that fact and do not change the
signal mask.
The strace below shows the futex deadlock in action and i've found it
to occurs anywhere inside the above code block. At first i thought it
was a problem with thread-safety but im using local variables, blocking
the non thread-safe time function and using localtime_r.
The strace output is as follows...
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [USR1], NULL, 8) = 0
rt_sigprocmask(SIG_SETMASK, NULL, [CHLD], 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0
write(2, "1.1\n", 4) = 4
write(2, "BFA0BB14\n", 9) = 9
write(2, "BFA0BAE8\n", 9) = 9
write(2, "2\n", 2) = 2
write(2, "3\n", 2) = 2
write(2, "4\n", 2) = 2
write(2, "5\n", 2) = 2
write(2, "6\n", 2) = 2
--- SIGUSR1 (User defined signal 1) @ 0 (0) ---
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
rt_sigprocmask(SIG_SETMASK, NULL, [USR1 CHLD], 8) = 0
time(NULL) = 1155581415
futex(0x3f80bc, FUTEX_WAIT, 2, NULL