If a worker process crashes during shutdown, dump core and prevent restarts. Before the change, if a worker process crashes during shutdown, death() handler would exit with code 1, and master process would restart the worker. Now workers send SIGUSR1 to master when shutting down. When master process gets the SIGUSR1 signal, it stops restarting workers. SIGUSR1 is already used for log rotation, but it is fine to use SIGUSR1 for master process shutdown notifications because master is never responsible for both log rotation and kid restarts. Terminate with abort(3) instead of exit(3) to leave a core dump if Squid worker crashes during shutdown. Also the patch fixes potential infinite loop in master process. Master finished only when all kids exited with success, or all kids are hopeless, or all kids were killed by a signal. But in cases like when part of kids are hopeless and other were killed, master process would not exit. After the change master exits when there are no running kids and no kids should be restarted. Add syslog notice if kid becomes hopeless. === modified file 'src/ipc/Kid.cc' --- src/ipc/Kid.cc 2010-07-07 12:44:44 +0000 +++ src/ipc/Kid.cc 2010-11-14 16:19:25 +0000 @@ -1,28 +1,29 @@ /* * $Id$ * * DEBUG: section 54 Interprocess Communication * */ #include "config.h" +#include "globals.h" #include "ipc/Kid.h" #if HAVE_SYS_WAIT_H #include #endif Kid::Kid(): badFailures(0), pid(-1), startTime(0), isRunning(false) { } Kid::Kid(const String& kid_name): theName(kid_name), badFailures(0), pid(-1), startTime(0), isRunning(false) @@ -47,40 +48,52 @@ void Kid::stop(status_type exitStatus) assert(startTime != 0); isRunning = false; time_t stop_time; time(&stop_time); if ((stop_time - startTime) < fastFailureTimeLimit) badFailures++; else badFailures = 0; // the failures are not "frequent" [any more] status = exitStatus; } /// returns true if tracking of kid is stopped bool Kid::running() const { return isRunning; } +/// returns true if master process should restart this kid +bool Kid::shouldRestart() const +{ + return !(running() || + exitedHappy() || + hopeless() || + shutting_down || + signaled(SIGKILL) || // squid -k kill + signaled(SIGINT) || // unexpected forced shutdown + signaled(SIGTERM)); // unexpected forced shutdown +} + /// returns current pid for a running kid and last pid for a stopped kid pid_t Kid::getPid() const { assert(pid > 0); return pid; } /// whether the failures are "repeated and frequent" bool Kid::hopeless() const { return badFailures > badFailureLimit; } /// returns true if the process terminated normally bool Kid::calledExit() const { return (pid > 0) && !running() && WIFEXITED(status); } /// returns the exit status of the process === modified file 'src/ipc/Kid.h' --- src/ipc/Kid.h 2010-03-30 21:54:40 +0000 +++ src/ipc/Kid.h 2010-11-13 16:39:43 +0000 @@ -23,40 +23,43 @@ public: /// keep restarting until the number of bad failures exceed this limit enum { badFailureLimit = 4 }; /// slower start failures are not "frequent enough" to be counted as "bad" enum { fastFailureTimeLimit = 10 }; // seconds public: Kid(); Kid(const String& kid_name); /// called when this kid got started, records PID void start(pid_t cpid); /// called when kid terminates, sets exiting status void stop(status_type exitStatus); /// returns true if tracking of kid is stopped bool running() const; + /// returns true if master should restart this kid + bool shouldRestart() const; + /// returns current pid for a running kid and last pid for a stopped kid pid_t getPid() const; /// whether the failures are "repeated and frequent" bool hopeless() const; /// returns true if the process terminated normally bool calledExit() const; /// returns the exit status of the process int exitStatus() const; /// whether the process exited with a given exit status code bool calledExit(int code) const; /// whether the process exited with code 0 bool exitedHappy() const; /// returns true if the kid was terminated by a signal bool signaled() const; === modified file 'src/ipc/Kids.cc' --- src/ipc/Kids.cc 2010-07-07 16:41:03 +0000 +++ src/ipc/Kids.cc 2010-11-14 16:18:16 +0000 @@ -63,35 +63,55 @@ Kid& Kids::get(size_t i) /// whether all kids are hopeless bool Kids::allHopeless() const { for (size_t i = 0; i < storage.size(); ++i) { if (!storage[i].hopeless()) return false; } return true; } /// whether all kids called exited happy bool Kids::allExitedHappy() const { for (size_t i = 0; i < storage.size(); ++i) { if (!storage[i].exitedHappy()) return false; } return true; } -/// whether all kids died from a given signal -bool Kids::allSignaled(int sgnl) const +/// whether some kids died from a given signal +bool Kids::someSignaled(const int sgnl) const { for (size_t i = 0; i < storage.size(); ++i) { - if (!storage[i].signaled(sgnl)) - return false; + if (storage[i].signaled(sgnl)) + return true; } - return true; + return false; +} + +/// whether some kids are running +bool Kids::someRunning() const +{ + for (size_t i = 0; i < storage.size(); ++i) { + if (storage[i].running()) + return true; + } + return false; +} + +/// whether some kids should be restarted by master +bool Kids::shouldRestartSome() const +{ + for (size_t i = 0; i < storage.size(); ++i) { + if (storage[i].shouldRestart()) + return true; + } + return false; } /// returns the number of kids size_t Kids::count() const { return storage.size(); } === modified file 'src/ipc/Kids.h' --- src/ipc/Kids.h 2010-07-07 16:45:24 +0000 +++ src/ipc/Kids.h 2010-11-14 16:17:40 +0000 @@ -19,37 +19,43 @@ public: private: Kids (const Kids&); ///< not implemented Kids& operator= (const Kids&); ///< not implemented public: /// maintain n kids void init(size_t n); /// returns kid by pid Kid* find(pid_t pid); /// returns the kid by index, useful for kids iteration Kid& get(size_t i); /// whether all kids are hopeless bool allHopeless() const; /// whether all kids called exited happy bool allExitedHappy() const; - /// whether all kids died from a given signal - bool allSignaled(int sgnl) const; + /// whether some kids died from a given signal + bool someSignaled(const int sgnl) const; + + /// whether some kids are running + bool someRunning() const; + + /// whether some kids should be restarted by master + bool shouldRestartSome() const; /// returns the number of kids size_t count() const; private: Vector storage; }; extern Kids TheKids; ///< All kids being maintained typedef char KidName[64]; ///< Squid process name (e.g., "squid-coord") extern KidName TheKidName; ///< current Squid process name #endif /* SQUID_IPC_KIDS_H */ === modified file 'src/main.cc' --- src/main.cc 2010-11-01 05:44:28 +0000 +++ src/main.cc 2010-11-15 00:58:00 +0000 @@ -599,48 +599,58 @@ reconfigure(int sig) ReconfigureSignal = sig; #ifndef _SQUID_MSWIN_ #if !HAVE_SIGACTION signal(sig, reconfigure); #endif #endif } void shut_down(int sig) { do_shutdown = sig == SIGINT ? -1 : 1; ShutdownSignal = sig; #ifdef SIGTTIN if (SIGTTIN == sig) shutdown_status = 1; #endif + + const pid_t ppid = getppid(); + + if (ppid > 1) { + // notify master that we are shutting down + if (kill(ppid, SIGUSR1) < 0) + debugs(1, DBG_IMPORTANT, "Failed to send SIGUSR1 to master process," + " pid " << ppid << ": " << xstrerror()); + } + #ifndef _SQUID_MSWIN_ #if KILL_PARENT_OPT - if (getppid() > 1) { - debugs(1, 1, "Killing master process, pid " << getppid()); + if (ppid > 1) { + debugs(1, DBG_IMPORTANT, "Killing master process, pid " << ppid); - if (kill(getppid(), sig) < 0) - debugs(1, 1, "kill " << getppid() << ": " << xstrerror()); + if (kill(ppid, sig) < 0) + debugs(1, DBG_IMPORTANT, "kill " << ppid << ": " << xstrerror()); } #endif /* KILL_PARENT_OPT */ #if SA_RESETHAND == 0 signal(SIGTERM, SIG_DFL); signal(SIGINT, SIG_DFL); #endif #endif } static void serverConnectionsOpen(void) { if (IamPrimaryProcess()) { #if USE_WCCP wccpConnectionOpen(); #endif @@ -1677,55 +1687,58 @@ watch_child(char *argv[]) #endif /* * RBCOLLINS - if cygwin stackdumps when squid is run without * -N, check the cygwin1.dll version, it needs to be AT LEAST * 1.1.3. execvp had a bit overflow error in a loop.. */ /* Connect stdio to /dev/null in daemon mode */ nullfd = open(_PATH_DEVNULL, O_RDWR | O_TEXT); if (nullfd < 0) fatalf(_PATH_DEVNULL " %s\n", xstrerror()); dup2(nullfd, 0); if (Debug::log_stderr < 0) { dup2(nullfd, 1); dup2(nullfd, 2); } + // handle shutdown notifications from kids + squid_signal(SIGUSR1, sig_shutdown, SA_RESTART); + if (Config.workers > 128) { syslog(LOG_ALERT, "Suspiciously high workers value: %d", Config.workers); // but we keep going in hope that user knows best } TheKids.init(Config.workers); // keep [re]starting kids until it is time to quit for (;;) { mainStartScript(argv[0]); // start each kid that needs to be [re]started; once for (int i = TheKids.count() - 1; i >= 0; --i) { Kid& kid = TheKids.get(i); - if (kid.hopeless() || kid.exitedHappy() || kid.running()) + if (!kid.shouldRestart()) continue; if ((pid = fork()) == 0) { /* child */ openlog(APP_SHORTNAME, LOG_PID | LOG_NDELAY | LOG_CONS, LOG_LOCAL4); prog = argv[0]; argv[0] = const_cast(kid.name().termedBuf()); execvp(prog, argv); syslog(LOG_ALERT, "execvp failed: %s", xstrerror()); } kid.start(pid); syslog(LOG_NOTICE, "Squid Parent: child process %d started", pid); } /* parent */ openlog(APP_SHORTNAME, LOG_PID | LOG_NDELAY | LOG_CONS, LOG_LOCAL4); squid_signal(SIGINT, SIG_IGN, SA_RESTART); @@ -1737,68 +1750,69 @@ watch_child(char *argv[]) pid = waitpid(-1, &status, 0); #endif // Loop to collect all stopped kids before we go to sleep below. do { Kid* kid = TheKids.find(pid); if (kid) { kid->stop(status); if (kid->calledExit()) { syslog(LOG_NOTICE, "Squid Parent: child process %d exited with status %d", kid->getPid(), kid->exitStatus()); } else if (kid->signaled()) { syslog(LOG_NOTICE, "Squid Parent: child process %d exited due to signal %d with status %d", kid->getPid(), kid->termSignal(), kid->exitStatus()); } else { syslog(LOG_NOTICE, "Squid Parent: child process %d exited", kid->getPid()); } + if (kid->hopeless()) { + syslog(LOG_NOTICE, "Squid Parent: child process %d will not" + " be restarted due to repeated, frequent failures", + kid->getPid()); + } } else { syslog(LOG_NOTICE, "Squid Parent: unknown child process %d exited", pid); } #ifdef _SQUID_NEXT_ } while ((pid = wait3(&status, WNOHANG, NULL)) > 0); #else } while ((pid = waitpid(-1, &status, WNOHANG)) > 0); #endif - if (TheKids.allExitedHappy()) { - exit(0); - } + if (!TheKids.someRunning() && !TheKids.shouldRestartSome()) { + if (TheKids.someSignaled(SIGINT) || TheKids.someSignaled(SIGTERM)) { + syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); + exit(1); + } - if (TheKids.allHopeless()) { - syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); - exit(1); - } + if (TheKids.allHopeless()) { + syslog(LOG_ALERT, "Exiting due to repeated, frequent failures"); + exit(1); + } - if (TheKids.allSignaled(SIGKILL)) { exit(0); } - if (TheKids.allSignaled(SIGINT) || TheKids.allSignaled(SIGTERM)) { - syslog(LOG_ALERT, "Exiting due to unexpected forced shutdown"); - exit(1); - } - squid_signal(SIGINT, SIG_DFL, SA_RESTART); sleep(3); } /* NOTREACHED */ #endif /* _SQUID_MSWIN_ */ } static void SquidShutdown() { /* XXX: This function is called after the main loop has quit, which * means that no AsyncCalls would be called, including close handlers. * TODO: We need to close/shut/free everything that needs calls before * exiting the loop. */ #if USE_WIN32_SERVICE WIN32_svcstatusupdate(SERVICE_STOP_PENDING, 10000); === modified file 'src/protos.h' --- src/protos.h 2010-11-01 05:44:28 +0000 +++ src/protos.h 2010-11-15 01:00:19 +0000 @@ -545,40 +545,41 @@ SQUIDCEXTERN void storeSwapInStart(store /* * store_client.c */ SQUIDCEXTERN store_client *storeClientListAdd(StoreEntry * e, void *data); SQUIDCEXTERN int storeClientCopyPending(store_client *, StoreEntry * e, void *data); SQUIDCEXTERN int storeUnregister(store_client * sc, StoreEntry * e, void *data) ; SQUIDCEXTERN int storePendingNClients(const StoreEntry * e); SQUIDCEXTERN int storeClientIsThisAClient(store_client * sc, void *someClient); SQUIDCEXTERN const char *getMyHostname(void); SQUIDCEXTERN const char *uniqueHostname(void); SQUIDCEXTERN void safeunlink(const char *path, int quiet); void death(int sig); SQUIDCEXTERN void fatal(const char *message); SQUIDCEXTERN void fatalf(const char *fmt,...) PRINTF_FORMAT_ARG1; SQUIDCEXTERN void fatal_dump(const char *message); void sigusr2_handle(int sig); void sig_child(int sig); +void sig_shutdown(int sig); ///< handles shutdown notifications from kids SQUIDCEXTERN void leave_suid(void); SQUIDCEXTERN void enter_suid(void); SQUIDCEXTERN void no_suid(void); SQUIDCEXTERN void writePidFile(void); SQUIDCEXTERN void setSocketShutdownLifetimes(int); SQUIDCEXTERN void setMaxFD(void); SQUIDCEXTERN void setSystemLimits(void); SQUIDCEXTERN void squid_signal(int sig, SIGHDLR *, int flags); SQUIDCEXTERN pid_t readPidFile(void); SQUIDCEXTERN void keepCapabilities(void); SQUIDCEXTERN void BroadcastSignalIfAny(int& sig); /// whether the current process is the parent of all other Squid processes SQUIDCEXTERN bool IamMasterProcess(); /** whether the current process is dedicated to doing things that only a single process should do, such as PID file maintenance and WCCP */ SQUIDCEXTERN bool IamPrimaryProcess(); /// whether the current process coordinates worker processes SQUIDCEXTERN bool IamCoordinatorProcess(); === modified file 'src/tools.cc' --- src/tools.cc 2010-10-06 03:50:45 +0000 +++ src/tools.cc 2010-11-14 15:37:26 +0000 @@ -382,43 +382,40 @@ death(int sig) releaseServerSockets(); storeDirWriteCleanLogs(0); if (!shutting_down) { PrintRusage(); dumpMallocStats(); } if (squid_curtime - SQUID_RELEASE_TIME < 864000) { /* skip if more than 10 days old */ if (Config.adminEmail) mail_warranty(); puts(dead_msg()); } - if (shutting_down) - exit(1); - abort(); } void BroadcastSignalIfAny(int& sig) { if (sig > 0) { if (IamCoordinatorProcess()) Ipc::Coordinator::Instance()->broadcastSignal(sig); sig = -1; } } void sigusr2_handle(int sig) { static int state = 0; /* no debugs() here; bad things happen if the signal is delivered during _db_print() */ DebugSignal = sig; @@ -580,40 +577,46 @@ sig_child(int sig) #else pid = waitpid(-1, &status, WNOHANG); #endif /* no debugs() here; bad things happen if the signal is delivered during _db_print() */ #if HAVE_SIGACTION } while (pid > 0); #else } while (pid > 0 || (pid < 0 && errno == EINTR)); signal(sig, sig_child); #endif #endif } +void +sig_shutdown(int sig) +{ + shutting_down = 1; +} + const char * getMyHostname(void) { LOCAL_ARRAY(char, host, SQUIDHOSTNAMELEN + 1); static int present = 0; struct addrinfo *AI = NULL; Ip::Address sa; if (Config.visibleHostname != NULL) return Config.visibleHostname; if (present) return host; host[0] = '\0'; if (Config.Sockaddr.http && sa.IsAnyAddr()) sa = Config.Sockaddr.http->s; #if USE_SSL