| 1 |
tdb |
1.22 |
/* |
| 2 |
|
|
* i-scream central monitoring system |
| 3 |
tdb |
1.23 |
* http://www.i-scream.org.uk |
| 4 |
tdb |
1.22 |
* Copyright (C) 2000-2002 i-scream |
| 5 |
|
|
* |
| 6 |
|
|
* This program is free software; you can redistribute it and/or |
| 7 |
|
|
* modify it under the terms of the GNU General Public License |
| 8 |
|
|
* as published by the Free Software Foundation; either version 2 |
| 9 |
|
|
* of the License, or (at your option) any later version. |
| 10 |
|
|
* |
| 11 |
|
|
* This program is distributed in the hope that it will be useful, |
| 12 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 |
|
|
* GNU General Public License for more details. |
| 15 |
|
|
* |
| 16 |
|
|
* You should have received a copy of the GNU General Public License |
| 17 |
|
|
* along with this program; if not, write to the Free Software |
| 18 |
|
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 19 |
|
|
*/ |
| 20 |
|
|
|
| 21 |
tdb |
1.1 |
//---PACKAGE DECLARATION--- |
| 22 |
tdb |
1.20 |
package uk.org.iscream.cms.server.client.monitors; |
| 23 |
tdb |
1.1 |
|
| 24 |
|
|
//---IMPORTS--- |
| 25 |
|
|
import java.util.HashMap; |
| 26 |
|
|
import java.util.Iterator; |
| 27 |
tdb |
1.21 |
import java.util.StringTokenizer; |
| 28 |
tdb |
1.20 |
import uk.org.iscream.cms.server.client.*; |
| 29 |
|
|
import uk.org.iscream.cms.server.core.*; |
| 30 |
tdb |
1.24 |
import uk.org.iscream.cms.util.*; |
| 31 |
tdb |
1.20 |
import uk.org.iscream.cms.server.componentmanager.*; |
| 32 |
tdb |
1.1 |
|
| 33 |
|
|
/** |
| 34 |
tdb |
1.18 |
* This Monitor watches heartbeats. |
| 35 |
|
|
* It generates an alert when a heartbeat that is expected |
| 36 |
|
|
* does not arrive. Unlike all the other monitors, this one |
| 37 |
|
|
* is driven by an event *not* occuring, rather than an |
| 38 |
|
|
* event occuring. This means it must be actively checking |
| 39 |
|
|
* for missing heartbeat's, and thus has an extra inner class |
| 40 |
|
|
* thread. |
| 41 |
tdb |
1.1 |
* |
| 42 |
tdb |
1.22 |
* @author $Author: tdb $ |
| 43 |
tdb |
1.24 |
* @version $Id: Heartbeat__Monitor.java,v 1.23 2002/05/21 16:47:16 tdb Exp $ |
| 44 |
tdb |
1.1 |
*/ |
| 45 |
ajm |
1.14 |
public class Heartbeat__Monitor extends MonitorSkeleton { |
| 46 |
tdb |
1.1 |
|
| 47 |
|
|
//---FINAL ATTRIBUTES--- |
| 48 |
|
|
|
| 49 |
|
|
/** |
| 50 |
|
|
* The current CVS revision of this class |
| 51 |
|
|
*/ |
| 52 |
tdb |
1.24 |
public final String REVISION = "$Revision: 1.23 $"; |
| 53 |
tdb |
1.1 |
|
| 54 |
tdb |
1.18 |
/** |
| 55 |
|
|
* A description of this monitor |
| 56 |
|
|
*/ |
| 57 |
tdb |
1.1 |
public final String DESC = "Monitors Heartbeats."; |
| 58 |
|
|
|
| 59 |
tdb |
1.18 |
/** |
| 60 |
|
|
* The default (used if not configured) period at |
| 61 |
|
|
* which to check for old heartbeats. (in seconds) |
| 62 |
|
|
*/ |
| 63 |
tdb |
1.3 |
public final int DEFAULT_CHECK_PERIOD = 60; |
| 64 |
|
|
|
| 65 |
tdb |
1.1 |
//---STATIC METHODS--- |
| 66 |
|
|
|
| 67 |
|
|
//---CONSTRUCTORS--- |
| 68 |
tdb |
1.18 |
|
| 69 |
|
|
/** |
| 70 |
|
|
* Constructs a new Heartbeat monitor, and starts off |
| 71 |
|
|
* the worker thread. |
| 72 |
|
|
*/ |
| 73 |
tdb |
1.2 |
public Heartbeat__Monitor() { |
| 74 |
ajm |
1.16 |
super(); |
| 75 |
tdb |
1.21 |
createInitialHosts(); |
| 76 |
ajm |
1.14 |
new HeartbeatWorker().start(); |
| 77 |
tdb |
1.2 |
} |
| 78 |
|
|
|
| 79 |
tdb |
1.1 |
//---PUBLIC METHODS--- |
| 80 |
|
|
|
| 81 |
tdb |
1.18 |
/** |
| 82 |
|
|
* Analyse a packet of data. In this case, this will just |
| 83 |
|
|
* register the fact that a heartbeat has arrived. |
| 84 |
|
|
* |
| 85 |
|
|
* @param packet The packet of data to analyse |
| 86 |
|
|
*/ |
| 87 |
ajm |
1.14 |
public void analysePacket(XMLPacket packet) { |
| 88 |
|
|
String source = packet.getParam("packet.attributes.machine_name"); |
| 89 |
|
|
if (!_hosts.containsKey(source)) { |
| 90 |
tdb |
1.9 |
synchronized(this) { |
| 91 |
tdb |
1.18 |
_hosts.put(source, new HeartbeatHolder(new Register(source, _name))); |
| 92 |
tdb |
1.1 |
} |
| 93 |
|
|
} |
| 94 |
ajm |
1.14 |
HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source); |
| 95 |
|
|
lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000); |
| 96 |
tdb |
1.1 |
} |
| 97 |
|
|
|
| 98 |
|
|
/** |
| 99 |
|
|
* Overrides the {@link java.lang.Object#toString() Object.toString()} |
| 100 |
|
|
* method to provide clean logging (every class should have this). |
| 101 |
|
|
* |
| 102 |
tdb |
1.24 |
* This uses the uk.org.iscream.cms.util.NameFormat class |
| 103 |
tdb |
1.1 |
* to format the toString() |
| 104 |
|
|
* |
| 105 |
|
|
* @return the name of this class and its CVS revision |
| 106 |
|
|
*/ |
| 107 |
|
|
public String toString() { |
| 108 |
|
|
return FormatName.getName( |
| 109 |
|
|
_name, |
| 110 |
|
|
getClass().getName(), |
| 111 |
|
|
REVISION); |
| 112 |
|
|
} |
| 113 |
|
|
|
| 114 |
|
|
/** |
| 115 |
|
|
* return the String representation of what the monitor does |
| 116 |
|
|
*/ |
| 117 |
|
|
public String getDescription(){ |
| 118 |
|
|
return DESC; |
| 119 |
|
|
} |
| 120 |
|
|
|
| 121 |
|
|
//---PRIVATE METHODS--- |
| 122 |
|
|
|
| 123 |
tdb |
1.18 |
/** |
| 124 |
|
|
* Checks whether the time since the last heartbeat |
| 125 |
|
|
* is beyond the threshold(s). |
| 126 |
|
|
* |
| 127 |
|
|
* @param timeSinceLastHB a long time since the last heartbeat arrived |
| 128 |
|
|
* @param reg the Register for this host |
| 129 |
|
|
* @return the level which has been breached, if any |
| 130 |
|
|
*/ |
| 131 |
tdb |
1.2 |
private int checkAttributeThreshold(long timeSinceLastHB, Register reg) { |
| 132 |
tdb |
1.1 |
for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) { |
| 133 |
|
|
if (reg.getThreshold(thresholdLevel) != -1.0) { |
| 134 |
tdb |
1.2 |
if (((long) reg.getThreshold(thresholdLevel)) < timeSinceLastHB) { |
| 135 |
tdb |
1.1 |
return thresholdLevel; |
| 136 |
|
|
} |
| 137 |
|
|
} |
| 138 |
|
|
} |
| 139 |
tdb |
1.7 |
return Alert.thresholdNORMAL; |
| 140 |
tdb |
1.21 |
} |
| 141 |
|
|
|
| 142 |
|
|
/** |
| 143 |
|
|
* Gets an initial list of hosts from the config |
| 144 |
|
|
* and adds a fake set of heartbeats for them. |
| 145 |
|
|
* If the hosts don't respond within the timeout |
| 146 |
|
|
* period an alert will be raised. |
| 147 |
|
|
* |
| 148 |
|
|
* The effect of this is to allow us to know about |
| 149 |
|
|
* hosts which weren't on when we started up, and |
| 150 |
|
|
* will thus never have generated a heartbeat - yet |
| 151 |
|
|
* will still want to know they're not responding. |
| 152 |
|
|
*/ |
| 153 |
|
|
private void createInitialHosts() { |
| 154 |
|
|
// get the initial list of hosts from the config |
| 155 |
|
|
String initialHosts = ""; |
| 156 |
|
|
try { |
| 157 |
|
|
initialHosts = _cp.getProperty(_name, "Monitor.Heartbeat.initialHosts"); |
| 158 |
|
|
} catch (PropertyNotFoundException e) { |
| 159 |
|
|
// just leave initialHosts empty |
| 160 |
|
|
_logger.write(Heartbeat__Monitor.this.toString(), Logger.DEBUG, "No initial list of hosts set, defaulting to none."); |
| 161 |
|
|
} |
| 162 |
|
|
|
| 163 |
|
|
// parse through the initial hosts adding them |
| 164 |
|
|
StringTokenizer st = new StringTokenizer(initialHosts, ";"); |
| 165 |
|
|
while (st.hasMoreTokens()) { |
| 166 |
|
|
String source = st.nextToken(); |
| 167 |
|
|
// check if they already exist, don't want to add them twice |
| 168 |
|
|
if (!_hosts.containsKey(source)) { |
| 169 |
|
|
synchronized(this) { |
| 170 |
|
|
_hosts.put(source, new HeartbeatHolder(new Register(source, _name))); |
| 171 |
|
|
} |
| 172 |
|
|
} |
| 173 |
|
|
HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source); |
| 174 |
|
|
// set a "fake" heartbeat |
| 175 |
|
|
lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000); |
| 176 |
|
|
} |
| 177 |
tdb |
1.1 |
} |
| 178 |
|
|
|
| 179 |
|
|
//---ACCESSOR/MUTATOR METHODS--- |
| 180 |
tdb |
1.18 |
|
| 181 |
|
|
/** |
| 182 |
|
|
* Returns a reference to the Queue we're getting data |
| 183 |
|
|
* from. This is specific to this monitor. |
| 184 |
|
|
* |
| 185 |
|
|
* @return a reference to a Queue to get data from |
| 186 |
|
|
*/ |
| 187 |
ajm |
1.14 |
protected Queue getQueue() { |
| 188 |
|
|
return MonitorManager.getInstance().getHeartbeatQueue(); |
| 189 |
|
|
} |
| 190 |
|
|
|
| 191 |
tdb |
1.1 |
//---ATTRIBUTES--- |
| 192 |
|
|
|
| 193 |
|
|
/** |
| 194 |
|
|
* This is the friendly identifier of the |
| 195 |
|
|
* component this class is running in. |
| 196 |
|
|
* eg, a Filter may be called "filter1", |
| 197 |
|
|
* If this class does not have an owning |
| 198 |
|
|
* component, a name from the configuration |
| 199 |
|
|
* can be placed here. This name could also |
| 200 |
|
|
* be changed to null for utility classes. |
| 201 |
|
|
*/ |
| 202 |
|
|
private String _name = "Heartbeat"; |
| 203 |
|
|
|
| 204 |
|
|
/** |
| 205 |
|
|
* A reference to the configuration proxy in use |
| 206 |
|
|
*/ |
| 207 |
|
|
private ConfigurationProxy _cp = ConfigurationProxy.getInstance(); |
| 208 |
tdb |
1.18 |
|
| 209 |
|
|
/** |
| 210 |
|
|
* A HashMap of hosts, with associated HeartbeatHolder's. |
| 211 |
|
|
*/ |
| 212 |
tdb |
1.6 |
private HashMap _hosts = new HashMap(); |
| 213 |
tdb |
1.18 |
|
| 214 |
|
|
/** |
| 215 |
|
|
* A reference to the system logger. |
| 216 |
|
|
*/ |
| 217 |
tdb |
1.15 |
private Logger _logger = ReferenceManager.getInstance().getLogger(); |
| 218 |
tdb |
1.1 |
|
| 219 |
|
|
//---STATIC ATTRIBUTES--- |
| 220 |
|
|
|
| 221 |
|
|
//---INNER CLASSES--- |
| 222 |
tdb |
1.18 |
|
| 223 |
|
|
/** |
| 224 |
|
|
* This inner class simply holding some information |
| 225 |
|
|
* about a specific host. |
| 226 |
|
|
*/ |
| 227 |
tdb |
1.1 |
private class HeartbeatHolder { |
| 228 |
|
|
|
| 229 |
tdb |
1.18 |
/** |
| 230 |
|
|
* Construct a new HeartbeatHolder. |
| 231 |
|
|
*/ |
| 232 |
|
|
public HeartbeatHolder(Register register) { |
| 233 |
|
|
_register = register; |
| 234 |
tdb |
1.6 |
} |
| 235 |
|
|
|
| 236 |
tdb |
1.18 |
/** |
| 237 |
|
|
* Set the time of the last heartbeat |
| 238 |
|
|
*/ |
| 239 |
tdb |
1.2 |
public void setLastHeartbeat(long lastHeartbeat) { |
| 240 |
tdb |
1.1 |
_lastHeartbeat = lastHeartbeat; |
| 241 |
|
|
} |
| 242 |
|
|
|
| 243 |
tdb |
1.18 |
/** |
| 244 |
|
|
* Get the time of the last heartbeat |
| 245 |
|
|
*/ |
| 246 |
tdb |
1.2 |
public long getLastHeartbeat() { |
| 247 |
tdb |
1.1 |
return _lastHeartbeat; |
| 248 |
|
|
} |
| 249 |
|
|
|
| 250 |
tdb |
1.18 |
/** |
| 251 |
|
|
* Get the Register |
| 252 |
|
|
*/ |
| 253 |
|
|
public Register getRegister() { |
| 254 |
|
|
return _register; |
| 255 |
tdb |
1.6 |
} |
| 256 |
|
|
|
| 257 |
tdb |
1.18 |
/** |
| 258 |
|
|
* last heartbeat time |
| 259 |
|
|
*/ |
| 260 |
tdb |
1.2 |
private long _lastHeartbeat; |
| 261 |
tdb |
1.18 |
|
| 262 |
|
|
/** |
| 263 |
|
|
* register ref |
| 264 |
|
|
*/ |
| 265 |
|
|
private Register _register; |
| 266 |
ajm |
1.14 |
} |
| 267 |
|
|
|
| 268 |
tdb |
1.18 |
/** |
| 269 |
|
|
* This worker thread just checks all the hosts and then |
| 270 |
|
|
* waits a period of time before doing it again. It sends |
| 271 |
|
|
* Alerts as required. |
| 272 |
|
|
*/ |
| 273 |
ajm |
1.14 |
private class HeartbeatWorker extends Thread { |
| 274 |
|
|
|
| 275 |
tdb |
1.18 |
/** |
| 276 |
|
|
* The main run method of this worker thread. It simply |
| 277 |
|
|
* checks through all the hosts it has stored, running |
| 278 |
|
|
* the analyseHB method on each. It then removes any |
| 279 |
|
|
* that have passed a FINAL, and waits a (configured) |
| 280 |
|
|
* length of time before doing it again. |
| 281 |
|
|
*/ |
| 282 |
ajm |
1.14 |
public void run() { |
| 283 |
|
|
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
| 284 |
|
|
while(true) { |
| 285 |
|
|
// this cycle period of this monitor's checks |
| 286 |
|
|
int checkPeriod = 0; |
| 287 |
|
|
try { |
| 288 |
|
|
checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod")); |
| 289 |
|
|
} catch (PropertyNotFoundException e) { |
| 290 |
|
|
checkPeriod = DEFAULT_CHECK_PERIOD; |
| 291 |
tdb |
1.19 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds"); |
| 292 |
ajm |
1.14 |
} catch (NumberFormatException e) { |
| 293 |
|
|
checkPeriod = DEFAULT_CHECK_PERIOD; |
| 294 |
tdb |
1.19 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds"); |
| 295 |
ajm |
1.14 |
} |
| 296 |
|
|
|
| 297 |
tdb |
1.19 |
synchronized(Heartbeat__Monitor.this) { |
| 298 |
ajm |
1.14 |
// perform the checks (use HB hash, although they *should* be the same) |
| 299 |
|
|
Iterator i = _hosts.keySet().iterator(); |
| 300 |
|
|
while(i.hasNext()) { |
| 301 |
|
|
// get host |
| 302 |
|
|
String source = (String) i.next(); |
| 303 |
|
|
// check it |
| 304 |
|
|
boolean remove = analyseHB(source); |
| 305 |
tdb |
1.18 |
// remove it if it's passed a FINAL |
| 306 |
ajm |
1.14 |
if(remove) { |
| 307 |
|
|
i.remove(); |
| 308 |
|
|
} |
| 309 |
|
|
} |
| 310 |
|
|
} |
| 311 |
|
|
|
| 312 |
|
|
// wait a while |
| 313 |
|
|
try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {} |
| 314 |
|
|
} |
| 315 |
|
|
} |
| 316 |
ajm |
1.16 |
|
| 317 |
tdb |
1.18 |
/** |
| 318 |
|
|
* Analyses a given host's state, and if need be generates |
| 319 |
|
|
* a relevant Alert. Note that it also checks if the last |
| 320 |
|
|
* alert sent is FINAL, in which case it returns true to |
| 321 |
|
|
* indicate removal of this host. |
| 322 |
|
|
* |
| 323 |
|
|
* @param source the host to check |
| 324 |
|
|
* @return whether this host can be deleted |
| 325 |
|
|
*/ |
| 326 |
ajm |
1.16 |
private boolean analyseHB(String source) { |
| 327 |
|
|
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
| 328 |
|
|
HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source); |
| 329 |
tdb |
1.18 |
Register reg = hbHolder.getRegister(); |
| 330 |
ajm |
1.16 |
|
| 331 |
|
|
// get host's HB interval (seconds) |
| 332 |
|
|
// this should always exist, thus we set to 0 |
| 333 |
|
|
int hostHBinterval = 0; |
| 334 |
|
|
try { |
| 335 |
|
|
hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.TCPUpdateTime")); |
| 336 |
|
|
} catch (PropertyNotFoundException e) { |
| 337 |
|
|
hostHBinterval = 0; |
| 338 |
tdb |
1.19 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "TCPUpdateTime value unavailable using default of " + hostHBinterval + " seconds"); |
| 339 |
ajm |
1.16 |
} catch (NumberFormatException e) { |
| 340 |
|
|
hostHBinterval = 0; |
| 341 |
tdb |
1.19 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous TCPUpdateTime value in configuration using default of " + hostHBinterval + " seconds"); |
| 342 |
ajm |
1.16 |
} |
| 343 |
|
|
|
| 344 |
|
|
// get host's last HB time (seconds) |
| 345 |
|
|
long lastHeartbeat = hbHolder.getLastHeartbeat(); |
| 346 |
|
|
// time since last heartbeat (seconds) |
| 347 |
|
|
long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat; |
| 348 |
|
|
// time since (or until if negative) the expected heartbeat |
| 349 |
|
|
long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval; |
| 350 |
|
|
|
| 351 |
|
|
// best do a check in case the expected heartbeat is in the future |
| 352 |
|
|
if(timeSinceExpectedHB < 0) { |
| 353 |
|
|
timeSinceExpectedHB = 0; |
| 354 |
|
|
} |
| 355 |
|
|
|
| 356 |
|
|
// find out the threshold level we're at |
| 357 |
|
|
int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg); |
| 358 |
|
|
|
| 359 |
|
|
// process the alert |
| 360 |
ajm |
1.17 |
Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB)); |
| 361 |
ajm |
1.16 |
|
| 362 |
|
|
if(reg.getLastAlertLevel() == Alert.alertFINAL) { |
| 363 |
|
|
return true; |
| 364 |
|
|
} |
| 365 |
|
|
return false; |
| 366 |
|
|
} |
| 367 |
ajm |
1.14 |
} |
| 368 |
tdb |
1.1 |
} |