1 |
/* |
2 |
* i-scream central monitoring system |
3 |
* http://www.i-scream.org.uk |
4 |
* Copyright (C) 2000-2002 i-scream |
5 |
* |
6 |
* This program is free software; you can redistribute it and/or |
7 |
* modify it under the terms of the GNU General Public License |
8 |
* as published by the Free Software Foundation; either version 2 |
9 |
* of the License, or (at your option) any later version. |
10 |
* |
11 |
* This program is distributed in the hope that it will be useful, |
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 |
* GNU General Public License for more details. |
15 |
* |
16 |
* You should have received a copy of the GNU General Public License |
17 |
* along with this program; if not, write to the Free Software |
18 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
19 |
*/ |
20 |
|
21 |
//---PACKAGE DECLARATION--- |
22 |
package uk.org.iscream.cms.server.client.monitors; |
23 |
|
24 |
//---IMPORTS--- |
25 |
import java.util.HashMap; |
26 |
import java.util.Iterator; |
27 |
import java.util.StringTokenizer; |
28 |
import uk.org.iscream.cms.server.client.*; |
29 |
import uk.org.iscream.cms.server.core.*; |
30 |
import uk.org.iscream.cms.server.util.*; |
31 |
import uk.org.iscream.cms.server.componentmanager.*; |
32 |
|
33 |
/** |
34 |
* This Monitor watches heartbeats. |
35 |
* It generates an alert when a heartbeat that is expected |
36 |
* does not arrive. Unlike all the other monitors, this one |
37 |
* is driven by an event *not* occuring, rather than an |
38 |
* event occuring. This means it must be actively checking |
39 |
* for missing heartbeat's, and thus has an extra inner class |
40 |
* thread. |
41 |
* |
42 |
* @author $Author: tdb $ |
43 |
* @version $Id: Heartbeat__Monitor.java,v 1.22 2002/05/18 18:16:00 tdb Exp $ |
44 |
*/ |
45 |
public class Heartbeat__Monitor extends MonitorSkeleton { |
46 |
|
47 |
//---FINAL ATTRIBUTES--- |
48 |
|
49 |
/** |
50 |
* The current CVS revision of this class |
51 |
*/ |
52 |
public final String REVISION = "$Revision: 1.22 $"; |
53 |
|
54 |
/** |
55 |
* A description of this monitor |
56 |
*/ |
57 |
public final String DESC = "Monitors Heartbeats."; |
58 |
|
59 |
/** |
60 |
* The default (used if not configured) period at |
61 |
* which to check for old heartbeats. (in seconds) |
62 |
*/ |
63 |
public final int DEFAULT_CHECK_PERIOD = 60; |
64 |
|
65 |
//---STATIC METHODS--- |
66 |
|
67 |
//---CONSTRUCTORS--- |
68 |
|
69 |
/** |
70 |
* Constructs a new Heartbeat monitor, and starts off |
71 |
* the worker thread. |
72 |
*/ |
73 |
public Heartbeat__Monitor() { |
74 |
super(); |
75 |
createInitialHosts(); |
76 |
new HeartbeatWorker().start(); |
77 |
} |
78 |
|
79 |
//---PUBLIC METHODS--- |
80 |
|
81 |
/** |
82 |
* Analyse a packet of data. In this case, this will just |
83 |
* register the fact that a heartbeat has arrived. |
84 |
* |
85 |
* @param packet The packet of data to analyse |
86 |
*/ |
87 |
public void analysePacket(XMLPacket packet) { |
88 |
String source = packet.getParam("packet.attributes.machine_name"); |
89 |
if (!_hosts.containsKey(source)) { |
90 |
synchronized(this) { |
91 |
_hosts.put(source, new HeartbeatHolder(new Register(source, _name))); |
92 |
} |
93 |
} |
94 |
HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source); |
95 |
lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000); |
96 |
} |
97 |
|
98 |
/** |
99 |
* Overrides the {@link java.lang.Object#toString() Object.toString()} |
100 |
* method to provide clean logging (every class should have this). |
101 |
* |
102 |
* This uses the uk.org.iscream.cms.server.util.NameFormat class |
103 |
* to format the toString() |
104 |
* |
105 |
* @return the name of this class and its CVS revision |
106 |
*/ |
107 |
public String toString() { |
108 |
return FormatName.getName( |
109 |
_name, |
110 |
getClass().getName(), |
111 |
REVISION); |
112 |
} |
113 |
|
114 |
/** |
115 |
* return the String representation of what the monitor does |
116 |
*/ |
117 |
public String getDescription(){ |
118 |
return DESC; |
119 |
} |
120 |
|
121 |
//---PRIVATE METHODS--- |
122 |
|
123 |
/** |
124 |
* Checks whether the time since the last heartbeat |
125 |
* is beyond the threshold(s). |
126 |
* |
127 |
* @param timeSinceLastHB a long time since the last heartbeat arrived |
128 |
* @param reg the Register for this host |
129 |
* @return the level which has been breached, if any |
130 |
*/ |
131 |
private int checkAttributeThreshold(long timeSinceLastHB, Register reg) { |
132 |
for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) { |
133 |
if (reg.getThreshold(thresholdLevel) != -1.0) { |
134 |
if (((long) reg.getThreshold(thresholdLevel)) < timeSinceLastHB) { |
135 |
return thresholdLevel; |
136 |
} |
137 |
} |
138 |
} |
139 |
return Alert.thresholdNORMAL; |
140 |
} |
141 |
|
142 |
/** |
143 |
* Gets an initial list of hosts from the config |
144 |
* and adds a fake set of heartbeats for them. |
145 |
* If the hosts don't respond within the timeout |
146 |
* period an alert will be raised. |
147 |
* |
148 |
* The effect of this is to allow us to know about |
149 |
* hosts which weren't on when we started up, and |
150 |
* will thus never have generated a heartbeat - yet |
151 |
* will still want to know they're not responding. |
152 |
*/ |
153 |
private void createInitialHosts() { |
154 |
// get the initial list of hosts from the config |
155 |
String initialHosts = ""; |
156 |
try { |
157 |
initialHosts = _cp.getProperty(_name, "Monitor.Heartbeat.initialHosts"); |
158 |
} catch (PropertyNotFoundException e) { |
159 |
// just leave initialHosts empty |
160 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.DEBUG, "No initial list of hosts set, defaulting to none."); |
161 |
} |
162 |
|
163 |
// parse through the initial hosts adding them |
164 |
StringTokenizer st = new StringTokenizer(initialHosts, ";"); |
165 |
while (st.hasMoreTokens()) { |
166 |
String source = st.nextToken(); |
167 |
// check if they already exist, don't want to add them twice |
168 |
if (!_hosts.containsKey(source)) { |
169 |
synchronized(this) { |
170 |
_hosts.put(source, new HeartbeatHolder(new Register(source, _name))); |
171 |
} |
172 |
} |
173 |
HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source); |
174 |
// set a "fake" heartbeat |
175 |
lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000); |
176 |
} |
177 |
} |
178 |
|
179 |
//---ACCESSOR/MUTATOR METHODS--- |
180 |
|
181 |
/** |
182 |
* Returns a reference to the Queue we're getting data |
183 |
* from. This is specific to this monitor. |
184 |
* |
185 |
* @return a reference to a Queue to get data from |
186 |
*/ |
187 |
protected Queue getQueue() { |
188 |
return MonitorManager.getInstance().getHeartbeatQueue(); |
189 |
} |
190 |
|
191 |
//---ATTRIBUTES--- |
192 |
|
193 |
/** |
194 |
* This is the friendly identifier of the |
195 |
* component this class is running in. |
196 |
* eg, a Filter may be called "filter1", |
197 |
* If this class does not have an owning |
198 |
* component, a name from the configuration |
199 |
* can be placed here. This name could also |
200 |
* be changed to null for utility classes. |
201 |
*/ |
202 |
private String _name = "Heartbeat"; |
203 |
|
204 |
/** |
205 |
* A reference to the configuration proxy in use |
206 |
*/ |
207 |
private ConfigurationProxy _cp = ConfigurationProxy.getInstance(); |
208 |
|
209 |
/** |
210 |
* A HashMap of hosts, with associated HeartbeatHolder's. |
211 |
*/ |
212 |
private HashMap _hosts = new HashMap(); |
213 |
|
214 |
/** |
215 |
* A reference to the system logger. |
216 |
*/ |
217 |
private Logger _logger = ReferenceManager.getInstance().getLogger(); |
218 |
|
219 |
//---STATIC ATTRIBUTES--- |
220 |
|
221 |
//---INNER CLASSES--- |
222 |
|
223 |
/** |
224 |
* This inner class simply holding some information |
225 |
* about a specific host. |
226 |
*/ |
227 |
private class HeartbeatHolder { |
228 |
|
229 |
/** |
230 |
* Construct a new HeartbeatHolder. |
231 |
*/ |
232 |
public HeartbeatHolder(Register register) { |
233 |
_register = register; |
234 |
} |
235 |
|
236 |
/** |
237 |
* Set the time of the last heartbeat |
238 |
*/ |
239 |
public void setLastHeartbeat(long lastHeartbeat) { |
240 |
_lastHeartbeat = lastHeartbeat; |
241 |
} |
242 |
|
243 |
/** |
244 |
* Get the time of the last heartbeat |
245 |
*/ |
246 |
public long getLastHeartbeat() { |
247 |
return _lastHeartbeat; |
248 |
} |
249 |
|
250 |
/** |
251 |
* Get the Register |
252 |
*/ |
253 |
public Register getRegister() { |
254 |
return _register; |
255 |
} |
256 |
|
257 |
/** |
258 |
* last heartbeat time |
259 |
*/ |
260 |
private long _lastHeartbeat; |
261 |
|
262 |
/** |
263 |
* register ref |
264 |
*/ |
265 |
private Register _register; |
266 |
} |
267 |
|
268 |
/** |
269 |
* This worker thread just checks all the hosts and then |
270 |
* waits a period of time before doing it again. It sends |
271 |
* Alerts as required. |
272 |
*/ |
273 |
private class HeartbeatWorker extends Thread { |
274 |
|
275 |
/** |
276 |
* The main run method of this worker thread. It simply |
277 |
* checks through all the hosts it has stored, running |
278 |
* the analyseHB method on each. It then removes any |
279 |
* that have passed a FINAL, and waits a (configured) |
280 |
* length of time before doing it again. |
281 |
*/ |
282 |
public void run() { |
283 |
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
284 |
while(true) { |
285 |
// this cycle period of this monitor's checks |
286 |
int checkPeriod = 0; |
287 |
try { |
288 |
checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod")); |
289 |
} catch (PropertyNotFoundException e) { |
290 |
checkPeriod = DEFAULT_CHECK_PERIOD; |
291 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds"); |
292 |
} catch (NumberFormatException e) { |
293 |
checkPeriod = DEFAULT_CHECK_PERIOD; |
294 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds"); |
295 |
} |
296 |
|
297 |
synchronized(Heartbeat__Monitor.this) { |
298 |
// perform the checks (use HB hash, although they *should* be the same) |
299 |
Iterator i = _hosts.keySet().iterator(); |
300 |
while(i.hasNext()) { |
301 |
// get host |
302 |
String source = (String) i.next(); |
303 |
// check it |
304 |
boolean remove = analyseHB(source); |
305 |
// remove it if it's passed a FINAL |
306 |
if(remove) { |
307 |
i.remove(); |
308 |
} |
309 |
} |
310 |
} |
311 |
|
312 |
// wait a while |
313 |
try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {} |
314 |
} |
315 |
} |
316 |
|
317 |
/** |
318 |
* Analyses a given host's state, and if need be generates |
319 |
* a relevant Alert. Note that it also checks if the last |
320 |
* alert sent is FINAL, in which case it returns true to |
321 |
* indicate removal of this host. |
322 |
* |
323 |
* @param source the host to check |
324 |
* @return whether this host can be deleted |
325 |
*/ |
326 |
private boolean analyseHB(String source) { |
327 |
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
328 |
HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source); |
329 |
Register reg = hbHolder.getRegister(); |
330 |
|
331 |
// get host's HB interval (seconds) |
332 |
// this should always exist, thus we set to 0 |
333 |
int hostHBinterval = 0; |
334 |
try { |
335 |
hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.TCPUpdateTime")); |
336 |
} catch (PropertyNotFoundException e) { |
337 |
hostHBinterval = 0; |
338 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "TCPUpdateTime value unavailable using default of " + hostHBinterval + " seconds"); |
339 |
} catch (NumberFormatException e) { |
340 |
hostHBinterval = 0; |
341 |
_logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous TCPUpdateTime value in configuration using default of " + hostHBinterval + " seconds"); |
342 |
} |
343 |
|
344 |
// get host's last HB time (seconds) |
345 |
long lastHeartbeat = hbHolder.getLastHeartbeat(); |
346 |
// time since last heartbeat (seconds) |
347 |
long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat; |
348 |
// time since (or until if negative) the expected heartbeat |
349 |
long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval; |
350 |
|
351 |
// best do a check in case the expected heartbeat is in the future |
352 |
if(timeSinceExpectedHB < 0) { |
353 |
timeSinceExpectedHB = 0; |
354 |
} |
355 |
|
356 |
// find out the threshold level we're at |
357 |
int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg); |
358 |
|
359 |
// process the alert |
360 |
Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB)); |
361 |
|
362 |
if(reg.getLastAlertLevel() == Alert.alertFINAL) { |
363 |
return true; |
364 |
} |
365 |
return false; |
366 |
} |
367 |
} |
368 |
} |