1 |
tdb |
1.1 |
//---PACKAGE DECLARATION--- |
2 |
tdb |
1.13 |
package uk.org.iscream.client.monitors; |
3 |
tdb |
1.1 |
|
4 |
|
|
//---IMPORTS--- |
5 |
|
|
import java.util.HashMap; |
6 |
|
|
import java.util.Iterator; |
7 |
tdb |
1.13 |
import uk.org.iscream.client.*; |
8 |
|
|
import uk.org.iscream.core.*; |
9 |
|
|
import uk.org.iscream.util.*; |
10 |
|
|
import uk.org.iscream.componentmanager.*; |
11 |
tdb |
1.1 |
|
12 |
|
|
/** |
13 |
tdb |
1.18 |
* This Monitor watches heartbeats. |
14 |
|
|
* It generates an alert when a heartbeat that is expected |
15 |
|
|
* does not arrive. Unlike all the other monitors, this one |
16 |
|
|
* is driven by an event *not* occuring, rather than an |
17 |
|
|
* event occuring. This means it must be actively checking |
18 |
|
|
* for missing heartbeat's, and thus has an extra inner class |
19 |
|
|
* thread. |
20 |
tdb |
1.1 |
* |
21 |
ajm |
1.17 |
* @author $Author: ajm4 $ |
22 |
tdb |
1.18 |
* @version $Id: Heartbeat__Monitor.java,v 1.17 2001/03/23 01:08:00 ajm4 Exp $ |
23 |
tdb |
1.1 |
*/ |
24 |
ajm |
1.14 |
public class Heartbeat__Monitor extends MonitorSkeleton { |
25 |
tdb |
1.1 |
|
26 |
|
|
//---FINAL ATTRIBUTES--- |
27 |
|
|
|
28 |
|
|
/** |
29 |
|
|
* The current CVS revision of this class |
30 |
|
|
*/ |
31 |
tdb |
1.18 |
public final String REVISION = "$Revision: 1.17 $"; |
32 |
tdb |
1.1 |
|
33 |
tdb |
1.18 |
/** |
34 |
|
|
* A description of this monitor |
35 |
|
|
*/ |
36 |
tdb |
1.1 |
public final String DESC = "Monitors Heartbeats."; |
37 |
|
|
|
38 |
tdb |
1.18 |
/** |
39 |
|
|
* The default (used if not configured) period at |
40 |
|
|
* which to check for old heartbeats. (in seconds) |
41 |
|
|
*/ |
42 |
tdb |
1.3 |
public final int DEFAULT_CHECK_PERIOD = 60; |
43 |
|
|
|
44 |
tdb |
1.1 |
//---STATIC METHODS--- |
45 |
|
|
|
46 |
|
|
//---CONSTRUCTORS--- |
47 |
tdb |
1.18 |
|
48 |
|
|
/** |
49 |
|
|
* Constructs a new Heartbeat monitor, and starts off |
50 |
|
|
* the worker thread. |
51 |
|
|
*/ |
52 |
tdb |
1.2 |
public Heartbeat__Monitor() { |
53 |
ajm |
1.16 |
super(); |
54 |
ajm |
1.14 |
new HeartbeatWorker().start(); |
55 |
tdb |
1.2 |
} |
56 |
|
|
|
57 |
tdb |
1.1 |
//---PUBLIC METHODS--- |
58 |
|
|
|
59 |
tdb |
1.18 |
/** |
60 |
|
|
* Analyse a packet of data. In this case, this will just |
61 |
|
|
* register the fact that a heartbeat has arrived. |
62 |
|
|
* |
63 |
|
|
* @param packet The packet of data to analyse |
64 |
|
|
*/ |
65 |
ajm |
1.14 |
public void analysePacket(XMLPacket packet) { |
66 |
|
|
String source = packet.getParam("packet.attributes.machine_name"); |
67 |
|
|
if (!_hosts.containsKey(source)) { |
68 |
tdb |
1.9 |
synchronized(this) { |
69 |
tdb |
1.18 |
_hosts.put(source, new HeartbeatHolder(new Register(source, _name))); |
70 |
tdb |
1.1 |
} |
71 |
|
|
} |
72 |
ajm |
1.14 |
HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source); |
73 |
|
|
lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000); |
74 |
tdb |
1.1 |
} |
75 |
|
|
|
76 |
|
|
/** |
77 |
|
|
* Overrides the {@link java.lang.Object#toString() Object.toString()} |
78 |
|
|
* method to provide clean logging (every class should have this). |
79 |
|
|
* |
80 |
tdb |
1.13 |
* This uses the uk.org.iscream.util.NameFormat class |
81 |
tdb |
1.1 |
* to format the toString() |
82 |
|
|
* |
83 |
|
|
* @return the name of this class and its CVS revision |
84 |
|
|
*/ |
85 |
|
|
public String toString() { |
86 |
|
|
return FormatName.getName( |
87 |
|
|
_name, |
88 |
|
|
getClass().getName(), |
89 |
|
|
REVISION); |
90 |
|
|
} |
91 |
|
|
|
92 |
|
|
/** |
93 |
|
|
* return the String representation of what the monitor does |
94 |
|
|
*/ |
95 |
|
|
public String getDescription(){ |
96 |
|
|
return DESC; |
97 |
|
|
} |
98 |
|
|
|
99 |
|
|
//---PRIVATE METHODS--- |
100 |
|
|
|
101 |
tdb |
1.18 |
/** |
102 |
|
|
* Checks whether the time since the last heartbeat |
103 |
|
|
* is beyond the threshold(s). |
104 |
|
|
* |
105 |
|
|
* @param timeSinceLastHB a long time since the last heartbeat arrived |
106 |
|
|
* @param reg the Register for this host |
107 |
|
|
* @return the level which has been breached, if any |
108 |
|
|
*/ |
109 |
tdb |
1.2 |
private int checkAttributeThreshold(long timeSinceLastHB, Register reg) { |
110 |
tdb |
1.1 |
for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) { |
111 |
|
|
if (reg.getThreshold(thresholdLevel) != -1.0) { |
112 |
tdb |
1.2 |
if (((long) reg.getThreshold(thresholdLevel)) < timeSinceLastHB) { |
113 |
tdb |
1.1 |
return thresholdLevel; |
114 |
|
|
} |
115 |
|
|
} |
116 |
|
|
} |
117 |
tdb |
1.7 |
return Alert.thresholdNORMAL; |
118 |
tdb |
1.1 |
} |
119 |
|
|
|
120 |
|
|
//---ACCESSOR/MUTATOR METHODS--- |
121 |
tdb |
1.18 |
|
122 |
|
|
/** |
123 |
|
|
* Returns a reference to the Queue we're getting data |
124 |
|
|
* from. This is specific to this monitor. |
125 |
|
|
* |
126 |
|
|
* @return a reference to a Queue to get data from |
127 |
|
|
*/ |
128 |
ajm |
1.14 |
protected Queue getQueue() { |
129 |
|
|
return MonitorManager.getInstance().getHeartbeatQueue(); |
130 |
|
|
} |
131 |
|
|
|
132 |
tdb |
1.1 |
//---ATTRIBUTES--- |
133 |
|
|
|
134 |
|
|
/** |
135 |
|
|
* This is the friendly identifier of the |
136 |
|
|
* component this class is running in. |
137 |
|
|
* eg, a Filter may be called "filter1", |
138 |
|
|
* If this class does not have an owning |
139 |
|
|
* component, a name from the configuration |
140 |
|
|
* can be placed here. This name could also |
141 |
|
|
* be changed to null for utility classes. |
142 |
|
|
*/ |
143 |
|
|
private String _name = "Heartbeat"; |
144 |
|
|
|
145 |
|
|
/** |
146 |
|
|
* A reference to the configuration proxy in use |
147 |
|
|
*/ |
148 |
|
|
private ConfigurationProxy _cp = ConfigurationProxy.getInstance(); |
149 |
tdb |
1.18 |
|
150 |
|
|
/** |
151 |
|
|
* A HashMap of hosts, with associated HeartbeatHolder's. |
152 |
|
|
*/ |
153 |
tdb |
1.6 |
private HashMap _hosts = new HashMap(); |
154 |
tdb |
1.18 |
|
155 |
|
|
/** |
156 |
|
|
* A reference to the system logger. |
157 |
|
|
*/ |
158 |
tdb |
1.15 |
private Logger _logger = ReferenceManager.getInstance().getLogger(); |
159 |
tdb |
1.1 |
|
160 |
|
|
//---STATIC ATTRIBUTES--- |
161 |
|
|
|
162 |
|
|
//---INNER CLASSES--- |
163 |
tdb |
1.18 |
|
164 |
|
|
/** |
165 |
|
|
* This inner class simply holding some information |
166 |
|
|
* about a specific host. |
167 |
|
|
*/ |
168 |
tdb |
1.1 |
private class HeartbeatHolder { |
169 |
|
|
|
170 |
tdb |
1.18 |
/** |
171 |
|
|
* Construct a new HeartbeatHolder. |
172 |
|
|
*/ |
173 |
|
|
public HeartbeatHolder(Register register) { |
174 |
|
|
_register = register; |
175 |
tdb |
1.6 |
} |
176 |
|
|
|
177 |
tdb |
1.18 |
/** |
178 |
|
|
* Set the time of the last heartbeat |
179 |
|
|
*/ |
180 |
tdb |
1.2 |
public void setLastHeartbeat(long lastHeartbeat) { |
181 |
tdb |
1.1 |
_lastHeartbeat = lastHeartbeat; |
182 |
|
|
} |
183 |
|
|
|
184 |
tdb |
1.18 |
/** |
185 |
|
|
* Get the time of the last heartbeat |
186 |
|
|
*/ |
187 |
tdb |
1.2 |
public long getLastHeartbeat() { |
188 |
tdb |
1.1 |
return _lastHeartbeat; |
189 |
|
|
} |
190 |
|
|
|
191 |
tdb |
1.18 |
/** |
192 |
|
|
* Get the Register |
193 |
|
|
*/ |
194 |
|
|
public Register getRegister() { |
195 |
|
|
return _register; |
196 |
tdb |
1.6 |
} |
197 |
|
|
|
198 |
tdb |
1.18 |
/** |
199 |
|
|
* last heartbeat time |
200 |
|
|
*/ |
201 |
tdb |
1.2 |
private long _lastHeartbeat; |
202 |
tdb |
1.18 |
|
203 |
|
|
/** |
204 |
|
|
* register ref |
205 |
|
|
*/ |
206 |
|
|
private Register _register; |
207 |
ajm |
1.14 |
} |
208 |
|
|
|
209 |
tdb |
1.18 |
/** |
210 |
|
|
* This worker thread just checks all the hosts and then |
211 |
|
|
* waits a period of time before doing it again. It sends |
212 |
|
|
* Alerts as required. |
213 |
|
|
*/ |
214 |
ajm |
1.14 |
private class HeartbeatWorker extends Thread { |
215 |
|
|
|
216 |
tdb |
1.18 |
/** |
217 |
|
|
* The main run method of this worker thread. It simply |
218 |
|
|
* checks through all the hosts it has stored, running |
219 |
|
|
* the analyseHB method on each. It then removes any |
220 |
|
|
* that have passed a FINAL, and waits a (configured) |
221 |
|
|
* length of time before doing it again. |
222 |
|
|
*/ |
223 |
ajm |
1.14 |
public void run() { |
224 |
|
|
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
225 |
|
|
while(true) { |
226 |
|
|
// this cycle period of this monitor's checks |
227 |
|
|
int checkPeriod = 0; |
228 |
|
|
try { |
229 |
|
|
checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod")); |
230 |
|
|
} catch (PropertyNotFoundException e) { |
231 |
|
|
checkPeriod = DEFAULT_CHECK_PERIOD; |
232 |
|
|
_logger.write(this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds"); |
233 |
|
|
} catch (NumberFormatException e) { |
234 |
|
|
checkPeriod = DEFAULT_CHECK_PERIOD; |
235 |
|
|
_logger.write(this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds"); |
236 |
|
|
} |
237 |
|
|
|
238 |
|
|
synchronized(this) { |
239 |
|
|
// perform the checks (use HB hash, although they *should* be the same) |
240 |
|
|
Iterator i = _hosts.keySet().iterator(); |
241 |
|
|
while(i.hasNext()) { |
242 |
|
|
// get host |
243 |
|
|
String source = (String) i.next(); |
244 |
|
|
// check it |
245 |
|
|
boolean remove = analyseHB(source); |
246 |
tdb |
1.18 |
// remove it if it's passed a FINAL |
247 |
ajm |
1.14 |
if(remove) { |
248 |
|
|
i.remove(); |
249 |
|
|
} |
250 |
|
|
} |
251 |
|
|
} |
252 |
|
|
|
253 |
|
|
// wait a while |
254 |
|
|
try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {} |
255 |
|
|
} |
256 |
|
|
} |
257 |
ajm |
1.16 |
|
258 |
tdb |
1.18 |
/** |
259 |
|
|
* Analyses a given host's state, and if need be generates |
260 |
|
|
* a relevant Alert. Note that it also checks if the last |
261 |
|
|
* alert sent is FINAL, in which case it returns true to |
262 |
|
|
* indicate removal of this host. |
263 |
|
|
* |
264 |
|
|
* @param source the host to check |
265 |
|
|
* @return whether this host can be deleted |
266 |
|
|
*/ |
267 |
ajm |
1.16 |
private boolean analyseHB(String source) { |
268 |
|
|
ConfigurationProxy cp = ConfigurationProxy.getInstance(); |
269 |
|
|
HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source); |
270 |
tdb |
1.18 |
Register reg = hbHolder.getRegister(); |
271 |
ajm |
1.16 |
|
272 |
|
|
// get host's HB interval (seconds) |
273 |
|
|
// this should always exist, thus we set to 0 |
274 |
|
|
int hostHBinterval = 0; |
275 |
|
|
try { |
276 |
|
|
hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.TCPUpdateTime")); |
277 |
|
|
} catch (PropertyNotFoundException e) { |
278 |
|
|
hostHBinterval = 0; |
279 |
|
|
_logger.write(this.toString(), Logger.WARNING, "TCPUpdateTime value unavailable using default of " + hostHBinterval + " seconds"); |
280 |
|
|
} catch (NumberFormatException e) { |
281 |
|
|
hostHBinterval = 0; |
282 |
|
|
_logger.write(this.toString(), Logger.WARNING, "Erronous TCPUpdateTime value in configuration using default of " + hostHBinterval + " seconds"); |
283 |
|
|
} |
284 |
|
|
|
285 |
|
|
// get host's last HB time (seconds) |
286 |
|
|
long lastHeartbeat = hbHolder.getLastHeartbeat(); |
287 |
|
|
// time since last heartbeat (seconds) |
288 |
|
|
long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat; |
289 |
|
|
// time since (or until if negative) the expected heartbeat |
290 |
|
|
long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval; |
291 |
|
|
|
292 |
|
|
// best do a check in case the expected heartbeat is in the future |
293 |
|
|
if(timeSinceExpectedHB < 0) { |
294 |
|
|
timeSinceExpectedHB = 0; |
295 |
|
|
} |
296 |
|
|
|
297 |
|
|
// find out the threshold level we're at |
298 |
|
|
int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg); |
299 |
|
|
|
300 |
|
|
// process the alert |
301 |
ajm |
1.17 |
Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB)); |
302 |
ajm |
1.16 |
|
303 |
|
|
if(reg.getLastAlertLevel() == Alert.alertFINAL) { |
304 |
|
|
return true; |
305 |
|
|
} |
306 |
|
|
return false; |
307 |
|
|
} |
308 |
ajm |
1.14 |
} |
309 |
tdb |
1.1 |
} |