ViewVC Help
View File | Revision Log | Show Annotations | Revision Graph | Root Listing
root/i-scream/projects/cms/source/server/uk/org/iscream/cms/server/client/monitors/Heartbeat__Monitor.java
Revision: 1.25
Committed: Mon Feb 24 20:18:48 2003 UTC (21 years, 3 months ago) by tdb
Branch: MAIN
Changes since 1.24: +10 -6 lines
Log Message:
Fairly major commit. This will break the current version of ihost, but this
had to be done really to give Pete something to test the new ihost against.

The main change here is removal of the TCP Heartbeat functionality from the
filter. This meant the following features stopped working :-
  - Heartbeat testing
  - Configuration checking
  - Service checks

The heartbeat testing, specifically the monitor, now looks at the presence
of UDP packets instead. Before it just looked for the presence of a TCP
heartbeat packet, so the change their is fairly negligible. Of course this
means heartbeat testing now relies on the UDP working... but I don't see
this as a problem.

Configuration checking has been repositioned in to the filtermanager. This
is a backwards compatible change - the filtermanager should still perform
as it should for older hosts. But now there's an extra command to check the
configuration is up-to-date, with a similar format to the old TCP protocol
in the filter. (although we may optimise this soon)

The service checks are broken. This isn't a major issue for us as they were
pretty useless in the first place. The concept is good, but the checks are
just far too primitive. I expect at some point I'll work on a seperate
component that just monitors services, which will replace this function.

Further changes in the server include removal of the key checking code,
as this relied on a bolt on to the TCP heartbeat protocol to ship the
key. This got more akward than originally planned, so I'm happy to drop the
idea. In the long term we hope to replace this with a public key systems
for signing and even encryption.

Finally, general tidy up to remove other bits of code that check for
TCP heartbeat packets when they don't need to any more.

File Contents

# User Rev Content
1 tdb 1.22 /*
2     * i-scream central monitoring system
3 tdb 1.23 * http://www.i-scream.org.uk
4 tdb 1.22 * Copyright (C) 2000-2002 i-scream
5     *
6     * This program is free software; you can redistribute it and/or
7     * modify it under the terms of the GNU General Public License
8     * as published by the Free Software Foundation; either version 2
9     * of the License, or (at your option) any later version.
10     *
11     * This program is distributed in the hope that it will be useful,
12     * but WITHOUT ANY WARRANTY; without even the implied warranty of
13     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14     * GNU General Public License for more details.
15     *
16     * You should have received a copy of the GNU General Public License
17     * along with this program; if not, write to the Free Software
18     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19     */
20    
21 tdb 1.1 //---PACKAGE DECLARATION---
22 tdb 1.20 package uk.org.iscream.cms.server.client.monitors;
23 tdb 1.1
24     //---IMPORTS---
25     import java.util.HashMap;
26     import java.util.Iterator;
27 tdb 1.21 import java.util.StringTokenizer;
28 tdb 1.20 import uk.org.iscream.cms.server.client.*;
29     import uk.org.iscream.cms.server.core.*;
30 tdb 1.24 import uk.org.iscream.cms.util.*;
31 tdb 1.20 import uk.org.iscream.cms.server.componentmanager.*;
32 tdb 1.1
33     /**
34 tdb 1.18 * This Monitor watches heartbeats.
35     * It generates an alert when a heartbeat that is expected
36     * does not arrive. Unlike all the other monitors, this one
37     * is driven by an event *not* occuring, rather than an
38     * event occuring. This means it must be actively checking
39     * for missing heartbeat's, and thus has an extra inner class
40     * thread.
41 tdb 1.1 *
42 tdb 1.25 * This originally took "heartbeat" packets, but they've now
43     * been deprecated. Instead we look at UDP packets, or, rather
44     * the lack of them :-)
45     *
46 tdb 1.22 * @author $Author: tdb $
47 tdb 1.25 * @version $Id: Heartbeat__Monitor.java,v 1.24 2003/02/05 16:43:45 tdb Exp $
48 tdb 1.1 */
49 ajm 1.14 public class Heartbeat__Monitor extends MonitorSkeleton {
50 tdb 1.1
51     //---FINAL ATTRIBUTES---
52    
53     /**
54     * The current CVS revision of this class
55     */
56 tdb 1.25 public final String REVISION = "$Revision: 1.24 $";
57 tdb 1.1
58 tdb 1.18 /**
59     * A description of this monitor
60     */
61 tdb 1.1 public final String DESC = "Monitors Heartbeats.";
62    
63 tdb 1.18 /**
64     * The default (used if not configured) period at
65     * which to check for old heartbeats. (in seconds)
66     */
67 tdb 1.3 public final int DEFAULT_CHECK_PERIOD = 60;
68    
69 tdb 1.1 //---STATIC METHODS---
70    
71     //---CONSTRUCTORS---
72 tdb 1.18
73     /**
74     * Constructs a new Heartbeat monitor, and starts off
75     * the worker thread.
76     */
77 tdb 1.2 public Heartbeat__Monitor() {
78 ajm 1.16 super();
79 tdb 1.21 createInitialHosts();
80 ajm 1.14 new HeartbeatWorker().start();
81 tdb 1.2 }
82    
83 tdb 1.1 //---PUBLIC METHODS---
84    
85 tdb 1.18 /**
86     * Analyse a packet of data. In this case, this will just
87     * register the fact that a heartbeat has arrived.
88     *
89     * @param packet The packet of data to analyse
90     */
91 ajm 1.14 public void analysePacket(XMLPacket packet) {
92     String source = packet.getParam("packet.attributes.machine_name");
93     if (!_hosts.containsKey(source)) {
94 tdb 1.9 synchronized(this) {
95 tdb 1.18 _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
96 tdb 1.1 }
97     }
98 ajm 1.14 HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
99     lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
100 tdb 1.1 }
101    
102     /**
103     * Overrides the {@link java.lang.Object#toString() Object.toString()}
104     * method to provide clean logging (every class should have this).
105     *
106 tdb 1.24 * This uses the uk.org.iscream.cms.util.NameFormat class
107 tdb 1.1 * to format the toString()
108     *
109     * @return the name of this class and its CVS revision
110     */
111     public String toString() {
112     return FormatName.getName(
113     _name,
114     getClass().getName(),
115     REVISION);
116     }
117    
118     /**
119     * return the String representation of what the monitor does
120     */
121     public String getDescription(){
122     return DESC;
123     }
124    
125     //---PRIVATE METHODS---
126    
127 tdb 1.18 /**
128     * Checks whether the time since the last heartbeat
129     * is beyond the threshold(s).
130     *
131     * @param timeSinceLastHB a long time since the last heartbeat arrived
132     * @param reg the Register for this host
133     * @return the level which has been breached, if any
134     */
135 tdb 1.2 private int checkAttributeThreshold(long timeSinceLastHB, Register reg) {
136 tdb 1.1 for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) {
137     if (reg.getThreshold(thresholdLevel) != -1.0) {
138 tdb 1.2 if (((long) reg.getThreshold(thresholdLevel)) < timeSinceLastHB) {
139 tdb 1.1 return thresholdLevel;
140     }
141     }
142     }
143 tdb 1.7 return Alert.thresholdNORMAL;
144 tdb 1.21 }
145    
146     /**
147     * Gets an initial list of hosts from the config
148     * and adds a fake set of heartbeats for them.
149     * If the hosts don't respond within the timeout
150     * period an alert will be raised.
151     *
152     * The effect of this is to allow us to know about
153     * hosts which weren't on when we started up, and
154     * will thus never have generated a heartbeat - yet
155     * will still want to know they're not responding.
156     */
157     private void createInitialHosts() {
158     // get the initial list of hosts from the config
159     String initialHosts = "";
160     try {
161     initialHosts = _cp.getProperty(_name, "Monitor.Heartbeat.initialHosts");
162     } catch (PropertyNotFoundException e) {
163     // just leave initialHosts empty
164     _logger.write(Heartbeat__Monitor.this.toString(), Logger.DEBUG, "No initial list of hosts set, defaulting to none.");
165     }
166    
167     // parse through the initial hosts adding them
168     StringTokenizer st = new StringTokenizer(initialHosts, ";");
169     while (st.hasMoreTokens()) {
170     String source = st.nextToken();
171     // check if they already exist, don't want to add them twice
172     if (!_hosts.containsKey(source)) {
173     synchronized(this) {
174     _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
175     }
176     }
177     HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
178     // set a "fake" heartbeat
179     lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
180     }
181 tdb 1.1 }
182    
183     //---ACCESSOR/MUTATOR METHODS---
184 tdb 1.18
185     /**
186     * Returns a reference to the Queue we're getting data
187     * from. This is specific to this monitor.
188     *
189     * @return a reference to a Queue to get data from
190     */
191 ajm 1.14 protected Queue getQueue() {
192 tdb 1.25 return MonitorManager.getInstance().getDataQueue();
193 ajm 1.14 }
194    
195 tdb 1.1 //---ATTRIBUTES---
196    
197     /**
198     * This is the friendly identifier of the
199     * component this class is running in.
200     * eg, a Filter may be called "filter1",
201     * If this class does not have an owning
202     * component, a name from the configuration
203     * can be placed here. This name could also
204     * be changed to null for utility classes.
205     */
206     private String _name = "Heartbeat";
207    
208     /**
209     * A reference to the configuration proxy in use
210     */
211     private ConfigurationProxy _cp = ConfigurationProxy.getInstance();
212 tdb 1.18
213     /**
214     * A HashMap of hosts, with associated HeartbeatHolder's.
215     */
216 tdb 1.6 private HashMap _hosts = new HashMap();
217 tdb 1.18
218     /**
219     * A reference to the system logger.
220     */
221 tdb 1.15 private Logger _logger = ReferenceManager.getInstance().getLogger();
222 tdb 1.1
223     //---STATIC ATTRIBUTES---
224    
225     //---INNER CLASSES---
226 tdb 1.18
227     /**
228     * This inner class simply holding some information
229     * about a specific host.
230     */
231 tdb 1.1 private class HeartbeatHolder {
232    
233 tdb 1.18 /**
234     * Construct a new HeartbeatHolder.
235     */
236     public HeartbeatHolder(Register register) {
237     _register = register;
238 tdb 1.6 }
239    
240 tdb 1.18 /**
241     * Set the time of the last heartbeat
242     */
243 tdb 1.2 public void setLastHeartbeat(long lastHeartbeat) {
244 tdb 1.1 _lastHeartbeat = lastHeartbeat;
245     }
246    
247 tdb 1.18 /**
248     * Get the time of the last heartbeat
249     */
250 tdb 1.2 public long getLastHeartbeat() {
251 tdb 1.1 return _lastHeartbeat;
252     }
253    
254 tdb 1.18 /**
255     * Get the Register
256     */
257     public Register getRegister() {
258     return _register;
259 tdb 1.6 }
260    
261 tdb 1.18 /**
262     * last heartbeat time
263     */
264 tdb 1.2 private long _lastHeartbeat;
265 tdb 1.18
266     /**
267     * register ref
268     */
269     private Register _register;
270 ajm 1.14 }
271    
272 tdb 1.18 /**
273     * This worker thread just checks all the hosts and then
274     * waits a period of time before doing it again. It sends
275     * Alerts as required.
276     */
277 ajm 1.14 private class HeartbeatWorker extends Thread {
278    
279 tdb 1.18 /**
280     * The main run method of this worker thread. It simply
281     * checks through all the hosts it has stored, running
282     * the analyseHB method on each. It then removes any
283     * that have passed a FINAL, and waits a (configured)
284     * length of time before doing it again.
285     */
286 ajm 1.14 public void run() {
287     ConfigurationProxy cp = ConfigurationProxy.getInstance();
288     while(true) {
289     // this cycle period of this monitor's checks
290     int checkPeriod = 0;
291     try {
292     checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod"));
293     } catch (PropertyNotFoundException e) {
294     checkPeriod = DEFAULT_CHECK_PERIOD;
295 tdb 1.19 _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds");
296 ajm 1.14 } catch (NumberFormatException e) {
297     checkPeriod = DEFAULT_CHECK_PERIOD;
298 tdb 1.19 _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds");
299 ajm 1.14 }
300    
301 tdb 1.19 synchronized(Heartbeat__Monitor.this) {
302 ajm 1.14 // perform the checks (use HB hash, although they *should* be the same)
303     Iterator i = _hosts.keySet().iterator();
304     while(i.hasNext()) {
305     // get host
306     String source = (String) i.next();
307     // check it
308     boolean remove = analyseHB(source);
309 tdb 1.18 // remove it if it's passed a FINAL
310 ajm 1.14 if(remove) {
311     i.remove();
312     }
313     }
314     }
315    
316     // wait a while
317     try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {}
318     }
319     }
320 ajm 1.16
321 tdb 1.18 /**
322     * Analyses a given host's state, and if need be generates
323     * a relevant Alert. Note that it also checks if the last
324     * alert sent is FINAL, in which case it returns true to
325     * indicate removal of this host.
326     *
327     * @param source the host to check
328     * @return whether this host can be deleted
329     */
330 ajm 1.16 private boolean analyseHB(String source) {
331     ConfigurationProxy cp = ConfigurationProxy.getInstance();
332     HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source);
333 tdb 1.18 Register reg = hbHolder.getRegister();
334 ajm 1.16
335     // get host's HB interval (seconds)
336     // this should always exist, thus we set to 0
337     int hostHBinterval = 0;
338     try {
339 tdb 1.25 hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.UDPUpdateTime"));
340 ajm 1.16 } catch (PropertyNotFoundException e) {
341     hostHBinterval = 0;
342 tdb 1.25 _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "UDPUpdateTime value unavailable using default of " + hostHBinterval + " seconds");
343 ajm 1.16 } catch (NumberFormatException e) {
344     hostHBinterval = 0;
345 tdb 1.25 _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous UDPUpdateTime value in configuration using default of " + hostHBinterval + " seconds");
346 ajm 1.16 }
347    
348     // get host's last HB time (seconds)
349     long lastHeartbeat = hbHolder.getLastHeartbeat();
350     // time since last heartbeat (seconds)
351     long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat;
352     // time since (or until if negative) the expected heartbeat
353     long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval;
354    
355     // best do a check in case the expected heartbeat is in the future
356     if(timeSinceExpectedHB < 0) {
357     timeSinceExpectedHB = 0;
358     }
359    
360     // find out the threshold level we're at
361     int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg);
362    
363     // process the alert
364 ajm 1.17 Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB));
365 ajm 1.16
366     if(reg.getLastAlertLevel() == Alert.alertFINAL) {
367     return true;
368     }
369     return false;
370     }
371 ajm 1.14 }
372 tdb 1.1 }