ViewVC Help
View File | Revision Log | Show Annotations | Revision Graph | Root Listing
root/i-scream/projects/cms/source/server/uk/org/iscream/cms/server/client/monitors/Heartbeat__Monitor.java
(Generate patch)

Comparing projects/cms/source/server/uk/org/iscream/cms/server/client/monitors/Heartbeat__Monitor.java (file contents):
Revision 1.1 by tdb, Mon Mar 5 13:30:34 2001 UTC vs.
Revision 1.25 by tdb, Mon Feb 24 20:18:48 2003 UTC

# Line 1 | Line 1
1 + /*
2 + * i-scream central monitoring system
3 + * http://www.i-scream.org.uk
4 + * Copyright (C) 2000-2002 i-scream
5 + *
6 + * This program is free software; you can redistribute it and/or
7 + * modify it under the terms of the GNU General Public License
8 + * as published by the Free Software Foundation; either version 2
9 + * of the License, or (at your option) any later version.
10 + *
11 + * This program is distributed in the hope that it will be useful,
12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 + * GNU General Public License for more details.
15 + *
16 + * You should have received a copy of the GNU General Public License
17 + * along with this program; if not, write to the Free Software
18 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19 + */
20 +
21   //---PACKAGE DECLARATION---
22 < package uk.ac.ukc.iscream.client.monitors;
22 > package uk.org.iscream.cms.server.client.monitors;
23  
24   //---IMPORTS---
25   import java.util.HashMap;
26   import java.util.Iterator;
27 < import uk.ac.ukc.iscream.client.*;
28 < import uk.ac.ukc.iscream.core.*;
29 < import uk.ac.ukc.iscream.util.*;
30 < import uk.ac.ukc.iscream.componentmanager.*;
27 > import java.util.StringTokenizer;
28 > import uk.org.iscream.cms.server.client.*;
29 > import uk.org.iscream.cms.server.core.*;
30 > import uk.org.iscream.cms.util.*;
31 > import uk.org.iscream.cms.server.componentmanager.*;
32  
33   /**
34 < * This Monitor watches heartbeats
34 > * This Monitor watches heartbeats.
35 > * It generates an alert when a heartbeat that is expected
36 > * does not arrive. Unlike all the other monitors, this one
37 > * is driven by an event *not* occuring, rather than an
38 > * event occuring. This means it must be actively checking
39 > * for missing heartbeat's, and thus has an extra inner class
40 > * thread.
41   *
42 + * This originally took "heartbeat" packets, but they've now
43 + * been deprecated. Instead we look at UDP packets, or, rather
44 + * the lack of them :-)
45 + *
46   * @author  $Author$
47   * @version $Id$
48   */
49 < public class Heartbeat__Monitor extends Thread implements PluginMonitor {
49 > public class Heartbeat__Monitor extends MonitorSkeleton {
50  
51   //---FINAL ATTRIBUTES---
52  
# Line 24 | Line 55 | public class Heartbeat__Monitor extends Thread impleme
55       */
56      public final String REVISION = "$Revision$";
57      
58 +    /**
59 +     * A description of this monitor
60 +     */
61      public final String DESC = "Monitors Heartbeats.";
62      
63 +    /**
64 +     * The default (used if not configured) period at
65 +     * which to check for old heartbeats. (in seconds)
66 +     */
67 +    public final int DEFAULT_CHECK_PERIOD = 60;
68 +    
69   //---STATIC METHODS---
70  
71   //---CONSTRUCTORS---
32
33 //---PUBLIC METHODS---
72      
73 <    public void run() {
74 <        ConfigurationProxy cp = ConfigurationProxy.getInstance();
75 <        while(true) {
76 <            // this cycle period could be done better, maybe ?
77 <            int checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod"));
78 <            
79 <            // perform the checks (use HB hash, although they *should* be the same)
80 <            Iterator i = _hostsHB.keySet().iterator();
43 <            while(i.hasNext()) {
44 <                // get host
45 <                String source = (String) i.next();
46 <                // check it
47 <                analyseHB(source);
48 <            }
49 <            
50 <            // wait a while
51 <            try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {}
52 <        }
73 >    /**
74 >     * Constructs a new Heartbeat monitor, and starts off
75 >     * the worker thread.
76 >     */
77 >    public Heartbeat__Monitor() {
78 >        super();
79 >        createInitialHosts();
80 >        new HeartbeatWorker().start();
81      }
54    
55    // only use attribute num 0 :)
56    public void analyseHB(String source) {
57        ConfigurationProxy cp = ConfigurationProxy.getInstance();
58        Register reg = (Register) _hostsReg.get(source);
59        
60        // get host's HB interval (seconds)
61        int hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "TCPUpdateTime"));
62        // get host's last HB time (seconds)
63        int lastHeartbeat = ((HeartbeatHolder) _hostsHB.get(source)).getLastHeartbeat();
64        // time since last heartbeat (seconds)
65        int timeSinceLastHB = ((int) (System.currentTimeMillis()/1000)) - lastHeartbeat;
66        
67        // find out the threshold level we're at
68        int result = checkAttributeThreshold(timeSinceLastHB, reg);
69            
70        // decide what threshold level we're on, if we've changed, record that
71        if (result != reg.getLastThresholdLevel(0)) {
72            reg.setLastThresholdLevel(0, result);
73        }
74            
75        // as long as this isn't a normal level
76        if(reg.getLastThresholdLevel(0) != Alert.thresholdNORMAL) {
77            // if the time since the last alert is more than the time for
78            // its timeout, fire an alert, escalate the alert
79            long timeout = reg.getLastAlertTimeout(0);
80            if ((timeout > 0) && (reg.getTimeLastSent(0) > 0)) {
81                if((System.currentTimeMillis() - reg.getTimeLastSent(0)) > timeout) {
82                    int lastAlert = reg.getLastAlertLevel(0);
83                    reg.escalateAlert(0);
84                    reg.setTimeLastSent(0, System.currentTimeMillis());
85                    reg.setLastAlertTimeout(0, reg.getAlertTimeout(reg.getLastAlertLevel(0), 0));
86                    // -- SEND
87                    fireAlert(source, timeSinceLastHB, reg, lastAlert);
88                }
89            // if we don't have a timeout configured...we got STRAIGHT to the next level
90            } else {
91                int lastAlert = reg.getLastAlertLevel(0);
92                reg.escalateAlert(0);
93                reg.setTimeLastSent(0, System.currentTimeMillis());
94                reg.setLastAlertTimeout(0, reg.getAlertTimeout(reg.getLastAlertLevel(0), 0));
95                // -- SEND
96                fireAlert(source, timeSinceLastHB, reg, lastAlert);
97            }
98                
99        // we must be on ok, check the timeout value for this
100        } else {
101            // if we were on an OK alert before, then we don't do anything
102            // but if we weren't we only set OK, once the timeout of the last
103            // alert has occourd
104            if (reg.getLastAlertLevel(0) != Alert.alertOK) {
105                long timeout = reg.getLastAlertTimeout(0);
106                if ((timeout > 0) && (reg.getTimeLastSent(0) > 0)) {
107                    if ((System.currentTimeMillis() - reg.getTimeLastSent(0)) > timeout) {
108                        int lastAlert = reg.getLastAlertLevel(0);
109                        reg.setLastAlertLevel(0, Alert.alertOK);
110                        reg.setTimeLastSent(0, System.currentTimeMillis());
111                        reg.setLastAlertTimeout(0, timeout);
112                        // -- SEND
113                        fireAlert(source, timeSinceLastHB, reg, lastAlert);
114                    }
115                }
116            }
117        }
118    }
82  
83 + //---PUBLIC METHODS---
84 +    
85 +    /**
86 +     * Analyse a packet of data. In this case, this will just
87 +     * register the fact that a heartbeat has arrived.
88 +     *
89 +     * @param packet The packet of data to analyse
90 +     */
91      public void analysePacket(XMLPacket packet) {
92 <        if (packet.getParam("packet.attributes.type").equals("heartbeat")) {
93 <            String source = packet.getParam("packet.attributes.machine_name");
94 <            if (!_hostsHB.containsKey(source)) {
95 <                _hostsReg.put(source, new Register(source, _name, 1));
125 <                _hostsHB.put(source, new HeartbeatHolder());
92 >        String source = packet.getParam("packet.attributes.machine_name");
93 >        if (!_hosts.containsKey(source)) {
94 >            synchronized(this) {
95 >                _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
96              }
127            HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hostsReg.get(source);
128            lastHeartbeat.setLastHeartbeat((int)System.currentTimeMillis()/1000);
97          }
98 +        HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
99 +        lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
100      }
101      
102      /**
103       * Overrides the {@link java.lang.Object#toString() Object.toString()}
104       * method to provide clean logging (every class should have this).
105       *
106 <     * This uses the uk.ac.ukc.iscream.util.NameFormat class
106 >     * This uses the uk.org.iscream.cms.util.NameFormat class
107       * to format the toString()
108       *
109       * @return the name of this class and its CVS revision
# Line 154 | Line 124 | public class Heartbeat__Monitor extends Thread impleme
124  
125   //---PRIVATE METHODS---
126      
127 <    private int checkAttributeThreshold(int timeSinceLastHB, Register reg) {
127 >    /**
128 >     * Checks whether the time since the last heartbeat
129 >     * is beyond the threshold(s).
130 >     *
131 >     * @param timeSinceLastHB a long time since the last heartbeat arrived
132 >     * @param reg the Register for this host
133 >     * @return the level which has been breached, if any
134 >     */
135 >    private int checkAttributeThreshold(long timeSinceLastHB, Register reg) {
136          for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) {
137              if (reg.getThreshold(thresholdLevel) != -1.0) {
138 <                if (reg.getThreshold(thresholdLevel) < timeSinceLastHB) {
138 >                if (((long) reg.getThreshold(thresholdLevel)) < timeSinceLastHB) {
139                      return thresholdLevel;
140                  }
141              }
142          }
143 <        return 0;
143 >        return Alert.thresholdNORMAL;
144      }
145 +    
146 +    /**
147 +     * Gets an initial list of hosts from the config
148 +     * and adds a fake set of heartbeats for them.
149 +     * If the hosts don't respond within the timeout
150 +     * period an alert will be raised.
151 +     *
152 +     * The effect of this is to allow us to know about
153 +     * hosts which weren't on when we started up, and
154 +     * will thus never have generated a heartbeat - yet
155 +     * will still want to know they're not responding.
156 +     */
157 +    private void createInitialHosts() {
158 +        // get the initial list of hosts from the config
159 +        String initialHosts = "";
160 +        try {
161 +            initialHosts = _cp.getProperty(_name, "Monitor.Heartbeat.initialHosts");
162 +        } catch (PropertyNotFoundException e) {
163 +            // just leave initialHosts empty
164 +            _logger.write(Heartbeat__Monitor.this.toString(), Logger.DEBUG, "No initial list of hosts set, defaulting to none.");
165 +        }
166          
167 <    private void fireAlert(String source, int timeSinceLastHB, Register reg, int lastAlert) {
168 <        int alertLevel = reg.getLastAlertLevel(0);
169 <        int thresholdLevel = reg.getLastThresholdLevel(0);
170 <        String currentValue = String.valueOf(timeSinceLastHB);
171 <        String attributeName = "Heartbeat";
172 <        String thresholdValue = String.valueOf(reg.getThreshold(thresholdLevel));
173 <        String time = Long.toString(reg.getAlertTimeout(reg.getLastAlertLevel(0), 0) / 1000);
174 <        if (thresholdLevel == Alert.thresholdNORMAL) {
175 <            thresholdValue = "-";
167 >        // parse through the initial hosts adding them
168 >        StringTokenizer st = new StringTokenizer(initialHosts, ";");
169 >        while (st.hasMoreTokens()) {
170 >            String source = st.nextToken();
171 >            // check if they already exist, don't want to add them twice
172 >            if (!_hosts.containsKey(source)) {
173 >                synchronized(this) {
174 >                    _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
175 >                }
176 >            }
177 >            HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
178 >            // set a "fake" heartbeat
179 >            lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
180          }
178        if (alertLevel == Alert.alertOK) {
179            time = "0";
180        }
181        Alert alert = new Alert(alertLevel, lastAlert, thresholdLevel, source, thresholdValue, currentValue, attributeName, time);
182        _alerterQueue.add(alert);
183        _logger.write(toString(), Logger.DEBUG, "Fired alert for source:" + source + " at alert level:" + Alert.alertLevels[alertLevel] + " on:" + attributeName + " for threshold level:" + Alert.thresholdLevels[thresholdLevel] + " at:" +  currentValue + " exceeding threshold of:" +thresholdValue + " next alert sent in:" + time + "secs");
181      }
182  
183   //---ACCESSOR/MUTATOR METHODS---
184 <
184 >    
185 >    /**
186 >     * Returns a reference to the Queue we're getting data
187 >     * from. This is specific to this monitor.
188 >     *
189 >     * @return a reference to a Queue to get data from
190 >     */
191 >    protected Queue getQueue() {
192 >        return MonitorManager.getInstance().getDataQueue();
193 >    }
194 >    
195   //---ATTRIBUTES---
196  
197      /**
# Line 197 | Line 204 | public class Heartbeat__Monitor extends Thread impleme
204       * be changed to null for utility classes.
205       */
206      private String _name = "Heartbeat";
207 <
207 >    
208      /**
209 <     * This holds a reference to the
203 <     * system logger that is being used.
209 >     * A reference to the configuration proxy in use
210       */
211 <    private Logger _logger = ReferenceManager.getInstance().getLogger();
211 >    private ConfigurationProxy _cp = ConfigurationProxy.getInstance();
212      
213 <    private Queue _alerterQueue = ClientMain._alerterQueue;
213 >    /**
214 >     * A HashMap of hosts, with associated HeartbeatHolder's.
215 >     */
216 >    private HashMap _hosts = new HashMap();
217      
218      /**
219 <     * A reference to the configuration proxy in use
219 >     * A reference to the system logger.
220       */
221 <    private ConfigurationProxy _cp = ConfigurationProxy.getInstance();
221 >    private Logger _logger = ReferenceManager.getInstance().getLogger();
222  
214    private HashMap _hostsHB = new HashMap();
215    private HashMap _hostsReg = new HashMap();
216
223   //---STATIC ATTRIBUTES---
224  
225   //---INNER CLASSES---
226 <
226 >    
227 >    /**
228 >     * This inner class simply holding some information
229 >     * about a specific host.
230 >     */
231      private class HeartbeatHolder {
232          
233 <        public void setLastHeartbeat(int lastHeartbeat) {
233 >        /**
234 >         * Construct a new HeartbeatHolder.
235 >         */
236 >        public HeartbeatHolder(Register register) {
237 >            _register = register;
238 >        }
239 >        
240 >        /**
241 >         * Set the time of the last heartbeat
242 >         */
243 >        public void setLastHeartbeat(long lastHeartbeat) {
244              _lastHeartbeat = lastHeartbeat;
245          }
246          
247 <        public int getLastHeartbeat() {
247 >        /**
248 >         * Get the time of the last heartbeat
249 >         */
250 >        public long getLastHeartbeat() {
251              return _lastHeartbeat;
252          }
253          
254 <        private int _lastHeartbeat;
255 <    }  
256 <
254 >        /**
255 >         * Get the Register
256 >         */
257 >        public Register getRegister() {
258 >            return _register;
259 >        }
260 >        
261 >        /**
262 >         * last heartbeat time
263 >         */
264 >        private long _lastHeartbeat;
265 >        
266 >        /**
267 >         * register ref
268 >         */
269 >        private Register _register;
270 >    }
271 >    
272 >    /**
273 >     * This worker thread just checks all the hosts and then
274 >     * waits a period of time before doing it again. It sends
275 >     * Alerts as required.
276 >     */
277 >    private class HeartbeatWorker extends Thread {
278 >        
279 >        /**
280 >         * The main run method of this worker thread. It simply
281 >         * checks through all the hosts it has stored, running
282 >         * the analyseHB method on each. It then removes any
283 >         * that have passed a FINAL, and waits a (configured)
284 >         * length of time before doing it again.
285 >         */
286 >        public void run() {
287 >            ConfigurationProxy cp = ConfigurationProxy.getInstance();
288 >            while(true) {
289 >                // this cycle period of this monitor's checks
290 >                int checkPeriod = 0;
291 >                try {
292 >                    checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod"));
293 >                } catch (PropertyNotFoundException e) {
294 >                    checkPeriod = DEFAULT_CHECK_PERIOD;
295 >                    _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds");
296 >                } catch (NumberFormatException e) {
297 >                    checkPeriod = DEFAULT_CHECK_PERIOD;
298 >                    _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds");
299 >                }
300 >                
301 >                synchronized(Heartbeat__Monitor.this) {
302 >                    // perform the checks (use HB hash, although they *should* be the same)
303 >                    Iterator i = _hosts.keySet().iterator();
304 >                    while(i.hasNext()) {
305 >                        // get host
306 >                        String source = (String) i.next();
307 >                        // check it
308 >                        boolean remove = analyseHB(source);
309 >                        // remove it if it's passed a FINAL
310 >                        if(remove) {
311 >                            i.remove();
312 >                        }
313 >                    }
314 >                }
315 >                
316 >                // wait a while
317 >                try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {}
318 >            }
319 >        }
320 >        
321 >        /**
322 >         * Analyses a given host's state, and if need be generates
323 >         * a relevant Alert. Note that it also checks if the last
324 >         * alert sent is FINAL, in which case it returns true to
325 >         * indicate removal of this host.
326 >         *
327 >         * @param source the host to check
328 >         * @return whether this host can be deleted
329 >         */
330 >        private boolean analyseHB(String source) {
331 >            ConfigurationProxy cp = ConfigurationProxy.getInstance();
332 >            HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source);
333 >            Register reg = hbHolder.getRegister();
334 >            
335 >            // get host's HB interval (seconds)
336 >            // this should always exist, thus we set to 0
337 >            int hostHBinterval = 0;
338 >            try {
339 >                hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.UDPUpdateTime"));
340 >            } catch (PropertyNotFoundException e) {
341 >                hostHBinterval = 0;
342 >                _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "UDPUpdateTime value unavailable using default of " + hostHBinterval + " seconds");
343 >            } catch (NumberFormatException e) {
344 >                hostHBinterval = 0;
345 >                _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous UDPUpdateTime value in configuration using default of " + hostHBinterval + " seconds");
346 >            }
347 >            
348 >            // get host's last HB time (seconds)
349 >            long lastHeartbeat = hbHolder.getLastHeartbeat();
350 >            // time since last heartbeat (seconds)
351 >            long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat;
352 >            // time since (or until if negative) the expected heartbeat
353 >            long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval;
354 >            
355 >            // best do a check in case the expected heartbeat is in the future
356 >            if(timeSinceExpectedHB < 0) {
357 >                timeSinceExpectedHB = 0;
358 >            }
359 >            
360 >            // find out the threshold level we're at
361 >            int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg);
362 >            
363 >            // process the alert
364 >            Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB));
365 >            
366 >            if(reg.getLastAlertLevel() == Alert.alertFINAL) {
367 >                return true;
368 >            }
369 >            return false;
370 >        }
371 >    }
372   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines