ViewVC Help
View File | Revision Log | Show Annotations | Revision Graph | Root Listing
root/i-scream/projects/cms/source/server/uk/org/iscream/cms/server/client/monitors/Heartbeat__Monitor.java
(Generate patch)

Comparing projects/cms/source/server/uk/org/iscream/cms/server/client/monitors/Heartbeat__Monitor.java (file contents):
Revision 1.3 by tdb, Mon Mar 5 23:14:30 2001 UTC vs.
Revision 1.22 by tdb, Sat May 18 18:16:00 2002 UTC

# Line 1 | Line 1
1 + /*
2 + * i-scream central monitoring system
3 + * Copyright (C) 2000-2002 i-scream
4 + *
5 + * This program is free software; you can redistribute it and/or
6 + * modify it under the terms of the GNU General Public License
7 + * as published by the Free Software Foundation; either version 2
8 + * of the License, or (at your option) any later version.
9 + *
10 + * This program is distributed in the hope that it will be useful,
11 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 + * GNU General Public License for more details.
14 + *
15 + * You should have received a copy of the GNU General Public License
16 + * along with this program; if not, write to the Free Software
17 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18 + */
19 +
20   //---PACKAGE DECLARATION---
21 < package uk.ac.ukc.iscream.client.monitors;
21 > package uk.org.iscream.cms.server.client.monitors;
22  
23   //---IMPORTS---
24   import java.util.HashMap;
25   import java.util.Iterator;
26 < import uk.ac.ukc.iscream.client.*;
27 < import uk.ac.ukc.iscream.core.*;
28 < import uk.ac.ukc.iscream.util.*;
29 < import uk.ac.ukc.iscream.componentmanager.*;
26 > import java.util.StringTokenizer;
27 > import uk.org.iscream.cms.server.client.*;
28 > import uk.org.iscream.cms.server.core.*;
29 > import uk.org.iscream.cms.server.util.*;
30 > import uk.org.iscream.cms.server.componentmanager.*;
31  
32   /**
33 < * This Monitor watches heartbeats
33 > * This Monitor watches heartbeats.
34 > * It generates an alert when a heartbeat that is expected
35 > * does not arrive. Unlike all the other monitors, this one
36 > * is driven by an event *not* occuring, rather than an
37 > * event occuring. This means it must be actively checking
38 > * for missing heartbeat's, and thus has an extra inner class
39 > * thread.
40   *
41   * @author  $Author$
42   * @version $Id$
43   */
44 < public class Heartbeat__Monitor implements PluginMonitor, Runnable {
44 > public class Heartbeat__Monitor extends MonitorSkeleton {
45  
46   //---FINAL ATTRIBUTES---
47  
# Line 24 | Line 50 | public class Heartbeat__Monitor implements PluginMonit
50       */
51      public final String REVISION = "$Revision$";
52      
53 +    /**
54 +     * A description of this monitor
55 +     */
56      public final String DESC = "Monitors Heartbeats.";
57      
58 +    /**
59 +     * The default (used if not configured) period at
60 +     * which to check for old heartbeats. (in seconds)
61 +     */
62      public final int DEFAULT_CHECK_PERIOD = 60;
63      
64   //---STATIC METHODS---
65  
66   //---CONSTRUCTORS---
67 <
67 >    
68 >    /**
69 >     * Constructs a new Heartbeat monitor, and starts off
70 >     * the worker thread.
71 >     */
72      public Heartbeat__Monitor() {
73 <        new Thread(this).start();
73 >        super();
74 >        createInitialHosts();
75 >        new HeartbeatWorker().start();
76      }
77  
78   //---PUBLIC METHODS---
79      
80 <    public void run() {
81 <        ConfigurationProxy cp = ConfigurationProxy.getInstance();
82 <        while(true) {
83 <            // this cycle period of this monitor's checks
84 <            int checkPeriod = 0;
85 <            try {
47 <                checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod"));
48 <            } catch (PropertyNotFoundException e) {
49 <                checkPeriod = DEFAULT_CHECK_PERIOD;
50 <                _logger.write(toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds");
51 <            } catch (NumberFormatException e) {
52 <                checkPeriod = DEFAULT_CHECK_PERIOD;
53 <                _logger.write(toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds");
54 <            }
55 <            
56 <            // perform the checks (use HB hash, although they *should* be the same)
57 <            Iterator i = _hostsHB.keySet().iterator();
58 <            while(i.hasNext()) {
59 <                // get host
60 <                String source = (String) i.next();
61 <                // check it
62 <                analyseHB(source);
63 <            }
64 <            
65 <            // wait a while
66 <            try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {}
67 <        }
68 <    }
69 <    
70 <    // only use attribute num 0 :)
71 <    public void analyseHB(String source) {
72 <        ConfigurationProxy cp = ConfigurationProxy.getInstance();
73 <        Register reg = (Register) _hostsReg.get(source);
74 <        
75 <        // get host's HB interval (seconds)
76 <        // this should always exist, thus we set to 0
77 <        int hostHBinterval = 0;
78 <        try {
79 <            hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "TCPUpdateTime"));
80 <        } catch (PropertyNotFoundException e) {
81 <            hostHBinterval = 0;
82 <            _logger.write(toString(), Logger.WARNING, "TCPUpdateTime value unavailable using default of " + hostHBinterval + " seconds");
83 <        } catch (NumberFormatException e) {
84 <            hostHBinterval = 0;
85 <            _logger.write(toString(), Logger.WARNING, "Erronous TCPUpdateTime value in configuration using default of " + hostHBinterval + " seconds");
86 <        }
87 <
88 <        // get host's last HB time (seconds)
89 <        long lastHeartbeat = ((HeartbeatHolder) _hostsHB.get(source)).getLastHeartbeat();
90 <        // time since last heartbeat (seconds)
91 <        long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat;
92 <        
93 <        // find out the threshold level we're at
94 <        int result = checkAttributeThreshold(timeSinceLastHB, reg);
95 <            
96 <        // decide what threshold level we're on, if we've changed, record that
97 <        if (result != reg.getLastThresholdLevel(0)) {
98 <            reg.setLastThresholdLevel(0, result);
99 <        }
100 <            
101 <        // as long as this isn't a normal level
102 <        if(reg.getLastThresholdLevel(0) != Alert.thresholdNORMAL) {
103 <            // if the time since the last alert is more than the time for
104 <            // its timeout, fire an alert, escalate the alert
105 <            long timeout = reg.getLastAlertTimeout(0);
106 <            if ((timeout > 0) && (reg.getTimeLastSent(0) > 0)) {
107 <                if((System.currentTimeMillis() - reg.getTimeLastSent(0)) > timeout) {
108 <                    int lastAlert = reg.getLastAlertLevel(0);
109 <                    reg.escalateAlert(0);
110 <                    reg.setTimeLastSent(0, System.currentTimeMillis());
111 <                    reg.setLastAlertTimeout(0, reg.getAlertTimeout(reg.getLastAlertLevel(0), 0));
112 <                    // -- SEND
113 <                    fireAlert(source, timeSinceLastHB, reg, lastAlert);
114 <                }
115 <            // if we don't have a timeout configured...we got STRAIGHT to the next level
116 <            } else {
117 <                int lastAlert = reg.getLastAlertLevel(0);
118 <                reg.escalateAlert(0);
119 <                reg.setTimeLastSent(0, System.currentTimeMillis());
120 <                reg.setLastAlertTimeout(0, reg.getAlertTimeout(reg.getLastAlertLevel(0), 0));
121 <                // -- SEND
122 <                fireAlert(source, timeSinceLastHB, reg, lastAlert);
123 <            }
124 <                
125 <        // we must be on ok, check the timeout value for this
126 <        } else {
127 <            // if we were on an OK alert before, then we don't do anything
128 <            // but if we weren't we only set OK, once the timeout of the last
129 <            // alert has occourd
130 <            if (reg.getLastAlertLevel(0) != Alert.alertOK) {
131 <                long timeout = reg.getLastAlertTimeout(0);
132 <                if ((timeout > 0) && (reg.getTimeLastSent(0) > 0)) {
133 <                    if ((System.currentTimeMillis() - reg.getTimeLastSent(0)) > timeout) {
134 <                        int lastAlert = reg.getLastAlertLevel(0);
135 <                        reg.setLastAlertLevel(0, Alert.alertOK);
136 <                        reg.setTimeLastSent(0, System.currentTimeMillis());
137 <                        reg.setLastAlertTimeout(0, timeout);
138 <                        // -- SEND
139 <                        fireAlert(source, timeSinceLastHB, reg, lastAlert);
140 <                    }
141 <                }
142 <            }
143 <        }
144 <    }
145 <
80 >    /**
81 >     * Analyse a packet of data. In this case, this will just
82 >     * register the fact that a heartbeat has arrived.
83 >     *
84 >     * @param packet The packet of data to analyse
85 >     */
86      public void analysePacket(XMLPacket packet) {
87 <        if (packet.getParam("packet.attributes.type").equals("heartbeat")) {
88 <            String source = packet.getParam("packet.attributes.machine_name");
89 <            if (!_hostsHB.containsKey(source)) {
90 <                _hostsReg.put(source, new Register(source, _name, 1));
151 <                _hostsHB.put(source, new HeartbeatHolder());
87 >        String source = packet.getParam("packet.attributes.machine_name");
88 >        if (!_hosts.containsKey(source)) {
89 >            synchronized(this) {
90 >                _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
91              }
153            HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hostsHB.get(source);
154            lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
92          }
93 +        HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
94 +        lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
95      }
96      
97      /**
98       * Overrides the {@link java.lang.Object#toString() Object.toString()}
99       * method to provide clean logging (every class should have this).
100       *
101 <     * This uses the uk.ac.ukc.iscream.util.NameFormat class
101 >     * This uses the uk.org.iscream.cms.server.util.NameFormat class
102       * to format the toString()
103       *
104       * @return the name of this class and its CVS revision
# Line 180 | Line 119 | public class Heartbeat__Monitor implements PluginMonit
119  
120   //---PRIVATE METHODS---
121      
122 +    /**
123 +     * Checks whether the time since the last heartbeat
124 +     * is beyond the threshold(s).
125 +     *
126 +     * @param timeSinceLastHB a long time since the last heartbeat arrived
127 +     * @param reg the Register for this host
128 +     * @return the level which has been breached, if any
129 +     */
130      private int checkAttributeThreshold(long timeSinceLastHB, Register reg) {
131          for(int thresholdLevel = Alert.thresholdLevels.length - 1; thresholdLevel >= 0; thresholdLevel--) {
132              if (reg.getThreshold(thresholdLevel) != -1.0) {
# Line 188 | Line 135 | public class Heartbeat__Monitor implements PluginMonit
135                  }
136              }
137          }
138 <        return 0;
138 >        return Alert.thresholdNORMAL;
139      }
140 +    
141 +    /**
142 +     * Gets an initial list of hosts from the config
143 +     * and adds a fake set of heartbeats for them.
144 +     * If the hosts don't respond within the timeout
145 +     * period an alert will be raised.
146 +     *
147 +     * The effect of this is to allow us to know about
148 +     * hosts which weren't on when we started up, and
149 +     * will thus never have generated a heartbeat - yet
150 +     * will still want to know they're not responding.
151 +     */
152 +    private void createInitialHosts() {
153 +        // get the initial list of hosts from the config
154 +        String initialHosts = "";
155 +        try {
156 +            initialHosts = _cp.getProperty(_name, "Monitor.Heartbeat.initialHosts");
157 +        } catch (PropertyNotFoundException e) {
158 +            // just leave initialHosts empty
159 +            _logger.write(Heartbeat__Monitor.this.toString(), Logger.DEBUG, "No initial list of hosts set, defaulting to none.");
160 +        }
161          
162 <    private void fireAlert(String source, long timeSinceLastHB, Register reg, int lastAlert) {
163 <        int alertLevel = reg.getLastAlertLevel(0);
164 <        int thresholdLevel = reg.getLastThresholdLevel(0);
165 <        String currentValue = String.valueOf(timeSinceLastHB);
166 <        String attributeName = "Heartbeat";
167 <        String thresholdValue = String.valueOf(reg.getThreshold(thresholdLevel));
168 <        String time = Long.toString(reg.getAlertTimeout(reg.getLastAlertLevel(0), 0) / 1000);
169 <        if (thresholdLevel == Alert.thresholdNORMAL) {
170 <            thresholdValue = "-";
162 >        // parse through the initial hosts adding them
163 >        StringTokenizer st = new StringTokenizer(initialHosts, ";");
164 >        while (st.hasMoreTokens()) {
165 >            String source = st.nextToken();
166 >            // check if they already exist, don't want to add them twice
167 >            if (!_hosts.containsKey(source)) {
168 >                synchronized(this) {
169 >                    _hosts.put(source, new HeartbeatHolder(new Register(source, _name)));
170 >                }
171 >            }
172 >            HeartbeatHolder lastHeartbeat = (HeartbeatHolder) _hosts.get(source);
173 >            // set a "fake" heartbeat
174 >            lastHeartbeat.setLastHeartbeat(System.currentTimeMillis()/1000);
175          }
204        if (alertLevel == Alert.alertOK) {
205            time = "0";
206        }
207        Alert alert = new Alert(alertLevel, lastAlert, thresholdLevel, source, thresholdValue, currentValue, attributeName, time);
208        _alerterQueue.add(alert);
209        _logger.write(toString(), Logger.DEBUG, "Fired alert for source:" + source + " at alert level:" + Alert.alertLevels[alertLevel] + " on:" + attributeName + " for threshold level:" + Alert.thresholdLevels[thresholdLevel] + " at:" +  currentValue + " exceeding threshold of:" +thresholdValue + " next alert sent in:" + time + "secs");
176      }
177  
178   //---ACCESSOR/MUTATOR METHODS---
179 <
179 >    
180 >    /**
181 >     * Returns a reference to the Queue we're getting data
182 >     * from. This is specific to this monitor.
183 >     *
184 >     * @return a reference to a Queue to get data from
185 >     */
186 >    protected Queue getQueue() {
187 >        return MonitorManager.getInstance().getHeartbeatQueue();
188 >    }
189 >    
190   //---ATTRIBUTES---
191  
192      /**
# Line 223 | Line 199 | public class Heartbeat__Monitor implements PluginMonit
199       * be changed to null for utility classes.
200       */
201      private String _name = "Heartbeat";
202 <
202 >    
203      /**
204 <     * This holds a reference to the
229 <     * system logger that is being used.
204 >     * A reference to the configuration proxy in use
205       */
206 <    private Logger _logger = ReferenceManager.getInstance().getLogger();
206 >    private ConfigurationProxy _cp = ConfigurationProxy.getInstance();
207      
208 <    private Queue _alerterQueue = ClientMain._alerterQueue;
208 >    /**
209 >     * A HashMap of hosts, with associated HeartbeatHolder's.
210 >     */
211 >    private HashMap _hosts = new HashMap();
212      
213      /**
214 <     * A reference to the configuration proxy in use
214 >     * A reference to the system logger.
215       */
216 <    private ConfigurationProxy _cp = ConfigurationProxy.getInstance();
216 >    private Logger _logger = ReferenceManager.getInstance().getLogger();
217  
240    private HashMap _hostsHB = new HashMap();
241    private HashMap _hostsReg = new HashMap();
242
218   //---STATIC ATTRIBUTES---
219  
220   //---INNER CLASSES---
221 <
221 >    
222 >    /**
223 >     * This inner class simply holding some information
224 >     * about a specific host.
225 >     */
226      private class HeartbeatHolder {
227          
228 +        /**
229 +         * Construct a new HeartbeatHolder.
230 +         */
231 +        public HeartbeatHolder(Register register) {
232 +            _register = register;
233 +        }
234 +        
235 +        /**
236 +         * Set the time of the last heartbeat
237 +         */
238          public void setLastHeartbeat(long lastHeartbeat) {
239              _lastHeartbeat = lastHeartbeat;
240          }
241          
242 +        /**
243 +         * Get the time of the last heartbeat
244 +         */
245          public long getLastHeartbeat() {
246              return _lastHeartbeat;
247          }
248          
249 +        /**
250 +         * Get the Register
251 +         */
252 +        public Register getRegister() {
253 +            return _register;
254 +        }
255 +        
256 +        /**
257 +         * last heartbeat time
258 +         */
259          private long _lastHeartbeat;
260 <    }  
261 <
260 >        
261 >        /**
262 >         * register ref
263 >         */
264 >        private Register _register;
265 >    }
266 >    
267 >    /**
268 >     * This worker thread just checks all the hosts and then
269 >     * waits a period of time before doing it again. It sends
270 >     * Alerts as required.
271 >     */
272 >    private class HeartbeatWorker extends Thread {
273 >        
274 >        /**
275 >         * The main run method of this worker thread. It simply
276 >         * checks through all the hosts it has stored, running
277 >         * the analyseHB method on each. It then removes any
278 >         * that have passed a FINAL, and waits a (configured)
279 >         * length of time before doing it again.
280 >         */
281 >        public void run() {
282 >            ConfigurationProxy cp = ConfigurationProxy.getInstance();
283 >            while(true) {
284 >                // this cycle period of this monitor's checks
285 >                int checkPeriod = 0;
286 >                try {
287 >                    checkPeriod = Integer.parseInt(cp.getProperty(_name, "Monitor.Heartbeat.checkPeriod"));
288 >                } catch (PropertyNotFoundException e) {
289 >                    checkPeriod = DEFAULT_CHECK_PERIOD;
290 >                    _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Monitor.Heartbeat.checkPeriod value unavailable using default of " + checkPeriod + " seconds");
291 >                } catch (NumberFormatException e) {
292 >                    checkPeriod = DEFAULT_CHECK_PERIOD;
293 >                    _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous Monitor.Heartbeat.checkPeriod value in configuration using default of " + checkPeriod + " seconds");
294 >                }
295 >                
296 >                synchronized(Heartbeat__Monitor.this) {
297 >                    // perform the checks (use HB hash, although they *should* be the same)
298 >                    Iterator i = _hosts.keySet().iterator();
299 >                    while(i.hasNext()) {
300 >                        // get host
301 >                        String source = (String) i.next();
302 >                        // check it
303 >                        boolean remove = analyseHB(source);
304 >                        // remove it if it's passed a FINAL
305 >                        if(remove) {
306 >                            i.remove();
307 >                        }
308 >                    }
309 >                }
310 >                
311 >                // wait a while
312 >                try {Thread.sleep(checkPeriod * 1000);} catch (InterruptedException e) {}
313 >            }
314 >        }
315 >        
316 >        /**
317 >         * Analyses a given host's state, and if need be generates
318 >         * a relevant Alert. Note that it also checks if the last
319 >         * alert sent is FINAL, in which case it returns true to
320 >         * indicate removal of this host.
321 >         *
322 >         * @param source the host to check
323 >         * @return whether this host can be deleted
324 >         */
325 >        private boolean analyseHB(String source) {
326 >            ConfigurationProxy cp = ConfigurationProxy.getInstance();
327 >            HeartbeatHolder hbHolder = (HeartbeatHolder) _hosts.get(source);
328 >            Register reg = hbHolder.getRegister();
329 >            
330 >            // get host's HB interval (seconds)
331 >            // this should always exist, thus we set to 0
332 >            int hostHBinterval = 0;
333 >            try {
334 >                hostHBinterval = Integer.parseInt(cp.getProperty("Host."+source, "Host.TCPUpdateTime"));
335 >            } catch (PropertyNotFoundException e) {
336 >                hostHBinterval = 0;
337 >                _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "TCPUpdateTime value unavailable using default of " + hostHBinterval + " seconds");
338 >            } catch (NumberFormatException e) {
339 >                hostHBinterval = 0;
340 >                _logger.write(Heartbeat__Monitor.this.toString(), Logger.WARNING, "Erronous TCPUpdateTime value in configuration using default of " + hostHBinterval + " seconds");
341 >            }
342 >            
343 >            // get host's last HB time (seconds)
344 >            long lastHeartbeat = hbHolder.getLastHeartbeat();
345 >            // time since last heartbeat (seconds)
346 >            long timeSinceLastHB = (System.currentTimeMillis()/1000) - lastHeartbeat;
347 >            // time since (or until if negative) the expected heartbeat
348 >            long timeSinceExpectedHB = timeSinceLastHB - (long) hostHBinterval;
349 >            
350 >            // best do a check in case the expected heartbeat is in the future
351 >            if(timeSinceExpectedHB < 0) {
352 >                timeSinceExpectedHB = 0;
353 >            }
354 >            
355 >            // find out the threshold level we're at
356 >            int newThreshold = checkAttributeThreshold(timeSinceExpectedHB, reg);
357 >            
358 >            // process the alert
359 >            Heartbeat__Monitor.this.processAlert(newThreshold, "Heartbeat", reg, source, String.valueOf(timeSinceExpectedHB));
360 >            
361 >            if(reg.getLastAlertLevel() == Alert.alertFINAL) {
362 >                return true;
363 >            }
364 >            return false;
365 >        }
366 >    }
367   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines