Viewing file: health.py (3.38 KB) -rw-r--r-- Select action/file-type: (+) | (+) | (+) | Code (+) | Session (+) | (+) | SDB (+) | (+) | (+) | (+) | (+) | (+) |
"""This module implements health status reporting for watchdog operation.
Module receive important health metrics and exports its status of overall health assessment. This health assessment can be used by external watchdog scripts to initiate agent restart.
Process is considered "healthy" if:
* it is being shut down and shutdown timeout has not elapsed -> HEALTHY * it is not registered -> HEALTHY * process was started more than 6 hours ago and no data was sent to server within last 6 hours -> FAULTY * process was started more than 18 hours ago and no data was received from server within last 18 hours -> FAULTY
Otherwise process is considered HEALTHY.
As agent exports this information through RPC interface there is an additional implicit "health" requirement that:
* it responds to RPC requests.
This implicit requirement considered valid because UI fully depends on RPC so it does not make health assessment any worse than it should."""
import collections
HealthStatus = collections.namedtuple("HealthStatus", ["healthy", "why"])
class HealthSensor: """HealthSensor receives events about agent operation and provides information about overall status.
Initially, new HealthSensor object assumes:
* process was started long ago; * process is not being shut down; * data from server has been received long ago; * data to server was sent long ago; * agent is registered (license is valid).
So, initial health status is False (faulty)."""
RECEIVE_WINDOW = 18 * 3600 SEND_WINDOW = 6 * 3600 SHUTDOWN_TIMEOUT = 600
def __init__(self): self._started_at = 0.0 self._shutdown_at = 0.0 self._last_received = 0.0 self._last_sent = 0.0 self._is_registered = True
def starting(self, when: float) -> None: """Records a moment of agent startup""" self._started_at = when
def shutting_down(self, when: float) -> None: """Records a moment of externally initiated agent shutdown""" self._shutdown_at = when
def server_data_received(self, when: float) -> None: """Records a moment when data was received from server""" self._last_received = when
def server_data_sent(self, when: float) -> None: """Records a moment when data was sent to server""" self._last_sent = when
def registered(self) -> None: """Marks agent as being registered""" self._is_registered = True
def unregistered(self) -> None: """Marks agent as being not registered""" self._is_registered = False
def status(self, now: float) -> HealthStatus: if self._shutdown_at > 0: if now - self._shutdown_at >= self.SHUTDOWN_TIMEOUT: return HealthStatus(False, "stuck at shutdown") return HealthStatus(True, "shutdown is in progress") if not self._is_registered: return HealthStatus(True, "not registered") if ( now - self._started_at >= self.RECEIVE_WINDOW and now - self._last_received >= self.RECEIVE_WINDOW ): return HealthStatus(False, "no data received from server") if ( now - self._started_at >= self.SEND_WINDOW and now - self._last_sent >= self.SEND_WINDOW ): return HealthStatus(False, "no data sent to server") return HealthStatus(True, "all is ok")
sensor = HealthSensor()
|