ok
Direktori : /usr/lib/fm-agent/plugins/ |
Current File : //usr/lib/fm-agent/plugins/cpu_usage.py |
import agent_util import time import sys import platform import os import socket from agent_util import float try: import psutil except: psutil = None try: import distro except: distro = None def search_esxtop(headers, search_string): for idx, column in enumerate(headers): if search_string in column: return idx return None def get_cpu_metrics(cls): retcode, output = agent_util.execute_command("cat /proc/stat") cls.log.debug("cat /proc/stat output: %s" % str(output)) output = output.splitlines() stat_fields = [ "user", "nice", "system", "idle", "iowait", "irq", "softirq", "steal", "guest", "guest_nice", ] cpus = {} for line in output: if not line.startswith("cpu"): continue # python3 compatible lambda function if sys.version_info[0] == 3: parts = list(filter(lambda p: p, line.split(" "))) else: parts = filter(lambda p: p, line.split(" ")) core = parts[0] if core == "cpu": core = "Total" if len(parts) >= 11: user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice = ( map(int, parts[1:11]) ) cpus[core] = { "user": user, "nice": nice, "system": system, "idle": idle, "iowait": iowait, "irq": irq, "softirq": softirq, "steal": steal, "guest": guest, "guest_nice": guest_nice, } elif len(parts) > 8 and len(parts) < 11: user, nice, system, idle, iowait, irq, softirq = map(int, parts[1:8]) cpus[core] = { "user": user, "nice": nice, "system": system, "idle": idle, "iowait": iowait, "irq": irq, "softirq": softirq, } return cpus class CPUUsagePlugin(agent_util.Plugin): textkey = "cpu_usage" label = "CPU" @classmethod def get_metadata(self, config): status = agent_util.SUPPORTED msg = None if "aix" in sys.platform: status = agent_util.SUPPORTED data = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Usage percentage", "options": sorted(get_cpu_metrics(self).keys()), "status": status, "error_message": msg, "unit": "percent", }, "user_usage_percentage": { "label": "User usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "system_usage_percentage": { "label": "System usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "iowait_usage_percentage": { "label": "I/O Wait usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "cpu_entitlement_percentage": { "label": "CPU entitlement percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, } return data elif "sunos" in sys.platform: status = agent_util.SUPPORTED data = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Usage percentage", "options": sorted(get_cpu_metrics(self).keys()), "status": status, "error_message": msg, "unit": "percent", }, "user_usage_percentage": { "label": "User usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "system_usage_percentage": { "label": "System usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "iowait_usage_percentage": { "label": "I/O Wait usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, } return data elif "freebsd" in sys.platform or "darwin" in sys.platform: status = agent_util.SUPPORTED data = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Usage percentage", "options": ["Total"], "status": status, "error_message": msg, "unit": "percent", }, "user_usage_percentage": { "label": "User usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "system_usage_percentage": { "label": "System usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, } return data elif "hp-ux" in sys.platform: status = agent_util.SUPPORTED metadata = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Total Usage percentage", "options": ["Total"], "status": status, "error_message": msg, "unit": "percent", }, "user_usage_percentage": { "label": "User usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "system_usage_percentage": { "label": "System usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, } return metadata elif "vmware" in sys.platform: status = agent_util.SUPPORTED # here we're gathering the CPU cores that we can monitor and adding in a Total aggregation cpus = [] ret, out = agent_util.execute_command( 'esxcli hardware cpu list | grep "CPU:"' ) tmp_cpus = [x for x in out.split("\n") if x != ""] for c in tmp_cpus: cpu = "Cpu (%s)" % c.split(":")[1] cpus.append(cpu) cpus.append("Total") data = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Usage percentage", "options": cpus, "status": status, "error_message": msg, "min_value": 0, "max_value": 100, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": cpus, "status": status, "error_message": msg, "min_value": 0, "max_value": 100, "unit": "percent", }, } return data else: if psutil is None: # Unable to import psutil self.log.info( "Unable to import psutil library, no process metrics available" ) status = agent_util.UNSUPPORTED msg = "Unable to import psutil library, please install and rebuild metadata" # Core Linux if not agent_util.which("top", exc=False): self.log.info("top binary not found") status = agent_util.UNSUPPORTED msg = "top binary not found" try: distro_info = platform.dist() except AttributeError: if distro: distro_info = distro.linux_distribution() distro_info = ". ".join(distro_info) else: raise ValueError( "Unable to grab distribution information. Please verify dependencies. Distro for Python3.8" ) if ( "centos" in distro_info or "redhat" in distro_info or "oracle" in distro_info ) and not agent_util.which("iostat", exc=False): self.log.info("Missing sysstat package.") status = agent_util.UNSUPPORTED msg = "iostat/sysstat binary not found. Please install" metadata = { "load_average.1": { "label": "1 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.5": { "label": "5 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "load_average.15": { "label": "15 minute CPU load average", "options": None, "status": status, "error_message": msg, "unit": "avg", }, "usage_percentage": { "label": "Usage percentage", "options": sorted(get_cpu_metrics(self).keys()), "status": status, "error_message": msg, "unit": "percent", }, "user_usage_percentage": { "label": "User usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "system_usage_percentage": { "label": "System usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "idle_usage_percentage": { "label": "Idle usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "iowait_usage_percentage": { "label": "I/O Wait usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "irq_usage_percentage": { "label": "Hardware IRQ usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "softirg_usage_percentage": { "label": "Software IRQ usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "stealtime_usage_percentage": { "label": "Steal Time usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, "nice_usage_percentage": { "label": "Nice usage percentage", "options": None, "status": status, "error_message": msg, "unit": "percent", }, } return metadata def check(self, textkey, data, config={}): # AIX-specific logic if ( "aix" in sys.platform or "darwin" in sys.platform or "freebsd" in sys.platform ): if textkey.startswith("load_average"): retcode, load = agent_util.execute_command("uptime") fields = load.strip().split() if textkey == "load_average.1": return float(fields[-3].strip(",")) elif textkey == "load_average.5": return float(fields[-2].strip(",")) elif textkey == "load_average.15": return float(fields[-1]) else: return None else: iostat = str(agent_util.which("iostat")) if "aix" in sys.platform: retcode, output = agent_util.execute_command( iostat + " | grep -p tty" ) if "darwin" in sys.platform or "freebsd" in sys.platform: retcode, output = agent_util.execute_command( iostat + " -C -c 2 | tail -1" ) output = output.strip().split("\n") self.log.debug("iostat output: %s" % output) iostatline = False enti = False entc = 0 inuse = 0 user = 0 system = 0 idle = 0 iowait = 0 for line in output: if line.startswith("tty"): iostatline = True if "entc" in line.split()[-1]: enti = True continue fields = line.split() if "darwin" in sys.platform: user = float(fields[-6]) system = float(fields[-5]) idle = float(fields[-4]) elif "freebsd" in sys.platform: user = float(-5) idle = float(fields[-1]) system = float(fields[-3]) else: user = float(fields[2]) system = float(fields[3]) idle = float(fields[4]) iowait = float(fields[5]) if enti == True: entc = float(fields[-1]) inuse = 100.0 - idle if textkey == "usage_percentage": return inuse elif textkey == "user_usage_percentage": return user elif textkey == "system_usage_percentage": return system elif textkey == "idle_usage_percentage": return idle elif textkey == "iowait_usage_percentage": return iowait elif textkey == "cpu_entitlement_percentage" and enti == True: return entc # If we got here, we don't know how to gather this metric # for AIX - return None return None elif "sunos" in sys.platform: if textkey.startswith("load_average"): retcode, load = agent_util.execute_command("uptime") fields = load.strip().split() if textkey == "load_average.1": return float(fields[-3].strip(",")) elif textkey == "load_average.5": return float(fields[-2].strip(",")) elif textkey == "load_average.15": return float(fields[-1]) else: return None retcode, output = agent_util.execute_command("mpstat") output = output.split("\n") for line in output: if "CPU" in line or not line: continue fields = line.split() if textkey == "usage_percentage": return 100.0 - float(fields[-1]) elif textkey == "user_usage_percentage": return float(fields[-4]) elif textkey == "system_usage_percentage": return float(fields[-3]) elif textkey == "idle_usage_percentage": return float(fields[-1]) elif textkey == "iowait_usage_percentage": return float(fields[-2]) # If we got here we don't know how to gather this metric for Solaris return None elif "vmware" in sys.platform: hostname = socket.gethostname() search_string = "\\\\%s\\Physical " % hostname metric_value = None # actually gather the data to parse ret, out = agent_util.execute_command( "esxtop -b -n 2 -d 2", cache_timeout=agent_util.DEFAULT_CACHE_TIMEOUT ) out_list = out.split("\n") headers = out_list[0].replace('"', "").split(",") esxtop_data = [] for idx, val in enumerate(out_list[::1]): if not val or val == "": continue esxtop_data = out_list[idx].replace('"', "").split(",") # finish building search string if textkey.startswith("load_average"): search_string += ( "Cpu Load\\Cpu Load (%s Minute Avg)" % textkey.split(".")[-1] ) elif data and ( textkey == "usage_percentage" or textkey == "idle_usage_percentage" ): if data == "Total": search_string += "Cpu(_Total)" else: search_string += data search_string += "\\% Processor Time" # find index from headers and match to esxtop_data collected search_idx = search_esxtop(headers, search_string) if not search_idx: self.log.error("Unable to parse ESXTOP output for %s" % search_string) return None if textkey == "idle_usage_percentage": metric_value = 100 - float(esxtop_data[search_idx]) else: metric_value = float(esxtop_data[search_idx]) return metric_value elif "hp-ux" in sys.platform: # add terminal specification for hpux os.environ["TERM"] = "xterm" # !!! applicable to HP-UX 11.31 !!! ret, out = agent_util.execute_command("top -s2 -d2", env=os.environ) top = out.strip().splitlines() self.log.debug(top) metric_mapping = {} cpu_str = "" load_str = "" for line in top: if line.lower().startswith("avg"): cpu_str = line elif line.lower().startswith("load averages"): load_str = line cpu = cpu_str.replace("%", "").split() self.log.debug(cpu) metric_mapping["user_usage_percentage"] = float(cpu[2]) metric_mapping["system_usage_percentage"] = float(cpu[4]) metric_mapping["idle_usage_percentage"] = float(cpu[5]) metric_mapping["usage_percentage"] = ( 100.0 - metric_mapping["idle_usage_percentage"] ) load = load_str.strip().replace(",", "").split() self.log.debug(load) self.log.debug("'%s'" % load[4][:4]) metric_mapping["load_average.1"] = float(load[2]) metric_mapping["load_average.5"] = float(load[3]) metric_mapping["load_average.15"] = float(load[4][:4]) return float(metric_mapping.get(textkey, None)) else: if psutil is None: self.log.error("PSUTIL PACKAGE MISSING! UNABLE TO COLLECT CPU METRICS") return None # Default Linux/FreeBSD logic if textkey.startswith("load_average"): retcode, output = agent_util.execute_command("top -b -n 2 -d 0.5") if config.get("debug", False): self.log.debug( "#####################################################" ) self.log.debug("CPU usage command 'top -b -n 2 -d 0.5:") self.log.debug(str(output)) self.log.debug( "#####################################################" ) self.log.debug("top -b -n 2 -d 0.5: %s" % str(output)) output = output.splitlines() space_index = [0] for var, item in enumerate(output): if item == "": space_index.append(var) tmp_out = [] for line in output[space_index[2] :]: if line.strip(): tmp_out.append(line) output = tmp_out if textkey.startswith("load_average"): fields = output[0].split() if textkey == "load_average.1": index = -3 elif textkey == "load_average.5": index = -2 elif textkey == "load_average.15": index = -1 return float(fields[index].strip(",")) elif textkey.endswith("usage_percentage") and textkey != "usage_percentage": num_cores = psutil.cpu_count() usage_textkey_map = { "user_usage_percentage": "user", "system_usage_percentage": "system", "idle_usage_percentage": "idle", "iowait_usage_percentage": "iowait", "irq_usage_percentage": "irq", "softirg_usage_percentage": "softirq", "stealtime_usage_percentage": "steal", "nice_usage_percentage": "nice", } key_name = usage_textkey_map.get(textkey, None) if key_name is None: self.log.error("Unknown resource textkey '%s'!" % textkey) return None c = self.get_cache_results("psutil", "detailed_cpu_usage") self.log.debug("Retrieved cached value:\n%s" % c) cur_cpu = psutil.cpu_times() self.log.debug( "Retrieved instant value:\n%s" % getattr(cur_cpu, key_name) ) last_cpu = c and c[0][1] or None self.cache_result("psutil", "detailed_cpu_usage", cur_cpu, replace=True) if last_cpu is None: return None use_diff = ( getattr(cur_cpu, key_name) - getattr(last_cpu, key_name) ) / num_cores if use_diff < 0: # The system was likely rebooted, and the cached # CPU stats are no longer relevant. # Cache new values and exit without reporting a value. return None elapsed = c[0][0] usage_time = (use_diff / elapsed) * 100.0 return usage_time elif textkey == "usage_percentage" and data.lower() == "total": num_cores = psutil.cpu_count() c = self.get_cache_results("psutil", "total_cpu_usage") self.log.debug("Retrieved cached value:\n%s" % c) cur_cpu = psutil.cpu_times() self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle) last_cpu = c and c[0][1] or None self.cache_result("psutil", "total_cpu_usage", cur_cpu, replace=True) if last_cpu is None: return None idle_diff = (cur_cpu.idle - last_cpu.idle) / num_cores steal_diff = (cur_cpu.steal - last_cpu.steal) / num_cores if idle_diff < 0 or steal_diff < 0: # The system was likely rebooted, and the cached # CPU stats are no longer relevant. # Cache new values and exit without reporting a value. return None use_diff = idle_diff + steal_diff # Instead of using the time between cached calculate the exact time # between measures by substracting the sum of the current clock time vs # the previous clock time. This avoid issues where our usage was too small # and the seconds of the extra cache would give a negative result elapsed = (sum(cur_cpu) - sum(last_cpu)) / float(num_cores) usage_time = 100 - ((use_diff / elapsed) * 100.0) return usage_time elif textkey == "usage_percentage" and data.lower() != "total": self.log.debug("Checking for core %s" % data) num_cores = psutil.cpu_count() c = self.get_cache_results("psutil", "%s_cpu_usage" % data) self.log.debug("Retrieved cached value:\n%s" % c) try: cur_cpu = psutil.cpu_times(percpu=True)[int(str(data).strip("cpu"))] except IndexError: self.log.critical("UNABLE TO FIND CPU #%s" % data) return None self.log.debug("Retrieved instant value:\n%s" % cur_cpu.idle) last_cpu = c and c[0][1] or None self.cache_result( "psutil", "%s_cpu_usage" % data, cur_cpu, replace=True ) if last_cpu is None: return None idle_diff = cur_cpu.idle - last_cpu.idle steal_diff = cur_cpu.steal - last_cpu.steal if idle_diff < 0 or steal_diff < 0: # The system was likely rebooted, and the cached # CPU stats are no longer relevant. # Cache new values and exit without reporting a value. return None use_diff = idle_diff + steal_diff elapsed = sum(cur_cpu) - sum(last_cpu) usage_time = 100 - ((use_diff / elapsed) * 100.0) return usage_time return 0