#!/usr/bin/env python ##################################################################### # xenmon is a front-end for xenbaked. # There is a curses interface for live monitoring. XenMon also allows # logging to a file. For options, run python xenmon.py -h # # Copyright (C) 2005,2006 by Hewlett Packard, Palo Alto and Fort Collins # Authors: Lucy Cherkasova, lucy.cherkasova@hp.com # Rob Gardner, rob.gardner@hp.com # Diwaker Gupta, diwaker.gupta@hp.com ##################################################################### # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; under version 2 of the License. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ##################################################################### import mmap import struct import os import time import optparse as _o import curses as _c import math import sys # constants NSAMPLES = 100 NDOMAINS = 32 IDLE_DOMAIN = -1 # idle domain's ID # the struct strings for qos_info ST_DOM_INFO = "6Q3i2H32s" ST_QDATA = "%dQ" % (6*NDOMAINS + 4) # size of mmaped file QOS_DATA_SIZE = struct.calcsize(ST_QDATA)*NSAMPLES + struct.calcsize(ST_DOM_INFO)*NDOMAINS + struct.calcsize("4i") # location of mmaped file, hard coded right now SHM_FILE = "/var/run/xenq-shm" # format strings TOTALS = 15*' ' + "%6.2f%%" + 35*' ' + "%6.2f%%" ALLOCATED = "Allocated" GOTTEN = "Gotten" BLOCKED = "Blocked" WAITED = "Waited" IOCOUNT = "I/O Count" EXCOUNT = "Exec Count" # globals dom_in_use = [] # our curses screen stdscr = None # parsed options options, args = None, None # the optparse module is quite smart # to see help, just run xenmon -h def setup_cmdline_parser(): parser = _o.OptionParser() parser.add_option("-l", "--live", dest="live", action="store_true", default=True, help = "show the ncurses live monitoring frontend (default)") parser.add_option("-n", "--notlive", dest="live", action="store_false", default="True", help = "write to file instead of live monitoring") parser.add_option("-p", "--prefix", dest="prefix", default = "log", help="prefix to use for output files") parser.add_option("-t", "--time", dest="duration", action="store", type="int", default=10, help="stop logging to file after this much time has elapsed (in seconds). set to 0 to keep logging indefinitely") parser.add_option("-i", "--interval", dest="interval", action="store", type="int", default=1000, help="interval for logging (in ms)") parser.add_option("--ms_per_sample", dest="mspersample", action="store", type="int", default=100, help = "determines how many ms worth of data goes in a sample") parser.add_option("--cpu", dest="cpu", action="store", type="int", default=0, help = "specifies which cpu to display data for") parser.add_option("--allocated", dest="allocated", action="store_true", default=False, help="Display allocated time for each domain") parser.add_option("--noallocated", dest="allocated", action="store_false", default=False, help="Don't display allocated time for each domain") parser.add_option("--blocked", dest="blocked", action="store_true", default=True, help="Display blocked time for each domain") parser.add_option("--noblocked", dest="blocked", action="store_false", default=True, help="Don't display blocked time for each domain") parser.add_option("--waited", dest="waited", action="store_true", default=True, help="Display waiting time for each domain") parser.add_option("--nowaited", dest="waited", action="store_false", default=True, help="Don't display waiting time for each domain") parser.add_option("--excount", dest="excount", action="store_true", default=False, help="Display execution count for each domain") parser.add_option("--noexcount", dest="excount", action="store_false", default=False, help="Don't display execution count for each domain") parser.add_option("--iocount", dest="iocount", action="store_true", default=False, help="Display I/O count for each domain") parser.add_option("--noiocount", dest="iocount", action="store_false", default=False, help="Don't display I/O count for each domain") return parser # encapsulate information about a domain class DomainInfo: def __init__(self): self.allocated_sum = 0 self.gotten_sum = 0 self.blocked_sum = 0 self.waited_sum = 0 self.exec_count = 0; self.iocount_sum = 0 self.ffp_samples = [] def gotten_stats(self, passed): total = float(self.gotten_sum) per = 100*total/passed exs = self.exec_count if exs > 0: avg = total/exs else: avg = 0 return [total/(float(passed)/10**9), per, avg] def waited_stats(self, passed): total = float(self.waited_sum) per = 100*total/passed exs = self.exec_count if exs > 0: avg = total/exs else: avg = 0 return [total/(float(passed)/10**9), per, avg] def blocked_stats(self, passed): total = float(self.blocked_sum) per = 100*total/passed ios = self.iocount_sum if ios > 0: avg = total/float(ios) else: avg = 0 return [total/(float(passed)/10**9), per, avg] def allocated_stats(self, passed): total = self.allocated_sum exs = self.exec_count if exs > 0: return float(total)/exs else: return 0 def ec_stats(self, passed): total = float(self.exec_count/(float(passed)/10**9)) return total def io_stats(self, passed): total = float(self.iocount_sum) exs = self.exec_count if exs > 0: avg = total/exs else: avg = 0 return [total/(float(passed)/10**9), avg] def stats(self, passed): return [self.gotten_stats(passed), self.allocated_stats(passed), self.blocked_stats(passed), self.waited_stats(passed), self.ec_stats(passed), self.io_stats(passed)] # report values over desired interval def summarize(startat, endat, duration, samples): dominfos = {} for i in range(0, NDOMAINS): dominfos[i] = DomainInfo() passed = 1 # to prevent zero division curid = startat numbuckets = 0 lost_samples = [] ffp_samples = [] while passed < duration: for i in range(0, NDOMAINS): if dom_in_use[i]: dominfos[i].gotten_sum += samples[curid][0*NDOMAINS + i] dominfos[i].allocated_sum += samples[curid][1*NDOMAINS + i] dominfos[i].waited_sum += samples[curid][2*NDOMAINS + i] dominfos[i].blocked_sum += samples[curid][3*NDOMAINS + i] dominfos[i].exec_count += samples[curid][4*NDOMAINS + i] dominfos[i].iocount_sum += samples[curid][5*NDOMAINS + i] passed += samples[curid][6*NDOMAINS] lost_samples.append(samples[curid][6*NDOMAINS + 2]) ffp_samples.append(samples[curid][6*NDOMAINS + 3]) numbuckets += 1 if curid > 0: curid -= 1 else: curid = NSAMPLES - 1 if curid == endat: break lostinfo = [min(lost_samples), sum(lost_samples), max(lost_samples)] ffpinfo = [min(ffp_samples), sum(ffp_samples), max(ffp_samples)] ldoms = [] for x in range(0, NDOMAINS): if dom_in_use[x]: ldoms.append(dominfos[x].stats(passed)) else: ldoms.append(0) return [ldoms, lostinfo, ffpinfo] # scale microseconds to milliseconds or seconds as necessary def time_scale(ns): if ns < 1000: return "%4.2f ns" % float(ns) elif ns < 1000*1000: return "%4.2f us" % (float(ns)/10**3) elif ns < 10**9: return "%4.2f ms" % (float(ns)/10**6) else: return "%4.2f s" % (float(ns)/10**9) # paint message on curses screen, but detect screen size errors def display(scr, row, col, str, attr=0): try: scr.addstr(row, col, str, attr) except: scr.erase() _c.nocbreak() scr.keypad(0) _c.echo() _c.endwin() print "Your terminal screen is not big enough; Please resize it." print "row=%d, col=%d, str='%s'" % (row, col, str) sys.exit(1) # diplay domain id def display_domain_id(scr, row, col, dom): if dom == IDLE_DOMAIN: display(scr, row, col-1, "Idle") else: display(scr, row, col, "%d" % dom) # the live monitoring code def show_livestats(cpu): ncpu = 1 # number of cpu's on this platform slen = 0 # size of shared data structure, incuding padding cpu_1sec_usage = 0.0 cpu_10sec_usage = 0.0 heartbeat = 1 global dom_in_use, options # mmap the (the first chunk of the) file shmf = open(SHM_FILE, "r+") shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE) # initialize curses stdscr = _c.initscr() _c.noecho() _c.cbreak() stdscr.keypad(1) stdscr.timeout(1000) [maxy, maxx] = stdscr.getmaxyx() # display in a loop while True: cpuidx = 0 while cpuidx < ncpu: # calculate offset in mmap file to start from idx = cpuidx * slen samples = [] doms = [] dom_in_use = [] domain_id = [] # read in data for i in range(0, NSAMPLES): len = struct.calcsize(ST_QDATA) sample = struct.unpack(ST_QDATA, shm[idx:idx+len]) samples.append(sample) idx += len for i in range(0, NDOMAINS): len = struct.calcsize(ST_DOM_INFO) dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len]) doms.append(dom) # (last_update_time, start_time, runnable_start_time, blocked_start_time, # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update, # runnable, in_use, domid, junk, name) = dom # dom_in_use.append(in_use) dom_in_use.append(dom[8]) domid = dom[9] if domid == 32767 : domid = IDLE_DOMAIN domain_id.append(domid) idx += len # print "dom_in_use(cpu=%d): " % cpuidx, dom_in_use len = struct.calcsize("4i") oldncpu = ncpu (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len]) idx += len # xenbaked tells us how many cpu's it's got, so re-do # the mmap if necessary to get multiple cpu data if oldncpu != ncpu: shm = mmap.mmap(shmf.fileno(), ncpu*slen) # if we've just calculated data for the cpu of interest, then # stop examining mmap data and start displaying stuff if cpuidx == cpu: break cpuidx = cpuidx + 1 # calculate starting and ending datapoints; never look at "next" since # it represents live data that may be in transition. startat = next - 1 if next + 10 < NSAMPLES: endat = next + 10 else: endat = 10 # get summary over desired interval [h1, l1, f1] = summarize(startat, endat, 10**9, samples) [h2, l2, f2] = summarize(startat, endat, 10 * 10**9, samples) # the actual display code row = 0 display(stdscr, row, 1, "CPU = %d" % cpu, _c.A_STANDOUT) display(stdscr, row, 10, "%sLast 10 seconds (%3.2f%%)%sLast 1 second (%3.2f%%)" % (6*' ', cpu_10sec_usage, 30*' ', cpu_1sec_usage), _c.A_BOLD) row +=1 display(stdscr, row, 1, "%s" % ((maxx-2)*'=')) total_h1_cpu = 0 total_h2_cpu = 0 cpu_1sec_usage = 0.0 cpu_10sec_usage = 0.0 for dom in range(0, NDOMAINS): if not dom_in_use[dom]: continue if h1[dom][0][1] > 0 or domain_id[dom] == IDLE_DOMAIN: # display gotten row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 4 display(stdscr, row, col, "%s" % time_scale(h2[dom][0][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h2[dom][0][1]) if dom != IDLE_DOMAIN: cpu_10sec_usage += h2[dom][0][1] col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][0][2])) col += 18 display(stdscr, row, col, "%s" % time_scale(h1[dom][0][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h1[dom][0][1], _c.A_STANDOUT) col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][0][2])) col += 18 display(stdscr, row, col, "Gotten") if dom != IDLE_DOMAIN: cpu_1sec_usage = cpu_1sec_usage + h1[dom][0][1] # display allocated if options.allocated: row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 28 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][1])) col += 42 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][1])) col += 18 display(stdscr, row, col, "Allocated") # display blocked if options.blocked: row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 4 display(stdscr, row, col, "%s" % time_scale(h2[dom][2][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h2[dom][2][1]) col += 12 display(stdscr, row, col, "%s/io" % time_scale(h2[dom][2][2])) col += 18 display(stdscr, row, col, "%s" % time_scale(h1[dom][2][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h1[dom][2][1]) col += 12 display(stdscr, row, col, "%s/io" % time_scale(h1[dom][2][2])) col += 18 display(stdscr, row, col, "Blocked") # display waited if options.waited: row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 4 display(stdscr, row, col, "%s" % time_scale(h2[dom][3][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h2[dom][3][1]) col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h2[dom][3][2])) col += 18 display(stdscr, row, col, "%s" % time_scale(h1[dom][3][0])) col += 12 display(stdscr, row, col, "%3.2f%%" % h1[dom][3][1]) col += 12 display(stdscr, row, col, "%s/ex" % time_scale(h1[dom][3][2])) col += 18 display(stdscr, row, col, "Waited") # display ex count if options.excount: row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 28 display(stdscr, row, col, "%d/s" % h2[dom][4]) col += 42 display(stdscr, row, col, "%d" % h1[dom][4]) col += 18 display(stdscr, row, col, "Execution count") # display io count if options.iocount: row += 1 col = 2 display_domain_id(stdscr, row, col, domain_id[dom]) col += 4 display(stdscr, row, col, "%d/s" % h2[dom][5][0]) col += 24 display(stdscr, row, col, "%d/ex" % h2[dom][5][1]) col += 18 display(stdscr, row, col, "%d" % h1[dom][5][0]) col += 24 display(stdscr, row, col, "%3.2f/ex" % h1[dom][5][1]) col += 18 display(stdscr, row, col, "I/O Count") #row += 1 #stdscr.hline(row, 1, '-', maxx - 2) total_h1_cpu += h1[dom][0][1] total_h2_cpu += h2[dom][0][1] row += 1 star = heartbeat * '*' heartbeat = 1 - heartbeat display(stdscr, row, 1, star) display(stdscr, row, 2, TOTALS % (total_h2_cpu, total_h1_cpu)) row += 1 # display(stdscr, row, 2, # "\tFFP: %d (Min: %d, Max: %d)\t\t\tFFP: %d (Min: %d, Max %d)" % # (math.ceil(f2[1]), f2[0], f2[2], math.ceil(f1[1]), f1[0], f1[2]), _c.A_BOLD) if l1[1] > 1 : row += 1 display(stdscr, row, 2, "\tRecords lost: %d (Min: %d, Max: %d)\t\t\tRecords lost: %d (Min: %d, Max %d)" % (math.ceil(l2[1]), l2[0], l2[2], math.ceil(l1[1]), l1[0], l1[2]), _c.A_BOLD) # grab a char from tty input; exit if interrupt hit try: c = stdscr.getch() except: break # q = quit if c == ord('q'): break # c = cycle to a new cpu of interest if c == ord('c'): cpu = (cpu + 1) % ncpu # n/p = cycle to the next/previous CPU if c == ord('n'): cpu = (cpu + 1) % ncpu if c == ord('p'): cpu = (cpu - 1) % ncpu stdscr.erase() _c.nocbreak() stdscr.keypad(0) _c.echo() _c.endwin() shm.close() shmf.close() # simple functions to allow initialization of log files without actually # physically creating files that are never used; only on the first real # write does the file get created class Delayed(file): def __init__(self, filename, mode): self.filename = filename self.saved_mode = mode self.delay_data = "" self.opened = 0 def delayed_write(self, str): self.delay_data = str def write(self, str): if not self.opened: self.file = open(self.filename, self.saved_mode) self.opened = 1 self.file.write(self.delay_data) self.file.write(str) def rename(self, name): self.filename = name def flush(self): if self.opened: self.file.flush() def close(self): if self.opened: self.file.close() def writelog(): global options global dom_in_use ncpu = 1 # number of cpu's slen = 0 # size of shared structure inc. padding shmf = open(SHM_FILE, "r+") shm = mmap.mmap(shmf.fileno(), QOS_DATA_SIZE) interval = 0 curr = last = time.time() outfiles = {} for dom in range(0, NDOMAINS): outfiles[dom] = Delayed("%s-dom%d.log" % (options.prefix, dom), 'w') outfiles[dom].delayed_write("# passed cpu dom cpu(tot) cpu(%) cpu/ex allocated/ex blocked(tot) blocked(%) blocked/io waited(tot) waited(%) waited/ex ex/s io(tot) io/ex\n") while options.duration == 0 or interval < (options.duration * 1000): cpuidx = 0 while cpuidx < ncpu: idx = cpuidx * slen # offset needed in mmap file samples = [] doms = [] dom_in_use = [] domain_id = [] for i in range(0, NSAMPLES): len = struct.calcsize(ST_QDATA) sample = struct.unpack(ST_QDATA, shm[idx:idx+len]) samples.append(sample) idx += len for i in range(0, NDOMAINS): len = struct.calcsize(ST_DOM_INFO) dom = struct.unpack(ST_DOM_INFO, shm[idx:idx+len]) # doms.append(dom) # (last_update_time, start_time, runnable_start_time, blocked_start_time, # ns_since_boot, ns_oncpu_since_boot, runnable_at_last_update, # runnable, in_use, domid, junk, name) = dom dom_in_use.append(dom[8]) domid = dom[9] if domid == 32767: domid = IDLE_DOMAIN domain_id.append(domid) if domid == IDLE_DOMAIN: outfiles[i].rename("%s-idle.log" % options.prefix) else: outfiles[i].rename("%s-dom%d.log" % (options.prefix, domid)) idx += len len = struct.calcsize("4i") oldncpu = ncpu (next, ncpu, slen, freq) = struct.unpack("4i", shm[idx:idx+len]) idx += len if oldncpu != ncpu: shm = mmap.mmap(shmf.fileno(), ncpu*slen) startat = next - 1 if next + 10 < NSAMPLES: endat = next + 10 else: endat = 10 [h1,l1, f1] = summarize(startat, endat, options.interval * 10**6, samples) for dom in range(0, NDOMAINS): if not dom_in_use[dom]: continue if h1[dom][0][1] > 0 or dom == IDLE_DOMAIN: outfiles[dom].write("%.3f %d %d %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n" % (interval, cpuidx, domain_id[dom], h1[dom][0][0], h1[dom][0][1], h1[dom][0][2], h1[dom][1], h1[dom][2][0], h1[dom][2][1], h1[dom][2][2], h1[dom][3][0], h1[dom][3][1], h1[dom][3][2], h1[dom][4], h1[dom][5][0], h1[dom][5][1])) outfiles[dom].flush() curr = time.time() interval += (curr - last) * 1000 last = curr cpuidx = cpuidx + 1 time.sleep(options.interval / 1000.0) for dom in range(0, NDOMAINS): outfiles[dom].close() # start xenbaked def start_xenbaked(): global options global kill_cmd global xenbaked_cmd os.system(kill_cmd) os.system(xenbaked_cmd + " --ms_per_sample=%d &" % options.mspersample) time.sleep(1) # stop xenbaked def stop_xenbaked(): global stop_cmd os.system(stop_cmd) def main(): global options global args global domains global stop_cmd global kill_cmd global xenbaked_cmd if os.uname()[0] == "SunOS": xenbaked_cmd = "/usr/lib/xenbaked" stop_cmd = "/usr/bin/pkill -INT -z global xenbaked" kill_cmd = "/usr/bin/pkill -KILL -z global xenbaked" else: # assumes that xenbaked is in your path xenbaked_cmd = "xenbaked" stop_cmd = "/usr/bin/pkill -INT xenbaked" kill_cmd = "/usr/bin/pkill -KILL xenbaked" parser = setup_cmdline_parser() (options, args) = parser.parse_args() if len(args): parser.error("No parameter required") if options.mspersample < 0: parser.error("option --ms_per_sample: invalid negative value: '%d'" % options.mspersample) # If --ms_per_sample= is too large, no data may be logged. if not options.live and options.duration != 0 and \ options.mspersample > options.duration * 1000: parser.error("option --ms_per_sample: too large (> %d ms)" % (options.duration * 1000)) start_xenbaked() if options.live: show_livestats(options.cpu) else: try: writelog() except: print 'Quitting.' stop_xenbaked() if __name__ == "__main__": main()