aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorShriram Rajagopalan <rshriram@cs.ubc.ca>2011-04-25 13:27:12 +0100
committerShriram Rajagopalan <rshriram@cs.ubc.ca>2011-04-25 13:27:12 +0100
commitd006b9dae589eb8f83626ce6ba84d7dc867335ec (patch)
tree17a046469596d0b8c36bd8e10d2bcdfa4aadd48a
parent416c5575813326b2826f9fc3a0612df45d0934e4 (diff)
downloadxen-d006b9dae589eb8f83626ce6ba84d7dc867335ec.tar.gz
xen-d006b9dae589eb8f83626ce6ba84d7dc867335ec.tar.bz2
xen-d006b9dae589eb8f83626ce6ba84d7dc867335ec.zip
remus: proper cleanup on checkpoint failure.
While running remus, when an error occurs during checkpointing (e.g., timeouts on primary, failing to checkpoint network buffer or disk or even communication failure) the domU is sometimes left in suspended state on primary. Instead of blindly closing the checkpoint file handle, attempt to resume the domain before the close. Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> Committed-by: Ian Jackson <ian.jackson@eu.citrix.com> xen-unstable changeset: 23195:13ec53a59a42 xen-unstable date: Fri Apr 08 16:49:04 2011 +0100
-rw-r--r--tools/python/xen/lowlevel/checkpoint/checkpoint.c3
-rw-r--r--tools/python/xen/remus/save.py6
2 files changed, 8 insertions, 1 deletions
diff --git a/tools/python/xen/lowlevel/checkpoint/checkpoint.c b/tools/python/xen/lowlevel/checkpoint/checkpoint.c
index 7545d7deb6..1581b64095 100644
--- a/tools/python/xen/lowlevel/checkpoint/checkpoint.c
+++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.c
@@ -80,6 +80,9 @@ static PyObject* pycheckpoint_close(PyObject* obj, PyObject* args)
{
CheckpointObject* self = (CheckpointObject*)obj;
+ if (checkpoint_resume(&self->cps) < 0)
+ fprintf(stderr, "%s\n", checkpoint_error(&self->cps));
+
checkpoint_close(&self->cps);
Py_XDECREF(self->suspend_cb);
diff --git a/tools/python/xen/remus/save.py b/tools/python/xen/remus/save.py
index 71517da8c1..9858aec571 100644
--- a/tools/python/xen/remus/save.py
+++ b/tools/python/xen/remus/save.py
@@ -158,9 +158,13 @@ class Saver(object):
self.checkpointer.open(self.vm.domid)
self.checkpointer.start(self.fd, self.suspendcb, self.resumecb,
self.checkpointcb, self.interval)
- self.checkpointer.close()
except xen.lowlevel.checkpoint.error, e:
raise CheckpointError(e)
+ finally:
+ try: #errors in checkpoint close are not critical atm.
+ self.checkpointer.close()
+ except:
+ pass
def _resume(self):
"""low-overhead version of XendDomainInfo.resumeDomain"""