New subject: [libvirt] [Qemu-devel] [PATCH v4] XBZRLE delta for live migration of large memory apps

8 Aug 2011

On 08/08/2011 03:42 AM, Shribman, Aidan wrote:
...
Subject: [PATCH v4] XBZRLE delta for live migration of large memory apps
From: Aidan Shribman<aidan.shribman@sap.com>
By using XBZRLE (Xor Binary Zero Run-Length-Encoding) we can reduce VM downtime
and total live-migration time of VMs running memory write intensive workloads
typical of large enterprise applications such as SAP ERP Systems, and generally
speaking for any application with a sparse memory update pattern.
On the sender side XBZRLE is used as a compact delta encoding of page updates,
retrieving the old page content from an LRU cache (default size of 64 MB). The
receiving side uses the existing page content and XBZRLE to decode the new page
content.
Work was originally based on research results published VEE 2011: Evaluation of
Delta Compression Techniques for Efficient Live Migration of Large Virtual
Machines by Benoit, Svard, Tordsson and Elmroth. Additionally the delta encoder
XBRLE was improved further using XBZRLE instead.
XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it
ideal for in-line, real-time encoding such as is needed for live-migration.
A typical usage scenario:
     {qemu} migrate_set_cachesize 256m
     {qemu} migrate -x -d tcp:destination.host:4444
     {qemu} info migrate
     ...
     transferred ram-duplicate: A kbytes
     transferred ram-duplicate: B pages
     transferred ram-normal: C kbytes
     transferred ram-normal: D pages
     transferred ram-xbrle: E kbytes
     transferred ram-xbrle: F pages
     overflow ram-xbrle: G pages
     cache-hit ram-xbrle: H pages
     cache-lookup ram-xbrle: J pages
Testing: live migration with XBZRLE completed in 110 seconds, without live
migration was not able to complete.
A simple synthetic memory r/w load generator:
..    include<stdlib.h>
..    include<stdio.h>
..    int main()
..    {
..        char *buf = (char *) calloc(4096, 4096);
..        while (1) {
..            int i;
..            for (i = 0; i<  4096 * 4; i++) {
..                buf[i * 4096 / 4]++;
..            }
..            printf(".");
..        }
..    }
Signed-off-by: Benoit Hudzia<benoit.hudzia@sap.com>
Signed-off-by: Petter Svard<petters@cs.umu.se>
Signed-off-by: Aidan Shribman<aidan.shribman@sap.com>
One thing that strikes me about this algorithm is that it's very good 
for a particular type of workload--shockingly good really.

I think workload aware migration compression is possible for a lot of 
different types of workloads.  That makes me a bit wary of QEMU growing 
quite a lot of compression mechanisms.

It makes me think that this logic may really belong at a higher level 
where more information is known about the workload.  For instance, I can 
imagine XBZRLE living in something like libvirt.

Today, parsing migration traffic is pretty horrible but I think we're 
pretty strongly committed to fixing that in 1.0.  That makes me wonder 
if it would be nicer architecturally for a higher level tool to own 
something like this.

Originally, when I added migration, I had the view that we would have 
transport plugins based on the exec: protocol.  That hasn't really 
happened since libvirt really owns migration but I think having XBZRLE 
as a transport plugin for libvirt is something worth considering.

I'm curious what people think about this type of approach.  CC'ing 
libvirt to get their input.

Regards,

Anthony Liguori
...
--
Makefile.target   |    1 +
  arch_init.c       |  351 ++++++++++++++++++++++++++++++++++++++++++++++------
  block-migration.c |    3 +-
  hash.h            |   72 +++++++++++
  hmp-commands.hx   |   36 ++++--
  hw/hw.h           |    3 +-
  lru.c             |  142 +++++++++++++++++++++
  lru.h             |   13 ++
  migration-exec.c  |    6 +-
  migration-fd.c    |    6 +-
  migration-tcp.c   |    6 +-
  migration-unix.c  |    6 +-
  migration.c       |  119 +++++++++++++++++-
  migration.h       |   25 +++-
  qmp-commands.hx   |   43 ++++++-
  savevm.c          |   13 ++-
  sysemu.h          |   13 ++-
  xbzrle.c          |  126 +++++++++++++++++++
  xbzrle.h          |   12 ++
  19 files changed, 917 insertions(+), 79 deletions(-)

diff --git a/Makefile.target b/Makefile.target
index 2800f47..b3215de 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -186,6 +186,7 @@ endif #CONFIG_BSD_USER
  ifdef CONFIG_SOFTMMU
obj-y = arch_init.o cpus.o monitor.o machine.o gdbstub.o balloon.o
+obj-y += lru.o xbzrle.o
  # virtio has to be here due to weird dependency between PCI and virtio-net.
  # need to fix this properly
  obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-serial-bus.o
diff --git a/arch_init.c b/arch_init.c
old mode 100644
new mode 100755
index 4486925..d67dc82
--- a/arch_init.c
+++ b/arch_init.c
@@ -40,6 +40,17 @@
  #include "net.h"
  #include "gdbstub.h"
  #include "hw/smbios.h"
+#include "lru.h"
+#include "xbzrle.h"
+
+//#define DEBUG_ARCH_INIT
+#ifdef DEBUG_ARCH_INIT
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stdout, "arch_init: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
#ifdef TARGET_SPARC
  int graphic_width = 1024;
@@ -88,6 +99,161 @@ const uint32_t arch_type = QEMU_ARCH;
  #define RAM_SAVE_FLAG_PAGE     0x08
  #define RAM_SAVE_FLAG_EOS      0x10
  #define RAM_SAVE_FLAG_CONTINUE 0x20
+#define RAM_SAVE_FLAG_XBZRLE    0x40
+
+/***********************************************************/
+/* RAM Migration State */
+typedef struct ArchMigrationState {
+    int use_xbrle;
+    int64_t xbrle_cache_size;
+} ArchMigrationState;
+
+static ArchMigrationState arch_mig_state;
+
+void arch_set_params(int blk_enable, int shared_base, int use_xbrle,
+        int64_t xbrle_cache_size, void *opaque)
+{
+    arch_mig_state.use_xbrle = use_xbrle;
+    arch_mig_state.xbrle_cache_size = xbrle_cache_size;
+}
+
+#define BE16_MAGIC 0x0123
+
+/***********************************************************/
+/* XBZRLE (Xor Binary Zero Run-Length Encoding) */
+typedef struct XBZRLEHeader {
+    uint32_t xh_cksum; /* not used */
+    uint16_t xh_magic;
+    uint16_t xh_len;
+    uint8_t xh_flags;
+} XBZRLEHeader;
+
+static uint8_t dup_buf[TARGET_PAGE_SIZE];
+
+/***********************************************************/
+/* accounting */
+typedef struct AccountingInfo{
+    uint64_t dup_pages;
+    uint64_t norm_pages;
+    uint64_t xbrle_bytes;
+    uint64_t xbrle_pages;
+    uint64_t xbrle_overflow;
+    uint64_t xbrle_cache_lookup;
+    uint64_t xbrle_cache_hit;
+    uint64_t iterations;
+} AccountingInfo;
+
+static AccountingInfo acct_info;
+
+static void acct_clear(void)
+{
+    memset(&acct_info, 0, sizeof(acct_info));
+}
+
+uint64_t dup_mig_bytes_transferred(void)
+{
+    return acct_info.dup_pages;
+}
+
+uint64_t dup_mig_pages_transferred(void)
+{
+    return acct_info.dup_pages;
+}
+
+uint64_t norm_mig_bytes_transferred(void)
+{
+    return acct_info.norm_pages * TARGET_PAGE_SIZE;
+}
+
+uint64_t norm_mig_pages_transferred(void)
+{
+    return acct_info.norm_pages;
+}
+
+uint64_t xbrle_mig_bytes_transferred(void)
+{
+    return acct_info.xbrle_bytes;
+}
+
+uint64_t xbrle_mig_pages_transferred(void)
+{
+    return acct_info.xbrle_pages;
+}
+
+uint64_t xbrle_mig_pages_overflow(void)
+{
+    return acct_info.xbrle_overflow;
+}
+
+uint64_t xbrle_mig_pages_cache_hit(void)
+{
+    return acct_info.xbrle_cache_hit;
+}
+
+uint64_t xbrle_mig_pages_cache_lookup(void)
+{
+    return acct_info.xbrle_cache_lookup;
+}
+
+static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
+        int cont, int flag)
+{
+        qemu_put_be64(f, offset | cont | flag);
+        if (!cont) {
+                qemu_put_byte(f, strlen(block->idstr));
+                qemu_put_buffer(f, (uint8_t *)block->idstr,
+                                strlen(block->idstr));
+        }
+}
+
+#define ENCODING_FLAG_XBZRLE 0x1
+
+static int save_xbrle_page(QEMUFile *f, uint8_t *current_page,
+        ram_addr_t current_addr, RAMBlock *block, ram_addr_t offset, int cont)
+{
+    int encoded_len = 0, bytes_sent = 0;
+    XBZRLEHeader hdr = {0, BE16_MAGIC};
+    uint8_t *encoded, *old_page;
+
+    /* abort if page not cached */
+    acct_info.xbrle_cache_lookup++;
+    old_page = lru_lookup(current_addr);
+    if (!old_page) {
+        goto done;
+    }
+    acct_info.xbrle_cache_hit++;
+
+    /* XBZRLE (XOR+ZRLE) encoding */
+    encoded = (uint8_t *) qemu_malloc(TARGET_PAGE_SIZE);
+    encoded_len = xbzrle_encode(encoded, old_page, current_page,
+            TARGET_PAGE_SIZE);
+
+    if (encoded_len<  0) {
+        DPRINTF("XBZRLE encoding overflow - sending uncompressed\n");
+        acct_info.xbrle_overflow++;
+        goto done;
+    }
+
+    hdr.xh_len = encoded_len;
+    hdr.xh_flags |= ENCODING_FLAG_XBZRLE;
+
+    /* Send XBZRLE compressed page */
+    save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
+
+    qemu_put_be32(f, hdr.xh_cksum);
+    qemu_put_buffer(f, (uint8_t *)&hdr.xh_magic, sizeof (hdr.xh_magic));
+    qemu_put_be16(f, hdr.xh_len);
+    qemu_put_byte(f, hdr.xh_flags);
+
+    qemu_put_buffer(f, encoded, encoded_len);
+    acct_info.xbrle_pages++;
+    bytes_sent = encoded_len + sizeof(hdr);
+    acct_info.xbrle_bytes += bytes_sent;
+
+done:
+    qemu_free(encoded);
+    return bytes_sent;
+}
static int is_dup_page(uint8_t *page, uint8_t ch)
  {
@@ -107,7 +273,7 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
  static RAMBlock *last_block;
  static ram_addr_t last_offset;
-static int ram_save_block(QEMUFile *f)
+static int ram_save_block(QEMUFile *f, int stage)
  {
      RAMBlock *block = last_block;
      ram_addr_t offset = last_offset;
@@ -120,6 +286,7 @@ static int ram_save_block(QEMUFile *f)
      current_addr = block->offset + offset;
do {
+        lru_free_cb_t free_cb = qemu_free;
          if (cpu_physical_memory_get_dirty(current_addr, MIGRATION_DIRTY_FLAG)) {
              uint8_t *p;
              int cont = (block == last_block) ? RAM_SAVE_FLAG_CONTINUE : 0;
@@ -128,28 +295,35 @@ static int ram_save_block(QEMUFile *f)
                                              current_addr + TARGET_PAGE_SIZE,
                                              MIGRATION_DIRTY_FLAG);
-            p = block->host + offset;
+            if (arch_mig_state.use_xbrle) {
+                p = qemu_malloc(TARGET_PAGE_SIZE);
+                memcpy(p, block->host + offset, TARGET_PAGE_SIZE);
+            } else {
+                p = block->host + offset;
+            }
if (is_dup_page(p, *p)) {
-                qemu_put_be64(f, offset | cont | RAM_SAVE_FLAG_COMPRESS);
-                if (!cont) {
-                    qemu_put_byte(f, strlen(block->idstr));
-                    qemu_put_buffer(f, (uint8_t *)block->idstr,
-                                    strlen(block->idstr));
-                }
+                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
                  qemu_put_byte(f, *p);
                  bytes_sent = 1;
-            } else {
-                qemu_put_be64(f, offset | cont | RAM_SAVE_FLAG_PAGE);
-                if (!cont) {
-                    qemu_put_byte(f, strlen(block->idstr));
-                    qemu_put_buffer(f, (uint8_t *)block->idstr,
-                                    strlen(block->idstr));
+                acct_info.dup_pages++;
+                if (arch_mig_state.use_xbrle&&  !*p) {
+                    p = dup_buf;
+                    free_cb = NULL;
                  }
+            } else if (stage == 2&&  arch_mig_state.use_xbrle) {
+                bytes_sent = save_xbrle_page(f, p, current_addr, block,
+                    offset, cont);
+            }
+            if (!bytes_sent) {
+                save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
                  qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
                  bytes_sent = TARGET_PAGE_SIZE;
+                acct_info.norm_pages++;
+            }
+            if (arch_mig_state.use_xbrle) {
+                lru_insert(current_addr, p, free_cb);
              }
-
              break;
          }
@@ -221,6 +395,9 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
if (stage<  0) {
          cpu_physical_memory_set_dirty_tracking(0);
+        if (arch_mig_state.use_xbrle) {
+            lru_fini();
+        }
          return 0;
      }
@@ -235,6 +412,11 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
          last_block = NULL;
          last_offset = 0;
+        if (arch_mig_state.use_xbrle) {
+            lru_init(arch_mig_state.xbrle_cache_size/TARGET_PAGE_SIZE, 0);
+            acct_clear();
+        }
+
          /* Make sure all dirty bits are set */
          QLIST_FOREACH(block,&ram_list.blocks, next) {
              for (addr = block->offset; addr<  block->offset + block->length;
@@ -264,8 +446,9 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
      while (!qemu_file_rate_limit(f)) {
          int bytes_sent;
-        bytes_sent = ram_save_block(f);
+        bytes_sent = ram_save_block(f, stage);
          bytes_transferred += bytes_sent;
+        acct_info.iterations++;
          if (bytes_sent == 0) { /* no more blocks */
              break;
          }
@@ -285,19 +468,79 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
          int bytes_sent;
/* flush all remaining blocks regardless of rate limiting */
-        while ((bytes_sent = ram_save_block(f)) != 0) {
+        while ((bytes_sent = ram_save_block(f, stage))) {
              bytes_transferred += bytes_sent;
          }
          cpu_physical_memory_set_dirty_tracking(0);
+        if (arch_mig_state.use_xbrle) {
+            lru_fini();
+        }
      }
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
expected_time = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
+    DPRINTF("ram_save_live: expected(%ld)<= max(%ld)?\n", expected_time,
+        migrate_max_downtime());
+
      return (stage == 2)&&  (expected_time<= migrate_max_downtime());
  }
+static int load_xbrle(QEMUFile *f, ram_addr_t addr, void *host)
+{
+    int len, rc = -1;
+    uint8_t *encoded;
+    XBZRLEHeader hdr = {0};
+
+    /* extract ZRLE header */
+    hdr.xh_cksum = qemu_get_be32(f);
+    qemu_get_buffer(f, (uint8_t *)&hdr.xh_magic, sizeof (hdr.xh_magic));
+    hdr.xh_len = qemu_get_be16(f);
+    hdr.xh_flags = qemu_get_byte(f);
+
+    if (!(hdr.xh_flags&  ENCODING_FLAG_XBZRLE)) {
+        fprintf(stderr, "Failed to load XZBRLE page - wrong compression!\n");
+        goto done;
+    }
+
+    if (hdr.xh_len>  TARGET_PAGE_SIZE) {
+        fprintf(stderr, "Failed to load XZBRLE page - len overflow!\n");
+        goto done;
+    }
+
+    /* load data and decode */
+    encoded = (uint8_t *) qemu_malloc(hdr.xh_len);
+    qemu_get_buffer(f, encoded, hdr.xh_len);
+    /* covert endianess if magic indicated destination differs from source */
+    if (hdr.xh_magic != BE16_MAGIC) {
+        const uint64_t *end = (uint64_t *) encoded +
+            hdr.xh_len / sizeof (uint64_t);
+        uint64_t *p;
+        for (p = (uint64_t *) encoded; p<  end; p++) {
+            bswap64s(p);
+        }
+    }
+
+    /* decode ZRLE */
+    len = xbzrle_decode(host, host, encoded, hdr.xh_len);
+    if (len == -1) {
+        fprintf(stderr, "Failed to load XBZRLE page - decode error!\n");
+        goto done;
+    }
+
+    if (len != TARGET_PAGE_SIZE) {
+        fprintf(stderr, "Failed to load XBZRLE page - size %d expected %d!\n",
+            len, TARGET_PAGE_SIZE);
+        goto done;
+    }
+
+    rc = 0;
+done:
+    qemu_free(encoded);
+    return rc;
+}
+
  static inline void *host_from_stream_offset(QEMUFile *f,
                                              ram_addr_t offset,
                                              int flags)
@@ -328,16 +571,38 @@ static inline void *host_from_stream_offset(QEMUFile *f,
      return NULL;
  }
+static inline void *host_from_stream_offset_versioned(int version_id,
+        QEMUFile *f, ram_addr_t offset, int flags)
+{
+        void *host;
+        if (version_id == 3) {
+                host = qemu_get_ram_ptr(offset);
+        } else {
+                host = host_from_stream_offset(f, offset, flags);
+        }
+        if (!host) {
+            fprintf(stderr, "Failed to convert RAM address to host"
+                    " for offset 0x%lX!\n", offset);
+            abort();
+        }
+        return host;
+}
+
  int ram_load(QEMUFile *f, void *opaque, int version_id)
  {
      ram_addr_t addr;
-    int flags;
+    int flags, ret = 0;
+    static uint64_t seq_iter;
+
+    seq_iter++;
if (version_id<  3 || version_id>  4) {
-        return -EINVAL;
+        ret = -EINVAL;
+        goto done;
      }
do {
+        void *host;
          addr = qemu_get_be64(f);
flags = addr&  ~TARGET_PAGE_MASK;
@@ -346,7 +611,8 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
          if (flags&  RAM_SAVE_FLAG_MEM_SIZE) {
              if (version_id == 3) {
                  if (addr != ram_bytes_total()) {
-                    return -EINVAL;
+                    ret = -EINVAL;
+                    goto done;
                  }
              } else {
                  /* Synchronize RAM block list */
@@ -365,8 +631,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
QLIST_FOREACH(block,&ram_list.blocks, next) {
                          if (!strncmp(id, block->idstr, sizeof(id))) {
-                            if (block->length != length)
-                                return -EINVAL;
+                            if (block->length != length) {
+                                ret = -EINVAL;
+                                goto done;
+                            }
                              break;
                          }
                      }
@@ -374,7 +642,8 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
                      if (!block) {
                          fprintf(stderr, "Unknown ramblock \"%s\", cannot "
                                  "accept migration\n", id);
-                        return -EINVAL;
+                        ret = -EINVAL;
+                        goto done;
                      }
total_ram_bytes -= length;
@@ -383,17 +652,10 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
          }
if (flags&  RAM_SAVE_FLAG_COMPRESS) {
-            void *host;
              uint8_t ch;
-            if (version_id == 3)
-                host = qemu_get_ram_ptr(addr);
-            else
-                host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                return -EINVAL;
-            }
-
+            host = host_from_stream_offset_versioned(version_id,
+                            f, addr, flags);
              ch = qemu_get_byte(f);
              memset(host, ch, TARGET_PAGE_SIZE);
  #ifndef _WIN32
@@ -403,21 +665,28 @@ int ram_load(QEMUFile *f, void *opaque, int version_id)
              }
  #endif
          } else if (flags&  RAM_SAVE_FLAG_PAGE) {
-            void *host;
-
-            if (version_id == 3)
-                host = qemu_get_ram_ptr(addr);
-            else
-                host = host_from_stream_offset(f, addr, flags);
-
+            host = host_from_stream_offset_versioned(version_id,
+                            f, addr, flags);
              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
+        } else if (flags&  RAM_SAVE_FLAG_XBZRLE) {
+            host = host_from_stream_offset_versioned(version_id,
+                            f, addr, flags);
+            if (load_xbrle(f, addr, host)<  0) {
+                ret = -EINVAL;
+                goto done;
+            }
          }
+
          if (qemu_file_has_error(f)) {
-            return -EIO;
+            ret = -EIO;
+            goto done;
          }
      } while (!(flags&  RAM_SAVE_FLAG_EOS));
-    return 0;
+done:
+    DPRINTF("Completed load of VM with exit code %d seq iteration %ld\n",
+            ret, seq_iter);
+    return ret;
  }
void qemu_service_io(void)
diff --git a/block-migration.c b/block-migration.c
index 3e66f49..504df70 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -689,7 +689,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
      return 0;
  }
-static void block_set_params(int blk_enable, int shared_base, void *opaque)
+static void block_set_params(int blk_enable, int shared_base,
+        int use_xbrle, int64_t xbrle_cache_size, void *opaque)
  {
      block_mig_state.blk_enable = blk_enable;
      block_mig_state.shared_base = shared_base;
diff --git a/hash.h b/hash.h
new file mode 100644
index 0000000..7109905
--- /dev/null
+++ b/hash.h
@@ -0,0 +1,72 @@
+#ifndef _LINUX_HASH_H
+#define _LINUX_HASH_H
+/* Fast hashing routine for ints,  longs and pointers.
+   (C) 2002 William Lee Irwin III, IBM */
+
+/*
+ * Knuth recommends primes in approximately golden ratio to the maximum
+ * integer representable by a machine word for multiplicative hashing.
+ * Chuck Lever verified the effectiveness of this technique:
+ * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
+ *
+ * These primes are chosen to be bit-sparse, that is operations on
+ * them can use shifts and additions instead of multiplications for
+ * machines where multiplications are slow.
+ */
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+#define BITS_PER_LONG TARGET_LONG_BITS
+
+/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
+#define GOLDEN_RATIO_PRIME_32 0x9e370001UL
+/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
+#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL
+
+#if BITS_PER_LONG == 32
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32
+#define hash_long(val, bits) hash_32(val, bits)
+#elif BITS_PER_LONG == 64
+#define hash_long(val, bits) hash_64(val, bits)
+#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64
+#else
+#error Wordsize not 32 or 64
+#endif
+
+static inline u64 hash_64(u64 val, unsigned int bits)
+{
+    u64 hash = val;
+
+    /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
+    u64 n = hash;
+    n<<= 18;
+    hash -= n;
+    n<<= 33;
+    hash -= n;
+    n<<= 3;
+    hash += n;
+    n<<= 3;
+    hash -= n;
+    n<<= 4;
+    hash += n;
+    n<<= 2;
+    hash += n;
+
+    /* High bits are more random, so use them. */
+    return hash>>  (64 - bits);
+}
+
+static inline u32 hash_32(u32 val, unsigned int bits)
+{
+    /* On some cpus multiply is faster, on others gcc will do shifts */
+    u32 hash = val * GOLDEN_RATIO_PRIME_32;
+
+    /* High bits are more random, so use them. */
+    return hash>>  (32 - bits);
+}
+
+static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
+{
+    return hash_long((unsigned long)ptr, bits);
+}
+#endif /* _LINUX_HASH_H */
diff --git a/hmp-commands.hx b/hmp-commands.hx
old mode 100644
new mode 100755
index e5585ba..e49d5be
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -717,24 +717,27 @@ ETEXI
{
          .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
-        .params     = "[-d] [-b] [-i] uri",
-        .help       = "migrate to URI (using -d to not wait for completion)"
-                     "\n\t\t\t -b for migration without shared storage with"
-                     " full copy of disk\n\t\t\t -i for migration without "
-                     "shared storage with incremental copy of disk "
-                     "(base image shared between src and destination)",
+        .args_type  = "detach:-d,blk:-b,inc:-i,xbrle:-x,uri:s",
+        .params     = "[-d] [-b] [-i] [-x] uri",
+        .help       = "migrate to URI"
+                      "\n\t -d to not wait for completion"
+                      "\n\t -b for migration without shared storage with"
+                      " full copy of disk"
+                      "\n\t -i for migration without"
+                      " shared storage with incremental copy of disk"
+                      " (base image shared between source and destination)"
+                      "\n\t -x to use XBRLE page delta compression",
          .user_print = monitor_user_noop,
         .mhandler.cmd_new = do_migrate,
      },
-
  STEXI
-@item migrate [-d] [-b] [-i] @var{uri}
+@item migrate [-d] [-b] [-i] [-x] @var{uri}
  @findex migrate
  Migrate to @var{uri} (using -d to not wait for completion).
         -b for migration with full copy of disk
         -i for migration with incremental copy of disk (base image is shared)
+    -x to use XBRLE page delta compression
  ETEXI
{
@@ -753,10 +756,23 @@ Cancel the current VM migration.
  ETEXI
{
+        .name       = "migrate_set_cachesize",
+        .args_type  = "value:s",
+        .params     = "value",
+        .help       = "set cache size (in MB) for XBRLE migrations",
+        .mhandler.cmd = do_migrate_set_cachesize,
+    },
+
+STEXI
+@item migrate_set_cachesize @var{value}
+Set cache size (in MB) for xbrle migrations.
+ETEXI
+
+    {
          .name       = "migrate_set_speed",
          .args_type  = "value:o",
          .params     = "value",
-        .help       = "set maximum speed (in bytes) for migrations. "
+        .help       = "set maximum XBRLE cache size (in bytes) for migrations. "
         "Defaults to MB if no size suffix is specified, ie. B/K/M/G/T",
          .user_print = monitor_user_noop,
          .mhandler.cmd_new = do_migrate_set_speed,
diff --git a/hw/hw.h b/hw/hw.h
index 9d2cfc2..aa336ec 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -239,7 +239,8 @@ static inline void qemu_get_sbe64s(QEMUFile *f, int64_t *pv)
  int64_t qemu_ftell(QEMUFile *f);
  int64_t qemu_fseek(QEMUFile *f, int64_t pos, int whence);
-typedef void SaveSetParamsHandler(int blk_enable, int shared, void * opaque);
+typedef void SaveSetParamsHandler(int blk_enable, int shared,
+        int use_xbrle, int64_t xbrle_cache_size, void *opaque);
  typedef void SaveStateHandler(QEMUFile *f, void *opaque);
  typedef int SaveLiveStateHandler(Monitor *mon, QEMUFile *f, int stage,
                                   void *opaque);
diff --git a/lru.c b/lru.c
new file mode 100644
index 0000000..e7230d0
--- /dev/null
+++ b/lru.c
@@ -0,0 +1,142 @@
+#include<assert.h>
+#include<math.h>
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "host-utils.h"
+#include "lru.h"
+#include "hash.h"
+
+typedef struct CacheItem {
+    ram_addr_t it_addr;
+    uint8_t *it_data;
+    lru_free_cb_t it_free;
+    QCIRCLEQ_ENTRY(CacheItem) it_lru_next;
+    QCIRCLEQ_ENTRY(CacheItem) it_bucket_next;
+} CacheItem;
+
+typedef QCIRCLEQ_HEAD(, CacheItem) CacheBucket;
+static CacheBucket *page_hash;
+static int64_t cache_table_size;
+static uint64_t cache_max_items;
+static int64_t cache_num_items;
+static uint8_t cache_hash_bits;
+
+static QCIRCLEQ_HEAD(page_lru, CacheItem) page_lru;
+
+static uint64_t next_pow_of_2(uint64_t v)
+{
+    v--;
+    v |= v>>  1;
+    v |= v>>  2;
+    v |= v>>  4;
+    v |= v>>  8;
+    v |= v>>  16;
+    v |= v>>  32;
+    v++;
+    return v;
+}
+
+void lru_init(int64_t max_items, void *param)
+{
+    int i;
+
+    cache_num_items = 0;
+    cache_max_items = max_items;
+    /* add 20% to table size to reduce collisions */
+    cache_table_size = next_pow_of_2(1.2 * max_items);
+    cache_hash_bits = ctz64(cache_table_size) - 1;
+
+    QCIRCLEQ_INIT(&page_lru);
+
+    page_hash = qemu_mallocz(sizeof(CacheBucket) * cache_table_size);
+    assert(page_hash);
+    for (i = 0; i<  cache_table_size; i++) {
+        QCIRCLEQ_INIT(&page_hash[i]);
+    }
+}
+
+static CacheBucket *page_bucket_list(ram_addr_t addr)
+{
+    return&page_hash[hash_long(addr, cache_hash_bits)];
+}
+
+static void do_lru_remove(CacheItem *it)
+{
+    assert(it);
+
+    QCIRCLEQ_REMOVE(&page_lru, it, it_lru_next);
+    QCIRCLEQ_REMOVE(page_bucket_list(it->it_addr), it, it_bucket_next);
+    if (it->it_free) {
+        (*it->it_free)(it->it_data);
+    }
+    qemu_free(it);
+    cache_num_items--;
+}
+
+static int do_lru_remove_first(void)
+{
+    CacheItem *first;
+
+    if (QCIRCLEQ_EMPTY(&page_lru)) {
+        return -1;
+    }
+    first = QCIRCLEQ_FIRST(&page_lru);
+    do_lru_remove(first);
+    return 0;
+}
+
+
+void lru_fini(void)
+{
+    while (!do_lru_remove_first()) {
+    }
+    qemu_free(page_hash);
+}
+
+static CacheItem *do_lru_lookup(ram_addr_t addr)
+{
+    CacheBucket *head = page_bucket_list(addr);
+    CacheItem *it;
+
+    if (QCIRCLEQ_EMPTY(head)) {
+        return NULL;
+    }
+    QCIRCLEQ_FOREACH(it, head, it_bucket_next) {
+        if (addr == it->it_addr) {
+            return it;
+        }
+    }
+    return NULL;
+}
+
+uint8_t *lru_lookup(ram_addr_t addr)
+{
+    CacheItem *it = do_lru_lookup(addr);
+    return it ? it->it_data : NULL;
+}
+
+void lru_insert(ram_addr_t addr, uint8_t *data, lru_free_cb_t free_cb)
+{
+    CacheItem *it;
+
+    /* remove old if item exists */
+    it = do_lru_lookup(addr);
+    if (it) {
+        do_lru_remove(it);
+    }
+
+    /* evict LRU if require free space */
+    if (cache_num_items == cache_max_items) {
+        do_lru_remove_first();
+    }
+
+    /* add new entry */
+    it = qemu_mallocz(sizeof(*it));
+    it->it_addr = addr;
+    it->it_data = data;
+    it->it_free = free_cb;
+    QCIRCLEQ_INSERT_HEAD(page_bucket_list(addr), it, it_bucket_next);
+    QCIRCLEQ_INSERT_TAIL(&page_lru, it, it_lru_next);
+    cache_num_items++;
+}
+
diff --git a/lru.h b/lru.h
new file mode 100644
index 0000000..6c70095
--- /dev/null
+++ b/lru.h
@@ -0,0 +1,13 @@
+#ifndef _LRU_H_
+#define _LRU_H_
+
+#include<unistd.h>
+#include<stdint.h>
+#include "cpu-all.h"
+typedef void (*lru_free_cb_t)(void *);
+void lru_init(ssize_t num_items, void *param);
+void lru_fini(void);
+void lru_insert(ram_addr_t id, uint8_t *pdata, lru_free_cb_t free_cb);
+uint8_t *lru_lookup(ram_addr_t addr);
+#endif
+
diff --git a/migration-exec.c b/migration-exec.c
index 14718dd..fe8254a 100644
--- a/migration-exec.c
+++ b/migration-exec.c
@@ -67,7 +67,9 @@ MigrationState *exec_start_outgoing_migration(Monitor *mon,
                                               int64_t bandwidth_limit,
                                               int detach,
                                               int blk,
-                                             int inc)
+                          int inc,
+                          int use_xbrle,
+                          int64_t xbrle_cache_size)
  {
      FdMigrationState *s;
      FILE *f;
@@ -99,6 +101,8 @@ MigrationState *exec_start_outgoing_migration(Monitor *mon,
s->mig_state.blk = blk;
      s->mig_state.shared = inc;
+    s->mig_state.use_xbrle = use_xbrle;
+    s->mig_state.xbrle_cache_size = xbrle_cache_size;
s->state = MIG_STATE_ACTIVE;
      s->mon = NULL;
diff --git a/migration-fd.c b/migration-fd.c
index 6d14505..4a1ddbd 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -56,7 +56,9 @@ MigrationState *fd_start_outgoing_migration(Monitor *mon,
                                             int64_t bandwidth_limit,
                                             int detach,
                                             int blk,
-                                           int inc)
+                        int inc,
+                        int use_xbrle,
+                        int64_t xbrle_cache_size)
  {
      FdMigrationState *s;
@@ -82,6 +84,8 @@ MigrationState *fd_start_outgoing_migration(Monitor *mon,
s->mig_state.blk = blk;
      s->mig_state.shared = inc;
+    s->mig_state.use_xbrle = use_xbrle;
+    s->mig_state.xbrle_cache_size = xbrle_cache_size;
s->state = MIG_STATE_ACTIVE;
      s->mon = NULL;
diff --git a/migration-tcp.c b/migration-tcp.c
index b55f419..4ca5bf6 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -81,7 +81,9 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon,
                                               int64_t bandwidth_limit,
                                               int detach,
                                              int blk,
-                                            int inc)
+                         int inc,
+                         int use_xbrle,
+                         int64_t xbrle_cache_size)
  {
      struct sockaddr_in addr;
      FdMigrationState *s;
@@ -101,6 +103,8 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon,
s->mig_state.blk = blk;
      s->mig_state.shared = inc;
+    s->mig_state.use_xbrle = use_xbrle;
+    s->mig_state.xbrle_cache_size = xbrle_cache_size;
s->state = MIG_STATE_ACTIVE;
      s->mon = NULL;
diff --git a/migration-unix.c b/migration-unix.c
index 57232c0..0813902 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -80,7 +80,9 @@ MigrationState *unix_start_outgoing_migration(Monitor *mon,
                                               int64_t bandwidth_limit,
                                               int detach,
                                               int blk,
-                                             int inc)
+                          int inc,
+                          int use_xbrle,
+                          int64_t xbrle_cache_size)
  {
      FdMigrationState *s;
      struct sockaddr_un addr;
@@ -100,6 +102,8 @@ MigrationState *unix_start_outgoing_migration(Monitor *mon,
s->mig_state.blk = blk;
      s->mig_state.shared = inc;
+    s->mig_state.use_xbrle = use_xbrle;
+    s->mig_state.xbrle_cache_size = xbrle_cache_size;
s->state = MIG_STATE_ACTIVE;
      s->mon = NULL;
diff --git a/migration.c b/migration.c
old mode 100644
new mode 100755
index 9ee8b17..ccacf81
--- a/migration.c
+++ b/migration.c
@@ -34,6 +34,11 @@
  /* Migration speed throttling */
  static uint32_t max_throttle = (32<<  20);
+/* Migration XBRLE cache size */
+#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
+
+static int64_t migrate_cache_size = DEFAULT_MIGRATE_CACHE_SIZE;
+
  static MigrationState *current_migration;
int qemu_start_incoming_migration(const char *uri)
@@ -80,6 +85,7 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data)
      int detach = qdict_get_try_bool(qdict, "detach", 0);
      int blk = qdict_get_try_bool(qdict, "blk", 0);
      int inc = qdict_get_try_bool(qdict, "inc", 0);
+    int use_xbrle = qdict_get_try_bool(qdict, "xbrle", 0);
      const char *uri = qdict_get_str(qdict, "uri");
if (current_migration&&
@@ -90,17 +96,21 @@ int do_migrate(Monitor *mon, const QDict *qdict, QObject **ret_data)
if (strstart(uri, "tcp:",&p)) {
          s = tcp_start_outgoing_migration(mon, p, max_throttle, detach,
-                                         blk, inc);
+                                         blk, inc, use_xbrle,
+                                         migrate_cache_size);
  #if !defined(WIN32)
      } else if (strstart(uri, "exec:",&p)) {
          s = exec_start_outgoing_migration(mon, p, max_throttle, detach,
-                                          blk, inc);
+                                          blk, inc, use_xbrle,
+                                          migrate_cache_size);
      } else if (strstart(uri, "unix:",&p)) {
          s = unix_start_outgoing_migration(mon, p, max_throttle, detach,
-                                          blk, inc);
+                                          blk, inc, use_xbrle,
+                                          migrate_cache_size);
      } else if (strstart(uri, "fd:",&p)) {
          s = fd_start_outgoing_migration(mon, p, max_throttle, detach,
-                                        blk, inc);
+                                        blk, inc, use_xbrle,
+                                        migrate_cache_size);
  #endif
      } else {
          monitor_printf(mon, "unknown migration protocol: %s\n", uri);
@@ -185,6 +195,36 @@ static void migrate_print_status(Monitor *mon, const char *name,
                          qdict_get_int(qdict, "total")>>  10);
  }
+static void migrate_print_ram_status(Monitor *mon, const char *name,
+                                 const QDict *status_dict)
+{
+    QDict *qdict;
+    uint64_t overflow, cache_hit, cache_lookup;
+
+    qdict = qobject_to_qdict(qdict_get(status_dict, name));
+
+    monitor_printf(mon, "transferred %s: %" PRIu64 " kbytes\n", name,
+                        qdict_get_int(qdict, "bytes")>>  10);
+    monitor_printf(mon, "transferred %s: %" PRIu64 " pages\n", name,
+                        qdict_get_int(qdict, "pages"));
+    overflow = qdict_get_int(qdict, "overflow");
+    if (overflow>  0) {
+        monitor_printf(mon, "overflow %s: %" PRIu64 " pages\n", name,
+            overflow);
+    }
+    cache_hit = qdict_get_int(qdict, "cache-hit");
+    if (cache_hit>  0) {
+        monitor_printf(mon, "cache-hit %s: %" PRIu64 " pages\n", name,
+            cache_hit);
+    }
+    cache_lookup = qdict_get_int(qdict, "cache-lookup");
+    if (cache_lookup>  0) {
+        monitor_printf(mon, "cache-lookup %s: %" PRIu64 " pages\n", name,
+            cache_lookup);
+    }
+
+}
+
  void do_info_migrate_print(Monitor *mon, const QObject *data)
  {
      QDict *qdict;
@@ -198,6 +238,18 @@ void do_info_migrate_print(Monitor *mon, const QObject *data)
          migrate_print_status(mon, "ram", qdict);
      }
+    if (qdict_haskey(qdict, "ram-duplicate")) {
+        migrate_print_ram_status(mon, "ram-duplicate", qdict);
+    }
+
+    if (qdict_haskey(qdict, "ram-normal")) {
+        migrate_print_ram_status(mon, "ram-normal", qdict);
+    }
+
+    if (qdict_haskey(qdict, "ram-xbrle")) {
+        migrate_print_ram_status(mon, "ram-xbrle", qdict);
+    }
+
      if (qdict_haskey(qdict, "disk")) {
          migrate_print_status(mon, "disk", qdict);
      }
@@ -214,6 +266,23 @@ static void migrate_put_status(QDict *qdict, const char *name,
      qdict_put_obj(qdict, name, obj);
  }
+static void migrate_put_ram_status(QDict *qdict, const char *name,
+                               uint64_t bytes, uint64_t pages,
+                               uint64_t overflow, uint64_t cache_hit,
+                               uint64_t cache_lookup)
+{
+    QObject *obj;
+
+    obj = qobject_from_jsonf("{ 'bytes': %" PRId64 ", "
+                               "'pages': %" PRId64 ", "
+                               "'overflow': %" PRId64 ", "
+                               "'cache-hit': %" PRId64 ", "
+                               "'cache-lookup': %" PRId64 " }",
+                               bytes, pages, overflow, cache_hit,
+                               cache_lookup);
+    qdict_put_obj(qdict, name, obj);
+}
+
  void do_info_migrate(Monitor *mon, QObject **ret_data)
  {
      QDict *qdict;
@@ -228,6 +297,21 @@ void do_info_migrate(Monitor *mon, QObject **ret_data)
              migrate_put_status(qdict, "ram", ram_bytes_transferred(),
                                 ram_bytes_remaining(), ram_bytes_total());
+            if (s->use_xbrle) {
+                migrate_put_ram_status(qdict, "ram-duplicate",
+                                   dup_mig_bytes_transferred(),
+                                   dup_mig_pages_transferred(), 0, 0, 0);
+                migrate_put_ram_status(qdict, "ram-normal",
+                                   norm_mig_bytes_transferred(),
+                                   norm_mig_pages_transferred(), 0, 0, 0);
+                migrate_put_ram_status(qdict, "ram-xbrle",
+                                   xbrle_mig_bytes_transferred(),
+                                   xbrle_mig_pages_transferred(),
+                                   xbrle_mig_pages_overflow(),
+                                   xbrle_mig_pages_cache_hit(),
+                                   xbrle_mig_pages_cache_lookup());
+            }
+
              if (blk_mig_active()) {
                  migrate_put_status(qdict, "disk", blk_mig_bytes_transferred(),
                                     blk_mig_bytes_remaining(),
@@ -341,7 +425,8 @@ void migrate_fd_connect(FdMigrationState *s)
DPRINTF("beginning savevm\n");
      ret = qemu_savevm_state_begin(s->mon, s->file, s->mig_state.blk,
-                                  s->mig_state.shared);
+                                  s->mig_state.shared, s->mig_state.use_xbrle,
+                                  s->mig_state.xbrle_cache_size);
      if (ret<  0) {
          DPRINTF("failed, %d\n", ret);
          migrate_fd_error(s);
@@ -448,3 +533,27 @@ int migrate_fd_close(void *opaque)
      qemu_set_fd_handler2(s->fd, NULL, NULL, NULL, NULL);
      return s->close(s);
  }
+
+void do_migrate_set_cachesize(Monitor *mon, const QDict *qdict)
+{
+    ssize_t bytes;
+    const char *value = qdict_get_str(qdict, "value");
+
+    bytes = strtosz(value, NULL);
+    if (bytes<  0) {
+        monitor_printf(mon, "invalid cache size: %s\n", value);
+        return;
+    }
+
+    /* On 32-bit hosts, QEMU is limited by virtual address space */
+    if (bytes>  (2047<<  20)&&  HOST_LONG_BITS == 32) {
+        monitor_printf(mon, "cache can't exceed 2047 MB RAM limit on host\n");
+        return;
+    }
+    if (bytes != (uint64_t) bytes) {
+        monitor_printf(mon, "cache size too large\n");
+        return;
+    }
+    migrate_cache_size = bytes;
+}
+
diff --git a/migration.h b/migration.h
index d13ed4f..6dc0543 100644
--- a/migration.h
+++ b/migration.h
@@ -32,6 +32,8 @@ struct MigrationState
      void (*release)(MigrationState *s);
      int blk;
      int shared;
+    int use_xbrle;
+    int64_t xbrle_cache_size;
  };
typedef struct FdMigrationState FdMigrationState;
@@ -76,7 +78,9 @@ MigrationState *exec_start_outgoing_migration(Monitor *mon,
                                               int64_t bandwidth_limit,
                                               int detach,
                                               int blk,
-                                             int inc);
+                          int inc,
+                          int use_xbrle,
+                          int64_t xbrle_cache_size);
int tcp_start_incoming_migration(const char *host_port);
@@ -85,7 +89,9 @@ MigrationState *tcp_start_outgoing_migration(Monitor *mon,
                                              int64_t bandwidth_limit,
                                              int detach,
                                              int blk,
-                                            int inc);
+                         int inc,
+                         int use_xbrle,
+                         int64_t xbrle_cache_size);
int unix_start_incoming_migration(const char *path);
@@ -94,7 +100,9 @@ MigrationState *unix_start_outgoing_migration(Monitor *mon,
                                               int64_t bandwidth_limit,
                                               int detach,
                                               int blk,
-                                             int inc);
+                          int inc,
+                          int use_xbrle,
+                          int64_t xbrle_cache_size);
int fd_start_incoming_migration(const char *path);
@@ -103,7 +111,9 @@ MigrationState *fd_start_outgoing_migration(Monitor *mon,
                                             int64_t bandwidth_limit,
                                             int detach,
                                             int blk,
-                                           int inc);
+                        int inc,
+                        int use_xbrle,
+                        int64_t xbrle_cache_size);
void migrate_fd_monitor_suspend(FdMigrationState *s, Monitor *mon);
@@ -134,4 +144,11 @@ static inline FdMigrationState *migrate_to_fms(MigrationState *mig_state)
      return container_of(mig_state, FdMigrationState, mig_state);
  }
+void do_migrate_set_cachesize(Monitor *mon, const QDict *qdict);
+
+void arch_set_params(int blk_enable, int shared_base,
+        int use_xbrle, int64_t xbrle_cache_size, void *opaque);
+
+int xbrle_mig_active(void);
+
  #endif
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 793cf1c..8fbe64b 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -431,13 +431,16 @@ EQMP
{
          .name       = "migrate",
-        .args_type  = "detach:-d,blk:-b,inc:-i,uri:s",
-        .params     = "[-d] [-b] [-i] uri",
-        .help       = "migrate to URI (using -d to not wait for completion)"
-                     "\n\t\t\t -b for migration without shared storage with"
-                     " full copy of disk\n\t\t\t -i for migration without "
-                     "shared storage with incremental copy of disk "
-                     "(base image shared between src and destination)",
+        .args_type  = "detach:-d,blk:-b,inc:-i,xbrle:-x,uri:s",
+        .params     = "[-d] [-b] [-i] [-x] uri",
+        .help       = "migrate to URI"
+                      "\n\t -d to not wait for completion"
+                      "\n\t -b for migration without shared storage with"
+                      " full copy of disk"
+                      "\n\t -i for migration without"
+                      " shared storage with incremental copy of disk"
+                      " (base image shared between source and destination)"
+                      "\n\t -x to use XBRLE page delta compression",
          .user_print = monitor_user_noop,
         .mhandler.cmd_new = do_migrate,
      },
@@ -453,6 +456,7 @@ Arguments:
  - "blk": block migration, full disk copy (json-bool, optional)
  - "inc": incremental disk copy (json-bool, optional)
  - "uri": Destination URI (json-string)
+- "xbrle": to use XBRLE page delta compression
Example:
@@ -494,6 +498,31 @@ Example:
  EQMP
{
+        .name       = "migrate_set_cachesize",
+        .args_type  = "value:s",
+        .params     = "value",
+        .help       = "set cache size (in MB) for xbrle migrations",
+        .mhandler.cmd = do_migrate_set_cachesize,
+    },
+
+SQMP
+migrate_set_cachesize
+---------------------
+
+Set cache size to be used by XBRLE migration
+
+Arguments:
+
+- "value": cache size in bytes (json-number)
+
+Example:
+
+->  { "execute": "migrate_set_cachesize", "arguments": { "value": 500M } }
+<- { "return": {} }
+
+EQMP
+
+    {
          .name       = "migrate_set_speed",
          .args_type  = "value:f",
          .params     = "value",
diff --git a/savevm.c b/savevm.c
index 4e49765..93b512b 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1141,7 +1141,8 @@ int register_savevm(DeviceState *dev,
                      void *opaque)
  {
      return register_savevm_live(dev, idstr, instance_id, version_id,
-                                NULL, NULL, save_state, load_state, opaque);
+                                arch_set_params, NULL, save_state,
+                                load_state, opaque);
  }
void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
@@ -1428,15 +1429,17 @@ static int vmstate_save(QEMUFile *f, SaveStateEntry *se)
  #define QEMU_VM_SUBSECTION           0x05
int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable,
-                            int shared)
+                            int shared, int use_xbrle,
+                            int64_t xbrle_cache_size)
  {
      SaveStateEntry *se;
QTAILQ_FOREACH(se,&savevm_handlers, entry) {
          if(se->set_params == NULL) {
              continue;
-       }
-       se->set_params(blk_enable, shared, se->opaque);
+        }
+        se->set_params(blk_enable, shared, use_xbrle, xbrle_cache_size,
+                se->opaque);
      }
qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
@@ -1577,7 +1580,7 @@ static int qemu_savevm_state(Monitor *mon, QEMUFile *f)
bdrv_flush_all();
-    ret = qemu_savevm_state_begin(mon, f, 0, 0);
+    ret = qemu_savevm_state_begin(mon, f, 0, 0, 0, 0);
      if (ret<  0)
          goto out;
diff --git a/sysemu.h b/sysemu.h
index b81a70e..eb53bf7 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -44,6 +44,16 @@ uint64_t ram_bytes_remaining(void);
  uint64_t ram_bytes_transferred(void);
  uint64_t ram_bytes_total(void);
+uint64_t dup_mig_bytes_transferred(void);
+uint64_t dup_mig_pages_transferred(void);
+uint64_t norm_mig_bytes_transferred(void);
+uint64_t norm_mig_pages_transferred(void);
+uint64_t xbrle_mig_bytes_transferred(void);
+uint64_t xbrle_mig_pages_transferred(void);
+uint64_t xbrle_mig_pages_overflow(void);
+uint64_t xbrle_mig_pages_cache_lookup(void);
+uint64_t xbrle_mig_pages_cache_hit(void);
+
  int64_t cpu_get_ticks(void);
  void cpu_enable_ticks(void);
  void cpu_disable_ticks(void);
@@ -74,7 +84,8 @@ void qemu_announce_self(void);
  void main_loop_wait(int nonblocking);
int qemu_savevm_state_begin(Monitor *mon, QEMUFile *f, int blk_enable,
-                            int shared);
+                            int shared, int use_xbrle,
+                            int64_t xbrle_cache_size);
  int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f);
  int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f);
  void qemu_savevm_state_cancel(Monitor *mon, QEMUFile *f);
diff --git a/xbzrle.c b/xbzrle.c
new file mode 100644
index 0000000..e9285e0
--- /dev/null
+++ b/xbzrle.c
@@ -0,0 +1,126 @@
+#include<stdint.h>
+#include<string.h>
+#include<assert.h>
+#include "cpu-all.h"
+#include "xbzrle.h"
+
+typedef struct {
+    uint64_t c;
+    uint64_t num;
+} zero_encoding_t;
+
+typedef struct {
+    uint64_t c;
+} char_encoding_t;
+
+static int rle_encode(uint64_t *in, int slen, uint8_t *out, const int dlen)
+{
+    int dl = 0;
+    uint64_t cp = 0, c, run_len = 0;
+
+    if (slen<=  0)
+        return -1;
+
+    while (1) {
+        if (!slen)
+            break;
+        c = *in++;
+        slen--;
+        if (!(cp || c)) {
+            run_len++;
+        } else if (!cp) {
+            ((zero_encoding_t *)out)->c = cp;
+            ((zero_encoding_t *)out)->num = run_len;
+            dl += sizeof(zero_encoding_t);
+            out += sizeof(zero_encoding_t);
+            run_len = 1;
+        } else {
+            ((char_encoding_t *)out)->c = cp;
+            dl += sizeof(char_encoding_t);
+            out += sizeof(char_encoding_t);
+                }
+        cp = c;
+    }
+
+    if (!cp) {
+        ((zero_encoding_t *)out)->c = cp;
+        ((zero_encoding_t *)out)->num = run_len;
+        dl += sizeof(zero_encoding_t);
+        out += sizeof(zero_encoding_t);
+    } else {
+        ((char_encoding_t *)out)->c = cp;
+        dl += sizeof(char_encoding_t);
+        out += sizeof(char_encoding_t);
+    }
+    return dl;
+}
+
+static int rle_decode(const uint8_t *in, int slen, uint64_t *out, int dlen)
+{
+    int tb = 0;
+    uint64_t run_len, c;
+
+    while (slen>  0) {
+        c = ((char_encoding_t *) in)->c;
+        if (c) {
+            slen -= sizeof(char_encoding_t);
+            in += sizeof(char_encoding_t);
+            *out++ = c;
+            tb++;
+            continue;
+        }
+        run_len = ((zero_encoding_t *) in)->num;
+        slen -= sizeof(zero_encoding_t);
+        in += sizeof(zero_encoding_t);
+        while (run_len-->  0) {
+            *out++ = c;
+            tb++;
+        }
+    }
+    return tb;
+}
+
+static void xor_encode_word(uint8_t *dst, const uint8_t *src1,
+    const uint8_t *src2)
+{
+    int len = TARGET_PAGE_SIZE / sizeof (uint64_t);
+    uint64_t *dstw = (uint64_t *) dst;
+    const uint64_t *srcw1 = (const uint64_t *) src1;
+    const uint64_t *srcw2 = (const uint64_t *) src2;
+
+    while (len--) {
+        *dstw++ = *srcw1++ ^ *srcw2++;
+    }
+}
+
+int xbzrle_encode(uint8_t *xbzrle, const uint8_t *old, const uint8_t *curr,
+    const size_t max_compressed_len)
+{
+    int compressed_len;
+    uint8_t xor_buf[TARGET_PAGE_SIZE];
+    uint8_t work_buf[TARGET_PAGE_SIZE * 2]; /* worst case xbzrle is 150% */
+
+    xor_encode_word(xor_buf, old, curr);
+    compressed_len = rle_encode((uint64_t *)xor_buf,
+        sizeof(xor_buf)/sizeof(uint64_t), work_buf,
+        sizeof(work_buf));
+    if (compressed_len>  max_compressed_len) {
+        return -1;
+    }
+    memcpy(xbzrle, work_buf, compressed_len);
+    return compressed_len;
+}
+
+int xbzrle_decode(uint8_t *curr, const uint8_t *old, const uint8_t *xbrle,
+    const size_t compressed_len)
+{
+    uint8_t xor_buf[TARGET_PAGE_SIZE];
+
+    int len = rle_decode(xbrle, compressed_len,
+         (uint64_t *)xor_buf, sizeof(xor_buf)/sizeof(uint64_t));
+    if (len<  0) {
+        return len;
+    }
+    xor_encode_word(curr, old, xor_buf);
+    return len * sizeof(uint64_t);
+}
diff --git a/xbzrle.h b/xbzrle.h
new file mode 100644
index 0000000..5d625a0
--- /dev/null
+++ b/xbzrle.h
@@ -0,0 +1,12 @@
+#ifndef _XBZRLE_H_
+#define _XBZRLE_H_
+
+#include<stdio.h>
+
+int xbzrle_encode(uint8_t *xbrle, const uint8_t *old, const uint8_t *curr,
+    const size_t len);
+int xbzrle_decode(uint8_t *curr, const uint8_t *old, const uint8_t *xbrle,
+    const size_t len);
+
+#endif
+


    

Re: [libvirt] [Qemu-devel] [PATCH v4] XBZRLE delta for live migration of large memory apps

Anthony Liguori

Alexander Graf

Anthony Liguori

Avi Kivity

Avi Kivity

Anthony Liguori

Avi Kivity

Anthony Liguori

Avi Kivity

Anthony Liguori

Daniel P. Berrange

Avi Kivity

tags

participants (4)