Message ID | 1384762641-18297-3-git-send-email-pl@kamp.de |
---|---|
State | New |
Headers | show |
On Nov 18, 2013 12:20 AM, "Peter Lieven" <pl@kamp.de> wrote: > > vnc_update_client currently scans the dirty bitmap of each client > bitwise which is a very costly operation if only few bits are dirty. > vnc_refresh_server_surface does almost the same. > this patch optimizes both by utilizing the heavily optimized > function find_next_bit to find the offset of the next dirty > bit in the dirty bitmaps. > > Signed-off-by: Peter Lieven <pl@kamp.de> Can you include performance data? Regards, Anthony Liguori > --- > ui/vnc.c | 146 ++++++++++++++++++++++++++++++++++++++------------------------ > ui/vnc.h | 3 ++ > 2 files changed, 92 insertions(+), 57 deletions(-) > > diff --git a/ui/vnc.c b/ui/vnc.c > index 67b1f75..edf33be 100644 > --- a/ui/vnc.c > +++ b/ui/vnc.c > @@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y) > ptr += x * VNC_SERVER_FB_BYTES; > return ptr; > } > +/* this sets only the visible pixels of a dirty bitmap */ > +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\ > + int x, y;\ > + memset(bitmap, 0x00, sizeof(bitmap));\ > + for (y = 0; y < h; y++) {\ > + for (x = 0; x < w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\ > + set_bit(x, bitmap[y]);\ > + } \ > + } \ > + } > > static void vnc_dpy_switch(DisplayChangeListener *dcl, > DisplaySurface *surface) > @@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, > qemu_pixman_image_unref(vd->guest.fb); > vd->guest.fb = pixman_image_ref(surface->image); > vd->guest.format = surface->format; > - memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty)); > + VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty, > + surface_width(vd->ds), > + surface_height(vd->ds)); > > QTAILQ_FOREACH(vs, &vd->clients, next) { > vnc_colordepth(vs); > @@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, > if (vs->vd->cursor) { > vnc_cursor_define(vs); > } > - memset(vs->dirty, 0xFF, sizeof(vs->dirty)); > + VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty, > + surface_width(vd->ds), > + surface_height(vd->ds)); > } > } > > @@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int has_dirty) > return ret; > } > > +#define VNC_CLIENT_UPDATE_RECT() \ > + if (last_x != -1) {\ > + int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\ > + n += vnc_job_add_rect(job,\ > + last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\ > + (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\ > + } > + > static int vnc_update_client(VncState *vs, int has_dirty) > { > if (vs->need_update && vs->csock != -1) { > @@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int has_dirty) > width = MIN(pixman_image_get_width(vd->server), vs->client_width); > height = MIN(pixman_image_get_height(vd->server), vs->client_height); > > - for (y = 0; y < height; y++) { > + y = 0; > + for (;;) { > int x; > int last_x = -1; > - for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { > + unsigned long offset = find_next_bit((unsigned long *) &vs->dirty, > + height * VNC_DIRTY_BITS_PER_LINE(vs), > + y * VNC_DIRTY_BITS_PER_LINE(vs)); > + if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) { > + /* no more dirty bits */ > + break; > + } > + y = offset / VNC_DIRTY_BITS_PER_LINE(vs); > + > + for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs); > + x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { > if (test_and_clear_bit(x, vs->dirty[y])) { > if (last_x == -1) { > last_x = x; > } > } else { > - if (last_x != -1) { > - int h = find_and_clear_dirty_height(vs, y, last_x, x, > - height); > - > - n += vnc_job_add_rect(job, > - last_x * VNC_DIRTY_PIXELS_PER_BIT, > - y, > - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, > - h); > - } > + VNC_CLIENT_UPDATE_RECT(); > last_x = -1; > } > } > - if (last_x != -1) { > - int h = find_and_clear_dirty_height(vs, y, last_x, x, height); > - n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT, > - y, > - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, > - h); > - } > + VNC_CLIENT_UPDATE_RECT(); > + y++; > } > > vnc_job_push(job); > @@ -2676,8 +2695,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd) > int width = pixman_image_get_width(vd->guest.fb); > int height = pixman_image_get_height(vd->guest.fb); > int y; > - uint8_t *guest_row; > - uint8_t *server_row; > + uint8_t *guest_row0 = NULL, *server_row0; > + int guest_stride, server_stride; > int cmp_bytes; > VncState *vs; > int has_dirty = 0; > @@ -2702,44 +2721,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd) > if (vd->guest.format != VNC_SERVER_FB_FORMAT) { > int width = pixman_image_get_width(vd->server); > tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width); > - } > - guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb); > - server_row = (uint8_t *)pixman_image_get_data(vd->server); > - for (y = 0; y < height; y++) { > - if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) { > - int x; > - uint8_t *guest_ptr; > - uint8_t *server_ptr; > + } else { > + guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb); > + } > + server_row0 = (uint8_t *)pixman_image_get_data(vd->server); > + guest_stride = pixman_image_get_stride(vd->guest.fb); > + server_stride = pixman_image_get_stride(vd->server); > + > + y = 0; > + for (;;) { > + int x; > + uint8_t *guest_ptr, *server_ptr; > + unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty, > + height * VNC_DIRTY_BITS_PER_LINE(&vd->guest), > + y * VNC_DIRTY_BITS_PER_LINE(&vd->guest)); > + if (offset == height * VNC_DIRTY_BITS_PER_LINE(&vd->guest)) { > + /* no more dirty bits */ > + break; > + } > + y = offset / VNC_DIRTY_BITS_PER_LINE(&vd->guest); > > - if (vd->guest.format != VNC_SERVER_FB_FORMAT) { > - qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y); > - guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); > - } else { > - guest_ptr = guest_row; > - } > - server_ptr = server_row; > + server_ptr = server_row0 + y * server_stride; > > - for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; > - x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, > - server_ptr += cmp_bytes) { > - if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), > - vd->guest.dirty[y])) { > - continue; > - } > - if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { > - continue; > - } > - memcpy(server_ptr, guest_ptr, cmp_bytes); > - if (!vd->non_adaptive) > - vnc_rect_updated(vd, x, y, &tv); > - QTAILQ_FOREACH(vs, &vd->clients, next) { > - set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); > - } > - has_dirty++; > + if (vd->guest.format != VNC_SERVER_FB_FORMAT) { > + qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y); > + guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); > + } else { > + guest_ptr = guest_row0 + y * guest_stride; > + } > + > + for (x = offset % VNC_DIRTY_BITS_PER_LINE(&vd->guest); > + x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; > + x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, > + server_ptr += cmp_bytes) { > + if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), > + vd->guest.dirty[y])) { > + continue; > + } > + if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { > + continue; > + } > + memcpy(server_ptr, guest_ptr, cmp_bytes); > + if (!vd->non_adaptive) { > + vnc_rect_updated(vd, x, y, &tv); > } > + QTAILQ_FOREACH(vs, &vd->clients, next) { > + set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); > + } > + has_dirty++; > } > - guest_row += pixman_image_get_stride(vd->guest.fb); > - server_row += pixman_image_get_stride(vd->server); > + > + y++; > } > qemu_pixman_image_unref(tmpbuf); > return has_dirty; > diff --git a/ui/vnc.h b/ui/vnc.h > index 4a8f33c..82c8ea8 100644 > --- a/ui/vnc.h > +++ b/ui/vnc.h > @@ -88,6 +88,9 @@ typedef void VncSendHextileTile(VncState *vs, > /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ > #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) > > +/* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ > +#define VNC_DIRTY_BITS_PER_LINE(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE) > + > #define VNC_STAT_RECT 64 > #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT) > #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT) > -- > 1.7.9.5 > >
Am 18.11.2013 17:27, schrieb Anthony Liguori: > > > On Nov 18, 2013 12:20 AM, "Peter Lieven" <pl@kamp.de <mailto:pl@kamp.de>> wrote: > > > > vnc_update_client currently scans the dirty bitmap of each client > > bitwise which is a very costly operation if only few bits are dirty. > > vnc_refresh_server_surface does almost the same. > > this patch optimizes both by utilizing the heavily optimized > > function find_next_bit to find the offset of the next dirty > > bit in the dirty bitmaps. > > > > Signed-off-by: Peter Lieven <pl@kamp.de <mailto:pl@kamp.de>> > > Can you include performance data? > I hoped that the checking 32bits (pipelined) at once compared to checking 32bits one-by-one would be convincing enough ;-) Do you have a special test in mind? Otherwise I could try to create an artificial test case with e.g. no bits dirty, all bits dirty, only a few bits dirty (cursor update) and compare the timing for both versions. Peter
On 18.11.2013 17:27, Anthony Liguori wrote: > > > On Nov 18, 2013 12:20 AM, "Peter Lieven" <pl@kamp.de <mailto:pl@kamp.de>> wrote: > > > > vnc_update_client currently scans the dirty bitmap of each client > > bitwise which is a very costly operation if only few bits are dirty. > > vnc_refresh_server_surface does almost the same. > > this patch optimizes both by utilizing the heavily optimized > > function find_next_bit to find the offset of the next dirty > > bit in the dirty bitmaps. > > > > Signed-off-by: Peter Lieven <pl@kamp.de <mailto:pl@kamp.de>> > > Can you include performance data? > I made some aritificial analysis of vnc_update_client with the attached test code. $ gcc -O2 -o vnc_perf vnc_perf.c $ ./vnc_perf All bits clean - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.82 secs All bits dirty - vnc_update_client_new: 9.81 secs vnc_update_client_old: 20.00 secs Few bits dirty - vnc_update_client_new: 0.08 secs vnc_update_client_old: 10.62 secs find_and_clear_dirty_height() is still very slow, but I will look at this separately. Peter --- #define ITERATIONS 16*1024 #define VNC_MAX_WIDTH 2560 #define VNC_MAX_HEIGHT 2048 #define VNC_DIRTY_PIXELS_PER_BIT 16 /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) /* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ #define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BIT(nr) (1UL << (nr)) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define ctzl ctz64 #include <stdio.h> #include <stdint.h> #include <string.h> #include "time.h" static inline int ctz64(uint64_t val) { return val ? __builtin_ctzll(val) : 64; } DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS); unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG-1); unsigned long tmp; if (offset >= size) { return size; } size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp &= (~0UL << offset); if (size < BITS_PER_LONG) { goto found_first; } if (tmp) { goto found_middle; } size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size >= 4*BITS_PER_LONG) { unsigned long d1, d2, d3; tmp = *p; d1 = *(p+1); d2 = *(p+2); d3 = *(p+3); if (tmp) { goto found_middle; } if (d1 | d2 | d3) { break; } p += 4; result += 4*BITS_PER_LONG; size -= 4*BITS_PER_LONG; } while (size >= BITS_PER_LONG) { if ((tmp = *(p++))) { goto found_middle; } result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) { return result; } tmp = *p; found_first: tmp &= (~0UL >> (BITS_PER_LONG - size)); if (tmp == 0UL) { /* Are any bits set? */ return result + size; /* Nope. */ } found_middle: return result + ctzl(tmp); } static inline int test_bit(int nr, const unsigned long *addr) { return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); } static inline void clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p &= ~mask; } static inline void set_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p |= mask; } static inline int test_and_clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); unsigned long old = *p; *p = old & ~mask; return (old & mask) != 0; } static int find_and_clear_dirty_height(int y, int last_x, int x, int height) { int h; for (h = 1; h < (height - y); h++) { int tmp_x; if (!test_bit(last_x, dirty[y + h])) { break; } for (tmp_x = last_x; tmp_x < x; tmp_x++) { clear_bit(tmp_x, dirty[y + h]); } } return h; } #define VNC_CLIENT_UPDATE_RECT() \ if (last_x != -1) {\ int h = find_and_clear_dirty_height(y, last_x, x, height);\ } void vnc_update_client_new() { int width = VNC_MAX_WIDTH; int height = VNC_MAX_HEIGHT; int y = 0; for (;;) { int x; int last_x = -1; unsigned long offset = find_next_bit((unsigned long *) &dirty, height * VNC_DIRTY_BITS_PER_LINE(dirty), y * VNC_DIRTY_BITS_PER_LINE(dirty)); if (offset == height * VNC_DIRTY_BITS_PER_LINE(dirty)) { /* no more dirty bits */ break; } y = offset / VNC_DIRTY_BITS_PER_LINE(dirty); for (x = offset % VNC_DIRTY_BITS_PER_LINE(dirty); x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { if (test_and_clear_bit(x, dirty[y])) { if (last_x == -1) { last_x = x; } } else { VNC_CLIENT_UPDATE_RECT(); last_x = -1; } } VNC_CLIENT_UPDATE_RECT(); y++; }} void vnc_update_client_old() { int width = VNC_MAX_WIDTH; int height = VNC_MAX_HEIGHT; int y; for (y = 0; y < height; y++) { int x; int last_x = -1; for (x = 0; x < width / 16; x++) { if (test_and_clear_bit(x, dirty[y])) { if (last_x == -1) { last_x = x; } } else { VNC_CLIENT_UPDATE_RECT(); last_x = -1; } } VNC_CLIENT_UPDATE_RECT(); } } void main() { int i; clock_t start, end; start = clock(); for (i = 0; i < ITERATIONS; i++) { memset(dirty, 0x00, sizeof(dirty)); vnc_update_client_new(); } end = clock(); printf("All bits clean - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); start = clock(); for (i = 0; i < ITERATIONS; i++) { memset(dirty, 0x00, sizeof(dirty)); vnc_update_client_old(); } end = clock(); printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); start = clock(); for (i = 0; i < ITERATIONS; i++) { memset(dirty, 0xff, sizeof(dirty)); vnc_update_client_new(); } end = clock(); printf("All bits dirty - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); start = clock(); for (i = 0; i < ITERATIONS; i++) { memset(dirty, 0xff, sizeof(dirty)); vnc_update_client_old(); } end = clock(); printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); start = clock(); for (i = 0; i < ITERATIONS; i++) { int y; memset(dirty, 0x00, sizeof(dirty)); for (y = VNC_MAX_HEIGHT/2-8; y < VNC_MAX_HEIGHT/2+8; y++) { set_bit(VNC_DIRTY_BITS/2,dirty[y]); } vnc_update_client_new(); } end = clock(); printf("Few bits dirty - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); start = clock(); for (i = 0; i < ITERATIONS; i++) { int y; memset(dirty, 0x00, sizeof(dirty)); for (y = VNC_MAX_HEIGHT/2-8; y < VNC_MAX_HEIGHT/2+8; y++) { set_bit(VNC_DIRTY_BITS/2,dirty[y]); } vnc_update_client_old(); } end = clock(); printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); return; }
On 19.11.2013 14:48, Peter Lieven wrote: > On 18.11.2013 17:27, Anthony Liguori wrote: >> >> >> On Nov 18, 2013 12:20 AM, "Peter Lieven" <pl@kamp.de <mailto:pl@kamp.de>> wrote: >> > >> > vnc_update_client currently scans the dirty bitmap of each client >> > bitwise which is a very costly operation if only few bits are dirty. >> > vnc_refresh_server_surface does almost the same. >> > this patch optimizes both by utilizing the heavily optimized >> > function find_next_bit to find the offset of the next dirty >> > bit in the dirty bitmaps. >> > >> > Signed-off-by: Peter Lieven <pl@kamp.de <mailto:pl@kamp.de>> >> >> Can you include performance data? >> > > I made some aritificial analysis of vnc_update_client with the attached test code. > > $ gcc -O2 -o vnc_perf vnc_perf.c > $ ./vnc_perf > All bits clean - vnc_update_client_new: 0.07 secs > vnc_update_client_old: 10.82 secs > > All bits dirty - vnc_update_client_new: 9.81 secs > vnc_update_client_old: 20.00 secs > > Few bits dirty - vnc_update_client_new: 0.08 secs > vnc_update_client_old: 10.62 secs > > find_and_clear_dirty_height() is still very slow, but I will look at this separately. quite easy, but great effect: replacing: for (tmp_x = last_x; tmp_x < x; tmp_x++) { clear_bit(tmp_x, dirty[y + h]); } with: bitmap_clear(dirty[y + h], last_x, x - last_x); in find_and_clear_dirty_height(), yields the following performance ;-) All bits clean - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.65 secs All bits dirty - vnc_update_client_new: 0.69 secs vnc_update_client_old: 19.86 secs Few bits dirty - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.69 secs > Peter > > --- > > #define ITERATIONS 16*1024 > > #define VNC_MAX_WIDTH 2560 > #define VNC_MAX_HEIGHT 2048 > > #define VNC_DIRTY_PIXELS_PER_BIT 16 > > /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ > #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) > > /* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ > #define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE) > > #define BITS_PER_BYTE 8 > #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) > > #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) > > #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) > > #define BIT(nr) (1UL << (nr)) > #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) > #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) > #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) > > #define DECLARE_BITMAP(name,bits) \ > unsigned long name[BITS_TO_LONGS(bits)] > > #define ctzl ctz64 > > #include <stdio.h> > #include <stdint.h> > #include <string.h> > #include "time.h" > > static inline int ctz64(uint64_t val) > { > return val ? __builtin_ctzll(val) : 64; > } > > DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS); > > unsigned long find_next_bit(const unsigned long *addr, unsigned long size, > unsigned long offset) > { > const unsigned long *p = addr + BITOP_WORD(offset); > unsigned long result = offset & ~(BITS_PER_LONG-1); > unsigned long tmp; > > if (offset >= size) { > return size; > } > size -= result; > offset %= BITS_PER_LONG; > if (offset) { > tmp = *(p++); > tmp &= (~0UL << offset); > if (size < BITS_PER_LONG) { > goto found_first; > } > if (tmp) { > goto found_middle; > } > size -= BITS_PER_LONG; > result += BITS_PER_LONG; > } > while (size >= 4*BITS_PER_LONG) { > unsigned long d1, d2, d3; > tmp = *p; > d1 = *(p+1); > d2 = *(p+2); > d3 = *(p+3); > if (tmp) { > goto found_middle; > } > if (d1 | d2 | d3) { > break; > } > p += 4; > result += 4*BITS_PER_LONG; > size -= 4*BITS_PER_LONG; > } > while (size >= BITS_PER_LONG) { > if ((tmp = *(p++))) { > goto found_middle; > } > result += BITS_PER_LONG; > size -= BITS_PER_LONG; > } > if (!size) { > return result; > } > tmp = *p; > > found_first: > tmp &= (~0UL >> (BITS_PER_LONG - size)); > if (tmp == 0UL) { /* Are any bits set? */ > return result + size; /* Nope. */ > } > found_middle: > return result + ctzl(tmp); > } > > static inline int test_bit(int nr, const unsigned long *addr) > { > return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); > } > > static inline void clear_bit(int nr, unsigned long *addr) > { > unsigned long mask = BIT_MASK(nr); > unsigned long *p = addr + BIT_WORD(nr); > > *p &= ~mask; > } > > static inline void set_bit(int nr, unsigned long *addr) > { > unsigned long mask = BIT_MASK(nr); > unsigned long *p = addr + BIT_WORD(nr); > > *p |= mask; > } > > static inline int test_and_clear_bit(int nr, unsigned long *addr) > { > unsigned long mask = BIT_MASK(nr); > unsigned long *p = addr + BIT_WORD(nr); > unsigned long old = *p; > > *p = old & ~mask; > return (old & mask) != 0; > } > > static int find_and_clear_dirty_height(int y, int last_x, int x, int height) > { > int h; > > for (h = 1; h < (height - y); h++) { > int tmp_x; > if (!test_bit(last_x, dirty[y + h])) { > break; > } > for (tmp_x = last_x; tmp_x < x; tmp_x++) { > clear_bit(tmp_x, dirty[y + h]); > } > } > > return h; > } > > #define VNC_CLIENT_UPDATE_RECT() \ > if (last_x != -1) {\ > int h = find_and_clear_dirty_height(y, last_x, x, height);\ > } > > void vnc_update_client_new() { > int width = VNC_MAX_WIDTH; > int height = VNC_MAX_HEIGHT; > int y = 0; > for (;;) { > int x; > int last_x = -1; > unsigned long offset = find_next_bit((unsigned long *) &dirty, > height * VNC_DIRTY_BITS_PER_LINE(dirty), > y * VNC_DIRTY_BITS_PER_LINE(dirty)); > if (offset == height * VNC_DIRTY_BITS_PER_LINE(dirty)) { > /* no more dirty bits */ > break; > } > y = offset / VNC_DIRTY_BITS_PER_LINE(dirty); > > for (x = offset % VNC_DIRTY_BITS_PER_LINE(dirty); > x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { > if (test_and_clear_bit(x, dirty[y])) { > if (last_x == -1) { > last_x = x; > } > } else { > VNC_CLIENT_UPDATE_RECT(); > last_x = -1; > } > } > VNC_CLIENT_UPDATE_RECT(); > y++; > }} > > void vnc_update_client_old() { > int width = VNC_MAX_WIDTH; > int height = VNC_MAX_HEIGHT; > int y; > for (y = 0; y < height; y++) { > int x; > int last_x = -1; > for (x = 0; x < width / 16; x++) { > if (test_and_clear_bit(x, dirty[y])) { > if (last_x == -1) { > last_x = x; > } > } else { > VNC_CLIENT_UPDATE_RECT(); > last_x = -1; > } > } > VNC_CLIENT_UPDATE_RECT(); > } > } > > void main() { > int i; > clock_t start, end; > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > memset(dirty, 0x00, sizeof(dirty)); > vnc_update_client_new(); > } > end = clock(); > printf("All bits clean - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > memset(dirty, 0x00, sizeof(dirty)); > vnc_update_client_old(); > } > end = clock(); > printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > memset(dirty, 0xff, sizeof(dirty)); > vnc_update_client_new(); > } > end = clock(); > printf("All bits dirty - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > memset(dirty, 0xff, sizeof(dirty)); > vnc_update_client_old(); > } > end = clock(); > printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > int y; > memset(dirty, 0x00, sizeof(dirty)); > for (y = VNC_MAX_HEIGHT/2-8; y < VNC_MAX_HEIGHT/2+8; y++) { > set_bit(VNC_DIRTY_BITS/2,dirty[y]); > } > vnc_update_client_new(); > } > end = clock(); > printf("Few bits dirty - vnc_update_client_new: %.2f secs\n", (double) (end-start)/CLOCKS_PER_SEC); > start = clock(); > for (i = 0; i < ITERATIONS; i++) { > int y; > memset(dirty, 0x00, sizeof(dirty)); > for (y = VNC_MAX_HEIGHT/2-8; y < VNC_MAX_HEIGHT/2+8; y++) { > set_bit(VNC_DIRTY_BITS/2,dirty[y]); > } > vnc_update_client_old(); > } > end = clock(); > printf(" vnc_update_client_old: %.2f secs\n\n", (double) (end-start)/CLOCKS_PER_SEC); > return; > } >
diff --git a/ui/vnc.c b/ui/vnc.c index 67b1f75..edf33be 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y) ptr += x * VNC_SERVER_FB_BYTES; return ptr; } +/* this sets only the visible pixels of a dirty bitmap */ +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\ + int x, y;\ + memset(bitmap, 0x00, sizeof(bitmap));\ + for (y = 0; y < h; y++) {\ + for (x = 0; x < w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\ + set_bit(x, bitmap[y]);\ + } \ + } \ + } static void vnc_dpy_switch(DisplayChangeListener *dcl, DisplaySurface *surface) @@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, qemu_pixman_image_unref(vd->guest.fb); vd->guest.fb = pixman_image_ref(surface->image); vd->guest.format = surface->format; - memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty)); + VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty, + surface_width(vd->ds), + surface_height(vd->ds)); QTAILQ_FOREACH(vs, &vd->clients, next) { vnc_colordepth(vs); @@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, if (vs->vd->cursor) { vnc_cursor_define(vs); } - memset(vs->dirty, 0xFF, sizeof(vs->dirty)); + VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty, + surface_width(vd->ds), + surface_height(vd->ds)); } } @@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int has_dirty) return ret; } +#define VNC_CLIENT_UPDATE_RECT() \ + if (last_x != -1) {\ + int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\ + n += vnc_job_add_rect(job,\ + last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\ + (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\ + } + static int vnc_update_client(VncState *vs, int has_dirty) { if (vs->need_update && vs->csock != -1) { @@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int has_dirty) width = MIN(pixman_image_get_width(vd->server), vs->client_width); height = MIN(pixman_image_get_height(vd->server), vs->client_height); - for (y = 0; y < height; y++) { + y = 0; + for (;;) { int x; int last_x = -1; - for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { + unsigned long offset = find_next_bit((unsigned long *) &vs->dirty, + height * VNC_DIRTY_BITS_PER_LINE(vs), + y * VNC_DIRTY_BITS_PER_LINE(vs)); + if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) { + /* no more dirty bits */ + break; + } + y = offset / VNC_DIRTY_BITS_PER_LINE(vs); + + for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs); + x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) { if (test_and_clear_bit(x, vs->dirty[y])) { if (last_x == -1) { last_x = x; } } else { - if (last_x != -1) { - int h = find_and_clear_dirty_height(vs, y, last_x, x, - height); - - n += vnc_job_add_rect(job, - last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, - h); - } + VNC_CLIENT_UPDATE_RECT(); last_x = -1; } } - if (last_x != -1) { - int h = find_and_clear_dirty_height(vs, y, last_x, x, height); - n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, - h); - } + VNC_CLIENT_UPDATE_RECT(); + y++; } vnc_job_push(job); @@ -2676,8 +2695,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd) int width = pixman_image_get_width(vd->guest.fb); int height = pixman_image_get_height(vd->guest.fb); int y; - uint8_t *guest_row; - uint8_t *server_row; + uint8_t *guest_row0 = NULL, *server_row0; + int guest_stride, server_stride; int cmp_bytes; VncState *vs; int has_dirty = 0; @@ -2702,44 +2721,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd) if (vd->guest.format != VNC_SERVER_FB_FORMAT) { int width = pixman_image_get_width(vd->server); tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width); - } - guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb); - server_row = (uint8_t *)pixman_image_get_data(vd->server); - for (y = 0; y < height; y++) { - if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) { - int x; - uint8_t *guest_ptr; - uint8_t *server_ptr; + } else { + guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb); + } + server_row0 = (uint8_t *)pixman_image_get_data(vd->server); + guest_stride = pixman_image_get_stride(vd->guest.fb); + server_stride = pixman_image_get_stride(vd->server); + + y = 0; + for (;;) { + int x; + uint8_t *guest_ptr, *server_ptr; + unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty, + height * VNC_DIRTY_BITS_PER_LINE(&vd->guest), + y * VNC_DIRTY_BITS_PER_LINE(&vd->guest)); + if (offset == height * VNC_DIRTY_BITS_PER_LINE(&vd->guest)) { + /* no more dirty bits */ + break; + } + y = offset / VNC_DIRTY_BITS_PER_LINE(&vd->guest); - if (vd->guest.format != VNC_SERVER_FB_FORMAT) { - qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y); - guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); - } else { - guest_ptr = guest_row; - } - server_ptr = server_row; + server_ptr = server_row0 + y * server_stride; - for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; - x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, - server_ptr += cmp_bytes) { - if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), - vd->guest.dirty[y])) { - continue; - } - if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { - continue; - } - memcpy(server_ptr, guest_ptr, cmp_bytes); - if (!vd->non_adaptive) - vnc_rect_updated(vd, x, y, &tv); - QTAILQ_FOREACH(vs, &vd->clients, next) { - set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); - } - has_dirty++; + if (vd->guest.format != VNC_SERVER_FB_FORMAT) { + qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y); + guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf); + } else { + guest_ptr = guest_row0 + y * guest_stride; + } + + for (x = offset % VNC_DIRTY_BITS_PER_LINE(&vd->guest); + x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width; + x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes, + server_ptr += cmp_bytes) { + if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT), + vd->guest.dirty[y])) { + continue; + } + if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) { + continue; + } + memcpy(server_ptr, guest_ptr, cmp_bytes); + if (!vd->non_adaptive) { + vnc_rect_updated(vd, x, y, &tv); } + QTAILQ_FOREACH(vs, &vd->clients, next) { + set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]); + } + has_dirty++; } - guest_row += pixman_image_get_stride(vd->guest.fb); - server_row += pixman_image_get_stride(vd->server); + + y++; } qemu_pixman_image_unref(tmpbuf); return has_dirty; diff --git a/ui/vnc.h b/ui/vnc.h index 4a8f33c..82c8ea8 100644 --- a/ui/vnc.h +++ b/ui/vnc.h @@ -88,6 +88,9 @@ typedef void VncSendHextileTile(VncState *vs, /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) +/* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ +#define VNC_DIRTY_BITS_PER_LINE(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE) + #define VNC_STAT_RECT 64 #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT) #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)
vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven <pl@kamp.de> --- ui/vnc.c | 146 ++++++++++++++++++++++++++++++++++++++------------------------ ui/vnc.h | 3 ++ 2 files changed, 92 insertions(+), 57 deletions(-)