diff mbox

[PATCHv3,3/6] ui/vnc: optimize dirty bitmap tracking

Message ID 1388944951-14767-4-git-send-email-pl@kamp.de
State New
Headers show

Commit Message

Peter Lieven Jan. 5, 2014, 6:02 p.m. UTC
vnc_update_client currently scans the dirty bitmap of each client
bitwise which is a very costly operation if only few bits are dirty.
vnc_refresh_server_surface does almost the same.
this patch optimizes both by utilizing the heavily optimized
function find_next_bit to find the offset of the next dirty
bit in the dirty bitmaps.

The following artifical test (just the bitmap operation part) running
vnc_update_client 65536 times on a 2560x2048 surface illustrates the
performance difference:

All bits clean - vnc_update_client_new: 0.07 secs
                 vnc_update_client_old: 10.98 secs

All bits dirty - vnc_update_client_new: 11.26 secs
                 vnc_update_client_old: 20.19 secs

Few bits dirty - vnc_update_client_new: 0.08 secs
                 vnc_update_client_old: 10.98 secs

The case for all bits dirty is still rather slow, this
is due to the implementation of find_and_clear_dirty_height.
This will be addresses in a separate patch.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 ui/vnc.c |  154 +++++++++++++++++++++++++++++++++-----------------------------
 ui/vnc.h |    4 ++
 2 files changed, 87 insertions(+), 71 deletions(-)

Comments

Wayne Xia Jan. 6, 2014, 10:08 a.m. UTC | #1
于 2014/1/6 2:02, Peter Lieven 写道:
> vnc_update_client currently scans the dirty bitmap of each client
> bitwise which is a very costly operation if only few bits are dirty.
> vnc_refresh_server_surface does almost the same.
> this patch optimizes both by utilizing the heavily optimized
> function find_next_bit to find the offset of the next dirty
> bit in the dirty bitmaps.
> 
> The following artifical test (just the bitmap operation part) running
> vnc_update_client 65536 times on a 2560x2048 surface illustrates the
> performance difference:
> 
> All bits clean - vnc_update_client_new: 0.07 secs
>                   vnc_update_client_old: 10.98 secs
> 
> All bits dirty - vnc_update_client_new: 11.26 secs
>                   vnc_update_client_old: 20.19 secs
> 
> Few bits dirty - vnc_update_client_new: 0.08 secs
>                   vnc_update_client_old: 10.98 secs
> 
> The case for all bits dirty is still rather slow, this
> is due to the implementation of find_and_clear_dirty_height.
> This will be addresses in a separate patch.
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>   ui/vnc.c |  154 +++++++++++++++++++++++++++++++++-----------------------------
>   ui/vnc.h |    4 ++
>   2 files changed, 87 insertions(+), 71 deletions(-)
> 
> diff --git a/ui/vnc.c b/ui/vnc.c
> index 1d2aa1a..6a0c03e 100644
> --- a/ui/vnc.c
> +++ b/ui/vnc.c
> @@ -572,6 +572,14 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
>       ptr += x * VNC_SERVER_FB_BYTES;
>       return ptr;
>   }
> +/* this sets only the visible pixels of a dirty bitmap */
> +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
> +        int y;\
> +        memset(bitmap, 0x00, sizeof(bitmap));\
> +        for (y = 0; y < h; y++) {\
> +            bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\

  Will it be a problem when vnc's width % VNC_DIRTY_PIXELS_PER_BIT != 0?
Although it is a rare case, but I think it is better round it up, since
"v" and "VNC_DIRTY_PIXELS_PER_BIT" are variables. A macro computing it
would be nice:

#define VNC_DIRTY_BITS_FROM_WIDTH(w) (w + VNC_DIRTY_PIXELS_PER_BIT - 1/
VNC_DIRTY_PIXELS_PER_BIT)
#define VNC_DIRTY_BITS (VNC_DIRTY_BITS_FROM_WIDTH(VNC_MAX_WIDTH)

then here:
    bitmap_set(bitmap[y], 0, VNC_DIRTY_BITS_FROM_WIDTH(w));

Or simply warn or coredump when v % VNC_DIRTY_PIXELS_PER_BIT != 0.

Also, in vnc.h:
/* VNC_MAX_WIDTH must be a multiple of 16. */
#define VNC_MAX_WIDTH 2560
#define VNC_MAX_HEIGHT 2048

Maybe it should be updated as:
/* VNC_MAX_WIDTH must be a multiple of VNC_DIRTY_PIXELS_PER_BIT. */

> +        } \
> +    }
> 
>   static void vnc_dpy_switch(DisplayChangeListener *dcl,
>                              DisplaySurface *surface)
> @@ -597,7 +605,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>       qemu_pixman_image_unref(vd->guest.fb);
>       vd->guest.fb = pixman_image_ref(surface->image);
>       vd->guest.format = surface->format;
> -    memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty));
> +    VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty,
> +                                 surface_width(vd->ds),
> +                                 surface_height(vd->ds));
> 
>       QTAILQ_FOREACH(vs, &vd->clients, next) {
>           vnc_colordepth(vs);
> @@ -605,7 +615,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>           if (vs->vd->cursor) {
>               vnc_cursor_define(vs);
>           }
> -        memset(vs->dirty, 0xFF, sizeof(vs->dirty));
> +        VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty,
> +                                     surface_width(vd->ds),
> +                                     surface_height(vd->ds));
>       }
>   }
> 
> @@ -889,10 +901,9 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>           VncDisplay *vd = vs->vd;
>           VncJob *job;
>           int y;
> -        int width, height;
> +        int height;
>           int n = 0;
> 
> -
>           if (vs->output.offset && !vs->audio_cap && !vs->force_update)
>               /* kernel send buffers are full -> drop frames to throttle */
>               return 0;
> @@ -908,39 +919,27 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>            */
>           job = vnc_job_new(vs);
> 
> -        width = MIN(pixman_image_get_width(vd->server), vs->client_width);
>           height = MIN(pixman_image_get_height(vd->server), vs->client_height);
> 
> -        for (y = 0; y < height; y++) {
> -            int x;
> -            int last_x = -1;
> -            for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
> -                if (test_and_clear_bit(x, vs->dirty[y])) {
> -                    if (last_x == -1) {
> -                        last_x = x;
> -                    }
> -                } else {
> -                    if (last_x != -1) {
> -                        int h = find_and_clear_dirty_height(vs, y, last_x, x,
> -                                                            height);
> -
> -                        n += vnc_job_add_rect(job,
> -                                              last_x * VNC_DIRTY_PIXELS_PER_BIT,
> -                                              y,
> -                                              (x - last_x) *
> -                                              VNC_DIRTY_PIXELS_PER_BIT,
> -                                              h);
> -                    }
> -                    last_x = -1;
> -                }
> -            }
> -            if (last_x != -1) {
> -                int h = find_and_clear_dirty_height(vs, y, last_x, x, height);
> -                n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT,
> -                                      y,
> -                                      (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT,
> -                                      h);
> +        y = 0;
> +        for (;;) {
> +            int x, h;
> +            unsigned long x2;
> +            unsigned long offset = find_next_bit((unsigned long *) &vs->dirty,
> +                                                 height * VNC_DIRTY_BPL(vs),
> +                                                 y * VNC_DIRTY_BPL(vs));
> +            if (offset == height * VNC_DIRTY_BPL(vs)) {
> +                /* no more dirty bits */
> +                break;
>               }
> +            y = offset / VNC_DIRTY_BPL(vs);
> +            x = offset % VNC_DIRTY_BPL(vs);
> +            x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y],
> +                                    VNC_DIRTY_BPL(vs), x);
> +            bitmap_clear(vs->dirty[y], x, x2 - x);
> +            h = find_and_clear_dirty_height(vs, y, x, x2, height);
> +            n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y,
> +                                  (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h);
>           }

Minor comments:
  VNC_DIRTY_BPL(vs) is accessing memory by pointer, should we use a
variable instead of VNC_DIRTY_BPL(vs) in every place, in case of
compiler didn't optimize it for us?

> 
>           vnc_job_push(job);
> @@ -2678,8 +2677,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>       int width = pixman_image_get_width(vd->guest.fb);
>       int height = pixman_image_get_height(vd->guest.fb);
>       int y;
> -    uint8_t *guest_row;
> -    uint8_t *server_row;
> +    uint8_t *guest_row0 = NULL, *server_row0;
  Any reason that rename those variable?

> +    int guest_stride = 0, server_stride;
>       int cmp_bytes;
>       VncState *vs;
>       int has_dirty = 0;
> @@ -2704,44 +2703,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>       if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>           int width = pixman_image_get_width(vd->server);
>           tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
> -    }
> -    guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb);
> -    server_row = (uint8_t *)pixman_image_get_data(vd->server);
> -    for (y = 0; y < height; y++) {
> -        if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) {
> -            int x;
> -            uint8_t *guest_ptr;
> -            uint8_t *server_ptr;
> -
> -            if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
> -                qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
> -                guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
> -            } else {
> -                guest_ptr = guest_row;
> -            }
> -            server_ptr = server_row;
> +    } else {
> +        guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
> +        guest_stride = pixman_image_get_stride(vd->guest.fb);
> +    }
> +    server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
> +    server_stride = pixman_image_get_stride(vd->server);
> +
> +    y = 0;
> +    for (;;) {
> +        int x;
> +        uint8_t *guest_ptr, *server_ptr;
> +        unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty,
> +                                             height * VNC_DIRTY_BPL(&vd->guest),
> +                                             y * VNC_DIRTY_BPL(&vd->guest));
> +        if (offset == height * VNC_DIRTY_BPL(&vd->guest)) {
> +            /* no more dirty bits */
> +            break;
> +        }
> +        y = offset / VNC_DIRTY_BPL(&vd->guest);
> 
> -            for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
> -                 x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
> -                 server_ptr += cmp_bytes) {
> -                if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
> -                    vd->guest.dirty[y])) {
> -                    continue;
> -                }
> -                if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
> -                    continue;
> -                }
> -                memcpy(server_ptr, guest_ptr, cmp_bytes);
> -                if (!vd->non_adaptive)
> -                    vnc_rect_updated(vd, x, y, &tv);
> -                QTAILQ_FOREACH(vs, &vd->clients, next) {
> -                    set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
> -                }
> -                has_dirty++;
> +        server_ptr = server_row0 + y * server_stride;
> +
> +        if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
> +            qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
> +            guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
> +        } else {
> +            guest_ptr = guest_row0 + y * guest_stride;
> +        }
> +
> +        for (x = offset % VNC_DIRTY_BPL(&vd->guest);
> +             x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
> +             x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
> +             server_ptr += cmp_bytes) {
> +            if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
> +                vd->guest.dirty[y])) {
> +                continue;
> +            }
> +            if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
> +                continue;
> +            }
> +            memcpy(server_ptr, guest_ptr, cmp_bytes);
> +            if (!vd->non_adaptive) {
> +                vnc_rect_updated(vd, x, y, &tv);
>               }
> +            QTAILQ_FOREACH(vs, &vd->clients, next) {
> +                set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
> +            }
> +            has_dirty++;
>           }
> -        guest_row  += pixman_image_get_stride(vd->guest.fb);
> -        server_row += pixman_image_get_stride(vd->server);
> +
> +        y++;
>       }
>       qemu_pixman_image_unref(tmpbuf);
>       return has_dirty;
> diff --git a/ui/vnc.h b/ui/vnc.h
> index 4a8f33c..07e1f59 100644
> --- a/ui/vnc.h
> +++ b/ui/vnc.h
> @@ -88,6 +88,10 @@ typedef void VncSendHextileTile(VncState *vs,
>   /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
>   #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)
> 
> +/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than
> + * VNC_DIRTY_BITS due to alignment */
> +#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE)
> +
>   #define VNC_STAT_RECT  64
>   #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT)
>   #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)
>
Peter Lieven Jan. 6, 2014, 1:31 p.m. UTC | #2
On 06.01.2014 11:08, Wenchao Xia wrote:
> 于 2014/1/6 2:02, Peter Lieven 写道:
>> vnc_update_client currently scans the dirty bitmap of each client
>> bitwise which is a very costly operation if only few bits are dirty.
>> vnc_refresh_server_surface does almost the same.
>> this patch optimizes both by utilizing the heavily optimized
>> function find_next_bit to find the offset of the next dirty
>> bit in the dirty bitmaps.
>>
>> The following artifical test (just the bitmap operation part) running
>> vnc_update_client 65536 times on a 2560x2048 surface illustrates the
>> performance difference:
>>
>> All bits clean - vnc_update_client_new: 0.07 secs
>>                   vnc_update_client_old: 10.98 secs
>>
>> All bits dirty - vnc_update_client_new: 11.26 secs
>>                   vnc_update_client_old: 20.19 secs
>>
>> Few bits dirty - vnc_update_client_new: 0.08 secs
>>                   vnc_update_client_old: 10.98 secs
>>
>> The case for all bits dirty is still rather slow, this
>> is due to the implementation of find_and_clear_dirty_height.
>> This will be addresses in a separate patch.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>   ui/vnc.c |  154 +++++++++++++++++++++++++++++++++-----------------------------
>>   ui/vnc.h |    4 ++
>>   2 files changed, 87 insertions(+), 71 deletions(-)
>>
>> diff --git a/ui/vnc.c b/ui/vnc.c
>> index 1d2aa1a..6a0c03e 100644
>> --- a/ui/vnc.c
>> +++ b/ui/vnc.c
>> @@ -572,6 +572,14 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
>>       ptr += x * VNC_SERVER_FB_BYTES;
>>       return ptr;
>>   }
>> +/* this sets only the visible pixels of a dirty bitmap */
>> +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
>> +        int y;\
>> +        memset(bitmap, 0x00, sizeof(bitmap));\
>> +        for (y = 0; y < h; y++) {\
>> +            bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\
>   Will it be a problem when vnc's width % VNC_DIRTY_PIXELS_PER_BIT != 0?
> Although it is a rare case, but I think it is better round it up, since
> "v" and "VNC_DIRTY_PIXELS_PER_BIT" are variables. A macro computing it
> would be nice:
Good point, I will use DIV_ROUND_UP here.
>
> #define VNC_DIRTY_BITS_FROM_WIDTH(w) (w + VNC_DIRTY_PIXELS_PER_BIT - 1/
> VNC_DIRTY_PIXELS_PER_BIT)
> #define VNC_DIRTY_BITS (VNC_DIRTY_BITS_FROM_WIDTH(VNC_MAX_WIDTH)
>
> then here:
>     bitmap_set(bitmap[y], 0, VNC_DIRTY_BITS_FROM_WIDTH(w));
>
> Or simply warn or coredump when v % VNC_DIRTY_PIXELS_PER_BIT != 0.
>
> Also, in vnc.h:
> /* VNC_MAX_WIDTH must be a multiple of 16. */
> #define VNC_MAX_WIDTH 2560
> #define VNC_MAX_HEIGHT 2048
>
> Maybe it should be updated as:
> /* VNC_MAX_WIDTH must be a multiple of VNC_DIRTY_PIXELS_PER_BIT. */
correct. will fix as well.
>
>> +        } \
>> +    }
>>
>>   static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>                              DisplaySurface *surface)
>> @@ -597,7 +605,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>       qemu_pixman_image_unref(vd->guest.fb);
>>       vd->guest.fb = pixman_image_ref(surface->image);
>>       vd->guest.format = surface->format;
>> -    memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty));
>> +    VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty,
>> +                                 surface_width(vd->ds),
>> +                                 surface_height(vd->ds));
>>
>>       QTAILQ_FOREACH(vs, &vd->clients, next) {
>>           vnc_colordepth(vs);
>> @@ -605,7 +615,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>           if (vs->vd->cursor) {
>>               vnc_cursor_define(vs);
>>           }
>> -        memset(vs->dirty, 0xFF, sizeof(vs->dirty));
>> +        VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty,
>> +                                     surface_width(vd->ds),
>> +                                     surface_height(vd->ds));
>>       }
>>   }
>>
>> @@ -889,10 +901,9 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>>           VncDisplay *vd = vs->vd;
>>           VncJob *job;
>>           int y;
>> -        int width, height;
>> +        int height;
>>           int n = 0;
>>
>> -
>>           if (vs->output.offset && !vs->audio_cap && !vs->force_update)
>>               /* kernel send buffers are full -> drop frames to throttle */
>>               return 0;
>> @@ -908,39 +919,27 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>>            */
>>           job = vnc_job_new(vs);
>>
>> -        width = MIN(pixman_image_get_width(vd->server), vs->client_width);
>>           height = MIN(pixman_image_get_height(vd->server), vs->client_height);
>>
>> -        for (y = 0; y < height; y++) {
>> -            int x;
>> -            int last_x = -1;
>> -            for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
>> -                if (test_and_clear_bit(x, vs->dirty[y])) {
>> -                    if (last_x == -1) {
>> -                        last_x = x;
>> -                    }
>> -                } else {
>> -                    if (last_x != -1) {
>> -                        int h = find_and_clear_dirty_height(vs, y, last_x, x,
>> -                                                            height);
>> -
>> -                        n += vnc_job_add_rect(job,
>> -                                              last_x * VNC_DIRTY_PIXELS_PER_BIT,
>> -                                              y,
>> -                                              (x - last_x) *
>> -                                              VNC_DIRTY_PIXELS_PER_BIT,
>> -                                              h);
>> -                    }
>> -                    last_x = -1;
>> -                }
>> -            }
>> -            if (last_x != -1) {
>> -                int h = find_and_clear_dirty_height(vs, y, last_x, x, height);
>> -                n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT,
>> -                                      y,
>> -                                      (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT,
>> -                                      h);
>> +        y = 0;
>> +        for (;;) {
>> +            int x, h;
>> +            unsigned long x2;
>> +            unsigned long offset = find_next_bit((unsigned long *) &vs->dirty,
>> +                                                 height * VNC_DIRTY_BPL(vs),
>> +                                                 y * VNC_DIRTY_BPL(vs));
>> +            if (offset == height * VNC_DIRTY_BPL(vs)) {
>> +                /* no more dirty bits */
>> +                break;
>>               }
>> +            y = offset / VNC_DIRTY_BPL(vs);
>> +            x = offset % VNC_DIRTY_BPL(vs);
>> +            x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y],
>> +                                    VNC_DIRTY_BPL(vs), x);
>> +            bitmap_clear(vs->dirty[y], x, x2 - x);
>> +            h = find_and_clear_dirty_height(vs, y, x, x2, height);
>> +            n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y,
>> +                                  (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h);
>>           }
> Minor comments:
>   VNC_DIRTY_BPL(vs) is accessing memory by pointer, should we use a
> variable instead of VNC_DIRTY_BPL(vs) in every place, in case of
> compiler didn't optimize it for us?
I am pretty sure that sizeof is evaluated at compile time or do you have other
evidence?
>
>>           vnc_job_push(job);
>> @@ -2678,8 +2677,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>>       int width = pixman_image_get_width(vd->guest.fb);
>>       int height = pixman_image_get_height(vd->guest.fb);
>>       int y;
>> -    uint8_t *guest_row;
>> -    uint8_t *server_row;
>> +    uint8_t *guest_row0 = NULL, *server_row0;
>   Any reason that rename those variable?
Its actually a pointer to row0 and not to any specific row. This is why
I renamed it.
>
>> +    int guest_stride = 0, server_stride;
>>       int cmp_bytes;
>>       VncState *vs;
>>       int has_dirty = 0;
>> @@ -2704,44 +2703,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>>       if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>>           int width = pixman_image_get_width(vd->server);
>>           tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
>> -    }
>> -    guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>> -    server_row = (uint8_t *)pixman_image_get_data(vd->server);
>> -    for (y = 0; y < height; y++) {
>> -        if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) {
>> -            int x;
>> -            uint8_t *guest_ptr;
>> -            uint8_t *server_ptr;
>> -
>> -            if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>> -                qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>> -                guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>> -            } else {
>> -                guest_ptr = guest_row;
>> -            }
>> -            server_ptr = server_row;
>> +    } else {
>> +        guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>> +        guest_stride = pixman_image_get_stride(vd->guest.fb);
>> +    }
>> +    server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
>> +    server_stride = pixman_image_get_stride(vd->server);
>> +
>> +    y = 0;
>> +    for (;;) {
>> +        int x;
>> +        uint8_t *guest_ptr, *server_ptr;
>> +        unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty,
>> +                                             height * VNC_DIRTY_BPL(&vd->guest),
>> +                                             y * VNC_DIRTY_BPL(&vd->guest));
>> +        if (offset == height * VNC_DIRTY_BPL(&vd->guest)) {
>> +            /* no more dirty bits */
>> +            break;
>> +        }
>> +        y = offset / VNC_DIRTY_BPL(&vd->guest);
>>
>> -            for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>> -                 x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>> -                 server_ptr += cmp_bytes) {
>> -                if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>> -                    vd->guest.dirty[y])) {
>> -                    continue;
>> -                }
>> -                if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>> -                    continue;
>> -                }
>> -                memcpy(server_ptr, guest_ptr, cmp_bytes);
>> -                if (!vd->non_adaptive)
>> -                    vnc_rect_updated(vd, x, y, &tv);
>> -                QTAILQ_FOREACH(vs, &vd->clients, next) {
>> -                    set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>> -                }
>> -                has_dirty++;
>> +        server_ptr = server_row0 + y * server_stride;
>> +
>> +        if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>> +            qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>> +            guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>> +        } else {
>> +            guest_ptr = guest_row0 + y * guest_stride;
>> +        }
>> +
>> +        for (x = offset % VNC_DIRTY_BPL(&vd->guest);
>> +             x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>> +             x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>> +             server_ptr += cmp_bytes) {
>> +            if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>> +                vd->guest.dirty[y])) {
>> +                continue;
>> +            }
>> +            if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>> +                continue;
>> +            }
>> +            memcpy(server_ptr, guest_ptr, cmp_bytes);
>> +            if (!vd->non_adaptive) {
>> +                vnc_rect_updated(vd, x, y, &tv);
>>               }
>> +            QTAILQ_FOREACH(vs, &vd->clients, next) {
>> +                set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>> +            }
>> +            has_dirty++;
>>           }
>> -        guest_row  += pixman_image_get_stride(vd->guest.fb);
>> -        server_row += pixman_image_get_stride(vd->server);
>> +
>> +        y++;
>>       }
>>       qemu_pixman_image_unref(tmpbuf);
>>       return has_dirty;
>> diff --git a/ui/vnc.h b/ui/vnc.h
>> index 4a8f33c..07e1f59 100644
>> --- a/ui/vnc.h
>> +++ b/ui/vnc.h
>> @@ -88,6 +88,10 @@ typedef void VncSendHextileTile(VncState *vs,
>>   /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
>>   #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)
>>
>> +/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than
>> + * VNC_DIRTY_BITS due to alignment */
>> +#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE)
>> +
>>   #define VNC_STAT_RECT  64
>>   #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT)
>>   #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)
>>
Wayne Xia Jan. 7, 2014, 2 a.m. UTC | #3
于 2014/1/6 21:31, Peter Lieven 写道:
> On 06.01.2014 11:08, Wenchao Xia wrote:
>> 于 2014/1/6 2:02, Peter Lieven 写道:
>>> vnc_update_client currently scans the dirty bitmap of each client
>>> bitwise which is a very costly operation if only few bits are dirty.
>>> vnc_refresh_server_surface does almost the same.
>>> this patch optimizes both by utilizing the heavily optimized
>>> function find_next_bit to find the offset of the next dirty
>>> bit in the dirty bitmaps.
>>>
>>> The following artifical test (just the bitmap operation part) running
>>> vnc_update_client 65536 times on a 2560x2048 surface illustrates the
>>> performance difference:
>>>
>>> All bits clean - vnc_update_client_new: 0.07 secs
>>>                    vnc_update_client_old: 10.98 secs
>>>
>>> All bits dirty - vnc_update_client_new: 11.26 secs
>>>                    vnc_update_client_old: 20.19 secs
>>>
>>> Few bits dirty - vnc_update_client_new: 0.08 secs
>>>                    vnc_update_client_old: 10.98 secs
>>>
>>> The case for all bits dirty is still rather slow, this
>>> is due to the implementation of find_and_clear_dirty_height.
>>> This will be addresses in a separate patch.
>>>
>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>> ---
>>>    ui/vnc.c |  154 +++++++++++++++++++++++++++++++++-----------------------------
>>>    ui/vnc.h |    4 ++
>>>    2 files changed, 87 insertions(+), 71 deletions(-)
>>>
>>> diff --git a/ui/vnc.c b/ui/vnc.c
>>> index 1d2aa1a..6a0c03e 100644
>>> --- a/ui/vnc.c
>>> +++ b/ui/vnc.c
>>> @@ -572,6 +572,14 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
>>>        ptr += x * VNC_SERVER_FB_BYTES;
>>>        return ptr;
>>>    }
>>> +/* this sets only the visible pixels of a dirty bitmap */
>>> +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
>>> +        int y;\
>>> +        memset(bitmap, 0x00, sizeof(bitmap));\
>>> +        for (y = 0; y < h; y++) {\
>>> +            bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\
>>    Will it be a problem when vnc's width % VNC_DIRTY_PIXELS_PER_BIT != 0?
>> Although it is a rare case, but I think it is better round it up, since
>> "v" and "VNC_DIRTY_PIXELS_PER_BIT" are variables. A macro computing it
>> would be nice:
> Good point, I will use DIV_ROUND_UP here.
>>
>> #define VNC_DIRTY_BITS_FROM_WIDTH(w) (w + VNC_DIRTY_PIXELS_PER_BIT - 1/
>> VNC_DIRTY_PIXELS_PER_BIT)
>> #define VNC_DIRTY_BITS (VNC_DIRTY_BITS_FROM_WIDTH(VNC_MAX_WIDTH)
>>
>> then here:
>>      bitmap_set(bitmap[y], 0, VNC_DIRTY_BITS_FROM_WIDTH(w));
>>
>> Or simply warn or coredump when v % VNC_DIRTY_PIXELS_PER_BIT != 0.
>>
>> Also, in vnc.h:
>> /* VNC_MAX_WIDTH must be a multiple of 16. */
>> #define VNC_MAX_WIDTH 2560
>> #define VNC_MAX_HEIGHT 2048
>>
>> Maybe it should be updated as:
>> /* VNC_MAX_WIDTH must be a multiple of VNC_DIRTY_PIXELS_PER_BIT. */
> correct. will fix as well.
>>
>>> +        } \
>>> +    }
>>>
>>>    static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>>                               DisplaySurface *surface)
>>> @@ -597,7 +605,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>>        qemu_pixman_image_unref(vd->guest.fb);
>>>        vd->guest.fb = pixman_image_ref(surface->image);
>>>        vd->guest.format = surface->format;
>>> -    memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty));
>>> +    VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty,
>>> +                                 surface_width(vd->ds),
>>> +                                 surface_height(vd->ds));
>>>
>>>        QTAILQ_FOREACH(vs, &vd->clients, next) {
>>>            vnc_colordepth(vs);
>>> @@ -605,7 +615,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
>>>            if (vs->vd->cursor) {
>>>                vnc_cursor_define(vs);
>>>            }
>>> -        memset(vs->dirty, 0xFF, sizeof(vs->dirty));
>>> +        VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty,
>>> +                                     surface_width(vd->ds),
>>> +                                     surface_height(vd->ds));
>>>        }
>>>    }
>>>
>>> @@ -889,10 +901,9 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>>>            VncDisplay *vd = vs->vd;
>>>            VncJob *job;
>>>            int y;
>>> -        int width, height;
>>> +        int height;
>>>            int n = 0;
>>>
>>> -
>>>            if (vs->output.offset && !vs->audio_cap && !vs->force_update)
>>>                /* kernel send buffers are full -> drop frames to throttle */
>>>                return 0;
>>> @@ -908,39 +919,27 @@ static int vnc_update_client(VncState *vs, int has_dirty)
>>>             */
>>>            job = vnc_job_new(vs);
>>>
>>> -        width = MIN(pixman_image_get_width(vd->server), vs->client_width);
>>>            height = MIN(pixman_image_get_height(vd->server), vs->client_height);
>>>
>>> -        for (y = 0; y < height; y++) {
>>> -            int x;
>>> -            int last_x = -1;
>>> -            for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
>>> -                if (test_and_clear_bit(x, vs->dirty[y])) {
>>> -                    if (last_x == -1) {
>>> -                        last_x = x;
>>> -                    }
>>> -                } else {
>>> -                    if (last_x != -1) {
>>> -                        int h = find_and_clear_dirty_height(vs, y, last_x, x,
>>> -                                                            height);
>>> -
>>> -                        n += vnc_job_add_rect(job,
>>> -                                              last_x * VNC_DIRTY_PIXELS_PER_BIT,
>>> -                                              y,
>>> -                                              (x - last_x) *
>>> -                                              VNC_DIRTY_PIXELS_PER_BIT,
>>> -                                              h);
>>> -                    }
>>> -                    last_x = -1;
>>> -                }
>>> -            }
>>> -            if (last_x != -1) {
>>> -                int h = find_and_clear_dirty_height(vs, y, last_x, x, height);
>>> -                n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT,
>>> -                                      y,
>>> -                                      (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT,
>>> -                                      h);
>>> +        y = 0;
>>> +        for (;;) {
>>> +            int x, h;
>>> +            unsigned long x2;
>>> +            unsigned long offset = find_next_bit((unsigned long *) &vs->dirty,
>>> +                                                 height * VNC_DIRTY_BPL(vs),
>>> +                                                 y * VNC_DIRTY_BPL(vs));
>>> +            if (offset == height * VNC_DIRTY_BPL(vs)) {
>>> +                /* no more dirty bits */
>>> +                break;
>>>                }
>>> +            y = offset / VNC_DIRTY_BPL(vs);
>>> +            x = offset % VNC_DIRTY_BPL(vs);
>>> +            x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y],
>>> +                                    VNC_DIRTY_BPL(vs), x);
>>> +            bitmap_clear(vs->dirty[y], x, x2 - x);
>>> +            h = find_and_clear_dirty_height(vs, y, x, x2, height);
>>> +            n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y,
>>> +                                  (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h);
>>>            }
>> Minor comments:
>>    VNC_DIRTY_BPL(vs) is accessing memory by pointer, should we use a
>> variable instead of VNC_DIRTY_BPL(vs) in every place, in case of
>> compiler didn't optimize it for us?
> I am pretty sure that sizeof is evaluated at compile time or do you have other
> evidence?

  You are right, it is sizeof((x)->dirty), I missed the bracket before.

>>
>>>            vnc_job_push(job);
>>> @@ -2678,8 +2677,8 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>>>        int width = pixman_image_get_width(vd->guest.fb);
>>>        int height = pixman_image_get_height(vd->guest.fb);
>>>        int y;
>>> -    uint8_t *guest_row;
>>> -    uint8_t *server_row;
>>> +    uint8_t *guest_row0 = NULL, *server_row0;
>>    Any reason that rename those variable?
> Its actually a pointer to row0 and not to any specific row. This is why
> I renamed it.

  I see, thanks for the explanation.

>>
>>> +    int guest_stride = 0, server_stride;
>>>        int cmp_bytes;
>>>        VncState *vs;
>>>        int has_dirty = 0;
>>> @@ -2704,44 +2703,57 @@ static int vnc_refresh_server_surface(VncDisplay *vd)
>>>        if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>>>            int width = pixman_image_get_width(vd->server);
>>>            tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
>>> -    }
>>> -    guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>>> -    server_row = (uint8_t *)pixman_image_get_data(vd->server);
>>> -    for (y = 0; y < height; y++) {
>>> -        if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) {
>>> -            int x;
>>> -            uint8_t *guest_ptr;
>>> -            uint8_t *server_ptr;
>>> -
>>> -            if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>>> -                qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>>> -                guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>>> -            } else {
>>> -                guest_ptr = guest_row;
>>> -            }
>>> -            server_ptr = server_row;
>>> +    } else {
>>> +        guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
>>> +        guest_stride = pixman_image_get_stride(vd->guest.fb);
>>> +    }
>>> +    server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
>>> +    server_stride = pixman_image_get_stride(vd->server);
>>> +
>>> +    y = 0;
>>> +    for (;;) {
>>> +        int x;
>>> +        uint8_t *guest_ptr, *server_ptr;
>>> +        unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty,
>>> +                                             height * VNC_DIRTY_BPL(&vd->guest),
>>> +                                             y * VNC_DIRTY_BPL(&vd->guest));
>>> +        if (offset == height * VNC_DIRTY_BPL(&vd->guest)) {
>>> +            /* no more dirty bits */
>>> +            break;
>>> +        }
>>> +        y = offset / VNC_DIRTY_BPL(&vd->guest);
>>>
>>> -            for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>>> -                 x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>>> -                 server_ptr += cmp_bytes) {
>>> -                if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>>> -                    vd->guest.dirty[y])) {
>>> -                    continue;
>>> -                }
>>> -                if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>>> -                    continue;
>>> -                }
>>> -                memcpy(server_ptr, guest_ptr, cmp_bytes);
>>> -                if (!vd->non_adaptive)
>>> -                    vnc_rect_updated(vd, x, y, &tv);
>>> -                QTAILQ_FOREACH(vs, &vd->clients, next) {
>>> -                    set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>>> -                }
>>> -                has_dirty++;
>>> +        server_ptr = server_row0 + y * server_stride;
>>> +
>>> +        if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
>>> +            qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
>>> +            guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
>>> +        } else {
>>> +            guest_ptr = guest_row0 + y * guest_stride;
>>> +        }
>>> +
>>> +        for (x = offset % VNC_DIRTY_BPL(&vd->guest);
>>> +             x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
>>> +             x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
>>> +             server_ptr += cmp_bytes) {
>>> +            if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
>>> +                vd->guest.dirty[y])) {
>>> +                continue;
>>> +            }
>>> +            if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
>>> +                continue;
>>> +            }
>>> +            memcpy(server_ptr, guest_ptr, cmp_bytes);
>>> +            if (!vd->non_adaptive) {
>>> +                vnc_rect_updated(vd, x, y, &tv);
>>>                }
>>> +            QTAILQ_FOREACH(vs, &vd->clients, next) {
>>> +                set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
>>> +            }
>>> +            has_dirty++;
>>>            }
>>> -        guest_row  += pixman_image_get_stride(vd->guest.fb);
>>> -        server_row += pixman_image_get_stride(vd->server);
>>> +
>>> +        y++;
>>>        }
>>>        qemu_pixman_image_unref(tmpbuf);
>>>        return has_dirty;
>>> diff --git a/ui/vnc.h b/ui/vnc.h
>>> index 4a8f33c..07e1f59 100644
>>> --- a/ui/vnc.h
>>> +++ b/ui/vnc.h
>>> @@ -88,6 +88,10 @@ typedef void VncSendHextileTile(VncState *vs,
>>>    /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
>>>    #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)
>>>
>>> +/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than
>>> + * VNC_DIRTY_BITS due to alignment */
>>> +#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE)
>>> +
>>>    #define VNC_STAT_RECT  64
>>>    #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT)
>>>    #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)
>>>
> 
> 
>
diff mbox

Patch

diff --git a/ui/vnc.c b/ui/vnc.c
index 1d2aa1a..6a0c03e 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -572,6 +572,14 @@  void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
     ptr += x * VNC_SERVER_FB_BYTES;
     return ptr;
 }
+/* this sets only the visible pixels of a dirty bitmap */
+#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
+        int y;\
+        memset(bitmap, 0x00, sizeof(bitmap));\
+        for (y = 0; y < h; y++) {\
+            bitmap_set(bitmap[y], 0, w / VNC_DIRTY_PIXELS_PER_BIT);\
+        } \
+    }
 
 static void vnc_dpy_switch(DisplayChangeListener *dcl,
                            DisplaySurface *surface)
@@ -597,7 +605,9 @@  static void vnc_dpy_switch(DisplayChangeListener *dcl,
     qemu_pixman_image_unref(vd->guest.fb);
     vd->guest.fb = pixman_image_ref(surface->image);
     vd->guest.format = surface->format;
-    memset(vd->guest.dirty, 0xFF, sizeof(vd->guest.dirty));
+    VNC_SET_VISIBLE_PIXELS_DIRTY(vd->guest.dirty,
+                                 surface_width(vd->ds),
+                                 surface_height(vd->ds));
 
     QTAILQ_FOREACH(vs, &vd->clients, next) {
         vnc_colordepth(vs);
@@ -605,7 +615,9 @@  static void vnc_dpy_switch(DisplayChangeListener *dcl,
         if (vs->vd->cursor) {
             vnc_cursor_define(vs);
         }
-        memset(vs->dirty, 0xFF, sizeof(vs->dirty));
+        VNC_SET_VISIBLE_PIXELS_DIRTY(vs->dirty,
+                                     surface_width(vd->ds),
+                                     surface_height(vd->ds));
     }
 }
 
@@ -889,10 +901,9 @@  static int vnc_update_client(VncState *vs, int has_dirty)
         VncDisplay *vd = vs->vd;
         VncJob *job;
         int y;
-        int width, height;
+        int height;
         int n = 0;
 
-
         if (vs->output.offset && !vs->audio_cap && !vs->force_update)
             /* kernel send buffers are full -> drop frames to throttle */
             return 0;
@@ -908,39 +919,27 @@  static int vnc_update_client(VncState *vs, int has_dirty)
          */
         job = vnc_job_new(vs);
 
-        width = MIN(pixman_image_get_width(vd->server), vs->client_width);
         height = MIN(pixman_image_get_height(vd->server), vs->client_height);
 
-        for (y = 0; y < height; y++) {
-            int x;
-            int last_x = -1;
-            for (x = 0; x < width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
-                if (test_and_clear_bit(x, vs->dirty[y])) {
-                    if (last_x == -1) {
-                        last_x = x;
-                    }
-                } else {
-                    if (last_x != -1) {
-                        int h = find_and_clear_dirty_height(vs, y, last_x, x,
-                                                            height);
-
-                        n += vnc_job_add_rect(job,
-                                              last_x * VNC_DIRTY_PIXELS_PER_BIT,
-                                              y,
-                                              (x - last_x) *
-                                              VNC_DIRTY_PIXELS_PER_BIT,
-                                              h);
-                    }
-                    last_x = -1;
-                }
-            }
-            if (last_x != -1) {
-                int h = find_and_clear_dirty_height(vs, y, last_x, x, height);
-                n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT,
-                                      y,
-                                      (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT,
-                                      h);
+        y = 0;
+        for (;;) {
+            int x, h;
+            unsigned long x2;
+            unsigned long offset = find_next_bit((unsigned long *) &vs->dirty,
+                                                 height * VNC_DIRTY_BPL(vs),
+                                                 y * VNC_DIRTY_BPL(vs));
+            if (offset == height * VNC_DIRTY_BPL(vs)) {
+                /* no more dirty bits */
+                break;
             }
+            y = offset / VNC_DIRTY_BPL(vs);
+            x = offset % VNC_DIRTY_BPL(vs);
+            x2 = find_next_zero_bit((unsigned long *) &vs->dirty[y],
+                                    VNC_DIRTY_BPL(vs), x);
+            bitmap_clear(vs->dirty[y], x, x2 - x);
+            h = find_and_clear_dirty_height(vs, y, x, x2, height);
+            n += vnc_job_add_rect(job, x * VNC_DIRTY_PIXELS_PER_BIT, y,
+                                  (x2 - x) * VNC_DIRTY_PIXELS_PER_BIT, h);
         }
 
         vnc_job_push(job);
@@ -2678,8 +2677,8 @@  static int vnc_refresh_server_surface(VncDisplay *vd)
     int width = pixman_image_get_width(vd->guest.fb);
     int height = pixman_image_get_height(vd->guest.fb);
     int y;
-    uint8_t *guest_row;
-    uint8_t *server_row;
+    uint8_t *guest_row0 = NULL, *server_row0;
+    int guest_stride = 0, server_stride;
     int cmp_bytes;
     VncState *vs;
     int has_dirty = 0;
@@ -2704,44 +2703,57 @@  static int vnc_refresh_server_surface(VncDisplay *vd)
     if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
         int width = pixman_image_get_width(vd->server);
         tmpbuf = qemu_pixman_linebuf_create(VNC_SERVER_FB_FORMAT, width);
-    }
-    guest_row = (uint8_t *)pixman_image_get_data(vd->guest.fb);
-    server_row = (uint8_t *)pixman_image_get_data(vd->server);
-    for (y = 0; y < height; y++) {
-        if (!bitmap_empty(vd->guest.dirty[y], VNC_DIRTY_BITS)) {
-            int x;
-            uint8_t *guest_ptr;
-            uint8_t *server_ptr;
-
-            if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
-                qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
-                guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
-            } else {
-                guest_ptr = guest_row;
-            }
-            server_ptr = server_row;
+    } else {
+        guest_row0 = (uint8_t *)pixman_image_get_data(vd->guest.fb);
+        guest_stride = pixman_image_get_stride(vd->guest.fb);
+    }
+    server_row0 = (uint8_t *)pixman_image_get_data(vd->server);
+    server_stride = pixman_image_get_stride(vd->server);
+
+    y = 0;
+    for (;;) {
+        int x;
+        uint8_t *guest_ptr, *server_ptr;
+        unsigned long offset = find_next_bit((unsigned long *) &vd->guest.dirty,
+                                             height * VNC_DIRTY_BPL(&vd->guest),
+                                             y * VNC_DIRTY_BPL(&vd->guest));
+        if (offset == height * VNC_DIRTY_BPL(&vd->guest)) {
+            /* no more dirty bits */
+            break;
+        }
+        y = offset / VNC_DIRTY_BPL(&vd->guest);
 
-            for (x = 0; x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
-                 x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
-                 server_ptr += cmp_bytes) {
-                if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
-                    vd->guest.dirty[y])) {
-                    continue;
-                }
-                if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
-                    continue;
-                }
-                memcpy(server_ptr, guest_ptr, cmp_bytes);
-                if (!vd->non_adaptive)
-                    vnc_rect_updated(vd, x, y, &tv);
-                QTAILQ_FOREACH(vs, &vd->clients, next) {
-                    set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
-                }
-                has_dirty++;
+        server_ptr = server_row0 + y * server_stride;
+
+        if (vd->guest.format != VNC_SERVER_FB_FORMAT) {
+            qemu_pixman_linebuf_fill(tmpbuf, vd->guest.fb, width, 0, y);
+            guest_ptr = (uint8_t *)pixman_image_get_data(tmpbuf);
+        } else {
+            guest_ptr = guest_row0 + y * guest_stride;
+        }
+
+        for (x = offset % VNC_DIRTY_BPL(&vd->guest);
+             x + VNC_DIRTY_PIXELS_PER_BIT - 1 < width;
+             x += VNC_DIRTY_PIXELS_PER_BIT, guest_ptr += cmp_bytes,
+             server_ptr += cmp_bytes) {
+            if (!test_and_clear_bit((x / VNC_DIRTY_PIXELS_PER_BIT),
+                vd->guest.dirty[y])) {
+                continue;
+            }
+            if (memcmp(server_ptr, guest_ptr, cmp_bytes) == 0) {
+                continue;
+            }
+            memcpy(server_ptr, guest_ptr, cmp_bytes);
+            if (!vd->non_adaptive) {
+                vnc_rect_updated(vd, x, y, &tv);
             }
+            QTAILQ_FOREACH(vs, &vd->clients, next) {
+                set_bit((x / VNC_DIRTY_PIXELS_PER_BIT), vs->dirty[y]);
+            }
+            has_dirty++;
         }
-        guest_row  += pixman_image_get_stride(vd->guest.fb);
-        server_row += pixman_image_get_stride(vd->server);
+
+        y++;
     }
     qemu_pixman_image_unref(tmpbuf);
     return has_dirty;
diff --git a/ui/vnc.h b/ui/vnc.h
index 4a8f33c..07e1f59 100644
--- a/ui/vnc.h
+++ b/ui/vnc.h
@@ -88,6 +88,10 @@  typedef void VncSendHextileTile(VncState *vs,
 /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
 #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)
 
+/* VNC_DIRTY_BPL (BPL = bits per line) might be greater than
+ * VNC_DIRTY_BITS due to alignment */
+#define VNC_DIRTY_BPL(x) (sizeof((x)->dirty) / VNC_MAX_HEIGHT * BITS_PER_BYTE)
+
 #define VNC_STAT_RECT  64
 #define VNC_STAT_COLS (VNC_MAX_WIDTH / VNC_STAT_RECT)
 #define VNC_STAT_ROWS (VNC_MAX_HEIGHT / VNC_STAT_RECT)