diff mbox

[08/13] vvfat: correctly create long names for non-ASCII filenames

Message ID 20170515203114.9477-9-hpoussin@reactos.org
State New
Headers show

Commit Message

Hervé Poussineau May 15, 2017, 8:31 p.m. UTC
Assume that input filename is encoded as UTF-8, so correctly create UTF-16 encoding.
Reuse long_file_name structure to give back to caller the generated long name.
It will be used in next commit to transform the long file name into short file name.

Reference: http://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
Signed-off-by: Hervé Poussineau <hpoussin@reactos.org>
---
 block/vvfat.c | 132 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 97 insertions(+), 35 deletions(-)

Comments

Kevin Wolf May 16, 2017, 3:33 p.m. UTC | #1
Am 15.05.2017 um 22:31 hat Hervé Poussineau geschrieben:
> Assume that input filename is encoded as UTF-8, so correctly create UTF-16 encoding.
> Reuse long_file_name structure to give back to caller the generated long name.
> It will be used in next commit to transform the long file name into short file name.
> 
> Reference: http://stackoverflow.com/questions/7153935/how-to-convert-utf-8-stdstring-to-utf-16-stdwstring
> Signed-off-by: Hervé Poussineau <hpoussin@reactos.org>
> ---
>  block/vvfat.c | 132 ++++++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 97 insertions(+), 35 deletions(-)
> 
> diff --git a/block/vvfat.c b/block/vvfat.c
> index 7da07068b8..5f6356c834 100644
> --- a/block/vvfat.c
> +++ b/block/vvfat.c
> @@ -357,6 +357,23 @@ typedef struct BDRVVVFATState {
>      Error *migration_blocker;
>  } BDRVVVFATState;
>  
> +typedef struct {
> +    /*
> +     * Since the sequence number is at most 0x3f, and the filename
> +     * length is at most 13 times the sequence number, the maximal
> +     * filename length is 0x3f * 13 bytes.
> +     */
> +    unsigned char name[0x3f * 13 + 1];
> +    int checksum, len;
> +    int sequence_number;
> +} long_file_name;
> +
> +static void lfn_init(long_file_name *lfn)
> +{
> +   lfn->sequence_number = lfn->len = 0;
> +   lfn->checksum = 0x100;
> +}
> +
>  /* take the sector position spos and convert it to Cylinder/Head/Sector position
>   * if the position is outside the specified geometry, fill maximum value for CHS
>   * and return 1 to signal overflow.
> @@ -418,29 +435,90 @@ static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs)
>  
>  /* direntry functions */
>  
> -/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
> -static inline int short2long_name(char* dest,const char* src)
> -{
> -    int i;
> -    int len;
> -    for(i=0;i<129 && src[i];i++) {
> -        dest[2*i]=src[i];
> -        dest[2*i+1]=0;
> +/* fills lfn with UTF-16 representation of src filename */
> +/* return true if src is valid UTF-8 string, false otherwise */
> +static bool filename2long_name(long_file_name *lfn, const char* src)
> +{
> +    uint8_t *dest = lfn->name;
> +    int i = 0, j;
> +    int len = 0;
> +    while (src[i]) {
> +        uint32_t uni = 0;
> +        size_t todo;
> +        uint8_t ch = src[i++];
> +        if (ch <= 0x7f) {
> +            uni = ch;
> +            todo = 0;
> +        } else if (ch <= 0xbf) {
> +            return false;
> +        } else if (ch <= 0xdf) {
> +            uni = ch & 0x1f;
> +            todo = 1;
> +        } else if (ch <= 0xef) {
> +            uni = ch & 0x0f;
> +            todo = 2;
> +        } else if (ch <= 0xf7) {
> +            uni = ch & 0x07;
> +            todo = 3;
> +        } else {
> +            return false;
> +        }
> +        for (j = 0; j < todo; j++) {
> +            uint8_t ch;
> +            if (src[i] == '\0') {
> +                return false;
> +            }
> +            ch = src[i++];
> +            if (ch < 0x80 || ch >= 0xbf) {
> +                return false;
> +            }
> +            uni <<= 6;
> +            uni += ch & 0x3f;
> +        }

I'm not sure if we really want to add an ad-hoc UTF-8 parser here...
Shouldn't we be using something like g_utf8_get_char() instead?

> +        if (uni >= 0xd800 && uni <= 0xdfff) {
> +            return false;
> +        } else if (uni >= 0x10ffff) {
> +            return false;
> +        }
> +        if (uni <= 0xffff) {
> +            dest[len++] = uni & 0xff;
> +            dest[len++] = uni >> 8;
> +        } else {
> +            uint16_t w;
> +            uni -= 0x10000;
> +            w = (uni >> 10) + 0xd800;
> +            dest[len++] = w & 0xff;
> +            dest[len++] = w >> 8;
> +            w = (uni & 0x3ff) + 0xdc00;
> +            dest[len++] = w & 0xff;
> +            dest[len++] = w >> 8;
> +        }

Who guarantees that src was short enough that we don't overrun the
buffer in lfn->name?

> +    }
> +    dest[len++] = 0;
> +    dest[len++] = 0;
> +    while (len % 26 != 0) {
> +        dest[len++] = 0xff;
>      }
> -    len=2*i;
> -    dest[2*i]=dest[2*i+1]=0;
> -    for(i=2*i+2;(i%26);i++)
> -        dest[i]=0xff;
> -    return len;
> +    lfn->len = len;
> +    return true;
>  }

Kevin
diff mbox

Patch

diff --git a/block/vvfat.c b/block/vvfat.c
index 7da07068b8..5f6356c834 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -357,6 +357,23 @@  typedef struct BDRVVVFATState {
     Error *migration_blocker;
 } BDRVVVFATState;
 
+typedef struct {
+    /*
+     * Since the sequence number is at most 0x3f, and the filename
+     * length is at most 13 times the sequence number, the maximal
+     * filename length is 0x3f * 13 bytes.
+     */
+    unsigned char name[0x3f * 13 + 1];
+    int checksum, len;
+    int sequence_number;
+} long_file_name;
+
+static void lfn_init(long_file_name *lfn)
+{
+   lfn->sequence_number = lfn->len = 0;
+   lfn->checksum = 0x100;
+}
+
 /* take the sector position spos and convert it to Cylinder/Head/Sector position
  * if the position is outside the specified geometry, fill maximum value for CHS
  * and return 1 to signal overflow.
@@ -418,29 +435,90 @@  static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs)
 
 /* direntry functions */
 
-/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
-static inline int short2long_name(char* dest,const char* src)
-{
-    int i;
-    int len;
-    for(i=0;i<129 && src[i];i++) {
-        dest[2*i]=src[i];
-        dest[2*i+1]=0;
+/* fills lfn with UTF-16 representation of src filename */
+/* return true if src is valid UTF-8 string, false otherwise */
+static bool filename2long_name(long_file_name *lfn, const char* src)
+{
+    uint8_t *dest = lfn->name;
+    int i = 0, j;
+    int len = 0;
+    while (src[i]) {
+        uint32_t uni = 0;
+        size_t todo;
+        uint8_t ch = src[i++];
+        if (ch <= 0x7f) {
+            uni = ch;
+            todo = 0;
+        } else if (ch <= 0xbf) {
+            return false;
+        } else if (ch <= 0xdf) {
+            uni = ch & 0x1f;
+            todo = 1;
+        } else if (ch <= 0xef) {
+            uni = ch & 0x0f;
+            todo = 2;
+        } else if (ch <= 0xf7) {
+            uni = ch & 0x07;
+            todo = 3;
+        } else {
+            return false;
+        }
+        for (j = 0; j < todo; j++) {
+            uint8_t ch;
+            if (src[i] == '\0') {
+                return false;
+            }
+            ch = src[i++];
+            if (ch < 0x80 || ch >= 0xbf) {
+                return false;
+            }
+            uni <<= 6;
+            uni += ch & 0x3f;
+        }
+        if (uni >= 0xd800 && uni <= 0xdfff) {
+            return false;
+        } else if (uni >= 0x10ffff) {
+            return false;
+        }
+        if (uni <= 0xffff) {
+            dest[len++] = uni & 0xff;
+            dest[len++] = uni >> 8;
+        } else {
+            uint16_t w;
+            uni -= 0x10000;
+            w = (uni >> 10) + 0xd800;
+            dest[len++] = w & 0xff;
+            dest[len++] = w >> 8;
+            w = (uni & 0x3ff) + 0xdc00;
+            dest[len++] = w & 0xff;
+            dest[len++] = w >> 8;
+        }
+    }
+    dest[len++] = 0;
+    dest[len++] = 0;
+    while (len % 26 != 0) {
+        dest[len++] = 0xff;
     }
-    len=2*i;
-    dest[2*i]=dest[2*i+1]=0;
-    for(i=2*i+2;(i%26);i++)
-        dest[i]=0xff;
-    return len;
+    lfn->len = len;
+    return true;
 }
 
-static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename)
+static direntry_t *create_long_filename(BDRVVVFATState *s, const char *filename,
+                                        long_file_name *lfn)
 {
-    char buffer[258];
-    int length=short2long_name(buffer,filename),
-        number_of_entries=(length+25)/26,i;
+    uint8_t *buffer;
+    int length, number_of_entries, i;
     direntry_t* entry;
 
+    lfn_init(lfn);
+    if (!filename2long_name(lfn, filename)) {
+        fprintf(stderr, "vvfat: invalid UTF-8 name: %s\n", filename);
+        return NULL;
+    }
+    buffer = lfn->name;
+    length = lfn->len;
+    number_of_entries = (length + 25) / 26;
+
     for(i=0;i<number_of_entries;i++) {
         entry=array_get_next(&(s->directory));
         entry->attributes=0xf;
@@ -612,6 +690,7 @@  static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
     int i,j,long_index=s->directory.next;
     direntry_t* entry = NULL;
     direntry_t* entry_long = NULL;
+    long_file_name lfn;
 
     if(is_dot) {
         entry=array_get_next(&(s->directory));
@@ -620,7 +699,7 @@  static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
         return entry;
     }
 
-    entry_long=create_long_filename(s,filename);
+    entry_long = create_long_filename(s, filename, &lfn);
 
     i = strlen(filename);
     for(j = i - 1; j>0  && filename[j]!='.';j--);
@@ -1575,23 +1654,6 @@  static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path)
     commit->action = ACTION_MKDIR;
 }
 
-typedef struct {
-    /*
-     * Since the sequence number is at most 0x3f, and the filename
-     * length is at most 13 times the sequence number, the maximal
-     * filename length is 0x3f * 13 bytes.
-     */
-    unsigned char name[0x3f * 13 + 1];
-    int checksum, len;
-    int sequence_number;
-} long_file_name;
-
-static void lfn_init(long_file_name* lfn)
-{
-   lfn->sequence_number = lfn->len = 0;
-   lfn->checksum = 0x100;
-}
-
 /* return 0 if parsed successfully, > 0 if no long name, < 0 if error */
 static int parse_long_name(long_file_name* lfn,
         const direntry_t* direntry)