Encoding HFS+ names in UTF-16 rather than UCS-2.

2013-11-26 12:47:43 +01:00
parent 654ff82345
commit bc5e2227c8
3 changed files with 134 additions and 1 deletions
--- a/libisofs/hfsplus.c
+++ b/libisofs/hfsplus.c
@@ -128,7 +128,7 @@ int set_hfsplus_name(Ecma119Image *t, char *name, HFSPlusNode *node)
        return ISO_SUCCESS;
    }

-    ret = str2ucs(t->input_charset, name, &ucs_name);
+    ret = str2utf16be(t->input_charset, name, &ucs_name);
    if (ret < 0) {
        iso_msg_debug(t->image->id, "Can't convert %s", name);
        return ret;
--- a/libisofs/util.c
+++ b/libisofs/util.c
@@ -667,6 +667,123 @@ int str2ucs(const char *icharset, const char *input, uint16_t **output)
    return ISO_SUCCESS;
 }

+int str2utf16be(const char *icharset, const char *input, uint16_t **output)
+{
+    int result;
+    wchar_t *wsrc_ = NULL;
+    char *src;
+    char *ret = NULL;
+    char *ret_ = NULL;
+    struct iso_iconv_handle conv;
+    int conv_ret = 0;
+    int direct_conv = 0;
+    size_t loop_counter = 0, loop_limit = 3;
+    size_t numchars;
+    size_t outbytes;
+    size_t inbytes;
+    size_t n;
+
+    if (icharset == NULL || input == NULL || output == NULL) {
+        return ISO_NULL_POINTER;
+    }
+
+    /* 
+      Try the direct conversion.
+    */ 
+    conv_ret = iso_iconv_open(&conv, "UTF-16BE", (char *) icharset, 0);
+    if (conv_ret > 0) {
+        direct_conv = 1;
+        src = (char *) input;
+        inbytes = strlen(input);
+        loop_limit = inbytes + 3;
+        outbytes = (2 * inbytes + 1) * sizeof(uint16_t);
+        ret_ = malloc(outbytes);
+        if (ret_ == NULL)
+            return ISO_OUT_OF_MEM;
+        ret = ret_;
+    }  else {
+        /* Try via intermediate character set WCHAR_T.
+        */
+        result = str2wchar(icharset, input, &wsrc_);
+        if (result == (int) ISO_SUCCESS) {
+            src = (char *)wsrc_;
+            numchars = wcslen(wsrc_);
+
+            inbytes = numchars * sizeof(wchar_t);
+            loop_limit = inbytes + 3;
+
+            ret_ = malloc((2 * numchars+1) * sizeof(uint16_t));
+            if (ret_ == NULL)
+                return ISO_OUT_OF_MEM;
+            outbytes = 2 * numchars * sizeof(uint16_t);
+            ret = ret_;
+
+            /* initialize iconv */
+            conv_ret = iso_iconv_open(&conv, "UTF-16BE", "WCHAR_T", 0);
+            if (conv_ret <= 0) {
+                free(wsrc_);
+                free(ret_);
+            }
+        } else if (result != (int) ISO_CHARSET_CONV_ERROR)
+            return result;
+    }
+
+    if (conv_ret <= 0) {
+        return ISO_CHARSET_CONV_ERROR;
+    }
+
+    n = iso_iconv(&conv, &src, &inbytes, &ret, &outbytes, 0);
+    while (n == (size_t) -1) {
+        /* The destination buffer is too small. Stops here. */
+        if (errno == E2BIG)
+            break;
+
+        /* An incomplete multi bytes sequence was found. We 
+         * can't do anything here. That's quite unlikely. */
+        if (errno == EINVAL)
+            break;
+
+        /* The last possible error is an invalid multi bytes
+         * sequence. Just replace the character with a "_". 
+         * Probably the character doesn't exist in UCS */
+        set_ucsbe((uint16_t*) ret, '_');
+        ret += sizeof(uint16_t);
+        outbytes -= sizeof(uint16_t);
+
+        if (!outbytes)
+            break;
+
+        /* There was an error with one character but some other remain
+         * to be converted. That's probably a multibyte character.
+         * See above comment. */
+        if (direct_conv) {
+            src++;
+            inbytes--;
+        } else {
+            src += sizeof(wchar_t);
+            inbytes -= sizeof(wchar_t);
+        }
+
+        if (!inbytes)
+            break;
+
+        /* Just to appease my remorse about unclear loop ends */
+        loop_counter++;
+        if (loop_counter > loop_limit)
+            break;
+        n = iso_iconv(&conv, &src, &inbytes, &ret, &outbytes, 0);
+    }
+    iso_iconv_close(&conv, 0);
+
+    /* close the UTF-16 string */
+    set_ucsbe((uint16_t*) ret, '\0');
+    if (wsrc_ != NULL)
+        free(wsrc_);
+
+    *output = (uint16_t*)ret_;
+    return ISO_SUCCESS;
+}
+
 static int valid_d_char(char c)
 {
    return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c == '_');
--- a/libisofs/util.h
+++ b/libisofs/util.h
@@ -88,6 +88,22 @@ int str2ascii(const char *icharset, const char *input, char **output);
 */
 int str2ucs(const char *icharset, const char *input, uint16_t **output);

+/**
+ * Convert a given string from any input charset to UTF-16BE charset,
+ * used for HFS+ file identifiers.
+ * (UTF-16 differs from older UCS-2 by having multi word characters.)
+ * 
+ * @param icharset
+ *      Input charset. Must be supported by iconv
+ * @param input
+ *      Input string
+ * @param output
+ *      Location where the pointer to the ouput string will be stored
+ * @return
+ *      1 on success, < 0 on error
+ */
+int str2utf16be(const char *icharset, const char *input, uint16_t **output);
+
 /**
 * Create a level 1 directory identifier.
 *