From 88555bd0597224414f9e605d675d5c05cfde5579 Mon Sep 17 00:00:00 2001 From: Thomas Schmitt Date: Tue, 17 Dec 2013 21:45:52 +0100 Subject: [PATCH] New API call iso_write_opts_set_joliet_utf16() and ability to read Joliet names as UTF-16BE --- ChangeLog | 5 ++++ libisofs/ecma119.c | 30 +++++++++++++++++++++++ libisofs/ecma119.h | 15 ++++++++++++ libisofs/fs_image.c | 11 +++++---- libisofs/joliet.c | 55 +++++++++++++++++++++++++++++++++++-------- libisofs/libisofs.h | 13 ++++++++++ libisofs/libisofs.ver | 1 + libisofs/messages.c | 2 ++ libisofs/util.c | 22 +++++++++++++++-- libisofs/util.h | 6 +++++ 10 files changed, 144 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9f8a50b..0b21a2c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +bzr branch lp:libisofs/for-libisoburn (to become libisofs-1.3.6.tar.gz) +=============================================================================== +* New API call iso_write_opts_set_joliet_utf16() and ability to read Joliet + names as UTF-16BE + libisofs-1.3.4.tar.gz Thu Dec 12 2013 =============================================================================== * Giving sort weight 2 as default to El Torito boot images diff --git a/libisofs/ecma119.c b/libisofs/ecma119.c index 15995bc..d1d3d8d 100644 --- a/libisofs/ecma119.c +++ b/libisofs/ecma119.c @@ -1510,6 +1510,21 @@ ex:; } +static +void issue_write_warning_summary(Ecma119Image *target) +{ + if (target->joliet_ucs2_failures > ISO_JOLIET_UCS2_WARN_MAX) { + iso_msg_submit(-1, ISO_NAME_NOT_UCS2, 0, + "More filenames found which were not suitable for Joliet character set UCS-2"); + } + if (target->joliet_ucs2_failures > 0) { + iso_msg_submit(-1, ISO_NAME_NOT_UCS2, 0, + "Sum of filenames not suitable for Joliet character set UCS-2: %.f", + (double) target->joliet_ucs2_failures); + } +} + + static void *write_function(void *arg) { @@ -1566,6 +1581,8 @@ void *write_function(void *arg) if (res <= 0) goto write_error; + issue_write_warning_summary(target); + target->image->generator_is_running = 0; /* Give up reference claim made in ecma119_image_new(). @@ -1783,6 +1800,7 @@ int ecma119_image_new(IsoImage *src, IsoWriteOpts *opts, Ecma119Image **img) target->relaxed_vol_atts = opts->relaxed_vol_atts; target->joliet_longer_paths = opts->joliet_longer_paths; target->joliet_long_names = opts->joliet_long_names; + target->joliet_utf16 = opts->joliet_utf16; target->rrip_version_1_10 = opts->rrip_version_1_10; target->rrip_1_10_px_ino = opts->rrip_1_10_px_ino; target->aaip_susp_1_10 = opts->aaip_susp_1_10; @@ -2007,6 +2025,8 @@ int ecma119_image_new(IsoImage *src, IsoWriteOpts *opts, Ecma119Image **img) target->filesrc_start = 0; target->filesrc_blocks = 0; + target->joliet_ucs2_failures = 0; + /* * 2. Based on those options, create needed writers: iso, joliet... * Each writer inits its structures and stores needed info into @@ -2758,6 +2778,7 @@ int iso_write_opts_new(IsoWriteOpts **opts, int profile) wopts->fat = 0; wopts->fifo_size = 1024; /* 2 MB buffer */ wopts->sort_files = 1; /* file sorting is always good */ + wopts->joliet_utf16 = 0; wopts->rr_reloc_dir = NULL; wopts->rr_reloc_flags = 0; wopts->system_area_data = NULL; @@ -3035,6 +3056,15 @@ int iso_write_opts_set_joliet_long_names(IsoWriteOpts *opts, int allow) return ISO_SUCCESS; } +int iso_write_opts_set_joliet_utf16(IsoWriteOpts *opts, int allow) +{ + if (opts == NULL) { + return ISO_NULL_POINTER; + } + opts->joliet_utf16 = allow ? 1 : 0; + return ISO_SUCCESS; +} + int iso_write_opts_set_rrip_version_1_10(IsoWriteOpts *opts, int oldvers) { if (opts == NULL) { diff --git a/libisofs/ecma119.h b/libisofs/ecma119.h index a1279b7..13b146f 100644 --- a/libisofs/ecma119.h +++ b/libisofs/ecma119.h @@ -87,6 +87,12 @@ #define ISO_GPT_ENTRIES_MAX 248 +/* How many warnings to issue about writing Joliet names which cannot be + properly represented in UCS-2 and thus had to be defaultet to '_'. +*/ +#define ISO_JOLIET_UCS2_WARN_MAX 3 + + /** * Holds the options for the image generation. */ @@ -192,6 +198,11 @@ struct iso_write_opts { */ unsigned int joliet_long_names :1; + /** + * Use UTF-16BE rather than its subset UCS-2 + */ + unsigned int joliet_utf16 :1; + /** * Write Rock Ridge info as of specification RRIP-1.10 rather than * RRIP-1.12: signature "RRIP_1991A" rather than "IEEE_1282", @@ -540,6 +551,9 @@ struct ecma119_image /** Allow Joliet names up to 103 characters rather than 64 */ unsigned int joliet_long_names :1; + /** Use UTF-16BE rather than its subset UCS-2 */ + unsigned int joliet_utf16 :1; + /** Write old fashioned RRIP-1.10 rather than RRIP-1.12 */ unsigned int rrip_version_1_10 :1; @@ -642,6 +656,7 @@ struct ecma119_image uint32_t joliet_path_table_size; uint32_t joliet_l_path_table_pos; uint32_t joliet_m_path_table_pos; + size_t joliet_ucs2_failures; /* * HFS+ related information diff --git a/libisofs/fs_image.c b/libisofs/fs_image.c index 79b3cba..1cc2968 100644 --- a/libisofs/fs_image.c +++ b/libisofs/fs_image.c @@ -1252,7 +1252,7 @@ char *get_name(_ImageFsData *fsdata, const char *str, size_t len) return name; } else { ret = iso_msg_submit(fsdata->msgid, ISO_FILENAME_WRONG_CHARSET, ret, - "Charset conversion error. Cannot convert from %s to %s", + "Cannot convert from charset %s to %s", fsdata->input_charset, fsdata->local_charset); if (ret < 0) { return NULL; /* aborted */ @@ -1751,7 +1751,7 @@ if (name != NULL && !namecont) { LIBISO_FREE_MEM(msg); LIBISO_ALLOC_MEM(msg, char, 160); sprintf(msg, - "Charset conversion error. Cannot convert from %.40s to %.40s", + "Cannot convert from charset %.40s to %.40s", fsdata->input_charset, fsdata->local_charset); ret = iso_rr_msg_submit(fsdata, 17, ISO_FILENAME_WRONG_CHARSET, ret, msg); @@ -2906,7 +2906,10 @@ int iso_image_filesystem_new(IsoDataSource *src, struct iso_read_opts *opts, if (!opts->nojoliet && opts->preferjoliet && data->joliet) { /* if user prefers joliet, that is used */ iso_msg_debug(data->msgid, "Reading Joliet extensions."); - data->input_charset = strdup("UCS-2BE"); + /* Although Joliet prescribes UCS-2BE, interpret names by its + superset UTF-16BE in order to avoid conversion failures. + */ + data->input_charset = strdup("UTF-16BE"); data->rr = RR_EXT_NO; data->iso_root_block = data->svd_root_block; } else { @@ -2919,7 +2922,7 @@ int iso_image_filesystem_new(IsoDataSource *src, struct iso_read_opts *opts, if (!opts->nojoliet && data->joliet) { /* joliet will be used */ iso_msg_debug(data->msgid, "Reading Joliet extensions."); - data->input_charset = strdup("UCS-2BE"); + data->input_charset = strdup("UTF-16BE"); data->iso_root_block = data->svd_root_block; } else if (!opts->noiso1999 && data->iso1999) { /* we will read ISO 9660:1999 */ diff --git a/libisofs/joliet.c b/libisofs/joliet.c index 9880459..da8eef8 100644 --- a/libisofs/joliet.c +++ b/libisofs/joliet.c @@ -31,19 +31,41 @@ static int get_joliet_name(Ecma119Image *t, IsoNode *iso, uint16_t **name) { - int ret; - uint16_t *ucs_name; + int ret = ISO_SUCCESS; + uint16_t *ucs_name = NULL, *utf16_name = NULL; uint16_t *jname = NULL; if (iso->name == NULL) { /* it is not necessarily an error, it can be the root */ + *name = NULL; return ISO_SUCCESS; } - ret = str2ucs(t->input_charset, iso->name, &ucs_name); - if (ret < 0) { - iso_msg_debug(t->image->id, "Can't convert %s", iso->name); - return ret; + if (t->joliet_utf16) { + ret = str2utf16be(t->input_charset, iso->name, &ucs_name); + if (ret < 0) { + iso_msg_debug(t->image->id, "Cannot convert to UTF-16 : \"%s\"", + iso->name); + goto ex; + } + } else { + ret = str2ucs(t->input_charset, iso->name, &ucs_name); + if (ret < 0) { + iso_msg_debug(t->image->id, "Cannot convert to UCS-2 : \"%s\"", + iso->name); + goto ex; + } + ret = str2utf16be(t->input_charset, iso->name, &utf16_name); + if (ret == ISO_SUCCESS) { + if (ucscmp(ucs_name, utf16_name) != 0) { + t->joliet_ucs2_failures++; + if (t->joliet_ucs2_failures <= ISO_JOLIET_UCS2_WARN_MAX) { + iso_msg_submit(t->image->id, ISO_NAME_NOT_UCS2, 0, + "Filename not suitable for Joliet character set UCS-2 : \"%s\"", + iso->name); + } + } + } } if (iso->type == LIBISO_DIR) { jname = iso_j_dir_id(ucs_name, t->joliet_long_names << 1); @@ -51,8 +73,17 @@ int get_joliet_name(Ecma119Image *t, IsoNode *iso, uint16_t **name) jname = iso_j_file_id(ucs_name, (t->joliet_long_names << 1) | !!(t->no_force_dots & 2)); } - free(ucs_name); - if (jname != NULL) { + ret = ISO_SUCCESS; +ex:; + if (ucs_name != NULL) + free(ucs_name); + if (utf16_name != NULL) + free(utf16_name); + if (ret != ISO_SUCCESS) { + if (jname != NULL) + free(jname); + return ret; + } else if (jname != NULL) { *name = jname; return ISO_SUCCESS; } else { @@ -828,18 +859,22 @@ void ucsncpy_pad(uint16_t *dest, const uint16_t *src, size_t max) csrc = (char*)src; if (src != NULL) { - len = MIN(ucslen(src) * 2, max); + len = MIN(ucslen(src) * 2, max - (max % 2)); } else { len = 0; } for (i = 0; i < len; ++i) cdest[i] = csrc[i]; + if (len >= 2) + iso_handle_split_utf16(dest + (len / 2 - 1)); - for (i = len; i < max; i += 2) { + for (i = len; i + 1 < max; i += 2) { cdest[i] = '\0'; cdest[i + 1] = ' '; } + if (max % 2) + cdest[max - 1] = 0; } int joliet_writer_write_vol_desc(IsoImageWriter *writer) diff --git a/libisofs/libisofs.h b/libisofs/libisofs.h index da1b004..497bc5b 100644 --- a/libisofs/libisofs.h +++ b/libisofs/libisofs.h @@ -1771,6 +1771,16 @@ int iso_write_opts_set_joliet_longer_paths(IsoWriteOpts *opts, int allow); */ int iso_write_opts_set_joliet_long_names(IsoWriteOpts *opts, int allow); +/** + * Use character set UTF-16BE with Joliet, which is a superset of the + * actually prescribed character set UCS-2. + * This breaks Joliet specification with exotic characters which would + * elsewise be mapped to underscore '_'. Use with caution. + * + * @since 1.3.6 + */ +int iso_write_opts_set_joliet_utf16(IsoWriteOpts *opts, int allow); + /** * Write Rock Ridge info as of specification RRIP-1.10 rather than RRIP-1.12: * signature "RRIP_1991A" rather than "IEEE_1282", field PX without file @@ -7544,6 +7554,9 @@ int iso_image_hfsplus_get_blessed(IsoImage *img, IsoNode ***blessed_nodes, /** Unrecognized file type in ISO image (FAILURE, HIGH, -396) */ #define ISO_BAD_ISO_FILETYPE 0xE830FE74 +/** Filename not suitable for character set UCS-2 (WARNING, HIGH, -397) */ +#define ISO_NAME_NOT_UCS2 0xD030FE73 + /* Internal developer note: Place new error codes directly above this comment. diff --git a/libisofs/libisofs.ver b/libisofs/libisofs.ver index 78a7ccf..79bc576 100644 --- a/libisofs/libisofs.ver +++ b/libisofs/libisofs.ver @@ -303,6 +303,7 @@ iso_write_opts_set_iso_level; iso_write_opts_set_joliet; iso_write_opts_set_joliet_long_names; iso_write_opts_set_joliet_longer_paths; +iso_write_opts_set_joliet_utf16; iso_write_opts_set_max_37_char_filenames; iso_write_opts_set_ms_block; iso_write_opts_set_no_force_dots; diff --git a/libisofs/messages.c b/libisofs/messages.c index 81953ab..b04e655 100644 --- a/libisofs/messages.c +++ b/libisofs/messages.c @@ -503,6 +503,8 @@ const char *iso_error_to_msg(int errcode) return "Too many chained symbolic links"; case ISO_BAD_ISO_FILETYPE: return "Unrecognized file type in ISO image"; + case ISO_NAME_NOT_UCS2: + return "Filename not suitable for character set UCS-2"; default: return "Unknown error"; } diff --git a/libisofs/util.c b/libisofs/util.c index 303c1f7..0250d40 100644 --- a/libisofs/util.c +++ b/libisofs/util.c @@ -1197,7 +1197,7 @@ uint16_t *iso_j_file_id(const uint16_t *src, int flag) { uint16_t *dot, *retval = NULL; size_t lname, lext, lnname, lnext, pos, i, maxchar = 64; - uint16_t *dest = NULL; + uint16_t *dest = NULL, c; LIBISO_ALLOC_MEM_VOID(dest, uint16_t, LIBISO_JOLIET_NAME_MAX); /* was: 66 = 64 (name + ext) + 1 (.) + 1 (\0) */ @@ -1237,7 +1237,7 @@ uint16_t *iso_j_file_id(const uint16_t *src, int flag) /* Convert up to lnname characters of the filename. */ for (i = 0; i < lnname; i++) { - uint16_t c = src[i]; + c = src[i]; if (valid_j_char(c)) { dest[pos++] = c; } else { @@ -1245,6 +1245,7 @@ uint16_t *iso_j_file_id(const uint16_t *src, int flag) pos++; } } + iso_handle_split_utf16(dest + (pos - 1)); if ((flag & 1) && lnext <= 0) goto is_done; @@ -1262,6 +1263,7 @@ uint16_t *iso_j_file_id(const uint16_t *src, int flag) pos++; } } + iso_handle_split_utf16(dest + (pos - 1)); is_done:; set_ucsbe(dest + pos, '\0'); @@ -1298,6 +1300,7 @@ uint16_t *iso_j_dir_id(const uint16_t *src, int flag) set_ucsbe(dest + i, '_'); } } + iso_handle_split_utf16(dest + (len - 1)); set_ucsbe(dest + len, '\0'); retval = ucsdup(dest); ex: @@ -1379,6 +1382,8 @@ uint16_t *ucsncpy(uint16_t *dest, const uint16_t *src, size_t n) { n = MIN(n, ucslen(src) + 1); memcpy(dest, src, n*2); + if (n >= 2) + iso_handle_split_utf16(dest + (n - 2)); return dest; } @@ -2209,3 +2214,16 @@ uint16_t iso_htons(uint16_t v) return ret; } + +/* If an UTF-16 surrogate pair was split : Change to UTF-16 '_'. + (UCS-2 is promised to reserve 0xd800 to 0xdbff for UTF-16). +*/ +void iso_handle_split_utf16(uint16_t *utf_word) +{ + unsigned char *hb; + + hb = (unsigned char *) utf_word; + if ((hb[0] & 0xfc) == 0xd8) + set_ucsbe(utf_word, '_'); +} + diff --git a/libisofs/util.h b/libisofs/util.h index c2f75c3..2e9321d 100644 --- a/libisofs/util.h +++ b/libisofs/util.h @@ -236,6 +236,12 @@ uint16_t *ucscpy(uint16_t *dest, const uint16_t *src); */ uint16_t *ucsncpy(uint16_t *dest, const uint16_t *src, size_t n); +/** + * Check whether utf_word is the first surrogate word of a pair. + * If so, change it to UTF-16 character '_'. + */ +void iso_handle_split_utf16(uint16_t *utf_word); + /** * Convert a given input string to d-chars. * @return