From 315f1a1717d4654e742ce77cdbfcbd4a61a720f0 Mon Sep 17 00:00:00 2001 From: Vreixo Formoso Date: Fri, 14 Dec 2007 21:42:55 +0100 Subject: [PATCH] Support for charset conversion from any encoding to ASCII. --- src/error.h | 1 + src/util.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/util.h | 1 + 3 files changed, 161 insertions(+) diff --git a/src/error.h b/src/error.h index e5a6ca8..ad1b406 100644 --- a/src/error.h +++ b/src/error.h @@ -36,5 +36,6 @@ #define ISO_FILE_IS_NOT_DIR -108 #define ISO_FILE_IS_NOT_SYMLINK -109 +#define ISO_CHARSET_CONV_ERROR -150 #endif /*LIBISO_ERROR_H_*/ diff --git a/src/util.c b/src/util.c index 7bf4d63..b7ab0a4 100644 --- a/src/util.c +++ b/src/util.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2007 Vreixo Formoso + * Copyright (c) 2007 Mario Danic * * This file is part of the libisofs project; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 as @@ -7,6 +8,13 @@ */ #include "util.h" +#include "error.h" + +#include +#include +#include +#include +#include int div_up(int n, int div) { @@ -17,3 +25,154 @@ int round_up(int n, int mul) { return div_up(n, mul) * mul; } + +/** + * Convert a str in a specified codeset to WCHAR_T. + * The result must be free() when no more needed + * + * @return + * 1 success, < 0 error + */ +static +int str2wchar(const char *icharset, const char *input, wchar_t **output) +{ + iconv_t conv; + size_t inbytes; + size_t outbytes; + char *ret; + char *src; + wchar_t *wstr; + size_t n; + + if (icharset == NULL || input == NULL || output == NULL) { + return ISO_NULL_POINTER; + } + + conv = iconv_open("WCHAR_T", icharset); + if (conv == (iconv_t)-1) { + return ISO_CHARSET_CONV_ERROR; + } + + inbytes = strlen(input); + outbytes = (inbytes + 1) * sizeof(wchar_t); + + /* we are sure that numchars <= inbytes */ + wstr = malloc(outbytes); + if (wstr == NULL) { + return ISO_MEM_ERROR; + } + ret = (char *)wstr; + src = (char *)input; + + n = iconv(conv, &src, &inbytes, &ret, &outbytes); + while (n == -1) { + + if( errno != EINVAL ) { + /* error, should never occur */ + iconv_close(conv); + free(wstr); + return ISO_CHARSET_CONV_ERROR; + } + + /* invalid input string charset, just ignore */ + /* printf("String %s is not encoded in %s\n", str, codeset); */ + inbytes--; + + if (!inbytes) + break; + n = iconv(conv, &src, &inbytes, &ret, &outbytes); + } + iconv_close(conv); + + *( (wchar_t *)ret )='\0'; + *output = wstr; + return ISO_SUCCESS; +} + +int str2ascii(const char *icharset, const char *input, char **output) +{ + int result; + wchar_t *wsrc_; + char *ret; + char *ret_; + char *src; + iconv_t conv; + size_t numchars; + size_t outbytes; + size_t inbytes; + size_t n; + + if (icharset == NULL || input == NULL || output == NULL) { + return ISO_NULL_POINTER; + } + + /* convert the string to a wide character string. Note: outbytes + * is in fact the number of characters in the string and doesn't + * include the last NULL character. + */ + result = str2wchar(icharset, input, &wsrc_); + if (result < 0) { + return result; + } + src = (char *)wsrc_; + numchars = wcslen(wsrc_); + + inbytes = numchars * sizeof(wchar_t); + + ret_ = malloc(numchars + 1); + if (ret_ == NULL) { + return ISO_MEM_ERROR; + } + outbytes = numchars; + ret = ret_; + + /* initialize iconv */ + conv = iconv_open("ASCII", "WCHAR_T"); + if (conv == (iconv_t)-1) { + free(wsrc_); + free(ret_); + return ISO_CHARSET_CONV_ERROR; + } + + n = iconv(conv, &src, &inbytes, &ret, &outbytes); + while(n == -1) { + /* The destination buffer is too small. Stops here. */ + if (errno == E2BIG) + break; + + /* An incomplete multi bytes sequence was found. We + * can't do anything here. That's quite unlikely. */ + if (errno == EINVAL) + break; + + /* The last possible error is an invalid multi bytes + * sequence. Just replace the character with a "_". + * Probably the character doesn't exist in ascii like + * "é, è, à, ç, ..." in French. */ + *ret++ = '_'; + outbytes--; + + if (!outbytes) + break; + + /* There was an error with one character but some other remain + * to be converted. That's probably a multibyte character. + * See above comment. */ + src += sizeof(wchar_t); + inbytes -= sizeof(wchar_t); + + if (!inbytes) + break; + + n = iconv(conv, &src, &inbytes, &ret, &outbytes); + } + + iconv_close(conv); + + *ret='\0'; + free(wsrc_); + + *output = ret_; + return ISO_SUCCESS; +} + diff --git a/src/util.h b/src/util.h index 9a1ebe2..9c77a71 100644 --- a/src/util.h +++ b/src/util.h @@ -13,5 +13,6 @@ extern inline int div_up(int n, int div); extern inline int round_up(int n, int mul); +int str2ascii(const char *icharset, const char *input, char **output); #endif /*LIBISO_UTIL_H_*/