Support for charset conversion from any encoding to ASCII.

This commit is contained in:
Vreixo Formoso 2007-12-14 21:42:55 +01:00
parent 115da82c9e
commit 315f1a1717
3 changed files with 161 additions and 0 deletions

View File

@ -36,5 +36,6 @@
#define ISO_FILE_IS_NOT_DIR -108
#define ISO_FILE_IS_NOT_SYMLINK -109
#define ISO_CHARSET_CONV_ERROR -150
#endif /*LIBISO_ERROR_H_*/

View File

@ -1,5 +1,6 @@
/*
* Copyright (c) 2007 Vreixo Formoso
* Copyright (c) 2007 Mario Danic
*
* This file is part of the libisofs project; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2 as
@ -7,6 +8,13 @@
*/
#include "util.h"
#include "error.h"
#include <stdlib.h>
#include <wchar.h>
#include <iconv.h>
#include <string.h>
#include <errno.h>
int div_up(int n, int div)
{
@ -17,3 +25,154 @@ int round_up(int n, int mul)
{
return div_up(n, mul) * mul;
}
/**
* Convert a str in a specified codeset to WCHAR_T.
* The result must be free() when no more needed
*
* @return
* 1 success, < 0 error
*/
static
int str2wchar(const char *icharset, const char *input, wchar_t **output)
{
iconv_t conv;
size_t inbytes;
size_t outbytes;
char *ret;
char *src;
wchar_t *wstr;
size_t n;
if (icharset == NULL || input == NULL || output == NULL) {
return ISO_NULL_POINTER;
}
conv = iconv_open("WCHAR_T", icharset);
if (conv == (iconv_t)-1) {
return ISO_CHARSET_CONV_ERROR;
}
inbytes = strlen(input);
outbytes = (inbytes + 1) * sizeof(wchar_t);
/* we are sure that numchars <= inbytes */
wstr = malloc(outbytes);
if (wstr == NULL) {
return ISO_MEM_ERROR;
}
ret = (char *)wstr;
src = (char *)input;
n = iconv(conv, &src, &inbytes, &ret, &outbytes);
while (n == -1) {
if( errno != EINVAL ) {
/* error, should never occur */
iconv_close(conv);
free(wstr);
return ISO_CHARSET_CONV_ERROR;
}
/* invalid input string charset, just ignore */
/* printf("String %s is not encoded in %s\n", str, codeset); */
inbytes--;
if (!inbytes)
break;
n = iconv(conv, &src, &inbytes, &ret, &outbytes);
}
iconv_close(conv);
*( (wchar_t *)ret )='\0';
*output = wstr;
return ISO_SUCCESS;
}
int str2ascii(const char *icharset, const char *input, char **output)
{
int result;
wchar_t *wsrc_;
char *ret;
char *ret_;
char *src;
iconv_t conv;
size_t numchars;
size_t outbytes;
size_t inbytes;
size_t n;
if (icharset == NULL || input == NULL || output == NULL) {
return ISO_NULL_POINTER;
}
/* convert the string to a wide character string. Note: outbytes
* is in fact the number of characters in the string and doesn't
* include the last NULL character.
*/
result = str2wchar(icharset, input, &wsrc_);
if (result < 0) {
return result;
}
src = (char *)wsrc_;
numchars = wcslen(wsrc_);
inbytes = numchars * sizeof(wchar_t);
ret_ = malloc(numchars + 1);
if (ret_ == NULL) {
return ISO_MEM_ERROR;
}
outbytes = numchars;
ret = ret_;
/* initialize iconv */
conv = iconv_open("ASCII", "WCHAR_T");
if (conv == (iconv_t)-1) {
free(wsrc_);
free(ret_);
return ISO_CHARSET_CONV_ERROR;
}
n = iconv(conv, &src, &inbytes, &ret, &outbytes);
while(n == -1) {
/* The destination buffer is too small. Stops here. */
if (errno == E2BIG)
break;
/* An incomplete multi bytes sequence was found. We
* can't do anything here. That's quite unlikely. */
if (errno == EINVAL)
break;
/* The last possible error is an invalid multi bytes
* sequence. Just replace the character with a "_".
* Probably the character doesn't exist in ascii like
* "é, è, à, ç, ..." in French. */
*ret++ = '_';
outbytes--;
if (!outbytes)
break;
/* There was an error with one character but some other remain
* to be converted. That's probably a multibyte character.
* See above comment. */
src += sizeof(wchar_t);
inbytes -= sizeof(wchar_t);
if (!inbytes)
break;
n = iconv(conv, &src, &inbytes, &ret, &outbytes);
}
iconv_close(conv);
*ret='\0';
free(wsrc_);
*output = ret_;
return ISO_SUCCESS;
}

View File

@ -13,5 +13,6 @@ extern inline int div_up(int n, int div);
extern inline int round_up(int n, int mul);
int str2ascii(const char *icharset, const char *input, char **output);
#endif /*LIBISO_UTIL_H_*/