| /* Character set conversion with error handling and autodetection. |
| Copyright (C) 2002, 2005, 2007, 2009-2020 Free Software Foundation, Inc. |
| Written by Bruno Haible. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| #include <config.h> |
| |
| /* Specification. */ |
| #include "striconveha.h" |
| |
| #include <errno.h> |
| #include <stdlib.h> |
| #include <string.h> |
| |
| #include "malloca.h" |
| #include "c-strcase.h" |
| #include "striconveh.h" |
| |
| #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) |
| |
| |
| /* Autodetection list. */ |
| |
| struct autodetect_alias |
| { |
| struct autodetect_alias *next; |
| const char *name; |
| const char * const *encodings_to_try; |
| }; |
| |
| static const char * const autodetect_utf8_try[] = |
| { |
| /* Try UTF-8 first. There are very few ISO-8859-1 inputs that would |
| be valid UTF-8, but many UTF-8 inputs are valid ISO-8859-1. */ |
| "UTF-8", "ISO-8859-1", |
| NULL |
| }; |
| static const char * const autodetect_jp_try[] = |
| { |
| /* Try 7-bit encoding first. If the input contains bytes >= 0x80, |
| it will fail. |
| Try EUC-JP next. Short SHIFT_JIS inputs may come out wrong. This |
| is unavoidable. People will condemn SHIFT_JIS. |
| If we tried SHIFT_JIS first, then some short EUC-JP inputs would |
| come out wrong, and people would condemn EUC-JP and Unix, which |
| would not be good. |
| Finally try SHIFT_JIS. */ |
| "ISO-2022-JP-2", "EUC-JP", "SHIFT_JIS", |
| NULL |
| }; |
| static const char * const autodetect_kr_try[] = |
| { |
| /* Try 7-bit encoding first. If the input contains bytes >= 0x80, |
| it will fail. |
| Finally try EUC-KR. */ |
| "ISO-2022-KR", "EUC-KR", |
| NULL |
| }; |
| |
| static struct autodetect_alias autodetect_predefined[] = |
| { |
| { &autodetect_predefined[1], "autodetect_utf8", autodetect_utf8_try }, |
| { &autodetect_predefined[2], "autodetect_jp", autodetect_jp_try }, |
| { NULL, "autodetect_kr", autodetect_kr_try } |
| }; |
| |
| static struct autodetect_alias *autodetect_list = &autodetect_predefined[0]; |
| static struct autodetect_alias **autodetect_list_end = |
| &autodetect_predefined[SIZEOF(autodetect_predefined)-1].next; |
| |
| int |
| uniconv_register_autodetect (const char *name, |
| const char * const *try_in_order) |
| { |
| size_t namelen; |
| size_t listlen; |
| size_t memneed; |
| size_t i; |
| char *memory; |
| struct autodetect_alias *new_alias; |
| char *new_name; |
| const char **new_try_in_order; |
| |
| /* The TRY_IN_ORDER list must not be empty. */ |
| if (try_in_order[0] == NULL) |
| { |
| errno = EINVAL; |
| return -1; |
| } |
| |
| /* We must deep-copy NAME and TRY_IN_ORDER, because they may be allocated |
| with dynamic extent. */ |
| namelen = strlen (name) + 1; |
| memneed = sizeof (struct autodetect_alias) + namelen + sizeof (char *); |
| for (i = 0; try_in_order[i] != NULL; i++) |
| memneed += sizeof (char *) + strlen (try_in_order[i]) + 1; |
| listlen = i; |
| |
| memory = (char *) malloc (memneed); |
| if (memory != NULL) |
| { |
| new_alias = (struct autodetect_alias *) memory; |
| memory += sizeof (struct autodetect_alias); |
| |
| new_try_in_order = (const char **) memory; |
| memory += (listlen + 1) * sizeof (char *); |
| |
| new_name = (char *) memory; |
| memcpy (new_name, name, namelen); |
| memory += namelen; |
| |
| for (i = 0; i < listlen; i++) |
| { |
| size_t len = strlen (try_in_order[i]) + 1; |
| memcpy (memory, try_in_order[i], len); |
| new_try_in_order[i] = (const char *) memory; |
| memory += len; |
| } |
| new_try_in_order[i] = NULL; |
| |
| /* Now insert the new alias. */ |
| new_alias->name = new_name; |
| new_alias->encodings_to_try = new_try_in_order; |
| new_alias->next = NULL; |
| /* FIXME: Not multithread-safe. */ |
| *autodetect_list_end = new_alias; |
| autodetect_list_end = &new_alias->next; |
| return 0; |
| } |
| else |
| { |
| errno = ENOMEM; |
| return -1; |
| } |
| } |
| |
| /* Like mem_iconveha, except no handling of transliteration. */ |
| static int |
| mem_iconveha_notranslit (const char *src, size_t srclen, |
| const char *from_codeset, const char *to_codeset, |
| enum iconv_ilseq_handler handler, |
| size_t *offsets, |
| char **resultp, size_t *lengthp) |
| { |
| int retval = mem_iconveh (src, srclen, from_codeset, to_codeset, handler, |
| offsets, resultp, lengthp); |
| if (retval >= 0 || errno != EINVAL) |
| return retval; |
| else |
| { |
| struct autodetect_alias *alias; |
| |
| /* Unsupported from_codeset or to_codeset. Check whether the caller |
| requested autodetection. */ |
| for (alias = autodetect_list; alias != NULL; alias = alias->next) |
| if (strcmp (from_codeset, alias->name) == 0) |
| { |
| const char * const *encodings; |
| |
| if (handler != iconveh_error) |
| { |
| /* First try all encodings without any forgiving. */ |
| encodings = alias->encodings_to_try; |
| do |
| { |
| retval = mem_iconveha_notranslit (src, srclen, |
| *encodings, to_codeset, |
| iconveh_error, offsets, |
| resultp, lengthp); |
| if (!(retval < 0 && errno == EILSEQ)) |
| return retval; |
| encodings++; |
| } |
| while (*encodings != NULL); |
| } |
| |
| encodings = alias->encodings_to_try; |
| do |
| { |
| retval = mem_iconveha_notranslit (src, srclen, |
| *encodings, to_codeset, |
| handler, offsets, |
| resultp, lengthp); |
| if (!(retval < 0 && errno == EILSEQ)) |
| return retval; |
| encodings++; |
| } |
| while (*encodings != NULL); |
| |
| /* Return the last call's result. */ |
| return -1; |
| } |
| |
| /* It wasn't an autodetection name. */ |
| errno = EINVAL; |
| return -1; |
| } |
| } |
| |
| int |
| mem_iconveha (const char *src, size_t srclen, |
| const char *from_codeset, const char *to_codeset, |
| bool transliterate, |
| enum iconv_ilseq_handler handler, |
| size_t *offsets, |
| char **resultp, size_t *lengthp) |
| { |
| if (srclen == 0) |
| { |
| /* Nothing to convert. */ |
| *lengthp = 0; |
| return 0; |
| } |
| |
| /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5, |
| we want to use transliteration. */ |
| #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ |
| && !defined __UCLIBC__) \ |
| || _LIBICONV_VERSION >= 0x0105 |
| if (transliterate) |
| { |
| int retval; |
| size_t len = strlen (to_codeset); |
| char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); |
| memcpy (to_codeset_suffixed, to_codeset, len); |
| memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); |
| |
| retval = mem_iconveha_notranslit (src, srclen, |
| from_codeset, to_codeset_suffixed, |
| handler, offsets, resultp, lengthp); |
| |
| freea (to_codeset_suffixed); |
| |
| return retval; |
| } |
| else |
| #endif |
| return mem_iconveha_notranslit (src, srclen, |
| from_codeset, to_codeset, |
| handler, offsets, resultp, lengthp); |
| } |
| |
| /* Like str_iconveha, except no handling of transliteration. */ |
| static char * |
| str_iconveha_notranslit (const char *src, |
| const char *from_codeset, const char *to_codeset, |
| enum iconv_ilseq_handler handler) |
| { |
| char *result = str_iconveh (src, from_codeset, to_codeset, handler); |
| |
| if (result != NULL || errno != EINVAL) |
| return result; |
| else |
| { |
| struct autodetect_alias *alias; |
| |
| /* Unsupported from_codeset or to_codeset. Check whether the caller |
| requested autodetection. */ |
| for (alias = autodetect_list; alias != NULL; alias = alias->next) |
| if (strcmp (from_codeset, alias->name) == 0) |
| { |
| const char * const *encodings; |
| |
| if (handler != iconveh_error) |
| { |
| /* First try all encodings without any forgiving. */ |
| encodings = alias->encodings_to_try; |
| do |
| { |
| result = str_iconveha_notranslit (src, |
| *encodings, to_codeset, |
| iconveh_error); |
| if (!(result == NULL && errno == EILSEQ)) |
| return result; |
| encodings++; |
| } |
| while (*encodings != NULL); |
| } |
| |
| encodings = alias->encodings_to_try; |
| do |
| { |
| result = str_iconveha_notranslit (src, |
| *encodings, to_codeset, |
| handler); |
| if (!(result == NULL && errno == EILSEQ)) |
| return result; |
| encodings++; |
| } |
| while (*encodings != NULL); |
| |
| /* Return the last call's result. */ |
| return NULL; |
| } |
| |
| /* It wasn't an autodetection name. */ |
| errno = EINVAL; |
| return NULL; |
| } |
| } |
| |
| char * |
| str_iconveha (const char *src, |
| const char *from_codeset, const char *to_codeset, |
| bool transliterate, |
| enum iconv_ilseq_handler handler) |
| { |
| if (*src == '\0' || c_strcasecmp (from_codeset, to_codeset) == 0) |
| { |
| char *result = strdup (src); |
| |
| if (result == NULL) |
| errno = ENOMEM; |
| return result; |
| } |
| |
| /* When using GNU libc >= 2.2 or GNU libiconv >= 1.5, |
| we want to use transliteration. */ |
| #if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \ |
| && !defined __UCLIBC__) \ |
| || _LIBICONV_VERSION >= 0x0105 |
| if (transliterate) |
| { |
| char *result; |
| size_t len = strlen (to_codeset); |
| char *to_codeset_suffixed = (char *) malloca (len + 10 + 1); |
| memcpy (to_codeset_suffixed, to_codeset, len); |
| memcpy (to_codeset_suffixed + len, "//TRANSLIT", 10 + 1); |
| |
| result = str_iconveha_notranslit (src, from_codeset, to_codeset_suffixed, |
| handler); |
| |
| freea (to_codeset_suffixed); |
| |
| return result; |
| } |
| else |
| #endif |
| return str_iconveha_notranslit (src, from_codeset, to_codeset, handler); |
| } |