| /* |
| * Copyright (C) 2017 The Android Open Source Project |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| */ |
| |
| #include <iconv.h> |
| |
| #include <ctype.h> |
| #include <endian.h> |
| #include <errno.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <uchar.h> |
| |
| #include "private/bionic_mbstate.h" |
| |
| #define INVALID_ICONV_T reinterpret_cast<iconv_t>(-1) |
| |
| // Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something |
| // equivalent to (but slightly easier to use for runs of text than) <uchar.h>. If you're |
| // here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. |
| enum Encoding { |
| US_ASCII, |
| UTF_8, |
| UTF_16_LE, |
| UTF_16_BE, |
| UTF_32_LE, |
| UTF_32_BE, |
| WCHAR_T, |
| }; |
| |
| enum Mode { |
| ERROR, |
| IGNORE, |
| TRANSLIT, |
| }; |
| |
| // This matching is strange but true. |
| // See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. |
| static bool __match_encoding(const char* lhs, const char* rhs) { |
| while (*lhs && *rhs) { |
| // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. |
| // Also implement the "delete each 0 that is not preceded by a digit" rule. |
| for (; *lhs; ++lhs) { |
| if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; |
| } |
| // Case doesn't matter either. |
| if (tolower(*lhs) != tolower(*rhs)) break; |
| ++lhs; |
| ++rhs; |
| } |
| // As a special case we treat the GNU "//" extensions as end of string. |
| if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; |
| return false; |
| } |
| |
| static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { |
| const char* suffix = strstr(s, "//"); |
| if (suffix) { |
| if (!mode) return false; |
| if (strcmp(suffix, "//IGNORE") == 0) { |
| *mode = IGNORE; |
| } else if (strcmp(suffix, "//TRANSLIT") == 0) { |
| *mode = TRANSLIT; |
| } else { |
| return false; |
| } |
| } |
| if (__match_encoding(s, "utf8")) { |
| *encoding = UTF_8; |
| } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { |
| *encoding = US_ASCII; |
| } else if (__match_encoding(s, "utf16le")) { |
| *encoding = UTF_16_LE; |
| } else if (__match_encoding(s, "utf16be")) { |
| *encoding = UTF_16_BE; |
| } else if (__match_encoding(s, "utf32le")) { |
| *encoding = UTF_32_LE; |
| } else if (__match_encoding(s, "utf32be")) { |
| *encoding = UTF_32_BE; |
| } else if (__match_encoding(s, "wchart")) { |
| *encoding = WCHAR_T; |
| } else { |
| return false; |
| } |
| return true; |
| } |
| |
| struct __iconv_t { |
| Encoding src_encoding; |
| Encoding dst_encoding; |
| Mode mode; |
| |
| __iconv_t() : mode(ERROR) { |
| } |
| |
| int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { |
| // Reset state. |
| wc = 0; |
| memset(&ps, 0, sizeof(ps)); |
| replacement_count = 0; |
| ignored = false; |
| src_buf = src_buf0; |
| src_bytes_left = src_bytes_left0; |
| dst_buf = dst_buf0; |
| dst_bytes_left = dst_bytes_left0; |
| |
| while (*src_bytes_left > 0) { |
| if (!GetNext() || !Convert()) return -1; |
| } |
| return Done(); |
| } |
| |
| private: |
| char32_t wc; |
| char buf[16]; |
| size_t src_bytes_used; |
| size_t dst_bytes_used; |
| mbstate_t ps; |
| |
| size_t replacement_count; |
| bool ignored; |
| |
| char** src_buf; |
| size_t* src_bytes_left; |
| char** dst_buf; |
| size_t* dst_bytes_left; |
| |
| bool GetNext() { |
| errno = 0; |
| switch (src_encoding) { |
| case US_ASCII: |
| wc = **src_buf; |
| src_bytes_used = 1; |
| if (wc > 0x7f) errno = EILSEQ; |
| break; |
| |
| case UTF_8: |
| src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); |
| if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { |
| break; // EILSEQ already set. |
| } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { |
| errno = EINVAL; |
| return false; |
| } |
| break; |
| |
| case UTF_16_BE: |
| case UTF_16_LE: { |
| if (*src_bytes_left < 2) { |
| errno = EINVAL; |
| return false; |
| } |
| bool swap = (src_encoding == UTF_16_BE); |
| wc = In16(*src_buf, swap); |
| // 0xd800-0xdbff: high surrogates |
| // 0xdc00-0xdfff: low surrogates |
| if (wc >= 0xd800 && wc <= 0xdfff) { |
| if (wc >= 0xdc00) { // Low surrogate before high surrogate. |
| errno = EILSEQ; |
| return false; |
| } |
| if (*src_bytes_left < 4) { |
| errno = EINVAL; |
| return false; |
| } |
| uint16_t hi = wc; |
| uint16_t lo = In16(*src_buf + 2, swap); |
| wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); |
| src_bytes_used = 4; |
| } |
| break; |
| } |
| |
| case UTF_32_BE: |
| case UTF_32_LE: |
| case WCHAR_T: |
| if (*src_bytes_left < 4) { |
| errno = EINVAL; |
| return false; |
| } |
| wc = In32(*src_buf, (src_encoding == UTF_32_BE)); |
| break; |
| } |
| |
| if (errno == EILSEQ) { |
| switch (mode) { |
| case ERROR: |
| return false; |
| case IGNORE: |
| *src_buf += src_bytes_used; |
| *src_bytes_left -= src_bytes_used; |
| ignored = true; |
| return GetNext(); |
| case TRANSLIT: |
| wc = '?'; |
| ++replacement_count; |
| return true; |
| } |
| } |
| return true; |
| } |
| |
| bool Convert() { |
| errno = 0; |
| switch (dst_encoding) { |
| case US_ASCII: |
| buf[0] = wc; |
| dst_bytes_used = 1; |
| if (wc > 0x7f) errno = EILSEQ; |
| break; |
| |
| case UTF_8: |
| dst_bytes_used = c32rtomb(buf, wc, &ps); |
| if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { |
| break; // EILSEQ already set. |
| } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { |
| errno = EINVAL; |
| return false; |
| } |
| break; |
| |
| case UTF_16_BE: |
| case UTF_16_LE: { |
| bool swap = (dst_encoding == UTF_16_BE); |
| if (wc < 0x10000) { // BMP. |
| Out16(buf, wc, swap); |
| } else { // Supplementary plane; output surrogate pair. |
| wc -= 0x10000; |
| char16_t hi = 0xd800 | (wc >> 10); |
| char16_t lo = 0xdc00 | (wc & 0x3ff); |
| Out16(buf + 0, hi, swap); |
| Out16(buf + 2, lo, swap); |
| dst_bytes_used = 4; |
| } |
| } break; |
| |
| case UTF_32_BE: |
| case UTF_32_LE: |
| case WCHAR_T: |
| Out32(wc, (dst_encoding == UTF_32_BE)); |
| break; |
| } |
| |
| if (errno == EILSEQ) { |
| if (mode == IGNORE) { |
| *src_buf += src_bytes_used; |
| *src_bytes_left -= src_bytes_used; |
| ignored = true; |
| return true; |
| } else if (mode == TRANSLIT) { |
| wc = '?'; |
| ++replacement_count; |
| return Convert(); |
| } |
| return false; |
| } |
| |
| return Emit(); |
| } |
| |
| uint16_t In16(const char* buf, bool swap) { |
| const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); |
| uint16_t wc = (src[0]) | (src[1] << 8); |
| if (swap) wc = __swap16(wc); |
| src_bytes_used = 2; |
| return wc; |
| } |
| |
| uint32_t In32(const char* buf, bool swap) { |
| const uint8_t* src = reinterpret_cast<const uint8_t*>(buf); |
| uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); |
| if (swap) wc = __swap32(wc); |
| src_bytes_used = 4; |
| return wc; |
| } |
| |
| void Out16(char* dst, char16_t ch, bool swap) { |
| if (swap) ch = __swap16(ch); |
| dst[0] = ch; |
| dst[1] = ch >> 8; |
| dst_bytes_used = 2; |
| } |
| |
| void Out32(char32_t ch, bool swap) { |
| if (swap) ch = __swap32(ch); |
| buf[0] = ch; |
| buf[1] = ch >> 8; |
| buf[2] = ch >> 16; |
| buf[3] = ch >> 24; |
| dst_bytes_used = 4; |
| } |
| |
| bool Emit() { |
| if (dst_bytes_used > *dst_bytes_left) { |
| errno = E2BIG; |
| return false; |
| } |
| |
| memcpy(*dst_buf, buf, dst_bytes_used); |
| *src_buf += src_bytes_used; |
| *src_bytes_left -= src_bytes_used; |
| *dst_buf += dst_bytes_used; |
| *dst_bytes_left -= dst_bytes_used; |
| return true; |
| } |
| |
| int Done() { |
| if (mode == TRANSLIT) return replacement_count; |
| if (ignored) { |
| errno = EILSEQ; |
| return -1; |
| } |
| return 0; |
| } |
| }; |
| |
| iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { |
| iconv_t result = new __iconv_t; |
| if (!__parse_encoding(__src_encoding, &result->src_encoding, nullptr) || |
| !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { |
| delete result; |
| errno = EINVAL; |
| return INVALID_ICONV_T; |
| } |
| return result; |
| } |
| |
| size_t iconv(iconv_t __converter, |
| char** __src_buf, size_t* __src_bytes_left, |
| char** __dst_buf, size_t* __dst_bytes_left) { |
| if (__converter == INVALID_ICONV_T) { |
| errno = EBADF; |
| return -1; |
| } |
| return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); |
| } |
| |
| int iconv_close(iconv_t __converter) { |
| if (__converter == INVALID_ICONV_T) { |
| errno = EBADF; |
| return -1; |
| } |
| delete __converter; |
| return 0; |
| } |