/* | |
* Copyright (C) 1999 Tom Tromey | |
* Copyright (C) 2000 Red Hat, Inc. | |
* | |
* This library is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU Lesser General Public | |
* License as published by the Free Software Foundation; either | |
* version 2 of the License, or (at your option) any later version. | |
* | |
* This library is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
* Lesser General Public License for more details. | |
* | |
* You should have received a copy of the GNU Lesser General Public | |
* License along with this library; if not, write to the | |
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
* Boston, MA 02111-1307, USA. | |
* | |
* | |
************************************************ | |
* Also licensed with permission from Tom Tromey | |
* and Owen Taylor under the Apache license. | |
* Original location: | |
* http://cvs.gnome.org/viewcvs/glib/glib/gutf8.c?rev=1.50&view=log | |
************************************************ | |
* | |
* Copyright 2003-2006 The Apache Software Foundation | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
#include "CLucene/StdHeader.h" | |
typedef unsigned long gunichar; | |
typedef unsigned char guchar; | |
#define UTF8_COMPUTE(Char, Mask, Len) \ | |
if (Char < 128) \ | |
{ \ | |
Len = 1; \ | |
Mask = 0x7f; \ | |
} \ | |
else if ((Char & 0xe0) == 0xc0) \ | |
{ \ | |
Len = 2; \ | |
Mask = 0x1f; \ | |
} \ | |
else if ((Char & 0xf0) == 0xe0) \ | |
{ \ | |
Len = 3; \ | |
Mask = 0x0f; \ | |
} \ | |
else if ((Char & 0xf8) == 0xf0) \ | |
{ \ | |
Len = 4; \ | |
Mask = 0x07; \ | |
} \ | |
else if ((Char & 0xfc) == 0xf8) \ | |
{ \ | |
Len = 5; \ | |
Mask = 0x03; \ | |
} \ | |
else if ((Char & 0xfe) == 0xfc) \ | |
{ \ | |
Len = 6; \ | |
Mask = 0x01; \ | |
} \ | |
else \ | |
Len = -1; | |
/*#define UTF8_LENGTH(Char) \ | |
((Char) < 0x80 ? 1 : \ | |
((Char) < 0x800 ? 2 : \ | |
((Char) < 0x10000 ? 3 : \ | |
((Char) < 0x200000 ? 4 : \ | |
((Char) < 0x4000000 ? 5 : 6)))))*/ | |
#define UTF8_GET(Result, Chars, Count, Mask, Len) \ | |
(Result) = (Chars)[0] & (Mask); \ | |
for ((Count) = 1; (Count) < (Len); ++(Count)) \ | |
{ \ | |
if (((Chars)[(Count)] & 0xc0) != 0x80) \ | |
{ \ | |
(Result) = -1; \ | |
break; \ | |
} \ | |
(Result) <<= 6; \ | |
(Result) |= ((Chars)[(Count)] & 0x3f); \ | |
} | |
/** | |
* lucene_wctoutf8: | |
* @c: a ISO10646 character code | |
* @outbuf: output buffer, must have at least 6 bytes of space. | |
* If %NULL, the length will be computed and returned | |
* and nothing will be written to @outbuf. | |
* | |
* Converts a single character to UTF-8. | |
* | |
* Return value: number of bytes written | |
**/ | |
size_t lucene_wctoutf8(char * outbuf, const wchar_t ch) | |
{ | |
gunichar c = ch; | |
guchar len = 0; | |
int first; | |
int i; | |
if (c < 0x80) | |
{ | |
first = 0; | |
len = 1; | |
} | |
else if (c < 0x800) | |
{ | |
first = 0xc0; | |
len = 2; | |
} | |
else if (c < 0x10000) | |
{ | |
first = 0xe0; | |
len = 3; | |
} | |
else if (c < 0x200000) | |
{ | |
first = 0xf0; | |
len = 4; | |
} | |
else if (c < 0x4000000) | |
{ | |
first = 0xf8; | |
len = 5; | |
} | |
else | |
{ | |
first = 0xfc; | |
len = 6; | |
} | |
if (outbuf) | |
{ | |
for (i = len - 1; i > 0; --i) | |
{ | |
outbuf[i] = (char)((c & 0x3f) | 0x80); | |
c >>= 6; | |
} | |
outbuf[0] = c | first; | |
} | |
return len; | |
} | |
/** | |
* lucene_utf8towc: | |
* @p: a pointer to Unicode character encoded as UTF-8 | |
* | |
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | |
* If @p does not point to a valid UTF-8 encoded character, results are | |
* undefined. If you are not sure that the bytes are complete | |
* valid Unicode characters, you should use lucene_utf8towc_validated() | |
* instead. | |
* | |
* Return value: the resulting character | |
**/ | |
size_t lucene_utf8towc(wchar_t *pwc, const char *p, size_t n) | |
{ | |
int i, mask = 0; | |
int result; | |
unsigned char c = (unsigned char) *p; | |
int len=0; | |
UTF8_COMPUTE (c, mask, len); | |
if (len == -1) | |
return 0; | |
UTF8_GET (result, p, i, mask, len); | |
*pwc = result; | |
return len; | |
} | |
//this function was not taken from gnome | |
size_t lucene_wcstoutf8(char * result, const wchar_t * str, size_t result_length){ | |
char *p=result; | |
int i = 0; | |
while (p < result + result_length-1 && str[i] != 0) | |
p += lucene_wctoutf8(p,str[i++]); | |
*p = '\0'; | |
return p-result; | |
} | |
//this function was not taken from gnome | |
size_t lucene_utf8towcs(wchar_t * result, const char * str, size_t result_length){ | |
char *sp = (char*)str; | |
wchar_t *rp = result; | |
int i = 0; | |
while (rp < result + result_length && *sp!=0){ | |
size_t r = lucene_utf8towc(rp,sp,6); | |
if ( r == -1 ) | |
return 0; | |
sp += r; | |
rp++; | |
} | |
if ( sp-str < result_length ) | |
*rp = '\0'; | |
size_t ret = sp-str; | |
return ret; | |
} | |
//get the number of bytes that make up the utf8 character. | |
//this function was not taken from gnome | |
size_t lucene_utf8charlen(const char *p) | |
{ | |
int mask = 0; | |
int len=0; | |
unsigned char c = (unsigned char) *p; | |
UTF8_COMPUTE (c, mask, len); | |
return len; | |
} |