1 /* utf8.c - Operations on UTF-8 strings.
3 * Copyright (C) 2002 Timo Sirainen
5 * Based on GLib code by
7 * Copyright (C) 1999 Tom Tromey
8 * Copyright (C) 2000 Red Hat, Inc.
10 * UTF-8 width tables based on locale data from GNU libc by
12 * Copyright (C) 1991-2002 Free Software Foundation, Inc.
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2 of the License, or (at your option) any later version.
19 * This library is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * Lesser General Public License for more details.
24 * You should have received a copy of the GNU Lesser General Public
25 * License along with this library; if not, write to the
26 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 * Boston, MA 02111-1307, USA.
32 #define UTF8_COMPUTE(Char, Mask, Len) \
38 else if ((Char & 0xe0) == 0xc0) \
43 else if ((Char & 0xf0) == 0xe0) \
48 else if ((Char & 0xf8) == 0xf0) \
53 else if ((Char & 0xfc) == 0xf8) \
58 else if ((Char & 0xfe) == 0xfc) \
66 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
67 (Result) = (Chars)[0] & (Mask); \
68 for ((Count) = 1; (Count) < (Len); ++(Count)) \
70 if (((Chars)[(Count)] & 0xc0) != 0x80) \
76 (Result) |= ((Chars)[(Count)] & 0x3f); \
79 unichar get_utf8_char(const unsigned char **ptr, int len)
81 int i, result, mask, chrlen;
84 UTF8_COMPUTE(**ptr, mask, chrlen);
91 UTF8_GET(result, *ptr, i, mask, chrlen);
99 int strlen_utf8(const char *str)
101 const unsigned char *p = (const unsigned char *) str;
105 while (*p != '\0' && get_utf8_char(&p, 6) > 0) {
112 int utf16_char_to_utf8(unichar c, char *outbuf)
120 } else if (c < 0x800) {
123 } else if (c < 0x10000) {
126 } else if (c < 0x200000) {
129 } else if (c < 0x4000000) {
138 for (i = len - 1; i > 0; --i) {
139 outbuf[i] = (c & 0x3f) | 0x80;
142 outbuf[0] = c | first;
148 void utf8_to_utf16(const char *str, unichar *out)
150 const unsigned char *p = (const unsigned char *) str;
151 int i, result, mask, len;
155 UTF8_COMPUTE(*p, mask, len);
159 UTF8_GET(result, p, i, mask, len);
170 void utf16_to_utf8(const unichar *str, char *out)
174 while (*str != '\0') {
175 len = utf16_char_to_utf8(*str, out);
183 static const unichar wcc[] = {
184 0x0, 0x300, 0x34F, 0x360, 0x363, 0x483, 0x487, 0x488, 0x48A, 0x591,
185 0x5A2, 0x5A3, 0x5BA, 0x5BB, 0x5BE, 0x5BF, 0x5C0, 0x5C1, 0x5C3, 0x5C4,
186 0x5C5, 0x64B, 0x656, 0x670, 0x671, 0x6D6, 0x6E5, 0x6E7, 0x6E9, 0x6EA,
187 0x6EE, 0x70F, 0x710, 0x711, 0x712, 0x730, 0x74B, 0x7A6, 0x7B1, 0x901,
188 0x903, 0x93C, 0x93D, 0x941, 0x949, 0x94D, 0x94E, 0x951, 0x955, 0x962,
189 0x964, 0x981, 0x982, 0x9BC, 0x9BD, 0x9C1, 0x9C5, 0x9CD, 0x9CE, 0x9E2,
190 0x9E4, 0xA02, 0xA03, 0xA3C, 0xA3D, 0xA41, 0xA43, 0xA47, 0xA49, 0xA4B,
191 0xA4E, 0xA70, 0xA72, 0xA81, 0xA83, 0xABC, 0xABD, 0xAC1, 0xAC6, 0xAC7,
192 0xAC9, 0xACD, 0xACE, 0xB01, 0xB02, 0xB3C, 0xB3D, 0xB3F, 0xB40, 0xB41,
193 0xB44, 0xB4D, 0xB4E, 0xB56, 0xB57, 0xB82, 0xB83, 0xBC0, 0xBC1, 0xBCD,
194 0xBCE, 0xC3E, 0xC41, 0xC46, 0xC49, 0xC4A, 0xC4E, 0xC55, 0xC57, 0xCBF,
195 0xCC0, 0xCC6, 0xCC7, 0xCCC, 0xCCE, 0xD41, 0xD44, 0xD4D, 0xD4E, 0xDCA,
196 0xDCB, 0xDD2, 0xDD5, 0xDD6, 0xDD7, 0xE31, 0xE32, 0xE34, 0xE3B, 0xE47,
197 0xE4F, 0xEB1, 0xEB2, 0xEB4, 0xEBA, 0xEBB, 0xEBD, 0xEC8, 0xECE, 0xF18,
198 0xF1A, 0xF35, 0xF36, 0xF37, 0xF38, 0xF39, 0xF3A, 0xF71, 0xF7F, 0xF80,
199 0xF85, 0xF86, 0xF88, 0xF90, 0xF98, 0xF99, 0xFBD, 0xFC6, 0xFC7, 0x102D,
200 0x1031, 0x1032, 0x1033, 0x1036, 0x1038, 0x1039, 0x103A, 0x1058, 0x105A,
201 0x1100, 0x1160, 0x17B7, 0x17BE, 0x17C6, 0x17C7, 0x17C9, 0x17D4, 0x180B,
202 0x180F, 0x18A9, 0x18AA, 0x200B, 0x2010, 0x202A, 0x202F, 0x206A, 0x2070,
203 0x20D0, 0x20E4, 0x2E80, 0x3008, 0x300C, 0x3014, 0x3016, 0x3018, 0x301C,
204 0x302A, 0x3030, 0x303F, 0x3041, 0x3095, 0x3099, 0x309B, 0xA4C7, 0xAC00,
205 0xD7A4, 0xF8F0, 0xF900, 0xFA2E, 0xFB1E, 0xFB1F, 0xFE20, 0xFE24, 0xFE30,
206 0xFE6C, 0xFEFF, 0xFF00, 0xFF01, 0xFF5F, 0xFFE0, 0xFFE7, 0xFFF9, 0xFFFC,
208 0x1D167, 0x1D16A, 0x1D173, 0x1D183, 0x1D185, 0x1D18C, 0x1D1AA, 0x1D1AE,
209 0x20000, 0x2A6D7, 0x2F800, 0x2FA1E, 0xE0001, 0xE0002, 0xE0020, 0xE0080
213 static const int wccnum = sizeof(wcc) / sizeof(wcc[0]) - 1;
215 static const char wws[] = {
216 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
217 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
218 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
219 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
220 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
221 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
222 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
223 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2,
224 1, 2, 1, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2, 1, 0, 2, 1, 0, 1, 0, 1, 2, 1, 0,
225 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, -1
228 int utf8_width(unichar c)