1 /* utf8.c - Operations on UTF-8 strings.
3 * Copyright (C) 2002 Timo Sirainen
5 * Based on GLib code by
7 * Copyright (C) 1999 Tom Tromey
8 * Copyright (C) 2000 Red Hat, Inc.
10 * UTF-8 width tables based on locale data from GNU libc by
12 * Copyright (C) 1991-2002 Free Software Foundation, Inc.
14 * This library is free software; you can redistribute it and/or
15 * modify it under the terms of the GNU Lesser General Public
16 * License as published by the Free Software Foundation; either
17 * version 2 of the License, or (at your option) any later version.
19 * This library is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 * Lesser General Public License for more details.
24 * You should have received a copy of the GNU Lesser General Public
25 * License along with this library; if not, write to the
26 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 * Boston, MA 02111-1307, USA.
32 #define UTF8_COMPUTE(Char, Mask, Len) \
38 else if ((Char & 0xe0) == 0xc0) \
43 else if ((Char & 0xf0) == 0xe0) \
48 else if ((Char & 0xf8) == 0xf0) \
53 else if ((Char & 0xfc) == 0xf8) \
58 else if ((Char & 0xfe) == 0xfc) \
66 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
67 (Result) = (Chars)[0] & (Mask); \
68 for ((Count) = 1; (Count) < (Len); ++(Count)) \
70 if (((Chars)[(Count)] & 0xc0) != 0x80) \
76 (Result) |= ((Chars)[(Count)] & 0x3f); \
79 int get_utf8_char(const unsigned char **ptr, int len, unichar *chr_r)
81 int i, result, mask, chrlen;
84 UTF8_COMPUTE(**ptr, mask, chrlen);
91 UTF8_GET(result, *ptr, i, mask, chrlen);
95 *chr_r = (unichar) result;
100 int strlen_utf8(const char *str)
102 const unsigned char *p = (const unsigned char *) str;
107 while (*p != '\0' && get_utf8_char(&p, 6, &chr_r) > 0) {
114 int utf16_char_to_utf8(unichar c, char *outbuf)
122 } else if (c < 0x800) {
125 } else if (c < 0x10000) {
128 } else if (c < 0x200000) {
131 } else if (c < 0x4000000) {
140 for (i = len - 1; i > 0; --i) {
141 outbuf[i] = (c & 0x3f) | 0x80;
144 outbuf[0] = c | first;
150 void utf8_to_utf16(const char *str, unichar *out)
152 const unsigned char *p = (const unsigned char *) str;
153 int i, result, mask, len;
157 UTF8_COMPUTE(*p, mask, len);
161 UTF8_GET(result, p, i, mask, len);
172 void utf16_to_utf8(const unichar *str, char *out)
176 while (*str != '\0') {
177 len = utf16_char_to_utf8(*str, out);
185 void utf16_to_utf8_with_pos(const unichar *str, int spos, char *out, int *opos)
188 const unichar *sstart = str;
192 while (*str != '\0') {
193 len = utf16_char_to_utf8(*str, out);
197 if(str - sstart == spos)
198 *opos = out - ostart;
203 static const unichar wcc[] = {
204 0x0, 0x300, 0x34F, 0x360, 0x363, 0x483, 0x487, 0x488, 0x48A, 0x591,
205 0x5A2, 0x5A3, 0x5BA, 0x5BB, 0x5BE, 0x5BF, 0x5C0, 0x5C1, 0x5C3, 0x5C4,
206 0x5C5, 0x64B, 0x656, 0x670, 0x671, 0x6D6, 0x6E5, 0x6E7, 0x6E9, 0x6EA,
207 0x6EE, 0x70F, 0x710, 0x711, 0x712, 0x730, 0x74B, 0x7A6, 0x7B1, 0x901,
208 0x903, 0x93C, 0x93D, 0x941, 0x949, 0x94D, 0x94E, 0x951, 0x955, 0x962,
209 0x964, 0x981, 0x982, 0x9BC, 0x9BD, 0x9C1, 0x9C5, 0x9CD, 0x9CE, 0x9E2,
210 0x9E4, 0xA02, 0xA03, 0xA3C, 0xA3D, 0xA41, 0xA43, 0xA47, 0xA49, 0xA4B,
211 0xA4E, 0xA70, 0xA72, 0xA81, 0xA83, 0xABC, 0xABD, 0xAC1, 0xAC6, 0xAC7,
212 0xAC9, 0xACD, 0xACE, 0xB01, 0xB02, 0xB3C, 0xB3D, 0xB3F, 0xB40, 0xB41,
213 0xB44, 0xB4D, 0xB4E, 0xB56, 0xB57, 0xB82, 0xB83, 0xBC0, 0xBC1, 0xBCD,
214 0xBCE, 0xC3E, 0xC41, 0xC46, 0xC49, 0xC4A, 0xC4E, 0xC55, 0xC57, 0xCBF,
215 0xCC0, 0xCC6, 0xCC7, 0xCCC, 0xCCE, 0xD41, 0xD44, 0xD4D, 0xD4E, 0xDCA,
216 0xDCB, 0xDD2, 0xDD5, 0xDD6, 0xDD7, 0xE31, 0xE32, 0xE34, 0xE3B, 0xE47,
217 0xE4F, 0xEB1, 0xEB2, 0xEB4, 0xEBA, 0xEBB, 0xEBD, 0xEC8, 0xECE, 0xF18,
218 0xF1A, 0xF35, 0xF36, 0xF37, 0xF38, 0xF39, 0xF3A, 0xF71, 0xF7F, 0xF80,
219 0xF85, 0xF86, 0xF88, 0xF90, 0xF98, 0xF99, 0xFBD, 0xFC6, 0xFC7, 0x102D,
220 0x1031, 0x1032, 0x1033, 0x1036, 0x1038, 0x1039, 0x103A, 0x1058, 0x105A,
221 0x1100, 0x1160, 0x17B7, 0x17BE, 0x17C6, 0x17C7, 0x17C9, 0x17D4, 0x180B,
222 0x180F, 0x18A9, 0x18AA, 0x200B, 0x2010, 0x202A, 0x202F, 0x206A, 0x2070,
223 0x20D0, 0x20E4, 0x2E80, 0x3008, 0x300C, 0x3014, 0x3016, 0x3018, 0x301C,
224 0x302A, 0x3030, 0x303F, 0x3041, 0x3095, 0x3099, 0x309B, 0xA4C7, 0xAC00,
225 0xD7A4, 0xF8F0, 0xF900, 0xFA2E, 0xFB1E, 0xFB1F, 0xFE20, 0xFE24, 0xFE30,
226 0xFE6C, 0xFEFF, 0xFF00, 0xFF01, 0xFF5F, 0xFFE0, 0xFFE7, 0xFFF9, 0xFFFC,
228 0x1D167, 0x1D16A, 0x1D173, 0x1D183, 0x1D185, 0x1D18C, 0x1D1AA, 0x1D1AE,
229 0x20000, 0x2A6D7, 0x2F800, 0x2FA1E, 0xE0001, 0xE0002, 0xE0020, 0xE0080
233 static const int wccnum = sizeof(wcc) / sizeof(wcc[0]) - 1;
235 static const char wws[] = {
236 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
237 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
238 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
239 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
240 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
241 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
242 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
243 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2,
244 1, 2, 1, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2, 1, 0, 2, 1, 0, 1, 0, 1, 2, 1, 0,
245 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, -1
248 int utf8_width(unichar c)