5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2004 - 2005 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
20 #include "silcincludes.h"
23 /* Encodes the string `bin' of which encoding is `bin_encoding' to the
24 UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'.
25 Returns the length of the UTF-8 encoded string, or zero (0) on error.
26 By default `bin_encoding' is ASCII, and the caller needs to know the
27 encoding of the input string if it is anything else. */
29 SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len,
30 SilcStringEncoding bin_encoding,
31 unsigned char *utf8, SilcUInt32 utf8_size)
33 SilcUInt32 enclen = 0, i, charval = 0;
38 if (bin_encoding == SILC_STRING_UTF8) {
39 if (!silc_utf8_valid(bin, bin_len))
43 if (bin_len > utf8_size)
45 memcpy(utf8, bin, bin_len);
49 if (bin_encoding == SILC_STRING_LOCALE) {
50 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
51 char *fromconv, *icp, *ocp;
55 setlocale(LC_CTYPE, "");
56 fromconv = nl_langinfo(CODESET);
57 if (fromconv && strlen(fromconv)) {
58 icd = iconv_open("UTF-8", fromconv);
63 if (icp && ocp && icd != (iconv_t)-1) {
64 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
70 if (icd != (iconv_t)-1)
75 /* Fallback to 8-bit ASCII */
76 bin_encoding = SILC_STRING_ASCII;
79 for (i = 0; i < bin_len; i++) {
80 switch (bin_encoding) {
81 case SILC_STRING_ASCII:
82 case SILC_STRING_TELETEX:
85 case SILC_STRING_ASCII_ESC:
86 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
92 SILC_GET16_MSB(charval, bin + i);
95 case SILC_STRING_BMP_LSB:
98 SILC_GET16_LSB(charval, bin + i);
101 case SILC_STRING_UNIVERSAL:
102 if (i + 3 >= bin_len)
104 SILC_GET32_MSB(charval, bin + i);
107 case SILC_STRING_UNIVERSAL_LSB:
108 if (i + 3 >= bin_len)
110 SILC_GET32_LSB(charval, bin + i);
113 case SILC_STRING_PRINTABLE:
114 case SILC_STRING_VISIBLE:
115 if (!isprint(bin[i]))
119 case SILC_STRING_NUMERICAL:
120 if (bin[i] != 0x20 && !isdigit(bin[i]))
124 case SILC_STRING_LDAP_DN:
125 /* Remove any escaping */
126 if (bin[i] == '\\') {
128 if (i + 1 >= bin_len)
131 /* If escaped character is any of the following no processing is
132 needed, otherwise it is a hex value and we need to read it. */
134 if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' &&
135 cv != '>' && cv != ';' && cv != ' ' && cv != '#') {
137 if (i + 1 >= bin_len)
139 if (sscanf(&bin[++i], "%02X", &hexval) != 1)
141 cv = (unsigned char)hexval;
154 if (charval < 0x80) {
156 if (enclen > utf8_size)
159 utf8[enclen] = (unsigned char)charval;
162 } else if (charval < 0x800) {
164 if (enclen + 2 > utf8_size)
167 utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0);
168 utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80);
171 } else if (charval < 0x10000) {
173 if (enclen + 3 > utf8_size)
176 utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0);
177 utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
178 utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80);
181 } else if (charval < 0x200000) {
183 if (enclen + 4 > utf8_size)
186 utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0);
187 utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
188 utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
189 utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80);
192 } else if (charval < 0x4000000) {
194 if (enclen + 5 > utf8_size)
197 utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8);
198 utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
199 utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
200 utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
201 utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80);
206 if (enclen + 6 > utf8_size)
209 utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc);
210 utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80);
211 utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
212 utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
213 utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
214 utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80);
223 /* Decodes UTF-8 encoded string `utf8' to string of which encoding is
224 to be `bin_encoding', into the `bin' buffer of size of `bin_size'.
225 Returns the length of the decoded buffer, or zero (0) on error.
226 By default `bin_encoding' is ASCII, and the caller needs to know to
227 which encoding the output string is to be encoded if ASCII is not
230 SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len,
231 SilcStringEncoding bin_encoding,
232 unsigned char *bin, SilcUInt32 bin_size)
234 SilcUInt32 enclen = 0, i, charval;
236 if (!utf8 || !utf8_len)
239 if (bin_encoding == SILC_STRING_UTF8) {
240 if (!silc_utf8_valid(utf8, utf8_len) ||
243 memcpy(bin, utf8, utf8_len);
247 if (bin_encoding == SILC_STRING_LOCALE) {
248 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
249 char *toconv, *icp, *ocp;
251 size_t inlen, outlen;
253 setlocale(LC_CTYPE, "");
254 toconv = nl_langinfo(CODESET);
255 if (toconv && strlen(toconv)) {
256 icd = iconv_open(toconv, "UTF-8");
261 if (icp && ocp && icd != (iconv_t)-1) {
262 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
268 if (icd != (iconv_t)-1)
273 /* Fallback to 8-bit ASCII */
274 bin_encoding = SILC_STRING_ASCII;
277 for (i = 0; i < utf8_len; i++) {
278 if ((utf8[i] & 0x80) == 0x00) {
279 charval = utf8[i] & 0x7f;
280 } else if ((utf8[i] & 0xe0) == 0xc0) {
281 if (i + 1 >= utf8_len)
284 if ((utf8[i + 1] & 0xc0) != 0x80)
287 charval = (utf8[i++] & 0x1f) << 6;
288 charval |= utf8[i] & 0x3f;
291 } else if ((utf8[i] & 0xf0) == 0xe0) {
292 if (i + 2 >= utf8_len)
295 if (((utf8[i + 1] & 0xc0) != 0x80) ||
296 ((utf8[i + 2] & 0xc0) != 0x80))
299 /* Surrogates not allowed (D800-DFFF) */
300 if (utf8[i] == 0xed &&
301 utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf &&
302 utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf)
305 charval = (utf8[i++] & 0xf) << 12;
306 charval |= (utf8[i++] & 0x3f) << 6;
307 charval |= utf8[i] & 0x3f;
310 } else if ((utf8[i] & 0xf8) == 0xf0) {
311 if (i + 3 >= utf8_len)
314 if (((utf8[i + 1] & 0xc0) != 0x80) ||
315 ((utf8[i + 2] & 0xc0) != 0x80) ||
316 ((utf8[i + 3] & 0xc0) != 0x80))
319 charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18;
320 charval |= (utf8[i++] & 0x3f) << 12;
321 charval |= (utf8[i++] & 0x3f) << 6;
322 charval |= utf8[i] & 0x3f;
323 if (charval < 0x10000)
325 } else if ((utf8[i] & 0xfc) == 0xf8) {
326 if (i + 4 >= utf8_len)
329 if (((utf8[i + 1] & 0xc0) != 0x80) ||
330 ((utf8[i + 2] & 0xc0) != 0x80) ||
331 ((utf8[i + 3] & 0xc0) != 0x80) ||
332 ((utf8[i + 4] & 0xc0) != 0x80))
335 charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24;
336 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
337 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
338 charval |= (utf8[i++] & 0x3f) << 6;
339 charval |= utf8[i] & 0x3f;
340 if (charval < 0x200000)
342 } else if ((utf8[i] & 0xfe) == 0xfc) {
343 if (i + 5 >= utf8_len)
346 if (((utf8[i + 1] & 0xc0) != 0x80) ||
347 ((utf8[i + 2] & 0xc0) != 0x80) ||
348 ((utf8[i + 3] & 0xc0) != 0x80) ||
349 ((utf8[i + 4] & 0xc0) != 0x80) ||
350 ((utf8[i + 5] & 0xc0) != 0x80))
353 charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30;
354 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24;
355 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
356 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
357 charval |= (utf8[i++] & 0x3f) << 6;
358 charval |= utf8[i] & 0x3f;
359 if (charval < 0x4000000)
365 switch (bin_encoding) {
366 case SILC_STRING_ASCII:
367 case SILC_STRING_PRINTABLE:
368 case SILC_STRING_VISIBLE:
369 case SILC_STRING_TELETEX:
370 case SILC_STRING_NUMERICAL:
372 if (enclen + 1 > bin_size)
375 bin[enclen] = (unsigned char)charval;
379 case SILC_STRING_ASCII_ESC:
380 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
383 case SILC_STRING_BMP:
385 if (enclen + 2 > bin_size)
387 SILC_PUT16_MSB(charval, bin + enclen);
391 case SILC_STRING_BMP_LSB:
393 if (enclen + 2 > bin_size)
395 SILC_PUT16_LSB(charval, bin + enclen);
399 case SILC_STRING_UNIVERSAL:
401 if (enclen + 4 > bin_size)
403 SILC_PUT32_MSB(charval, bin + enclen);
407 case SILC_STRING_UNIVERSAL_LSB:
409 if (enclen + 4 > bin_size)
411 SILC_PUT32_LSB(charval, bin + enclen);
415 case SILC_STRING_LDAP_DN:
417 /* XXX multibyte handling */
418 unsigned char cv = (unsigned char)charval;
420 /* If string starts with space or # escape it */
421 if (!enclen && (cv == '#' || cv == ' ')) {
423 if (enclen + 2 > bin_size)
426 bin[enclen + 1] = cv;
432 /* If string ends with space escape it */
433 if (i == utf8_len - 1 && cv == ' ') {
435 if (enclen + 2 > bin_size)
438 bin[enclen + 1] = cv;
444 /* If character is any of following then escape */
445 if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' ||
446 cv == '>' || cv == ';') {
448 if (enclen + 2 > bin_size)
451 bin[enclen + 1] = cv;
457 /* If character is not printable escape it with hex character */
458 if (!isprint((int)cv)) {
460 if (enclen + 2 > bin_size)
463 snprintf(bin + enclen + 1, 3, "%02X", cv);
470 if (enclen + 1 > bin_size)
486 /* Returns the length of UTF-8 encoded string if the `bin' of
487 encoding of `bin_encoding' is encoded with silc_utf8_encode. */
489 SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len,
490 SilcStringEncoding bin_encoding)
492 return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0);
495 /* Returns the length of decoded string if the `bin' of encoding of
496 `bin_encoding' is decoded with silc_utf8_decode. */
498 SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len,
499 SilcStringEncoding bin_encoding)
501 return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0);
504 /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid
505 UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */
507 bool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len)
509 return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0;