5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2004 - 2007 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
23 /* Encodes the string `bin' of which encoding is `bin_encoding' to the
24 UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'.
25 Returns the length of the UTF-8 encoded string, or zero (0) on error.
26 By default `bin_encoding' is ASCII, and the caller needs to know the
27 encoding of the input string if it is anything else. */
29 SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len,
30 SilcStringEncoding bin_encoding,
31 unsigned char *utf8, SilcUInt32 utf8_size)
33 SilcUInt32 enclen = 0, i, charval = 0;
35 if (!bin || !bin_len) {
36 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
40 if (bin_encoding == SILC_STRING_UTF8) {
41 if (!silc_utf8_valid(bin, bin_len))
45 if (bin_len > utf8_size)
47 memcpy(utf8, bin, bin_len);
51 /* The SILC_STRING_LDAP_DN is alredy UTF-8 but it may be escaped. We
52 remove the escaping and we're done. */
53 if (bin_encoding == SILC_STRING_LDAP_DN ||
54 bin_encoding == SILC_STRING_UTF8_ESCAPE) {
57 for (i = 0; i < bin_len; i++) {
62 /* If escaped character is any of the following no processing is
63 needed, otherwise it is a hex value and we need to read it. */
65 if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' &&
66 cv != '>' && cv != ';' && cv != ' ' && cv != '#') {
70 if (sscanf(&bin[i + 1], "%02X", &hexval) != 1) {
71 silc_set_errno_posix(errno);
75 if (enclen + 1 > utf8_size)
77 utf8[enclen] = (unsigned char)hexval;
88 if (enclen + 1 > utf8_size)
90 utf8[enclen] = bin[i];
98 if (bin_encoding == SILC_STRING_LOCALE) {
99 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
100 char *fromconv, *icp, *ocp;
102 size_t inlen, outlen;
104 setlocale(LC_CTYPE, "");
105 fromconv = nl_langinfo(CODESET);
106 if (fromconv && strlen(fromconv)) {
107 icd = iconv_open("UTF-8", fromconv);
112 if (icp && ocp && icd != (iconv_t)-1) {
113 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
119 if (icd != (iconv_t)-1)
124 /* Fallback to 8-bit ASCII */
125 bin_encoding = SILC_STRING_ASCII;
128 for (i = 0; i < bin_len; i++) {
129 switch (bin_encoding) {
130 case SILC_STRING_ASCII:
131 case SILC_STRING_TELETEX:
134 case SILC_STRING_ASCII_ESC:
135 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
136 silc_set_errno(SILC_ERR_NOT_SUPPORTED);
139 case SILC_STRING_BMP:
140 if (i + 1 >= bin_len)
142 SILC_GET16_MSB(charval, bin + i);
145 case SILC_STRING_BMP_LSB:
146 if (i + 1 >= bin_len)
148 SILC_GET16_LSB(charval, bin + i);
151 case SILC_STRING_UNIVERSAL:
152 if (i + 3 >= bin_len)
154 SILC_GET32_MSB(charval, bin + i);
157 case SILC_STRING_UNIVERSAL_LSB:
158 if (i + 3 >= bin_len)
160 SILC_GET32_LSB(charval, bin + i);
163 case SILC_STRING_PRINTABLE:
164 case SILC_STRING_VISIBLE:
165 if (!isprint(bin[i])) {
166 silc_set_errno(SILC_ERR_PROHIBITED_CHAR);
171 case SILC_STRING_NUMERICAL:
172 if (bin[i] != 0x20 && !isdigit(bin[i])) {
173 silc_set_errno(SILC_ERR_PROHIBITED_CHAR);
179 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
184 if (charval < 0x80) {
186 if (enclen > utf8_size)
189 utf8[enclen] = (unsigned char)charval;
192 } else if (charval < 0x800) {
194 if (enclen + 2 > utf8_size)
197 utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0);
198 utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80);
201 } else if (charval < 0x10000) {
203 if (enclen + 3 > utf8_size)
206 utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0);
207 utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
208 utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80);
211 } else if (charval < 0x200000) {
213 if (enclen + 4 > utf8_size)
216 utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0);
217 utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
218 utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
219 utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80);
222 } else if (charval < 0x4000000) {
224 if (enclen + 5 > utf8_size)
227 utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8);
228 utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
229 utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
230 utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
231 utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80);
236 if (enclen + 6 > utf8_size)
239 utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc);
240 utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80);
241 utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
242 utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
243 utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
244 utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80);
253 silc_set_errno(SILC_ERR_OVERFLOW);
257 /* Decodes UTF-8 encoded string `utf8' to string of which encoding is
258 to be `bin_encoding', into the `bin' buffer of size of `bin_size'.
259 Returns the length of the decoded buffer, or zero (0) on error.
260 By default `bin_encoding' is ASCII, and the caller needs to know to
261 which encoding the output string is to be encoded if ASCII is not
264 SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len,
265 SilcStringEncoding bin_encoding,
266 unsigned char *bin, SilcUInt32 bin_size)
268 SilcUInt32 enclen = 0, i, charval, bytes;
270 if (!utf8 || !utf8_len) {
271 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
275 if (bin_encoding == SILC_STRING_UTF8) {
276 if (!silc_utf8_valid(utf8, utf8_len))
278 if (utf8_len > bin_size)
280 memcpy(bin, utf8, utf8_len);
284 if (bin_encoding == SILC_STRING_LOCALE) {
285 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
286 char *toconv, *icp, *ocp;
288 size_t inlen, outlen;
290 setlocale(LC_CTYPE, "");
291 toconv = nl_langinfo(CODESET);
292 if (toconv && strlen(toconv)) {
293 icd = iconv_open(toconv, "UTF-8");
298 if (icp && ocp && icd != (iconv_t)-1) {
299 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
305 if (icd != (iconv_t)-1)
310 /* Fallback to 8-bit ASCII */
311 bin_encoding = SILC_STRING_ASCII;
314 for (i = 0; i < utf8_len; i++) {
315 if ((utf8[i] & 0x80) == 0x00) {
316 charval = utf8[i] & 0x7f;
318 } else if ((utf8[i] & 0xe0) == 0xc0) {
319 if (i + 1 >= utf8_len)
322 if ((utf8[i + 1] & 0xc0) != 0x80)
325 charval = (utf8[i++] & 0x1f) << 6;
326 charval |= utf8[i] & 0x3f;
330 } else if ((utf8[i] & 0xf0) == 0xe0) {
331 if (i + 2 >= utf8_len)
334 if (((utf8[i + 1] & 0xc0) != 0x80) ||
335 ((utf8[i + 2] & 0xc0) != 0x80))
338 /* Surrogates not allowed (D800-DFFF) */
339 if (utf8[i] == 0xed &&
340 utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf &&
341 utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf)
344 charval = (utf8[i++] & 0xf) << 12;
345 charval |= (utf8[i++] & 0x3f) << 6;
346 charval |= utf8[i] & 0x3f;
350 } else if ((utf8[i] & 0xf8) == 0xf0) {
351 if (i + 3 >= utf8_len)
354 if (((utf8[i + 1] & 0xc0) != 0x80) ||
355 ((utf8[i + 2] & 0xc0) != 0x80) ||
356 ((utf8[i + 3] & 0xc0) != 0x80))
359 charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18;
360 charval |= (utf8[i++] & 0x3f) << 12;
361 charval |= (utf8[i++] & 0x3f) << 6;
362 charval |= utf8[i] & 0x3f;
363 if (charval < 0x10000)
366 } else if ((utf8[i] & 0xfc) == 0xf8) {
367 if (i + 4 >= utf8_len)
370 if (((utf8[i + 1] & 0xc0) != 0x80) ||
371 ((utf8[i + 2] & 0xc0) != 0x80) ||
372 ((utf8[i + 3] & 0xc0) != 0x80) ||
373 ((utf8[i + 4] & 0xc0) != 0x80))
376 charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24;
377 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
378 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
379 charval |= (utf8[i++] & 0x3f) << 6;
380 charval |= utf8[i] & 0x3f;
381 if (charval < 0x200000)
384 } else if ((utf8[i] & 0xfe) == 0xfc) {
385 if (i + 5 >= utf8_len)
388 if (((utf8[i + 1] & 0xc0) != 0x80) ||
389 ((utf8[i + 2] & 0xc0) != 0x80) ||
390 ((utf8[i + 3] & 0xc0) != 0x80) ||
391 ((utf8[i + 4] & 0xc0) != 0x80) ||
392 ((utf8[i + 5] & 0xc0) != 0x80))
395 charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30;
396 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24;
397 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
398 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
399 charval |= (utf8[i++] & 0x3f) << 6;
400 charval |= utf8[i] & 0x3f;
401 if (charval < 0x4000000)
408 switch (bin_encoding) {
409 case SILC_STRING_ASCII:
410 case SILC_STRING_PRINTABLE:
411 case SILC_STRING_VISIBLE:
412 case SILC_STRING_TELETEX:
413 case SILC_STRING_NUMERICAL:
415 if (enclen + 1 > bin_size)
418 bin[enclen] = (unsigned char)charval;
422 case SILC_STRING_ASCII_ESC:
423 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
426 case SILC_STRING_BMP:
428 if (enclen + 2 > bin_size)
430 SILC_PUT16_MSB(charval, bin + enclen);
434 case SILC_STRING_BMP_LSB:
436 if (enclen + 2 > bin_size)
438 SILC_PUT16_LSB(charval, bin + enclen);
442 case SILC_STRING_UNIVERSAL:
444 if (enclen + 4 > bin_size)
446 SILC_PUT32_MSB(charval, bin + enclen);
450 case SILC_STRING_UNIVERSAL_LSB:
452 if (enclen + 4 > bin_size)
454 SILC_PUT32_LSB(charval, bin + enclen);
458 case SILC_STRING_LDAP_DN:
463 /* Non-printable UTF-8 characters will be escaped, printable will
464 be as is. We take the bytes directly from the original data. */
465 for (k = 0; k < bytes; k++) {
466 cv = utf8[(i - (bytes - 1)) + k];
468 /* If string starts with space or # escape it */
469 if (!enclen && (cv == '#' || cv == ' ')) {
471 if (enclen + 2 > bin_size)
474 bin[enclen + 1] = cv;
480 /* If string ends with space escape it */
481 if (i == utf8_len - 1 && cv == ' ') {
483 if (enclen + 2 > bin_size)
486 bin[enclen + 1] = cv;
492 /* If character is any of following then escape */
493 if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' ||
494 cv == '>' || cv == ';') {
496 if (enclen + 2 > bin_size)
499 bin[enclen + 1] = cv;
505 /* If character is not printable escape it with hex character */
506 if (!isprint((int)cv)) {
508 if (enclen + 3 > bin_size)
511 silc_snprintf(bin + enclen + 1, 3, "%02X", cv);
518 if (enclen + 1 > bin_size)
527 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
536 silc_set_errno(SILC_ERR_OVERFLOW);
540 silc_set_errno(SILC_ERR_BAD_CHAR_ENCODING);
544 /* UTF-8 to wide characters */
546 SilcUInt32 silc_utf8_c2w(const unsigned char *utf8, SilcUInt32 utf8_len,
547 SilcUInt16 *utf8_wide, SilcUInt32 utf8_wide_size)
553 tmp_len = silc_utf8_decoded_len(utf8, utf8_len, SILC_STRING_BMP);
557 if (utf8_wide_size < tmp_len / 2) {
558 silc_set_errno(SILC_ERR_OVERFLOW);
562 memset(utf8_wide, 0, utf8_wide_size * 2);
564 tmp = silc_malloc(tmp_len);
568 silc_utf8_decode(utf8, utf8_len, SILC_STRING_BMP, tmp, tmp_len);
570 for (i = 0, k = 0; i < tmp_len; i += 2, k++)
571 SILC_GET16_MSB(utf8_wide[k], tmp + i);
577 /* Wide characters to UTF-8 */
579 SilcUInt32 silc_utf8_w2c(const SilcUInt16 *wide_str,
580 SilcUInt32 wide_str_len,
581 unsigned char *utf8, SilcUInt32 utf8_size)
588 if (utf8_size < wide_str_len * 2) {
589 silc_set_errno(SILC_ERR_OVERFLOW);
593 memset(utf8, 0, utf8_size);
595 tmp = silc_malloc(wide_str_len * 2);
599 for (i = 0, k = 0; i < wide_str_len; i += 2, k++)
600 SILC_PUT16_MSB(wide_str[k], tmp + i);
602 tmp_len = silc_utf8_encode(tmp, wide_str_len * 2, SILC_STRING_BMP,
609 /* Returns the length of UTF-8 encoded string if the `bin' of
610 encoding of `bin_encoding' is encoded with silc_utf8_encode. */
612 SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len,
613 SilcStringEncoding bin_encoding)
615 return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0);
618 /* Returns the length of decoded string if the `bin' of encoding of
619 `bin_encoding' is decoded with silc_utf8_decode. */
621 SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len,
622 SilcStringEncoding bin_encoding)
624 return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0);
627 /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid
628 UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */
630 SilcBool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len)
632 return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0;
635 /* Pretty close strcasecmp */
637 SilcBool silc_utf8_strcasecmp(const char *s1, const char *s2)
641 if (strlen(s1) != strlen(s2))
644 return silc_utf8_strncasecmp(s1, s2, strlen(s1));
647 /* Pretty close strcasecmp */
649 SilcBool silc_utf8_strncasecmp(const char *s1, const char *s2, SilcUInt32 n)
651 unsigned char *s1u, *s2u;
652 SilcUInt32 s1u_len, s2u_len;
653 SilcStringprepStatus status;
659 /* Casefold and normalize */
660 status = silc_stringprep(s1, n, SILC_STRING_UTF8,
661 SILC_IDENTIFIERC_PREP, 0, &s1u,
662 &s1u_len, SILC_STRING_UTF8);
663 if (status != SILC_STRINGPREP_OK)
666 /* Casefold and normalize */
667 status = silc_stringprep(s2, n, SILC_STRING_UTF8,
668 SILC_IDENTIFIERC_PREP, 0, &s2u,
669 &s2u_len, SILC_STRING_UTF8);
670 if (status != SILC_STRINGPREP_OK)
673 ret = !memcmp(s1u, s2u, n);