5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2004 - 2008 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
20 #include "silcruntime.h"
22 /* Encodes the string `bin' of which encoding is `bin_encoding' to the
23 UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'.
24 Returns the length of the UTF-8 encoded string, or zero (0) on error.
25 By default `bin_encoding' is ASCII, and the caller needs to know the
26 encoding of the input string if it is anything else. */
28 SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len,
29 SilcStringEncoding bin_encoding,
30 unsigned char *utf8, SilcUInt32 utf8_size)
32 SilcUInt32 enclen = 0, i, charval = 0;
34 if (!bin || !bin_len) {
35 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
39 if (bin_encoding == SILC_STRING_UTF8) {
40 if (!silc_utf8_valid(bin, bin_len))
44 if (bin_len > utf8_size)
46 memcpy(utf8, bin, bin_len);
50 /* The SILC_STRING_LDAP_DN is alredy UTF-8 but it may be escaped. We
51 remove the escaping and we're done. */
52 if (bin_encoding == SILC_STRING_LDAP_DN ||
53 bin_encoding == SILC_STRING_UTF8_ESCAPE) {
56 for (i = 0; i < bin_len; i++) {
61 /* If escaped character is any of the following no processing is
62 needed, otherwise it is a hex value and we need to read it. */
64 if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' &&
65 cv != '>' && cv != ';' && cv != ' ' && cv != '#') {
69 if (sscanf(&bin[i + 1], "%02X", &hexval) != 1) {
70 silc_set_errno_posix(errno);
74 if (enclen + 1 > utf8_size)
76 utf8[enclen] = (unsigned char)hexval;
87 if (enclen + 1 > utf8_size)
89 utf8[enclen] = bin[i];
97 if (bin_encoding == SILC_STRING_LOCALE) {
98 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
99 char *fromconv, *icp, *ocp;
101 size_t inlen, outlen;
103 setlocale(LC_CTYPE, "");
104 fromconv = nl_langinfo(CODESET);
105 if (fromconv && strlen(fromconv)) {
106 icd = iconv_open("UTF-8", fromconv);
111 if (icp && ocp && icd != (iconv_t)-1) {
112 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
118 if (icd != (iconv_t)-1)
123 /* Fallback to 8-bit ASCII */
124 bin_encoding = SILC_STRING_ASCII;
127 for (i = 0; i < bin_len; i++) {
128 switch (bin_encoding) {
129 case SILC_STRING_ASCII:
130 case SILC_STRING_TELETEX:
133 case SILC_STRING_ASCII_ESC:
134 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
135 silc_set_errno(SILC_ERR_NOT_SUPPORTED);
138 case SILC_STRING_BMP:
139 if (i + 1 >= bin_len)
141 SILC_GET16_MSB(charval, bin + i);
144 case SILC_STRING_BMP_LSB:
145 if (i + 1 >= bin_len)
147 SILC_GET16_LSB(charval, bin + i);
150 case SILC_STRING_UNIVERSAL:
151 if (i + 3 >= bin_len)
153 SILC_GET32_MSB(charval, bin + i);
156 case SILC_STRING_UNIVERSAL_LSB:
157 if (i + 3 >= bin_len)
159 SILC_GET32_LSB(charval, bin + i);
162 case SILC_STRING_PRINTABLE:
163 case SILC_STRING_VISIBLE:
164 if (!isprint(bin[i])) {
165 silc_set_errno(SILC_ERR_PROHIBITED_CHAR);
170 case SILC_STRING_NUMERICAL:
171 if (bin[i] != 0x20 && !isdigit(bin[i])) {
172 silc_set_errno(SILC_ERR_PROHIBITED_CHAR);
178 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
183 if (charval < 0x80) {
185 if (enclen > utf8_size)
188 utf8[enclen] = (unsigned char)charval;
191 } else if (charval < 0x800) {
193 if (enclen + 2 > utf8_size)
196 utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0);
197 utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80);
200 } else if (charval < 0x10000) {
202 if (enclen + 3 > utf8_size)
205 utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0);
206 utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
207 utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80);
210 } else if (charval < 0x200000) {
212 if (enclen + 4 > utf8_size)
215 utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0);
216 utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
217 utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
218 utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80);
221 } else if (charval < 0x4000000) {
223 if (enclen + 5 > utf8_size)
226 utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8);
227 utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
228 utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
229 utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
230 utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80);
235 if (enclen + 6 > utf8_size)
238 utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc);
239 utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80);
240 utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
241 utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
242 utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
243 utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80);
252 silc_set_errno(SILC_ERR_OVERFLOW);
256 /* Decodes UTF-8 encoded string `utf8' to string of which encoding is
257 to be `bin_encoding', into the `bin' buffer of size of `bin_size'.
258 Returns the length of the decoded buffer, or zero (0) on error.
259 By default `bin_encoding' is ASCII, and the caller needs to know to
260 which encoding the output string is to be encoded if ASCII is not
263 SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len,
264 SilcStringEncoding bin_encoding,
265 unsigned char *bin, SilcUInt32 bin_size)
267 SilcUInt32 enclen = 0, i, charval, bytes;
269 if (!utf8 || !utf8_len) {
270 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
274 if (bin_encoding == SILC_STRING_UTF8) {
275 if (!silc_utf8_valid(utf8, utf8_len))
277 if (utf8_len > bin_size)
279 memcpy(bin, utf8, utf8_len);
283 if (bin_encoding == SILC_STRING_LOCALE) {
284 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
285 char *toconv, *icp, *ocp;
287 size_t inlen, outlen;
289 setlocale(LC_CTYPE, "");
290 toconv = nl_langinfo(CODESET);
291 if (toconv && strlen(toconv)) {
292 icd = iconv_open(toconv, "UTF-8");
297 if (icp && ocp && icd != (iconv_t)-1) {
298 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
304 if (icd != (iconv_t)-1)
309 /* Fallback to 8-bit ASCII */
310 bin_encoding = SILC_STRING_ASCII;
313 for (i = 0; i < utf8_len; i++) {
314 if ((utf8[i] & 0x80) == 0x00) {
315 charval = utf8[i] & 0x7f;
317 } else if ((utf8[i] & 0xe0) == 0xc0) {
318 if (i + 1 >= utf8_len)
321 if ((utf8[i + 1] & 0xc0) != 0x80)
324 charval = (utf8[i++] & 0x1f) << 6;
325 charval |= utf8[i] & 0x3f;
329 } else if ((utf8[i] & 0xf0) == 0xe0) {
330 if (i + 2 >= utf8_len)
333 if (((utf8[i + 1] & 0xc0) != 0x80) ||
334 ((utf8[i + 2] & 0xc0) != 0x80))
337 /* Surrogates not allowed (D800-DFFF) */
338 if (utf8[i] == 0xed &&
339 utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf &&
340 utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf)
343 charval = (utf8[i++] & 0xf) << 12;
344 charval |= (utf8[i++] & 0x3f) << 6;
345 charval |= utf8[i] & 0x3f;
349 } else if ((utf8[i] & 0xf8) == 0xf0) {
350 if (i + 3 >= utf8_len)
353 if (((utf8[i + 1] & 0xc0) != 0x80) ||
354 ((utf8[i + 2] & 0xc0) != 0x80) ||
355 ((utf8[i + 3] & 0xc0) != 0x80))
358 charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18;
359 charval |= (utf8[i++] & 0x3f) << 12;
360 charval |= (utf8[i++] & 0x3f) << 6;
361 charval |= utf8[i] & 0x3f;
362 if (charval < 0x10000)
365 } else if ((utf8[i] & 0xfc) == 0xf8) {
366 if (i + 4 >= utf8_len)
369 if (((utf8[i + 1] & 0xc0) != 0x80) ||
370 ((utf8[i + 2] & 0xc0) != 0x80) ||
371 ((utf8[i + 3] & 0xc0) != 0x80) ||
372 ((utf8[i + 4] & 0xc0) != 0x80))
375 charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24;
376 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
377 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
378 charval |= (utf8[i++] & 0x3f) << 6;
379 charval |= utf8[i] & 0x3f;
380 if (charval < 0x200000)
383 } else if ((utf8[i] & 0xfe) == 0xfc) {
384 if (i + 5 >= utf8_len)
387 if (((utf8[i + 1] & 0xc0) != 0x80) ||
388 ((utf8[i + 2] & 0xc0) != 0x80) ||
389 ((utf8[i + 3] & 0xc0) != 0x80) ||
390 ((utf8[i + 4] & 0xc0) != 0x80) ||
391 ((utf8[i + 5] & 0xc0) != 0x80))
394 charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30;
395 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24;
396 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
397 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
398 charval |= (utf8[i++] & 0x3f) << 6;
399 charval |= utf8[i] & 0x3f;
400 if (charval < 0x4000000)
407 switch (bin_encoding) {
408 case SILC_STRING_ASCII:
409 case SILC_STRING_PRINTABLE:
410 case SILC_STRING_VISIBLE:
411 case SILC_STRING_TELETEX:
412 case SILC_STRING_NUMERICAL:
414 if (enclen + 1 > bin_size)
417 bin[enclen] = (unsigned char)charval;
421 case SILC_STRING_ASCII_ESC:
422 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
425 case SILC_STRING_BMP:
427 if (enclen + 2 > bin_size)
429 SILC_PUT16_MSB(charval, bin + enclen);
433 case SILC_STRING_BMP_LSB:
435 if (enclen + 2 > bin_size)
437 SILC_PUT16_LSB(charval, bin + enclen);
441 case SILC_STRING_UNIVERSAL:
443 if (enclen + 4 > bin_size)
445 SILC_PUT32_MSB(charval, bin + enclen);
449 case SILC_STRING_UNIVERSAL_LSB:
451 if (enclen + 4 > bin_size)
453 SILC_PUT32_LSB(charval, bin + enclen);
457 case SILC_STRING_LDAP_DN:
462 /* Non-printable UTF-8 characters will be escaped, printable will
463 be as is. We take the bytes directly from the original data. */
464 for (k = 0; k < bytes; k++) {
465 cv = utf8[(i - (bytes - 1)) + k];
467 /* If string starts with space or # escape it */
468 if (!enclen && (cv == '#' || cv == ' ')) {
470 if (enclen + 2 > bin_size)
473 bin[enclen + 1] = cv;
479 /* If string ends with space escape it */
480 if (i == utf8_len - 1 && cv == ' ') {
482 if (enclen + 2 > bin_size)
485 bin[enclen + 1] = cv;
491 /* If character is any of following then escape */
492 if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' ||
493 cv == '>' || cv == ';') {
495 if (enclen + 2 > bin_size)
498 bin[enclen + 1] = cv;
504 /* If character is not printable escape it with hex character */
505 if (!isprint((int)cv)) {
507 if (enclen + 3 > bin_size)
510 silc_snprintf((char *)bin + enclen + 1, 3, "%02X", cv);
517 if (enclen + 1 > bin_size)
526 silc_set_errno(SILC_ERR_INVALID_ARGUMENT);
535 silc_set_errno(SILC_ERR_OVERFLOW);
539 silc_set_errno(SILC_ERR_BAD_CHAR_ENCODING);
543 /* UTF-8 to wide characters */
545 SilcUInt32 silc_utf8_c2w(const unsigned char *utf8, SilcUInt32 utf8_len,
546 SilcUInt16 *utf8_wide, SilcUInt32 utf8_wide_size)
552 tmp_len = silc_utf8_decoded_len(utf8, utf8_len, SILC_STRING_BMP);
556 if (utf8_wide_size < tmp_len / 2) {
557 silc_set_errno(SILC_ERR_OVERFLOW);
561 memset(utf8_wide, 0, utf8_wide_size * 2);
563 tmp = silc_malloc(tmp_len);
567 silc_utf8_decode(utf8, utf8_len, SILC_STRING_BMP, tmp, tmp_len);
569 for (i = 0, k = 0; i < tmp_len; i += 2, k++)
570 SILC_GET16_MSB(utf8_wide[k], tmp + i);
576 /* Wide characters to UTF-8 */
578 SilcUInt32 silc_utf8_w2c(const SilcUInt16 *wide_str,
579 SilcUInt32 wide_str_len,
580 unsigned char *utf8, SilcUInt32 utf8_size)
587 if (utf8_size < wide_str_len * 2) {
588 silc_set_errno(SILC_ERR_OVERFLOW);
592 memset(utf8, 0, utf8_size);
594 tmp = silc_malloc(wide_str_len * 2);
598 for (i = 0, k = 0; i < wide_str_len; i += 2, k++)
599 SILC_PUT16_MSB(wide_str[k], tmp + i);
601 tmp_len = silc_utf8_encode(tmp, wide_str_len * 2, SILC_STRING_BMP,
608 /* Returns the length of UTF-8 encoded string if the `bin' of
609 encoding of `bin_encoding' is encoded with silc_utf8_encode. */
611 SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len,
612 SilcStringEncoding bin_encoding)
614 return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0);
617 /* Returns the length of decoded string if the `bin' of encoding of
618 `bin_encoding' is decoded with silc_utf8_decode. */
620 SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len,
621 SilcStringEncoding bin_encoding)
623 return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0);
626 /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid
627 UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */
629 SilcBool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len)
631 return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0;
634 /* Pretty close strcasecmp */
636 SilcBool silc_utf8_strcasecmp(const char *s1, const char *s2)
640 if (strlen(s1) != strlen(s2))
643 return silc_utf8_strncasecmp(s1, s2, strlen(s1));
646 /* Pretty close strcasecmp */
648 SilcBool silc_utf8_strncasecmp(const char *s1, const char *s2, SilcUInt32 n)
650 unsigned char *s1u, *s2u;
651 SilcUInt32 s1u_len, s2u_len;
652 SilcStringprepStatus status;
658 /* Casefold and normalize */
659 status = silc_stringprep(s1, n, SILC_STRING_UTF8,
660 SILC_IDENTIFIERC_PREP, 0, &s1u,
661 &s1u_len, SILC_STRING_UTF8);
662 if (status != SILC_STRINGPREP_OK)
665 /* Casefold and normalize */
666 status = silc_stringprep(s2, n, SILC_STRING_UTF8,
667 SILC_IDENTIFIERC_PREP, 0, &s2u,
668 &s2u_len, SILC_STRING_UTF8);
669 if (status != SILC_STRINGPREP_OK)
672 ret = !memcmp(s1u, s2u, n);