5 Author: Pekka Riikonen <priikone@silcnet.org>
7 Copyright (C) 2004 - 2005 Pekka Riikonen
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; version 2 of the License.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
23 /* Encodes the string `bin' of which encoding is `bin_encoding' to the
24 UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'.
25 Returns the length of the UTF-8 encoded string, or zero (0) on error.
26 By default `bin_encoding' is ASCII, and the caller needs to know the
27 encoding of the input string if it is anything else. */
29 SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len,
30 SilcStringEncoding bin_encoding,
31 unsigned char *utf8, SilcUInt32 utf8_size)
33 SilcUInt32 enclen = 0, i, charval = 0;
38 if (bin_encoding == SILC_STRING_UTF8) {
39 if (!silc_utf8_valid(bin, bin_len))
43 if (bin_len > utf8_size)
45 memcpy(utf8, bin, bin_len);
49 /* The SILC_STRING_LDAP_DN is alredy UTF-8 but it may be escaped. We
50 remove the escaping and we're done. */
51 if (bin_encoding == SILC_STRING_LDAP_DN ||
52 bin_encoding == SILC_STRING_UTF8_ESCAPE) {
55 for (i = 0; i < bin_len; i++) {
60 /* If escaped character is any of the following no processing is
61 needed, otherwise it is a hex value and we need to read it. */
63 if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' &&
64 cv != '>' && cv != ';' && cv != ' ' && cv != '#') {
68 if (sscanf(&bin[i + 1], "%02X", &hexval) != 1)
71 if (enclen + 1 > utf8_size)
73 utf8[enclen] = (unsigned char)hexval;
84 if (enclen + 1 > utf8_size)
86 utf8[enclen] = bin[i];
94 if (bin_encoding == SILC_STRING_LOCALE) {
95 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
96 char *fromconv, *icp, *ocp;
100 setlocale(LC_CTYPE, "");
101 fromconv = nl_langinfo(CODESET);
102 if (fromconv && strlen(fromconv)) {
103 icd = iconv_open("UTF-8", fromconv);
108 if (icp && ocp && icd != (iconv_t)-1) {
109 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
115 if (icd != (iconv_t)-1)
120 /* Fallback to 8-bit ASCII */
121 bin_encoding = SILC_STRING_ASCII;
124 for (i = 0; i < bin_len; i++) {
125 switch (bin_encoding) {
126 case SILC_STRING_ASCII:
127 case SILC_STRING_TELETEX:
130 case SILC_STRING_ASCII_ESC:
131 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
134 case SILC_STRING_BMP:
135 if (i + 1 >= bin_len)
137 SILC_GET16_MSB(charval, bin + i);
140 case SILC_STRING_BMP_LSB:
141 if (i + 1 >= bin_len)
143 SILC_GET16_LSB(charval, bin + i);
146 case SILC_STRING_UNIVERSAL:
147 if (i + 3 >= bin_len)
149 SILC_GET32_MSB(charval, bin + i);
152 case SILC_STRING_UNIVERSAL_LSB:
153 if (i + 3 >= bin_len)
155 SILC_GET32_LSB(charval, bin + i);
158 case SILC_STRING_PRINTABLE:
159 case SILC_STRING_VISIBLE:
160 if (!isprint(bin[i]))
164 case SILC_STRING_NUMERICAL:
165 if (bin[i] != 0x20 && !isdigit(bin[i]))
174 if (charval < 0x80) {
176 if (enclen > utf8_size)
179 utf8[enclen] = (unsigned char)charval;
182 } else if (charval < 0x800) {
184 if (enclen + 2 > utf8_size)
187 utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0);
188 utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80);
191 } else if (charval < 0x10000) {
193 if (enclen + 3 > utf8_size)
196 utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0);
197 utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
198 utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80);
201 } else if (charval < 0x200000) {
203 if (enclen + 4 > utf8_size)
206 utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0);
207 utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
208 utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
209 utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80);
212 } else if (charval < 0x4000000) {
214 if (enclen + 5 > utf8_size)
217 utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8);
218 utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
219 utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
220 utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
221 utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80);
226 if (enclen + 6 > utf8_size)
229 utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc);
230 utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80);
231 utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80);
232 utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80);
233 utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80);
234 utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80);
243 /* Decodes UTF-8 encoded string `utf8' to string of which encoding is
244 to be `bin_encoding', into the `bin' buffer of size of `bin_size'.
245 Returns the length of the decoded buffer, or zero (0) on error.
246 By default `bin_encoding' is ASCII, and the caller needs to know to
247 which encoding the output string is to be encoded if ASCII is not
250 SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len,
251 SilcStringEncoding bin_encoding,
252 unsigned char *bin, SilcUInt32 bin_size)
254 SilcUInt32 enclen = 0, i, charval, bytes;
256 if (!utf8 || !utf8_len)
259 if (bin_encoding == SILC_STRING_UTF8) {
260 if (!silc_utf8_valid(utf8, utf8_len) ||
263 memcpy(bin, utf8, utf8_len);
267 if (bin_encoding == SILC_STRING_LOCALE) {
268 #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET)
269 char *toconv, *icp, *ocp;
271 size_t inlen, outlen;
273 setlocale(LC_CTYPE, "");
274 toconv = nl_langinfo(CODESET);
275 if (toconv && strlen(toconv)) {
276 icd = iconv_open(toconv, "UTF-8");
281 if (icp && ocp && icd != (iconv_t)-1) {
282 if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) {
288 if (icd != (iconv_t)-1)
293 /* Fallback to 8-bit ASCII */
294 bin_encoding = SILC_STRING_ASCII;
297 for (i = 0; i < utf8_len; i++) {
298 if ((utf8[i] & 0x80) == 0x00) {
299 charval = utf8[i] & 0x7f;
301 } else if ((utf8[i] & 0xe0) == 0xc0) {
302 if (i + 1 >= utf8_len)
305 if ((utf8[i + 1] & 0xc0) != 0x80)
308 charval = (utf8[i++] & 0x1f) << 6;
309 charval |= utf8[i] & 0x3f;
313 } else if ((utf8[i] & 0xf0) == 0xe0) {
314 if (i + 2 >= utf8_len)
317 if (((utf8[i + 1] & 0xc0) != 0x80) ||
318 ((utf8[i + 2] & 0xc0) != 0x80))
321 /* Surrogates not allowed (D800-DFFF) */
322 if (utf8[i] == 0xed &&
323 utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf &&
324 utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf)
327 charval = (utf8[i++] & 0xf) << 12;
328 charval |= (utf8[i++] & 0x3f) << 6;
329 charval |= utf8[i] & 0x3f;
333 } else if ((utf8[i] & 0xf8) == 0xf0) {
334 if (i + 3 >= utf8_len)
337 if (((utf8[i + 1] & 0xc0) != 0x80) ||
338 ((utf8[i + 2] & 0xc0) != 0x80) ||
339 ((utf8[i + 3] & 0xc0) != 0x80))
342 charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18;
343 charval |= (utf8[i++] & 0x3f) << 12;
344 charval |= (utf8[i++] & 0x3f) << 6;
345 charval |= utf8[i] & 0x3f;
346 if (charval < 0x10000)
349 } else if ((utf8[i] & 0xfc) == 0xf8) {
350 if (i + 4 >= utf8_len)
353 if (((utf8[i + 1] & 0xc0) != 0x80) ||
354 ((utf8[i + 2] & 0xc0) != 0x80) ||
355 ((utf8[i + 3] & 0xc0) != 0x80) ||
356 ((utf8[i + 4] & 0xc0) != 0x80))
359 charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24;
360 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
361 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
362 charval |= (utf8[i++] & 0x3f) << 6;
363 charval |= utf8[i] & 0x3f;
364 if (charval < 0x200000)
367 } else if ((utf8[i] & 0xfe) == 0xfc) {
368 if (i + 5 >= utf8_len)
371 if (((utf8[i + 1] & 0xc0) != 0x80) ||
372 ((utf8[i + 2] & 0xc0) != 0x80) ||
373 ((utf8[i + 3] & 0xc0) != 0x80) ||
374 ((utf8[i + 4] & 0xc0) != 0x80) ||
375 ((utf8[i + 5] & 0xc0) != 0x80))
378 charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30;
379 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24;
380 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18;
381 charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12;
382 charval |= (utf8[i++] & 0x3f) << 6;
383 charval |= utf8[i] & 0x3f;
384 if (charval < 0x4000000)
391 switch (bin_encoding) {
392 case SILC_STRING_ASCII:
393 case SILC_STRING_PRINTABLE:
394 case SILC_STRING_VISIBLE:
395 case SILC_STRING_TELETEX:
396 case SILC_STRING_NUMERICAL:
398 if (enclen + 1 > bin_size)
401 bin[enclen] = (unsigned char)charval;
405 case SILC_STRING_ASCII_ESC:
406 SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC");
409 case SILC_STRING_BMP:
411 if (enclen + 2 > bin_size)
413 SILC_PUT16_MSB(charval, bin + enclen);
417 case SILC_STRING_BMP_LSB:
419 if (enclen + 2 > bin_size)
421 SILC_PUT16_LSB(charval, bin + enclen);
425 case SILC_STRING_UNIVERSAL:
427 if (enclen + 4 > bin_size)
429 SILC_PUT32_MSB(charval, bin + enclen);
433 case SILC_STRING_UNIVERSAL_LSB:
435 if (enclen + 4 > bin_size)
437 SILC_PUT32_LSB(charval, bin + enclen);
441 case SILC_STRING_LDAP_DN:
446 /* Non-printable UTF-8 characters will be escaped, printable will
447 be as is. We take the bytes directly from the original data. */
448 for (k = 0; k < bytes; k++) {
449 cv = utf8[(i - (bytes - 1)) + k];
451 /* If string starts with space or # escape it */
452 if (!enclen && (cv == '#' || cv == ' ')) {
454 if (enclen + 2 > bin_size)
457 bin[enclen + 1] = cv;
463 /* If string ends with space escape it */
464 if (i == utf8_len - 1 && cv == ' ') {
466 if (enclen + 2 > bin_size)
469 bin[enclen + 1] = cv;
475 /* If character is any of following then escape */
476 if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' ||
477 cv == '>' || cv == ';') {
479 if (enclen + 2 > bin_size)
482 bin[enclen + 1] = cv;
488 /* If character is not printable escape it with hex character */
489 if (!isprint((int)cv)) {
491 if (enclen + 3 > bin_size)
494 snprintf(bin + enclen + 1, 3, "%02X", cv);
501 if (enclen + 1 > bin_size)
518 /* Returns the length of UTF-8 encoded string if the `bin' of
519 encoding of `bin_encoding' is encoded with silc_utf8_encode. */
521 SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len,
522 SilcStringEncoding bin_encoding)
524 return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0);
527 /* Returns the length of decoded string if the `bin' of encoding of
528 `bin_encoding' is decoded with silc_utf8_decode. */
530 SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len,
531 SilcStringEncoding bin_encoding)
533 return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0);
536 /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid
537 UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */
539 SilcBool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len)
541 return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0;
544 /* Pretty close strcasecmp */
546 SilcBool silc_utf8_strcasecmp(const char *s1, const char *s2)
550 if (strlen(s1) != strlen(s2))
553 return silc_utf8_strncasecmp(s1, s2, strlen(s1));
556 /* Pretty close strcasecmp */
558 SilcBool silc_utf8_strncasecmp(const char *s1, const char *s2, SilcUInt32 n)
560 unsigned char *s1u, *s2u;
561 SilcUInt32 s1u_len, s2u_len;
562 SilcStringprepStatus status;
568 /* Casefold and normalize */
569 status = silc_stringprep(s1, n, SILC_STRING_UTF8,
570 SILC_IDENTIFIERC_PREP, 0, &s1u,
571 &s1u_len, SILC_STRING_UTF8);
572 if (status != SILC_STRINGPREP_OK)
575 /* Casefold and normalize */
576 status = silc_stringprep(s2, n, SILC_STRING_UTF8,
577 SILC_IDENTIFIERC_PREP, 0, &s2u,
578 &s2u_len, SILC_STRING_UTF8);
579 if (status != SILC_STRINGPREP_OK)
582 ret = !memcmp(s1u, s2u, n);