/* silcutf8.c Author: Pekka Riikonen Copyright (C) 2004 - 2007 Pekka Riikonen This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. */ #include "silc.h" #include "silcutf8.h" /* Encodes the string `bin' of which encoding is `bin_encoding' to the UTF-8 encoding into the buffer `utf8' which is of size of `utf8_size'. Returns the length of the UTF-8 encoded string, or zero (0) on error. By default `bin_encoding' is ASCII, and the caller needs to know the encoding of the input string if it is anything else. */ SilcUInt32 silc_utf8_encode(const unsigned char *bin, SilcUInt32 bin_len, SilcStringEncoding bin_encoding, unsigned char *utf8, SilcUInt32 utf8_size) { SilcUInt32 enclen = 0, i, charval = 0; if (!bin || !bin_len) { silc_set_errno(SILC_ERR_INVALID_ARGUMENT); return 0; } if (bin_encoding == SILC_STRING_UTF8) { if (!silc_utf8_valid(bin, bin_len)) return 0; if (!utf8) return bin_len; if (bin_len > utf8_size) goto overflow; memcpy(utf8, bin, bin_len); return bin_len; } /* The SILC_STRING_LDAP_DN is alredy UTF-8 but it may be escaped. We remove the escaping and we're done. */ if (bin_encoding == SILC_STRING_LDAP_DN || bin_encoding == SILC_STRING_UTF8_ESCAPE) { unsigned char cv; for (i = 0; i < bin_len; i++) { if (bin[i] == '\\') { if (i + 1 >= bin_len) goto overflow; /* If escaped character is any of the following no processing is needed, otherwise it is a hex value and we need to read it. */ cv = bin[i + 1]; if (cv != ',' && cv != '+' && cv != '"' && cv != '\\' && cv != '<' && cv != '>' && cv != ';' && cv != ' ' && cv != '#') { unsigned int hexval; if (i + 2 >= bin_len) goto overflow; if (sscanf(&bin[i + 1], "%02X", &hexval) != 1) { silc_set_errno_posix(errno); return 0; } if (utf8) { if (enclen + 1 > utf8_size) goto overflow; utf8[enclen] = (unsigned char)hexval; } i += 2; enclen++; continue; } i++; } if (utf8) { if (enclen + 1 > utf8_size) goto overflow; utf8[enclen] = bin[i]; } enclen++; } return enclen; } if (bin_encoding == SILC_STRING_LOCALE) { #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET) char *fromconv, *icp, *ocp; iconv_t icd; size_t inlen, outlen; setlocale(LC_CTYPE, ""); fromconv = nl_langinfo(CODESET); if (fromconv && strlen(fromconv)) { icd = iconv_open("UTF-8", fromconv); icp = (char *)bin; ocp = (char *)utf8; inlen = bin_len; outlen = utf8_size; if (icp && ocp && icd != (iconv_t)-1) { if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) { utf8_size -= outlen; iconv_close(icd); return utf8_size; } } if (icd != (iconv_t)-1) iconv_close(icd); } #endif /* Fallback to 8-bit ASCII */ bin_encoding = SILC_STRING_ASCII; } for (i = 0; i < bin_len; i++) { switch (bin_encoding) { case SILC_STRING_ASCII: case SILC_STRING_TELETEX: charval = bin[i]; break; case SILC_STRING_ASCII_ESC: SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC"); silc_set_errno(SILC_ERR_NOT_SUPPORTED); return 0; break; case SILC_STRING_BMP: if (i + 1 >= bin_len) goto overflow; SILC_GET16_MSB(charval, bin + i); i += 1; break; case SILC_STRING_BMP_LSB: if (i + 1 >= bin_len) goto overflow; SILC_GET16_LSB(charval, bin + i); i += 1; break; case SILC_STRING_UNIVERSAL: if (i + 3 >= bin_len) goto overflow; SILC_GET32_MSB(charval, bin + i); i += 3; break; case SILC_STRING_UNIVERSAL_LSB: if (i + 3 >= bin_len) goto overflow; SILC_GET32_LSB(charval, bin + i); i += 3; break; case SILC_STRING_PRINTABLE: case SILC_STRING_VISIBLE: if (!isprint(bin[i])) { silc_set_errno(SILC_ERR_PROHIBITED_CHAR); return 0; } charval = bin[i]; break; case SILC_STRING_NUMERICAL: if (bin[i] != 0x20 && !isdigit(bin[i])) { silc_set_errno(SILC_ERR_PROHIBITED_CHAR); return 0; } charval = bin[i]; break; default: silc_set_errno(SILC_ERR_INVALID_ARGUMENT); return 0; break; } if (charval < 0x80) { if (utf8) { if (enclen > utf8_size) goto overflow; utf8[enclen] = (unsigned char)charval; } enclen++; } else if (charval < 0x800) { if (utf8) { if (enclen + 2 > utf8_size) goto overflow; utf8[enclen ] = (unsigned char )(((charval >> 6) & 0x1f) | 0xc0); utf8[enclen + 1] = (unsigned char )((charval & 0x3f) | 0x80); } enclen += 2; } else if (charval < 0x10000) { if (utf8) { if (enclen + 3 > utf8_size) goto overflow; utf8[enclen ] = (unsigned char )(((charval >> 12) & 0xf) | 0xe0); utf8[enclen + 1] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80); utf8[enclen + 2] = (unsigned char )((charval & 0x3f) | 0x80); } enclen += 3; } else if (charval < 0x200000) { if (utf8) { if (enclen + 4 > utf8_size) goto overflow; utf8[enclen ] = (unsigned char )(((charval >> 18) & 0x7) | 0xf0); utf8[enclen + 1] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80); utf8[enclen + 2] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80); utf8[enclen + 3] = (unsigned char )((charval & 0x3f) | 0x80); } enclen += 4; } else if (charval < 0x4000000) { if (utf8) { if (enclen + 5 > utf8_size) goto overflow; utf8[enclen ] = (unsigned char )(((charval >> 24) & 0x3) | 0xf8); utf8[enclen + 1] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80); utf8[enclen + 2] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80); utf8[enclen + 3] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80); utf8[enclen + 4] = (unsigned char )((charval & 0x3f) | 0x80); } enclen += 5; } else { if (utf8) { if (enclen + 6 > utf8_size) goto overflow; utf8[enclen ] = (unsigned char )(((charval >> 30) & 0x1) | 0xfc); utf8[enclen + 1] = (unsigned char )(((charval >> 24) & 0x3f) | 0x80); utf8[enclen + 2] = (unsigned char )(((charval >> 18) & 0x3f) | 0x80); utf8[enclen + 3] = (unsigned char )(((charval >> 12) & 0x3f) | 0x80); utf8[enclen + 4] = (unsigned char )(((charval >> 6) & 0x3f) | 0x80); utf8[enclen + 5] = (unsigned char )((charval & 0x3f) | 0x80); } enclen += 6; } } return enclen; overflow: silc_set_errno(SILC_ERR_OVERFLOW); return 0; } /* Decodes UTF-8 encoded string `utf8' to string of which encoding is to be `bin_encoding', into the `bin' buffer of size of `bin_size'. Returns the length of the decoded buffer, or zero (0) on error. By default `bin_encoding' is ASCII, and the caller needs to know to which encoding the output string is to be encoded if ASCII is not desired. */ SilcUInt32 silc_utf8_decode(const unsigned char *utf8, SilcUInt32 utf8_len, SilcStringEncoding bin_encoding, unsigned char *bin, SilcUInt32 bin_size) { SilcUInt32 enclen = 0, i, charval, bytes; if (!utf8 || !utf8_len) { silc_set_errno(SILC_ERR_INVALID_ARGUMENT); return 0; } if (bin_encoding == SILC_STRING_UTF8) { if (!silc_utf8_valid(utf8, utf8_len)) return 0; if (utf8_len > bin_size) goto overflow; memcpy(bin, utf8, utf8_len); return utf8_len; } if (bin_encoding == SILC_STRING_LOCALE) { #if defined(HAVE_ICONV) && defined(HAVE_NL_LANGINFO) && defined(CODESET) char *toconv, *icp, *ocp; iconv_t icd; size_t inlen, outlen; setlocale(LC_CTYPE, ""); toconv = nl_langinfo(CODESET); if (toconv && strlen(toconv)) { icd = iconv_open(toconv, "UTF-8"); icp = (char *)utf8; ocp = (char *)bin; inlen = utf8_len; outlen = bin_size; if (icp && ocp && icd != (iconv_t)-1) { if (iconv(icd, &icp, &inlen, &ocp, &outlen) != -1) { bin_size -= outlen; iconv_close(icd); return bin_size; } } if (icd != (iconv_t)-1) iconv_close(icd); } #endif /* Fallback to 8-bit ASCII */ bin_encoding = SILC_STRING_ASCII; } for (i = 0; i < utf8_len; i++) { if ((utf8[i] & 0x80) == 0x00) { charval = utf8[i] & 0x7f; bytes = 1; } else if ((utf8[i] & 0xe0) == 0xc0) { if (i + 1 >= utf8_len) goto overflow; if ((utf8[i + 1] & 0xc0) != 0x80) goto bad_char; charval = (utf8[i++] & 0x1f) << 6; charval |= utf8[i] & 0x3f; if (charval < 0x80) return 0; bytes = 2; } else if ((utf8[i] & 0xf0) == 0xe0) { if (i + 2 >= utf8_len) goto overflow; if (((utf8[i + 1] & 0xc0) != 0x80) || ((utf8[i + 2] & 0xc0) != 0x80)) goto bad_char; /* Surrogates not allowed (D800-DFFF) */ if (utf8[i] == 0xed && utf8[i + 1] >= 0xa0 && utf8[i + 1] <= 0xbf && utf8[i + 2] >= 0x80 && utf8[i + 2] <= 0xbf) goto bad_char; charval = (utf8[i++] & 0xf) << 12; charval |= (utf8[i++] & 0x3f) << 6; charval |= utf8[i] & 0x3f; if (charval < 0x800) goto bad_char; bytes = 3; } else if ((utf8[i] & 0xf8) == 0xf0) { if (i + 3 >= utf8_len) goto overflow; if (((utf8[i + 1] & 0xc0) != 0x80) || ((utf8[i + 2] & 0xc0) != 0x80) || ((utf8[i + 3] & 0xc0) != 0x80)) goto bad_char; charval = ((SilcUInt32)(utf8[i++] & 0x7)) << 18; charval |= (utf8[i++] & 0x3f) << 12; charval |= (utf8[i++] & 0x3f) << 6; charval |= utf8[i] & 0x3f; if (charval < 0x10000) goto bad_char; bytes = 4; } else if ((utf8[i] & 0xfc) == 0xf8) { if (i + 4 >= utf8_len) goto overflow; if (((utf8[i + 1] & 0xc0) != 0x80) || ((utf8[i + 2] & 0xc0) != 0x80) || ((utf8[i + 3] & 0xc0) != 0x80) || ((utf8[i + 4] & 0xc0) != 0x80)) goto bad_char; charval = ((SilcUInt32)(utf8[i++] & 0x3)) << 24; charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18; charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12; charval |= (utf8[i++] & 0x3f) << 6; charval |= utf8[i] & 0x3f; if (charval < 0x200000) goto bad_char; bytes = 5; } else if ((utf8[i] & 0xfe) == 0xfc) { if (i + 5 >= utf8_len) goto overflow; if (((utf8[i + 1] & 0xc0) != 0x80) || ((utf8[i + 2] & 0xc0) != 0x80) || ((utf8[i + 3] & 0xc0) != 0x80) || ((utf8[i + 4] & 0xc0) != 0x80) || ((utf8[i + 5] & 0xc0) != 0x80)) goto bad_char; charval = ((SilcUInt32)(utf8[i++] & 0x1)) << 30; charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 24; charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 18; charval |= ((SilcUInt32)(utf8[i++] & 0x3f)) << 12; charval |= (utf8[i++] & 0x3f) << 6; charval |= utf8[i] & 0x3f; if (charval < 0x4000000) goto bad_char; bytes = 6; } else { goto bad_char; } switch (bin_encoding) { case SILC_STRING_ASCII: case SILC_STRING_PRINTABLE: case SILC_STRING_VISIBLE: case SILC_STRING_TELETEX: case SILC_STRING_NUMERICAL: if (bin) { if (enclen + 1 > bin_size) goto overflow; bin[enclen] = (unsigned char)charval; } enclen++; break; case SILC_STRING_ASCII_ESC: SILC_NOT_IMPLEMENTED("SILC_STRING_ASCII_ESC"); return 0; break; case SILC_STRING_BMP: if (bin) { if (enclen + 2 > bin_size) goto overflow; SILC_PUT16_MSB(charval, bin + enclen); } enclen += 2; break; case SILC_STRING_BMP_LSB: if (bin) { if (enclen + 2 > bin_size) goto overflow; SILC_PUT16_LSB(charval, bin + enclen); } enclen += 2; break; case SILC_STRING_UNIVERSAL: if (bin) { if (enclen + 4 > bin_size) goto overflow; SILC_PUT32_MSB(charval, bin + enclen); } enclen += 4; break; case SILC_STRING_UNIVERSAL_LSB: if (bin) { if (enclen + 4 > bin_size) goto overflow; SILC_PUT32_LSB(charval, bin + enclen); } enclen += 4; break; case SILC_STRING_LDAP_DN: { int k; unsigned char cv; /* Non-printable UTF-8 characters will be escaped, printable will be as is. We take the bytes directly from the original data. */ for (k = 0; k < bytes; k++) { cv = utf8[(i - (bytes - 1)) + k]; /* If string starts with space or # escape it */ if (!enclen && (cv == '#' || cv == ' ')) { if (bin) { if (enclen + 2 > bin_size) goto overflow; bin[enclen] = '\\'; bin[enclen + 1] = cv; } enclen += 2; continue; } /* If string ends with space escape it */ if (i == utf8_len - 1 && cv == ' ') { if (bin) { if (enclen + 2 > bin_size) goto overflow; bin[enclen] = '\\'; bin[enclen + 1] = cv; } enclen += 2; continue; } /* If character is any of following then escape */ if (cv == ',' || cv == '+' || cv == '"' || cv == '\\' || cv == '<' || cv == '>' || cv == ';') { if (bin) { if (enclen + 2 > bin_size) goto overflow; bin[enclen] = '\\'; bin[enclen + 1] = cv; } enclen += 2; continue; } /* If character is not printable escape it with hex character */ if (!isprint((int)cv)) { if (bin) { if (enclen + 3 > bin_size) goto overflow; bin[enclen] = '\\'; silc_snprintf(bin + enclen + 1, 3, "%02X", cv); } enclen += 3; continue; } if (bin) { if (enclen + 1 > bin_size) goto overflow; bin[enclen] = cv; } enclen++; } } break; default: silc_set_errno(SILC_ERR_INVALID_ARGUMENT); return 0; break; } } return enclen; overflow: silc_set_errno(SILC_ERR_OVERFLOW); return 0; bad_char: silc_set_errno(SILC_ERR_BAD_CHAR_ENCODING); return 0; } /* UTF-8 to wide characters */ SilcUInt32 silc_utf8_c2w(const unsigned char *utf8, SilcUInt32 utf8_len, SilcUInt16 *utf8_wide, SilcUInt32 utf8_wide_size) { unsigned char *tmp; SilcUInt32 tmp_len; int i, k; tmp_len = silc_utf8_decoded_len(utf8, utf8_len, SILC_STRING_BMP); if (!tmp_len) return 0; if (utf8_wide_size < tmp_len / 2) { silc_set_errno(SILC_ERR_OVERFLOW); return 0; } memset(utf8_wide, 0, utf8_wide_size * 2); tmp = silc_malloc(tmp_len); if (!tmp) return 0; silc_utf8_decode(utf8, utf8_len, SILC_STRING_BMP, tmp, tmp_len); for (i = 0, k = 0; i < tmp_len; i += 2, k++) SILC_GET16_MSB(utf8_wide[k], tmp + i); silc_free(tmp); return k + 1; } /* Wide characters to UTF-8 */ SilcUInt32 silc_utf8_w2c(const SilcUInt16 *wide_str, SilcUInt32 wide_str_len, unsigned char *utf8, SilcUInt32 utf8_size) { unsigned char *tmp; SilcUInt32 tmp_len; int i, k; if (utf8_size < wide_str_len * 2) { silc_set_errno(SILC_ERR_OVERFLOW); return 0; } memset(utf8, 0, utf8_size); tmp = silc_malloc(wide_str_len * 2); if (!tmp) return 0; for (i = 0, k = 0; i < wide_str_len; i += 2, k++) SILC_PUT16_MSB(wide_str[k], tmp + i); tmp_len = silc_utf8_encode(tmp, wide_str_len * 2, SILC_STRING_BMP, utf8, utf8_size); silc_free(tmp); return tmp_len; } /* Returns the length of UTF-8 encoded string if the `bin' of encoding of `bin_encoding' is encoded with silc_utf8_encode. */ SilcUInt32 silc_utf8_encoded_len(const unsigned char *bin, SilcUInt32 bin_len, SilcStringEncoding bin_encoding) { return silc_utf8_encode(bin, bin_len, bin_encoding, NULL, 0); } /* Returns the length of decoded string if the `bin' of encoding of `bin_encoding' is decoded with silc_utf8_decode. */ SilcUInt32 silc_utf8_decoded_len(const unsigned char *bin, SilcUInt32 bin_len, SilcStringEncoding bin_encoding) { return silc_utf8_decode(bin, bin_len, bin_encoding, NULL, 0); } /* Returns TRUE if the `utf8' string of length of `utf8_len' is valid UTF-8 encoded string, FALSE if it is not UTF-8 encoded string. */ SilcBool silc_utf8_valid(const unsigned char *utf8, SilcUInt32 utf8_len) { return silc_utf8_decode(utf8, utf8_len, 0, NULL, 0) != 0; } /* Pretty close strcasecmp */ SilcBool silc_utf8_strcasecmp(const char *s1, const char *s2) { if (s1 == s2) return TRUE; if (strlen(s1) != strlen(s2)) return FALSE; return silc_utf8_strncasecmp(s1, s2, strlen(s1)); } /* Pretty close strcasecmp */ SilcBool silc_utf8_strncasecmp(const char *s1, const char *s2, SilcUInt32 n) { unsigned char *s1u, *s2u; SilcUInt32 s1u_len, s2u_len; SilcStringprepStatus status; SilcBool ret; if (s1 == s2) return TRUE; /* Casefold and normalize */ status = silc_stringprep(s1, n, SILC_STRING_UTF8, SILC_IDENTIFIERC_PREP, 0, &s1u, &s1u_len, SILC_STRING_UTF8); if (status != SILC_STRINGPREP_OK) return FALSE; /* Casefold and normalize */ status = silc_stringprep(s2, n, SILC_STRING_UTF8, SILC_IDENTIFIERC_PREP, 0, &s2u, &s2u_len, SILC_STRING_UTF8); if (status != SILC_STRINGPREP_OK) return FALSE; ret = !memcmp(s1u, s2u, n); silc_free(s1u); silc_free(s2u); return ret; }