1 /* nfkc.c Unicode normalization utilities.
2 * Copyright (C) 2002, 2003, 2004, 2005 Simon Josefsson
4 * This file is part of GNU Libidn.
6 * GNU Libidn is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * GNU Libidn is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with GNU Libidn; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 #include "stringprep.h"
27 /* This file contains functions from GLIB, including gutf8.c and
28 * gunidecomp.c, all licensed under LGPL and copyright hold by:
30 * Copyright (C) 1999, 2000 Tom Tromey
31 * Copyright 2000 Red Hat, Inc.
34 /* Hacks to make syncing with GLIB code easier. */
37 #define guchar unsigned char
40 #define guint unsigned int
41 #define gushort unsigned short
42 #define gint16 int16_t
43 #define guint16 uint16_t
44 #define gunichar uint32_t
46 #define gssize ssize_t
47 #define g_malloc malloc
50 #define g_set_error(a,b,c,d) ((void) 0)
51 #define g_new(struct_type, n_structs) \
52 ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
53 # if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
54 # define G_STMT_START (void)(
57 # if (defined (sun) || defined (__sun__))
58 # define G_STMT_START if (1)
59 # define G_STMT_END else (void)0
61 # define G_STMT_START do
62 # define G_STMT_END while (0)
65 #define g_return_val_if_fail(expr,val) G_STMT_START{ (void)0; }G_STMT_END
66 #define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
70 /* Code from GLIB gunicode.h starts here. */
75 G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
76 G_NORMALIZE_DEFAULT_COMPOSE,
77 G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
79 G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
80 G_NORMALIZE_ALL_COMPOSE,
81 G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
85 /* Code from GLIB gutf8.c starts here. */
87 #define UTF8_COMPUTE(Char, Mask, Len) \
93 else if ((Char & 0xe0) == 0xc0) \
98 else if ((Char & 0xf0) == 0xe0) \
103 else if ((Char & 0xf8) == 0xf0) \
108 else if ((Char & 0xfc) == 0xf8) \
113 else if ((Char & 0xfe) == 0xfc) \
121 #define UTF8_LENGTH(Char) \
122 ((Char) < 0x80 ? 1 : \
123 ((Char) < 0x800 ? 2 : \
124 ((Char) < 0x10000 ? 3 : \
125 ((Char) < 0x200000 ? 4 : \
126 ((Char) < 0x4000000 ? 5 : 6)))))
129 #define UTF8_GET(Result, Chars, Count, Mask, Len) \
130 (Result) = (Chars)[0] & (Mask); \
131 for ((Count) = 1; (Count) < (Len); ++(Count)) \
133 if (((Chars)[(Count)] & 0xc0) != 0x80) \
139 (Result) |= ((Chars)[(Count)] & 0x3f); \
142 #define UNICODE_VALID(Char) \
143 ((Char) < 0x110000 && \
144 (((Char) & 0xFFFFF800) != 0xD800) && \
145 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
146 ((Char) & 0xFFFE) != 0xFFFE)
149 static const gchar utf8_skip_data[256] = {
150 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
164 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
168 static const gchar *const g_utf8_skip = utf8_skip_data;
170 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
174 * @p: pointer to the start of a UTF-8 encoded string.
175 * @max: the maximum number of bytes to examine. If @max
176 * is less than 0, then the string is assumed to be
177 * nul-terminated. If @max is 0, @p will not be examined and
180 * Returns the length of the string in characters.
182 * Return value: the length of the string in characters
185 g_utf8_strlen (const gchar * p, gssize max)
188 const gchar *start = p;
189 g_return_val_if_fail (p != NULL || max == 0, 0);
195 p = g_utf8_next_char (p);
204 p = g_utf8_next_char (p);
206 while (p - start < max && *p)
209 p = g_utf8_next_char (p);
212 /* only do the last len increment if we got a complete
213 * char (don't count partial chars)
215 if (p - start == max)
224 * @p: a pointer to Unicode character encoded as UTF-8
226 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
227 * If @p does not point to a valid UTF-8 encoded character, results are
228 * undefined. If you are not sure that the bytes are complete
229 * valid Unicode characters, you should use g_utf8_get_char_validated()
232 * Return value: the resulting character
235 g_utf8_get_char (const gchar * p)
237 int i, mask = 0, len;
239 unsigned char c = (unsigned char) *p;
241 UTF8_COMPUTE (c, mask, len);
243 return (gunichar) - 1;
244 UTF8_GET (result, p, i, mask, len);
251 * @c: a ISO10646 character code
252 * @outbuf: output buffer, must have at least 6 bytes of space.
253 * If %NULL, the length will be computed and returned
254 * and nothing will be written to @outbuf.
256 * Converts a single character to UTF-8.
258 * Return value: number of bytes written
261 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
277 else if (c < 0x10000)
282 else if (c < 0x200000)
287 else if (c < 0x4000000)
300 for (i = len - 1; i > 0; --i)
302 outbuf[i] = (c & 0x3f) | 0x80;
305 outbuf[0] = c | first;
312 * g_utf8_to_ucs4_fast:
313 * @str: a UTF-8 encoded string
314 * @len: the maximum length of @str to use. If @len < 0, then
315 * the string is nul-terminated.
316 * @items_written: location to store the number of characters in the
319 * Convert a string from UTF-8 to a 32-bit fixed width
320 * representation as UCS-4, assuming valid UTF-8 input.
321 * This function is roughly twice as fast as g_utf8_to_ucs4()
322 * but does no error checking on the input.
324 * Return value: a pointer to a newly allocated UCS-4 string.
325 * This value must be freed with g_free().
328 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
335 g_return_val_if_fail (str != NULL, NULL);
343 p = g_utf8_next_char (p);
349 while (p < str + len && *p)
351 p = g_utf8_next_char (p);
356 result = g_new (gunichar, n_chars + 1);
361 for (i = 0; i < n_chars; i++)
363 gunichar wc = ((unsigned char *) p)[0];
398 for (j = 1; j < charlen; j++)
401 wc |= ((unsigned char *) p)[j] & 0x3f;
418 * @str: a UCS-4 encoded string
419 * @len: the maximum length of @str to use. If @len < 0, then
420 * the string is terminated with a 0 character.
421 * @items_read: location to store number of characters read read, or %NULL.
422 * @items_written: location to store number of bytes written or %NULL.
423 * The value here stored does not include the trailing 0
425 * @error: location to store the error occuring, or %NULL to ignore
426 * errors. Any of the errors in #GConvertError other than
427 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
429 * Convert a string from a 32-bit fixed width representation as UCS-4.
430 * to UTF-8. The result will be terminated with a 0 byte.
432 * Return value: a pointer to a newly allocated UTF-8 string.
433 * This value must be freed with g_free(). If an
434 * error occurs, %NULL will be returned and
438 g_ucs4_to_utf8 (const gunichar * str,
440 glong * items_read, glong * items_written, GError ** error)
443 gchar *result = NULL;
448 for (i = 0; len < 0 || i < len; i++)
453 if (str[i] >= 0x80000000)
458 g_set_error (error, G_CONVERT_ERROR,
459 G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
460 _("Character out of range for UTF-8"));
464 result_length += UTF8_LENGTH (str[i]);
467 result = g_malloc (result_length + 1);
473 while (p < result + result_length)
474 p += g_unichar_to_utf8 (str[i++], p);
479 *items_written = p - result;
488 /* Code from GLIB gunidecomp.c starts here. */
490 #include "gunidecomp.h"
491 #include "gunicomp.h"
493 #define CC_PART1(Page, Char) \
494 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
495 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
496 : (cclass_data[combining_class_table_part1[Page]][Char]))
498 #define CC_PART2(Page, Char) \
499 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
500 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
501 : (cclass_data[combining_class_table_part2[Page]][Char]))
503 #define COMBINING_CLASS(Char) \
504 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
505 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
506 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
507 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
510 /* constants for hangul syllable [de]composition */
518 #define NCount (VCount * TCount)
519 #define SCount (LCount * NCount)
522 * g_unicode_canonical_ordering:
523 * @string: a UCS-4 encoded string.
524 * @len: the maximum length of @string to use.
526 * Computes the canonical ordering of a string in-place.
527 * This rearranges decomposed characters in the string
528 * according to their combining classes. See the Unicode
529 * manual for more information.
532 g_unicode_canonical_ordering (gunichar * string, gsize len)
541 last = COMBINING_CLASS (string[0]);
542 for (i = 0; i < len - 1; ++i)
544 int next = COMBINING_CLASS (string[i + 1]);
545 if (next != 0 && last > next)
548 /* Percolate item leftward through string. */
549 for (j = i + 1; j > 0; --j)
552 if (COMBINING_CLASS (string[j - 1]) <= next)
555 string[j] = string[j - 1];
559 /* We're re-entering the loop looking at the old
568 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
569 * r should be null or have sufficient space. Calling with r == NULL will
570 * only calculate the result_len; however, a buffer with space for three
571 * characters will always be big enough. */
573 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
575 gint SIndex = s - SBase;
577 /* not a hangul syllable */
578 if (SIndex < 0 || SIndex >= SCount)
586 gunichar L = LBase + SIndex / NCount;
587 gunichar V = VBase + (SIndex % NCount) / TCount;
588 gunichar T = TBase + SIndex % TCount;
607 /* returns a pointer to a null-terminated UTF-8 string */
609 find_decomposition (gunichar ch, gboolean compat)
612 int end = G_N_ELEMENTS (decomp_table);
614 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
618 int half = (start + end) / 2;
619 if (ch == decomp_table[half].ch)
625 offset = decomp_table[half].compat_offset;
626 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
627 offset = decomp_table[half].canon_offset;
631 offset = decomp_table[half].canon_offset;
632 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
636 return &(decomp_expansion_string[offset]);
638 else if (half == start)
640 else if (ch > decomp_table[half].ch)
650 /* L,V => LV and LV,T => LVT */
652 combine_hangul (gunichar a, gunichar b, gunichar * result)
654 gint LIndex = a - LBase;
655 gint SIndex = a - SBase;
657 gint VIndex = b - VBase;
658 gint TIndex = b - TBase;
660 if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
662 *result = SBase + (LIndex * VCount + VIndex) * TCount;
665 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
666 && 0 <= TIndex && TIndex <= TCount)
668 *result = a + TIndex;
675 #define CI(Page, Char) \
676 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
677 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
678 : (compose_data[compose_table[Page]][Char]))
680 #define COMPOSE_INDEX(Char) \
681 ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
684 combine (gunichar a, gunichar b, gunichar * result)
686 gushort index_a, index_b;
688 if (combine_hangul (a, b, result))
691 index_a = COMPOSE_INDEX (a);
693 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
695 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
698 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
705 index_b = COMPOSE_INDEX (b);
707 if (index_b >= COMPOSE_SECOND_SINGLE_START)
710 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
713 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
720 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
721 && index_b >= COMPOSE_SECOND_START
722 && index_b < COMPOSE_SECOND_SINGLE_START)
725 compose_array[index_a - COMPOSE_FIRST_START][index_b -
726 COMPOSE_SECOND_START];
739 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
745 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
746 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
753 while ((max_len < 0 || p < str + max_len) && *p)
756 gunichar wc = g_utf8_get_char (p);
758 if (wc >= 0xac00 && wc <= 0xd7af)
761 decompose_hangul (wc, NULL, &result_len);
766 decomp = find_decomposition (wc, do_compat);
769 n_wc += g_utf8_strlen (decomp, -1);
774 p = g_utf8_next_char (p);
777 wc_buffer = g_new (gunichar, n_wc + 1);
784 while ((max_len < 0 || p < str + max_len) && *p)
786 gunichar wc = g_utf8_get_char (p);
789 gsize old_n_wc = n_wc;
791 if (wc >= 0xac00 && wc <= 0xd7af)
794 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
799 decomp = find_decomposition (wc, do_compat);
804 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
805 wc_buffer[n_wc++] = g_utf8_get_char (pd);
808 wc_buffer[n_wc++] = wc;
813 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
817 g_unicode_canonical_ordering (wc_buffer + last_start,
819 last_start = old_n_wc;
823 p = g_utf8_next_char (p);
828 g_unicode_canonical_ordering (wc_buffer + last_start,
835 /* All decomposed and reordered */
837 if (do_compose && n_wc > 0)
843 for (i = 0; i < n_wc; i++)
845 int cc = COMBINING_CLASS (wc_buffer[i]);
848 (last_cc == 0 || last_cc != cc) &&
849 combine (wc_buffer[last_start], wc_buffer[i],
850 &wc_buffer[last_start]))
852 for (j = i + 1; j < n_wc; j++)
853 wc_buffer[j - 1] = wc_buffer[j];
860 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
879 * @str: a UTF-8 encoded string.
880 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
881 * @mode: the type of normalization to perform.
883 * Converts a string into canonical form, standardizing
884 * such issues as whether a character with an accent
885 * is represented as a base character and combining
886 * accent or as a single precomposed character. You
887 * should generally call g_utf8_normalize() before
888 * comparing two Unicode strings.
890 * The normalization mode %G_NORMALIZE_DEFAULT only
891 * standardizes differences that do not affect the
892 * text content, such as the above-mentioned accent
893 * representation. %G_NORMALIZE_ALL also standardizes
894 * the "compatibility" characters in Unicode, such
895 * as SUPERSCRIPT THREE to the standard forms
896 * (in this case DIGIT THREE). Formatting information
897 * may be lost but for most text operations such
898 * characters should be considered the same.
899 * For example, g_utf8_collate() normalizes
900 * with %G_NORMALIZE_ALL as its first step.
902 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
903 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
904 * but returned a result with composed forms rather
905 * than a maximally decomposed form. This is often
906 * useful if you intend to convert the string to
907 * a legacy encoding or pass it to a system with
908 * less capable Unicode handling.
910 * Return value: a newly allocated string, that is the
911 * normalized form of @str.
914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
916 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
919 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
925 /* Public Libidn API starts here. */
928 * stringprep_utf8_to_unichar:
929 * @p: a pointer to Unicode character encoded as UTF-8
931 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
932 * If @p does not point to a valid UTF-8 encoded character, results are
935 * Return value: the resulting character.
938 stringprep_utf8_to_unichar (const char *p)
940 return g_utf8_get_char (p);
944 * stringprep_unichar_to_utf8:
945 * @c: a ISO10646 character code
946 * @outbuf: output buffer, must have at least 6 bytes of space.
947 * If %NULL, the length will be computed and returned
948 * and nothing will be written to @outbuf.
950 * Converts a single character to UTF-8.
952 * Return value: number of bytes written.
955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
957 return g_unichar_to_utf8 (c, outbuf);
961 * stringprep_utf8_to_ucs4:
962 * @str: a UTF-8 encoded string
963 * @len: the maximum length of @str to use. If @len < 0, then
964 * the string is nul-terminated.
965 * @items_written: location to store the number of characters in the
968 * Convert a string from UTF-8 to a 32-bit fixed width
969 * representation as UCS-4, assuming valid UTF-8 input.
970 * This function does no error checking on the input.
972 * Return value: a pointer to a newly allocated UCS-4 string.
973 * This value must be freed with free().
976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
978 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
982 * stringprep_ucs4_to_utf8:
983 * @str: a UCS-4 encoded string
984 * @len: the maximum length of @str to use. If @len < 0, then
985 * the string is terminated with a 0 character.
986 * @items_read: location to store number of characters read read, or %NULL.
987 * @items_written: location to store number of bytes written or %NULL.
988 * The value here stored does not include the trailing 0
991 * Convert a string from a 32-bit fixed width representation as UCS-4.
992 * to UTF-8. The result will be terminated with a 0 byte.
994 * Return value: a pointer to a newly allocated UTF-8 string.
995 * This value must be freed with free(). If an
996 * error occurs, %NULL will be returned and
1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001 size_t * items_read, size_t * items_written)
1003 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004 (glong *) items_written, NULL);
1008 * stringprep_utf8_nfkc_normalize:
1009 * @str: a UTF-8 encoded string.
1010 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1012 * Converts a string into canonical form, standardizing
1013 * such issues as whether a character with an accent
1014 * is represented as a base character and combining
1015 * accent or as a single precomposed character.
1017 * The normalization mode is NFKC (ALL COMPOSE). It standardizes
1018 * differences that do not affect the text content, such as the
1019 * above-mentioned accent representation. It standardizes the
1020 * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021 * the standard forms (in this case DIGIT THREE). Formatting
1022 * information may be lost but for most text operations such
1023 * characters should be considered the same. It returns a result with
1024 * composed forms rather than a maximally decomposed form.
1026 * Return value: a newly allocated string, that is the
1027 * NFKC normalized form of @str.
1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1032 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1036 * stringprep_ucs4_nfkc_normalize:
1037 * @str: a Unicode string.
1038 * @len: length of @str array, or -1 if @str is nul-terminated.
1040 * Converts UCS4 string into UTF-8 and runs
1041 * stringprep_utf8_nfkc_normalize().
1043 * Return value: a newly allocated Unicode string, that is the NFKC
1044 * normalized form of @str.
1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1050 uint32_t *result_wc;
1052 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);