lib/contrib/nfkc.c

   1 /* nfkc.c       Unicode normalization utilities.
   2  * Copyright (C) 2002, 2003, 2004, 2005  Simon Josefsson
   3  *
   4  * This file is part of GNU Libidn.
   5  *
   6  * GNU Libidn is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * GNU Libidn is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with GNU Libidn; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  */
  21
  22 #include <stdlib.h>
  23 #include <string.h>
  24
  25 #include "stringprep.h"
  26
  27 /* This file contains functions from GLIB, including gutf8.c and
  28  * gunidecomp.c, all licensed under LGPL and copyright hold by:
  29  *
  30  *  Copyright (C) 1999, 2000 Tom Tromey
  31  *  Copyright 2000 Red Hat, Inc.
  32  */
  33
  34 /* Hacks to make syncing with GLIB code easier. */
  35 #define gboolean int
  36 #define gchar char
  37 #define guchar unsigned char
  38 #define glong long
  39 #define gint int
  40 #define guint unsigned int
  41 #define gushort unsigned short
  42 #define gint16 int16_t
  43 #define guint16 uint16_t
  44 #define gunichar uint32_t
  45 #define gsize size_t
  46 #define gssize ssize_t
  47 #define g_malloc malloc
  48 #define g_free free
  49 #define GError void
  50 #define g_set_error(a,b,c,d) ((void) 0)
  51 #define g_new(struct_type, n_structs)                                   \
  52   ((struct_type *) g_malloc (((gsize) sizeof (struct_type)) * ((gsize) (n_structs))))
  53 #  if defined (__GNUC__) && !defined (__STRICT_ANSI__) && !defined (__cplusplus)
  54 #    define G_STMT_START        (void)(
  55 #    define G_STMT_END          )
  56 #  else
  57 #    if (defined (sun) || defined (__sun__))
  58 #      define G_STMT_START      if (1)
  59 #      define G_STMT_END        else (void)0
  60 #    else
  61 #      define G_STMT_START      do
  62 #      define G_STMT_END        while (0)
  63 #    endif
  64 #  endif
  65 #define g_return_val_if_fail(expr,val)          G_STMT_START{ (void)0; }G_STMT_END
  66 #define G_N_ELEMENTS(arr)               (sizeof (arr) / sizeof ((arr)[0]))
  67 #define TRUE 1
  68 #define FALSE 0
  69
  70 /* Code from GLIB gunicode.h starts here. */
  71
  72 typedef enum
  73 {
  74   G_NORMALIZE_DEFAULT,
  75   G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
  76   G_NORMALIZE_DEFAULT_COMPOSE,
  77   G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
  78   G_NORMALIZE_ALL,
  79   G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
  80   G_NORMALIZE_ALL_COMPOSE,
  81   G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
  82 }
  83 GNormalizeMode;
  84
  85 /* Code from GLIB gutf8.c starts here. */
  86
  87 #define UTF8_COMPUTE(Char, Mask, Len)           \
  88   if (Char < 128)                               \
  89     {                                           \
  90       Len = 1;                                  \
  91       Mask = 0x7f;                              \
  92     }                                           \
  93   else if ((Char & 0xe0) == 0xc0)               \
  94     {                                           \
  95       Len = 2;                                  \
  96       Mask = 0x1f;                              \
  97     }                                           \
  98   else if ((Char & 0xf0) == 0xe0)               \
  99     {                                           \
 100       Len = 3;                                  \
 101       Mask = 0x0f;                              \
 102     }                                           \
 103   else if ((Char & 0xf8) == 0xf0)               \
 104     {                                           \
 105       Len = 4;                                  \
 106       Mask = 0x07;                              \
 107     }                                           \
 108   else if ((Char & 0xfc) == 0xf8)               \
 109     {                                           \
 110       Len = 5;                                  \
 111       Mask = 0x03;                              \
 112     }                                           \
 113   else if ((Char & 0xfe) == 0xfc)               \
 114     {                                           \
 115       Len = 6;                                  \
 116       Mask = 0x01;                              \
 117     }                                           \
 118   else                                          \
 119     Len = -1;
 120
 121 #define UTF8_LENGTH(Char)                       \
 122   ((Char) < 0x80 ? 1 :                          \
 123    ((Char) < 0x800 ? 2 :                        \
 124     ((Char) < 0x10000 ? 3 :                     \
 125      ((Char) < 0x200000 ? 4 :                   \
 126       ((Char) < 0x4000000 ? 5 : 6)))))
 127
 128
 129 #define UTF8_GET(Result, Chars, Count, Mask, Len)       \
 130   (Result) = (Chars)[0] & (Mask);                       \
 131   for ((Count) = 1; (Count) < (Len); ++(Count))         \
 132     {                                                   \
 133       if (((Chars)[(Count)] & 0xc0) != 0x80)            \
 134         {                                               \
 135           (Result) = -1;                                \
 136           break;                                        \
 137         }                                               \
 138       (Result) <<= 6;                                   \
 139       (Result) |= ((Chars)[(Count)] & 0x3f);            \
 140     }
 141
 142 #define UNICODE_VALID(Char)                     \
 143   ((Char) < 0x110000 &&                         \
 144    (((Char) & 0xFFFFF800) != 0xD800) &&         \
 145    ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&      \
 146    ((Char) & 0xFFFE) != 0xFFFE)
 147
 148
 149 static const gchar utf8_skip_data[256] = {
 150   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 151   1, 1, 1, 1, 1, 1, 1,
 152   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 153   1, 1, 1, 1, 1, 1, 1,
 154   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 155   1, 1, 1, 1, 1, 1, 1,
 156   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 157   1, 1, 1, 1, 1, 1, 1,
 158   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 159   1, 1, 1, 1, 1, 1, 1,
 160   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 161   1, 1, 1, 1, 1, 1, 1,
 162   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 163   2, 2, 2, 2, 2, 2, 2,
 164   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
 165   5, 5, 5, 6, 6, 1, 1
 166 };
 167
 168 static const gchar *const g_utf8_skip = utf8_skip_data;
 169
 170 #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
 171
 172 /*
 173  * g_utf8_strlen:
 174  * @p: pointer to the start of a UTF-8 encoded string.
 175  * @max: the maximum number of bytes to examine. If @max
 176  *       is less than 0, then the string is assumed to be
 177  *       nul-terminated. If @max is 0, @p will not be examined and
 178  *       may be %NULL.
 179  *
 180  * Returns the length of the string in characters.
 181  *
 182  * Return value: the length of the string in characters
 183  **/
 184 static glong
 185 g_utf8_strlen (const gchar * p, gssize max)
 186 {
 187   glong len = 0;
 188   const gchar *start = p;
 189   g_return_val_if_fail (p != NULL || max == 0, 0);
 190
 191   if (max < 0)
 192     {
 193       while (*p)
 194         {
 195           p = g_utf8_next_char (p);
 196           ++len;
 197         }
 198     }
 199   else
 200     {
 201       if (max == 0 || !*p)
 202         return 0;
 203
 204       p = g_utf8_next_char (p);
 205
 206       while (p - start < max && *p)
 207         {
 208           ++len;
 209           p = g_utf8_next_char (p);
 210         }
 211
 212       /* only do the last len increment if we got a complete
 213        * char (don't count partial chars)
 214        */
 215       if (p - start == max)
 216         ++len;
 217     }
 218
 219   return len;
 220 }
 221
 222 /*
 223  * g_utf8_get_char:
 224  * @p: a pointer to Unicode character encoded as UTF-8
 225  *
 226  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 227  * If @p does not point to a valid UTF-8 encoded character, results are
 228  * undefined. If you are not sure that the bytes are complete
 229  * valid Unicode characters, you should use g_utf8_get_char_validated()
 230  * instead.
 231  *
 232  * Return value: the resulting character
 233  **/
 234 static gunichar
 235 g_utf8_get_char (const gchar * p)
 236 {
 237   int i, mask = 0, len;
 238   gunichar result;
 239   unsigned char c = (unsigned char) *p;
 240
 241   UTF8_COMPUTE (c, mask, len);
 242   if (len == -1)
 243     return (gunichar) - 1;
 244   UTF8_GET (result, p, i, mask, len);
 245
 246   return result;
 247 }
 248
 249 /*
 250  * g_unichar_to_utf8:
 251  * @c: a ISO10646 character code
 252  * @outbuf: output buffer, must have at least 6 bytes of space.
 253  *       If %NULL, the length will be computed and returned
 254  *       and nothing will be written to @outbuf.
 255  *
 256  * Converts a single character to UTF-8.
 257  *
 258  * Return value: number of bytes written
 259  **/
 260 static int
 261 g_unichar_to_utf8 (gunichar c, gchar * outbuf)
 262 {
 263   guint len = 0;
 264   int first;
 265   int i;
 266
 267   if (c < 0x80)
 268     {
 269       first = 0;
 270       len = 1;
 271     }
 272   else if (c < 0x800)
 273     {
 274       first = 0xc0;
 275       len = 2;
 276     }
 277   else if (c < 0x10000)
 278     {
 279       first = 0xe0;
 280       len = 3;
 281     }
 282   else if (c < 0x200000)
 283     {
 284       first = 0xf0;
 285       len = 4;
 286     }
 287   else if (c < 0x4000000)
 288     {
 289       first = 0xf8;
 290       len = 5;
 291     }
 292   else
 293     {
 294       first = 0xfc;
 295       len = 6;
 296     }
 297
 298   if (outbuf)
 299     {
 300       for (i = len - 1; i > 0; --i)
 301         {
 302           outbuf[i] = (c & 0x3f) | 0x80;
 303           c >>= 6;
 304         }
 305       outbuf[0] = c | first;
 306     }
 307
 308   return len;
 309 }
 310
 311 /*
 312  * g_utf8_to_ucs4_fast:
 313  * @str: a UTF-8 encoded string
 314  * @len: the maximum length of @str to use. If @len < 0, then
 315  *       the string is nul-terminated.
 316  * @items_written: location to store the number of characters in the
 317  *                 result, or %NULL.
 318  *
 319  * Convert a string from UTF-8 to a 32-bit fixed width
 320  * representation as UCS-4, assuming valid UTF-8 input.
 321  * This function is roughly twice as fast as g_utf8_to_ucs4()
 322  * but does no error checking on the input.
 323  *
 324  * Return value: a pointer to a newly allocated UCS-4 string.
 325  *               This value must be freed with g_free().
 326  **/
 327 static gunichar *
 328 g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
 329 {
 330   gint j, charlen;
 331   gunichar *result;
 332   gint n_chars, i;
 333   const gchar *p;
 334
 335   g_return_val_if_fail (str != NULL, NULL);
 336
 337   p = str;
 338   n_chars = 0;
 339   if (len < 0)
 340     {
 341       while (*p)
 342         {
 343           p = g_utf8_next_char (p);
 344           ++n_chars;
 345         }
 346     }
 347   else
 348     {
 349       while (p < str + len && *p)
 350         {
 351           p = g_utf8_next_char (p);
 352           ++n_chars;
 353         }
 354     }
 355
 356   result = g_new (gunichar, n_chars + 1);
 357   if (!result)
 358     return NULL;
 359
 360   p = str;
 361   for (i = 0; i < n_chars; i++)
 362     {
 363       gunichar wc = ((unsigned char *) p)[0];
 364
 365       if (wc < 0x80)
 366         {
 367           result[i] = wc;
 368           p++;
 369         }
 370       else
 371         {
 372           if (wc < 0xe0)
 373             {
 374               charlen = 2;
 375               wc &= 0x1f;
 376             }
 377           else if (wc < 0xf0)
 378             {
 379               charlen = 3;
 380               wc &= 0x0f;
 381             }
 382           else if (wc < 0xf8)
 383             {
 384               charlen = 4;
 385               wc &= 0x07;
 386             }
 387           else if (wc < 0xfc)
 388             {
 389               charlen = 5;
 390               wc &= 0x03;
 391             }
 392           else
 393             {
 394               charlen = 6;
 395               wc &= 0x01;
 396             }
 397
 398           for (j = 1; j < charlen; j++)
 399             {
 400               wc <<= 6;
 401               wc |= ((unsigned char *) p)[j] & 0x3f;
 402             }
 403
 404           result[i] = wc;
 405           p += charlen;
 406         }
 407     }
 408   result[i] = 0;
 409
 410   if (items_written)
 411     *items_written = i;
 412
 413   return result;
 414 }
 415
 416 /*
 417  * g_ucs4_to_utf8:
 418  * @str: a UCS-4 encoded string
 419  * @len: the maximum length of @str to use. If @len < 0, then
 420  *       the string is terminated with a 0 character.
 421  * @items_read: location to store number of characters read read, or %NULL.
 422  * @items_written: location to store number of bytes written or %NULL.
 423  *                 The value here stored does not include the trailing 0
 424  *                 byte.
 425  * @error: location to store the error occuring, or %NULL to ignore
 426  *         errors. Any of the errors in #GConvertError other than
 427  *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 428  *
 429  * Convert a string from a 32-bit fixed width representation as UCS-4.
 430  * to UTF-8. The result will be terminated with a 0 byte.
 431  *
 432  * Return value: a pointer to a newly allocated UTF-8 string.
 433  *               This value must be freed with g_free(). If an
 434  *               error occurs, %NULL will be returned and
 435  *               @error set.
 436  **/
 437 static gchar *
 438 g_ucs4_to_utf8 (const gunichar * str,
 439                 glong len,
 440                 glong * items_read, glong * items_written, GError ** error)
 441 {
 442   gint result_length;
 443   gchar *result = NULL;
 444   gchar *p;
 445   gint i;
 446
 447   result_length = 0;
 448   for (i = 0; len < 0 || i < len; i++)
 449     {
 450       if (!str[i])
 451         break;
 452
 453       if (str[i] >= 0x80000000)
 454         {
 455           if (items_read)
 456             *items_read = i;
 457
 458           g_set_error (error, G_CONVERT_ERROR,
 459                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
 460                        _("Character out of range for UTF-8"));
 461           goto err_out;
 462         }
 463
 464       result_length += UTF8_LENGTH (str[i]);
 465     }
 466
 467   result = g_malloc (result_length + 1);
 468   if (!result)
 469     return NULL;
 470   p = result;
 471
 472   i = 0;
 473   while (p < result + result_length)
 474     p += g_unichar_to_utf8 (str[i++], p);
 475
 476   *p = '\0';
 477
 478   if (items_written)
 479     *items_written = p - result;
 480
 481 err_out:
 482   if (items_read)
 483     *items_read = i;
 484
 485   return result;
 486 }
 487
 488 /* Code from GLIB gunidecomp.c starts here. */
 489
 490 #include "gunidecomp.h"
 491 #include "gunicomp.h"
 492
 493 #define CC_PART1(Page, Char) \
 494   ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 495    ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 496    : (cclass_data[combining_class_table_part1[Page]][Char]))
 497
 498 #define CC_PART2(Page, Char) \
 499   ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 500    ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 501    : (cclass_data[combining_class_table_part2[Page]][Char]))
 502
 503 #define COMBINING_CLASS(Char) \
 504   (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
 505    ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
 506    : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
 507       ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
 508       : 0))
 509
 510 /* constants for hangul syllable [de]composition */
 511 #define SBase 0xAC00
 512 #define LBase 0x1100
 513 #define VBase 0x1161
 514 #define TBase 0x11A7
 515 #define LCount 19
 516 #define VCount 21
 517 #define TCount 28
 518 #define NCount (VCount * TCount)
 519 #define SCount (LCount * NCount)
 520
 521 /*
 522  * g_unicode_canonical_ordering:
 523  * @string: a UCS-4 encoded string.
 524  * @len: the maximum length of @string to use.
 525  *
 526  * Computes the canonical ordering of a string in-place.
 527  * This rearranges decomposed characters in the string
 528  * according to their combining classes.  See the Unicode
 529  * manual for more information.
 530  **/
 531 static void
 532 g_unicode_canonical_ordering (gunichar * string, gsize len)
 533 {
 534   gsize i;
 535   int swap = 1;
 536
 537   while (swap)
 538     {
 539       int last;
 540       swap = 0;
 541       last = COMBINING_CLASS (string[0]);
 542       for (i = 0; i < len - 1; ++i)
 543         {
 544           int next = COMBINING_CLASS (string[i + 1]);
 545           if (next != 0 && last > next)
 546             {
 547               gsize j;
 548               /* Percolate item leftward through string.  */
 549               for (j = i + 1; j > 0; --j)
 550                 {
 551                   gunichar t;
 552                   if (COMBINING_CLASS (string[j - 1]) <= next)
 553                     break;
 554                   t = string[j];
 555                   string[j] = string[j - 1];
 556                   string[j - 1] = t;
 557                   swap = 1;
 558                 }
 559               /* We're re-entering the loop looking at the old
 560                  character again.  */
 561               next = last;
 562             }
 563           last = next;
 564         }
 565     }
 566 }
 567
 568 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
 569  * r should be null or have sufficient space. Calling with r == NULL will
 570  * only calculate the result_len; however, a buffer with space for three
 571  * characters will always be big enough. */
 572 static void
 573 decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
 574 {
 575   gint SIndex = s - SBase;
 576
 577   /* not a hangul syllable */
 578   if (SIndex < 0 || SIndex >= SCount)
 579     {
 580       if (r)
 581         r[0] = s;
 582       *result_len = 1;
 583     }
 584   else
 585     {
 586       gunichar L = LBase + SIndex / NCount;
 587       gunichar V = VBase + (SIndex % NCount) / TCount;
 588       gunichar T = TBase + SIndex % TCount;
 589
 590       if (r)
 591         {
 592           r[0] = L;
 593           r[1] = V;
 594         }
 595
 596       if (T != TBase)
 597         {
 598           if (r)
 599             r[2] = T;
 600           *result_len = 3;
 601         }
 602       else
 603         *result_len = 2;
 604     }
 605 }
 606
 607 /* returns a pointer to a null-terminated UTF-8 string */
 608 static const gchar *
 609 find_decomposition (gunichar ch, gboolean compat)
 610 {
 611   int start = 0;
 612   int end = G_N_ELEMENTS (decomp_table);
 613
 614   if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
 615     {
 616       while (TRUE)
 617         {
 618           int half = (start + end) / 2;
 619           if (ch == decomp_table[half].ch)
 620             {
 621               int offset;
 622
 623               if (compat)
 624                 {
 625                   offset = decomp_table[half].compat_offset;
 626                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 627                     offset = decomp_table[half].canon_offset;
 628                 }
 629               else
 630                 {
 631                   offset = decomp_table[half].canon_offset;
 632                   if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
 633                     return NULL;
 634                 }
 635
 636               return &(decomp_expansion_string[offset]);
 637             }
 638           else if (half == start)
 639             break;
 640           else if (ch > decomp_table[half].ch)
 641             start = half;
 642           else
 643             end = half;
 644         }
 645     }
 646
 647   return NULL;
 648 }
 649
 650 /* L,V => LV and LV,T => LVT  */
 651 static gboolean
 652 combine_hangul (gunichar a, gunichar b, gunichar * result)
 653 {
 654   gint LIndex = a - LBase;
 655   gint SIndex = a - SBase;
 656
 657   gint VIndex = b - VBase;
 658   gint TIndex = b - TBase;
 659
 660   if (0 <= LIndex && LIndex < LCount && 0 <= VIndex && VIndex < VCount)
 661     {
 662       *result = SBase + (LIndex * VCount + VIndex) * TCount;
 663       return TRUE;
 664     }
 665   else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
 666            && 0 <= TIndex && TIndex <= TCount)
 667     {
 668       *result = a + TIndex;
 669       return TRUE;
 670     }
 671
 672   return FALSE;
 673 }
 674
 675 #define CI(Page, Char) \
 676   ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
 677    ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
 678    : (compose_data[compose_table[Page]][Char]))
 679
 680 #define COMPOSE_INDEX(Char) \
 681      ((((Char) >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
 682
 683 static gboolean
 684 combine (gunichar a, gunichar b, gunichar * result)
 685 {
 686   gushort index_a, index_b;
 687
 688   if (combine_hangul (a, b, result))
 689     return TRUE;
 690
 691   index_a = COMPOSE_INDEX (a);
 692
 693   if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
 694     {
 695       if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
 696         {
 697           *result =
 698             compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
 699           return TRUE;
 700         }
 701       else
 702         return FALSE;
 703     }
 704
 705   index_b = COMPOSE_INDEX (b);
 706
 707   if (index_b >= COMPOSE_SECOND_SINGLE_START)
 708     {
 709       if (a ==
 710           compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
 711         {
 712           *result =
 713             compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
 714           return TRUE;
 715         }
 716       else
 717         return FALSE;
 718     }
 719
 720   if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
 721       && index_b >= COMPOSE_SECOND_START
 722       && index_b < COMPOSE_SECOND_SINGLE_START)
 723     {
 724       gunichar res =
 725         compose_array[index_a - COMPOSE_FIRST_START][index_b -
 726                                                      COMPOSE_SECOND_START];
 727
 728       if (res)
 729         {
 730           *result = res;
 731           return TRUE;
 732         }
 733     }
 734
 735   return FALSE;
 736 }
 737
 738 static gunichar *
 739 _g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
 740 {
 741   gsize n_wc;
 742   gunichar *wc_buffer;
 743   const char *p;
 744   gsize last_start;
 745   gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
 746   gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
 747
 748   if (!str)
 749     return NULL;
 750
 751   n_wc = 0;
 752   p = str;
 753   while ((max_len < 0 || p < str + max_len) && *p)
 754     {
 755       const gchar *decomp;
 756       gunichar wc = g_utf8_get_char (p);
 757
 758       if (wc >= 0xac00 && wc <= 0xd7af)
 759         {
 760           gsize result_len;
 761           decompose_hangul (wc, NULL, &result_len);
 762           n_wc += result_len;
 763         }
 764       else
 765         {
 766           decomp = find_decomposition (wc, do_compat);
 767
 768           if (decomp)
 769             n_wc += g_utf8_strlen (decomp, -1);
 770           else
 771             n_wc++;
 772         }
 773
 774       p = g_utf8_next_char (p);
 775     }
 776
 777   wc_buffer = g_new (gunichar, n_wc + 1);
 778   if (!wc_buffer)
 779     return NULL;
 780
 781   last_start = 0;
 782   n_wc = 0;
 783   p = str;
 784   while ((max_len < 0 || p < str + max_len) && *p)
 785     {
 786       gunichar wc = g_utf8_get_char (p);
 787       const gchar *decomp;
 788       int cc;
 789       gsize old_n_wc = n_wc;
 790
 791       if (wc >= 0xac00 && wc <= 0xd7af)
 792         {
 793           gsize result_len;
 794           decompose_hangul (wc, wc_buffer + n_wc, &result_len);
 795           n_wc += result_len;
 796         }
 797       else
 798         {
 799           decomp = find_decomposition (wc, do_compat);
 800
 801           if (decomp)
 802             {
 803               const char *pd;
 804               for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
 805                 wc_buffer[n_wc++] = g_utf8_get_char (pd);
 806             }
 807           else
 808             wc_buffer[n_wc++] = wc;
 809         }
 810
 811       if (n_wc > 0)
 812         {
 813           cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
 814
 815           if (cc == 0)
 816             {
 817               g_unicode_canonical_ordering (wc_buffer + last_start,
 818                                             n_wc - last_start);
 819               last_start = old_n_wc;
 820             }
 821         }
 822
 823       p = g_utf8_next_char (p);
 824     }
 825
 826   if (n_wc > 0)
 827     {
 828       g_unicode_canonical_ordering (wc_buffer + last_start,
 829                                     n_wc - last_start);
 830       last_start = n_wc;
 831     }
 832
 833   wc_buffer[n_wc] = 0;
 834
 835   /* All decomposed and reordered */
 836
 837   if (do_compose && n_wc > 0)
 838     {
 839       gsize i, j;
 840       int last_cc = 0;
 841       last_start = 0;
 842
 843       for (i = 0; i < n_wc; i++)
 844         {
 845           int cc = COMBINING_CLASS (wc_buffer[i]);
 846
 847           if (i > 0 &&
 848               (last_cc == 0 || last_cc != cc) &&
 849               combine (wc_buffer[last_start], wc_buffer[i],
 850                        &wc_buffer[last_start]))
 851             {
 852               for (j = i + 1; j < n_wc; j++)
 853                 wc_buffer[j - 1] = wc_buffer[j];
 854               n_wc--;
 855               i--;
 856
 857               if (i == last_start)
 858                 last_cc = 0;
 859               else
 860                 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
 861
 862               continue;
 863             }
 864
 865           if (cc == 0)
 866             last_start = i;
 867
 868           last_cc = cc;
 869         }
 870     }
 871
 872   wc_buffer[n_wc] = 0;
 873
 874   return wc_buffer;
 875 }
 876
 877 /*
 878  * g_utf8_normalize:
 879  * @str: a UTF-8 encoded string.
 880  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
 881  * @mode: the type of normalization to perform.
 882  *
 883  * Converts a string into canonical form, standardizing
 884  * such issues as whether a character with an accent
 885  * is represented as a base character and combining
 886  * accent or as a single precomposed character. You
 887  * should generally call g_utf8_normalize() before
 888  * comparing two Unicode strings.
 889  *
 890  * The normalization mode %G_NORMALIZE_DEFAULT only
 891  * standardizes differences that do not affect the
 892  * text content, such as the above-mentioned accent
 893  * representation. %G_NORMALIZE_ALL also standardizes
 894  * the "compatibility" characters in Unicode, such
 895  * as SUPERSCRIPT THREE to the standard forms
 896  * (in this case DIGIT THREE). Formatting information
 897  * may be lost but for most text operations such
 898  * characters should be considered the same.
 899  * For example, g_utf8_collate() normalizes
 900  * with %G_NORMALIZE_ALL as its first step.
 901  *
 902  * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
 903  * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
 904  * but returned a result with composed forms rather
 905  * than a maximally decomposed form. This is often
 906  * useful if you intend to convert the string to
 907  * a legacy encoding or pass it to a system with
 908  * less capable Unicode handling.
 909  *
 910  * Return value: a newly allocated string, that is the
 911  *   normalized form of @str.
 912  **/
 913 static gchar *
 914 g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
 915 {
 916   gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
 917   gchar *result;
 918
 919   result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
 920   g_free (result_wc);
 921
 922   return result;
 923 }
 924
 925 /* Public Libidn API starts here. */
 926
 927 /**
 928  * stringprep_utf8_to_unichar:
 929  * @p: a pointer to Unicode character encoded as UTF-8
 930  *
 931  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 932  * If @p does not point to a valid UTF-8 encoded character, results are
 933  * undefined.
 934  *
 935  * Return value: the resulting character.
 936  **/
 937 uint32_t
 938 stringprep_utf8_to_unichar (const char *p)
 939 {
 940   return g_utf8_get_char (p);
 941 }
 942
 943 /**
 944  * stringprep_unichar_to_utf8:
 945  * @c: a ISO10646 character code
 946  * @outbuf: output buffer, must have at least 6 bytes of space.
 947  *       If %NULL, the length will be computed and returned
 948  *       and nothing will be written to @outbuf.
 949  *
 950  * Converts a single character to UTF-8.
 951  *
 952  * Return value: number of bytes written.
 953  **/
 954 int
 955 stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
 956 {
 957   return g_unichar_to_utf8 (c, outbuf);
 958 }
 959
 960 /**
 961  * stringprep_utf8_to_ucs4:
 962  * @str: a UTF-8 encoded string
 963  * @len: the maximum length of @str to use. If @len < 0, then
 964  *       the string is nul-terminated.
 965  * @items_written: location to store the number of characters in the
 966  *                 result, or %NULL.
 967  *
 968  * Convert a string from UTF-8 to a 32-bit fixed width
 969  * representation as UCS-4, assuming valid UTF-8 input.
 970  * This function does no error checking on the input.
 971  *
 972  * Return value: a pointer to a newly allocated UCS-4 string.
 973  *               This value must be freed with free().
 974  **/
 975 uint32_t *
 976 stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t * items_written)
 977 {
 978   return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
 979 }
 980
 981 /**
 982  * stringprep_ucs4_to_utf8:
 983  * @str: a UCS-4 encoded string
 984  * @len: the maximum length of @str to use. If @len < 0, then
 985  *       the string is terminated with a 0 character.
 986  * @items_read: location to store number of characters read read, or %NULL.
 987  * @items_written: location to store number of bytes written or %NULL.
 988  *                 The value here stored does not include the trailing 0
 989  *                 byte.
 990  *
 991  * Convert a string from a 32-bit fixed width representation as UCS-4.
 992  * to UTF-8. The result will be terminated with a 0 byte.
 993  *
 994  * Return value: a pointer to a newly allocated UTF-8 string.
 995  *               This value must be freed with free(). If an
 996  *               error occurs, %NULL will be returned and
 997  *               @error set.
 998  **/
 999 char *
1000 stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1001                          size_t * items_read, size_t * items_written)
1002 {
1003   return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1004                          (glong *) items_written, NULL);
1005 }
1006
1007 /**
1008  * stringprep_utf8_nfkc_normalize:
1009  * @str: a UTF-8 encoded string.
1010  * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1011  *
1012  * Converts a string into canonical form, standardizing
1013  * such issues as whether a character with an accent
1014  * is represented as a base character and combining
1015  * accent or as a single precomposed character.
1016  *
1017  * The normalization mode is NFKC (ALL COMPOSE).  It standardizes
1018  * differences that do not affect the text content, such as the
1019  * above-mentioned accent representation. It standardizes the
1020  * "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to
1021  * the standard forms (in this case DIGIT THREE). Formatting
1022  * information may be lost but for most text operations such
1023  * characters should be considered the same. It returns a result with
1024  * composed forms rather than a maximally decomposed form.
1025  *
1026  * Return value: a newly allocated string, that is the
1027  *   NFKC normalized form of @str.
1028  **/
1029 char *
1030 stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1031 {
1032   return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1033 }
1034
1035 /**
1036  * stringprep_ucs4_nfkc_normalize:
1037  * @str: a Unicode string.
1038  * @len: length of @str array, or -1 if @str is nul-terminated.
1039  *
1040  * Converts UCS4 string into UTF-8 and runs
1041  * stringprep_utf8_nfkc_normalize().
1042  *
1043  * Return value: a newly allocated Unicode string, that is the NFKC
1044  *   normalized form of @str.
1045  **/
1046 uint32_t *
1047 stringprep_ucs4_nfkc_normalize (uint32_t * str, ssize_t len)
1048 {
1049   char *p;
1050   uint32_t *result_wc;
1051
1052   p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1053   result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1054   free (p);
1055
1056   return result_wc;
1057 }