| 
sl@0
 | 
     1  | 
/* decomp.c - Character decomposition.
  | 
| 
sl@0
 | 
     2  | 
 *
  | 
| 
sl@0
 | 
     3  | 
 *  Copyright (C) 1999, 2000 Tom Tromey
  | 
| 
sl@0
 | 
     4  | 
 *  Copyright 2000 Red Hat, Inc.
  | 
| 
sl@0
 | 
     5  | 
 * Portions copyright (c) 2006-2009 Nokia Corporation.  All rights reserved.
  | 
| 
sl@0
 | 
     6  | 
 *
  | 
| 
sl@0
 | 
     7  | 
 * The Gnome Library is free software; you can redistribute it and/or
  | 
| 
sl@0
 | 
     8  | 
 * modify it under the terms of the GNU Lesser General Public License as
  | 
| 
sl@0
 | 
     9  | 
 * published by the Free Software Foundation; either version 2 of the
  | 
| 
sl@0
 | 
    10  | 
 * License, or (at your option) any later version.
  | 
| 
sl@0
 | 
    11  | 
 *
  | 
| 
sl@0
 | 
    12  | 
 * The Gnome Library is distributed in the hope that it will be useful,
  | 
| 
sl@0
 | 
    13  | 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  | 
| 
sl@0
 | 
    14  | 
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  | 
| 
sl@0
 | 
    15  | 
 * Lesser General Public License for more details.
  | 
| 
sl@0
 | 
    16  | 
 *
  | 
| 
sl@0
 | 
    17  | 
 * You should have received a copy of the GNU Lesser General Public
  | 
| 
sl@0
 | 
    18  | 
 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
  | 
| 
sl@0
 | 
    19  | 
 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  | 
| 
sl@0
 | 
    20  | 
 *   Boston, MA 02111-1307, USA.
  | 
| 
sl@0
 | 
    21  | 
 */
  | 
| 
sl@0
 | 
    22  | 
  | 
| 
sl@0
 | 
    23  | 
#include "config.h"
  | 
| 
sl@0
 | 
    24  | 
  | 
| 
sl@0
 | 
    25  | 
#include <stdlib.h>
  | 
| 
sl@0
 | 
    26  | 
  | 
| 
sl@0
 | 
    27  | 
#include "glib.h"
  | 
| 
sl@0
 | 
    28  | 
#include "gunidecomp.h"
  | 
| 
sl@0
 | 
    29  | 
#include "gunicomp.h"
  | 
| 
sl@0
 | 
    30  | 
#include "gunicodeprivate.h"
  | 
| 
sl@0
 | 
    31  | 
#include "galias.h"
  | 
| 
sl@0
 | 
    32  | 
  | 
| 
sl@0
 | 
    33  | 
  | 
| 
sl@0
 | 
    34  | 
#define CC_PART1(Page, Char) \
  | 
| 
sl@0
 | 
    35  | 
  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
    36  | 
   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
    37  | 
   : (cclass_data[combining_class_table_part1[Page]][Char]))
  | 
| 
sl@0
 | 
    38  | 
  | 
| 
sl@0
 | 
    39  | 
#define CC_PART2(Page, Char) \
  | 
| 
sl@0
 | 
    40  | 
  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
    41  | 
   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
    42  | 
   : (cclass_data[combining_class_table_part2[Page]][Char]))
  | 
| 
sl@0
 | 
    43  | 
  | 
| 
sl@0
 | 
    44  | 
#define COMBINING_CLASS(Char) \
  | 
| 
sl@0
 | 
    45  | 
  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
  | 
| 
sl@0
 | 
    46  | 
   ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
  | 
| 
sl@0
 | 
    47  | 
   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
  | 
| 
sl@0
 | 
    48  | 
      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
  | 
| 
sl@0
 | 
    49  | 
      : 0))
  | 
| 
sl@0
 | 
    50  | 
  | 
| 
sl@0
 | 
    51  | 
/**
  | 
| 
sl@0
 | 
    52  | 
 * g_unichar_combining_class:
  | 
| 
sl@0
 | 
    53  | 
 * @uc: a Unicode character
  | 
| 
sl@0
 | 
    54  | 
 * 
  | 
| 
sl@0
 | 
    55  | 
 * Determines the canonical combining class of a Unicode character.
  | 
| 
sl@0
 | 
    56  | 
 * 
  | 
| 
sl@0
 | 
    57  | 
 * Return value: the combining class of the character
  | 
| 
sl@0
 | 
    58  | 
 *
  | 
| 
sl@0
 | 
    59  | 
 * Since: 2.14
  | 
| 
sl@0
 | 
    60  | 
 **/
  | 
| 
sl@0
 | 
    61  | 
EXPORT_C gint
  | 
| 
sl@0
 | 
    62  | 
g_unichar_combining_class (gunichar uc)
  | 
| 
sl@0
 | 
    63  | 
{
 | 
| 
sl@0
 | 
    64  | 
  return COMBINING_CLASS (uc);
  | 
| 
sl@0
 | 
    65  | 
}
  | 
| 
sl@0
 | 
    66  | 
  | 
| 
sl@0
 | 
    67  | 
/* constants for hangul syllable [de]composition */
  | 
| 
sl@0
 | 
    68  | 
#define SBase 0xAC00 
  | 
| 
sl@0
 | 
    69  | 
#define LBase 0x1100 
  | 
| 
sl@0
 | 
    70  | 
#define VBase 0x1161 
  | 
| 
sl@0
 | 
    71  | 
#define TBase 0x11A7
  | 
| 
sl@0
 | 
    72  | 
#define LCount 19 
  | 
| 
sl@0
 | 
    73  | 
#define VCount 21
  | 
| 
sl@0
 | 
    74  | 
#define TCount 28
  | 
| 
sl@0
 | 
    75  | 
#define NCount (VCount * TCount)
  | 
| 
sl@0
 | 
    76  | 
#define SCount (LCount * NCount)
  | 
| 
sl@0
 | 
    77  | 
  | 
| 
sl@0
 | 
    78  | 
/**
  | 
| 
sl@0
 | 
    79  | 
 * g_unicode_canonical_ordering:
  | 
| 
sl@0
 | 
    80  | 
 * @string: a UCS-4 encoded string.
  | 
| 
sl@0
 | 
    81  | 
 * @len: the maximum length of @string to use.
  | 
| 
sl@0
 | 
    82  | 
 *
  | 
| 
sl@0
 | 
    83  | 
 * Computes the canonical ordering of a string in-place.  
  | 
| 
sl@0
 | 
    84  | 
 * This rearranges decomposed characters in the string 
  | 
| 
sl@0
 | 
    85  | 
 * according to their combining classes.  See the Unicode 
  | 
| 
sl@0
 | 
    86  | 
 * manual for more information. 
  | 
| 
sl@0
 | 
    87  | 
 **/
  | 
| 
sl@0
 | 
    88  | 
EXPORT_C void
  | 
| 
sl@0
 | 
    89  | 
g_unicode_canonical_ordering (gunichar *string,
  | 
| 
sl@0
 | 
    90  | 
			      gsize     len)
  | 
| 
sl@0
 | 
    91  | 
{
 | 
| 
sl@0
 | 
    92  | 
  gsize i;
  | 
| 
sl@0
 | 
    93  | 
  int swap = 1;
  | 
| 
sl@0
 | 
    94  | 
  | 
| 
sl@0
 | 
    95  | 
  while (swap)
  | 
| 
sl@0
 | 
    96  | 
    {
 | 
| 
sl@0
 | 
    97  | 
      int last;
  | 
| 
sl@0
 | 
    98  | 
      swap = 0;
  | 
| 
sl@0
 | 
    99  | 
      last = COMBINING_CLASS (string[0]);
  | 
| 
sl@0
 | 
   100  | 
      for (i = 0; i < len - 1; ++i)
  | 
| 
sl@0
 | 
   101  | 
	{
 | 
| 
sl@0
 | 
   102  | 
	  int next = COMBINING_CLASS (string[i + 1]);
  | 
| 
sl@0
 | 
   103  | 
	  if (next != 0 && last > next)
  | 
| 
sl@0
 | 
   104  | 
	    {
 | 
| 
sl@0
 | 
   105  | 
	      gsize j;
  | 
| 
sl@0
 | 
   106  | 
	      /* Percolate item leftward through string.  */
  | 
| 
sl@0
 | 
   107  | 
	      for (j = i + 1; j > 0; --j)
  | 
| 
sl@0
 | 
   108  | 
		{
 | 
| 
sl@0
 | 
   109  | 
		  gunichar t;
  | 
| 
sl@0
 | 
   110  | 
		  if (COMBINING_CLASS (string[j - 1]) <= next)
  | 
| 
sl@0
 | 
   111  | 
		    break;
  | 
| 
sl@0
 | 
   112  | 
		  t = string[j];
  | 
| 
sl@0
 | 
   113  | 
		  string[j] = string[j - 1];
  | 
| 
sl@0
 | 
   114  | 
		  string[j - 1] = t;
  | 
| 
sl@0
 | 
   115  | 
		  swap = 1;
  | 
| 
sl@0
 | 
   116  | 
		}
  | 
| 
sl@0
 | 
   117  | 
	      /* We're re-entering the loop looking at the old
  | 
| 
sl@0
 | 
   118  | 
		 character again.  */
  | 
| 
sl@0
 | 
   119  | 
	      next = last;
  | 
| 
sl@0
 | 
   120  | 
	    }
  | 
| 
sl@0
 | 
   121  | 
	  last = next;
  | 
| 
sl@0
 | 
   122  | 
	}
  | 
| 
sl@0
 | 
   123  | 
    }
  | 
| 
sl@0
 | 
   124  | 
}
  | 
| 
sl@0
 | 
   125  | 
  | 
| 
sl@0
 | 
   126  | 
/* http://www.unicode.org/unicode/reports/tr15/#Hangul
  | 
| 
sl@0
 | 
   127  | 
 * r should be null or have sufficient space. Calling with r == NULL will
  | 
| 
sl@0
 | 
   128  | 
 * only calculate the result_len; however, a buffer with space for three
  | 
| 
sl@0
 | 
   129  | 
 * characters will always be big enough. */
  | 
| 
sl@0
 | 
   130  | 
static void
  | 
| 
sl@0
 | 
   131  | 
decompose_hangul (gunichar s, 
  | 
| 
sl@0
 | 
   132  | 
                  gunichar *r,
  | 
| 
sl@0
 | 
   133  | 
                  gsize *result_len)
  | 
| 
sl@0
 | 
   134  | 
{
 | 
| 
sl@0
 | 
   135  | 
  gint SIndex = s - SBase;
  | 
| 
sl@0
 | 
   136  | 
  | 
| 
sl@0
 | 
   137  | 
  /* not a hangul syllable */
  | 
| 
sl@0
 | 
   138  | 
  if (SIndex < 0 || SIndex >= SCount)
  | 
| 
sl@0
 | 
   139  | 
    {
 | 
| 
sl@0
 | 
   140  | 
      if (r)
  | 
| 
sl@0
 | 
   141  | 
        r[0] = s;
  | 
| 
sl@0
 | 
   142  | 
      *result_len = 1;
  | 
| 
sl@0
 | 
   143  | 
    }
  | 
| 
sl@0
 | 
   144  | 
  else
  | 
| 
sl@0
 | 
   145  | 
    {
 | 
| 
sl@0
 | 
   146  | 
      gunichar L = LBase + SIndex / NCount;
  | 
| 
sl@0
 | 
   147  | 
      gunichar V = VBase + (SIndex % NCount) / TCount;
  | 
| 
sl@0
 | 
   148  | 
      gunichar T = TBase + SIndex % TCount;
  | 
| 
sl@0
 | 
   149  | 
  | 
| 
sl@0
 | 
   150  | 
      if (r)
  | 
| 
sl@0
 | 
   151  | 
        {
 | 
| 
sl@0
 | 
   152  | 
          r[0] = L;
  | 
| 
sl@0
 | 
   153  | 
          r[1] = V;
  | 
| 
sl@0
 | 
   154  | 
        }
  | 
| 
sl@0
 | 
   155  | 
  | 
| 
sl@0
 | 
   156  | 
      if (T != TBase) 
  | 
| 
sl@0
 | 
   157  | 
        {
 | 
| 
sl@0
 | 
   158  | 
          if (r)
  | 
| 
sl@0
 | 
   159  | 
            r[2] = T;
  | 
| 
sl@0
 | 
   160  | 
          *result_len = 3;
  | 
| 
sl@0
 | 
   161  | 
        }
  | 
| 
sl@0
 | 
   162  | 
      else
  | 
| 
sl@0
 | 
   163  | 
        *result_len = 2;
  | 
| 
sl@0
 | 
   164  | 
    }
  | 
| 
sl@0
 | 
   165  | 
}
  | 
| 
sl@0
 | 
   166  | 
  | 
| 
sl@0
 | 
   167  | 
/* returns a pointer to a null-terminated UTF-8 string */
  | 
| 
sl@0
 | 
   168  | 
static const gchar *
  | 
| 
sl@0
 | 
   169  | 
find_decomposition (gunichar ch,
  | 
| 
sl@0
 | 
   170  | 
		    gboolean compat)
  | 
| 
sl@0
 | 
   171  | 
{
 | 
| 
sl@0
 | 
   172  | 
  int start = 0;
  | 
| 
sl@0
 | 
   173  | 
  int end = G_N_ELEMENTS (decomp_table);
  | 
| 
sl@0
 | 
   174  | 
  
  | 
| 
sl@0
 | 
   175  | 
  if (ch >= decomp_table[start].ch &&
  | 
| 
sl@0
 | 
   176  | 
      ch <= decomp_table[end - 1].ch)
  | 
| 
sl@0
 | 
   177  | 
    {
 | 
| 
sl@0
 | 
   178  | 
      while (TRUE)
  | 
| 
sl@0
 | 
   179  | 
	{
 | 
| 
sl@0
 | 
   180  | 
	  int half = (start + end) / 2;
  | 
| 
sl@0
 | 
   181  | 
	  if (ch == decomp_table[half].ch)
  | 
| 
sl@0
 | 
   182  | 
	    {
 | 
| 
sl@0
 | 
   183  | 
	      int offset;
  | 
| 
sl@0
 | 
   184  | 
  | 
| 
sl@0
 | 
   185  | 
	      if (compat)
  | 
| 
sl@0
 | 
   186  | 
		{
 | 
| 
sl@0
 | 
   187  | 
		  offset = decomp_table[half].compat_offset;
  | 
| 
sl@0
 | 
   188  | 
		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
  | 
| 
sl@0
 | 
   189  | 
		    offset = decomp_table[half].canon_offset;
  | 
| 
sl@0
 | 
   190  | 
		}
  | 
| 
sl@0
 | 
   191  | 
	      else
  | 
| 
sl@0
 | 
   192  | 
		{
 | 
| 
sl@0
 | 
   193  | 
		  offset = decomp_table[half].canon_offset;
  | 
| 
sl@0
 | 
   194  | 
		  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
  | 
| 
sl@0
 | 
   195  | 
		    return NULL;
  | 
| 
sl@0
 | 
   196  | 
		}
  | 
| 
sl@0
 | 
   197  | 
	      
  | 
| 
sl@0
 | 
   198  | 
	      return &(decomp_expansion_string[offset]);
  | 
| 
sl@0
 | 
   199  | 
	    }
  | 
| 
sl@0
 | 
   200  | 
	  else if (half == start)
  | 
| 
sl@0
 | 
   201  | 
	    break;
  | 
| 
sl@0
 | 
   202  | 
	  else if (ch > decomp_table[half].ch)
  | 
| 
sl@0
 | 
   203  | 
	    start = half;
  | 
| 
sl@0
 | 
   204  | 
	  else
  | 
| 
sl@0
 | 
   205  | 
	    end = half;
  | 
| 
sl@0
 | 
   206  | 
	}
  | 
| 
sl@0
 | 
   207  | 
    }
  | 
| 
sl@0
 | 
   208  | 
  | 
| 
sl@0
 | 
   209  | 
  return NULL;
  | 
| 
sl@0
 | 
   210  | 
}
  | 
| 
sl@0
 | 
   211  | 
  | 
| 
sl@0
 | 
   212  | 
/**
  | 
| 
sl@0
 | 
   213  | 
 * g_unicode_canonical_decomposition:
  | 
| 
sl@0
 | 
   214  | 
 * @ch: a Unicode character.
  | 
| 
sl@0
 | 
   215  | 
 * @result_len: location to store the length of the return value.
  | 
| 
sl@0
 | 
   216  | 
 *
  | 
| 
sl@0
 | 
   217  | 
 * Computes the canonical decomposition of a Unicode character.  
  | 
| 
sl@0
 | 
   218  | 
 * 
  | 
| 
sl@0
 | 
   219  | 
 * Return value: a newly allocated string of Unicode characters.
  | 
| 
sl@0
 | 
   220  | 
 *   @result_len is set to the resulting length of the string.
  | 
| 
sl@0
 | 
   221  | 
 **/
  | 
| 
sl@0
 | 
   222  | 
EXPORT_C gunichar *
  | 
| 
sl@0
 | 
   223  | 
g_unicode_canonical_decomposition (gunichar ch,
  | 
| 
sl@0
 | 
   224  | 
				   gsize   *result_len)
  | 
| 
sl@0
 | 
   225  | 
{
 | 
| 
sl@0
 | 
   226  | 
  const gchar *decomp;
  | 
| 
sl@0
 | 
   227  | 
  const gchar *p;
  | 
| 
sl@0
 | 
   228  | 
  gunichar *r;
  | 
| 
sl@0
 | 
   229  | 
  | 
| 
sl@0
 | 
   230  | 
  /* Hangul syllable */
  | 
| 
sl@0
 | 
   231  | 
  if (ch >= 0xac00 && ch <= 0xd7a3)
  | 
| 
sl@0
 | 
   232  | 
    {
 | 
| 
sl@0
 | 
   233  | 
      decompose_hangul (ch, NULL, result_len);
  | 
| 
sl@0
 | 
   234  | 
      r = g_malloc (*result_len * sizeof (gunichar));
  | 
| 
sl@0
 | 
   235  | 
      decompose_hangul (ch, r, result_len);
  | 
| 
sl@0
 | 
   236  | 
    }
  | 
| 
sl@0
 | 
   237  | 
  else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
  | 
| 
sl@0
 | 
   238  | 
    {
 | 
| 
sl@0
 | 
   239  | 
      /* Found it.  */
  | 
| 
sl@0
 | 
   240  | 
      int i;
  | 
| 
sl@0
 | 
   241  | 
      
  | 
| 
sl@0
 | 
   242  | 
      *result_len = g_utf8_strlen (decomp, -1);
  | 
| 
sl@0
 | 
   243  | 
      r = g_malloc (*result_len * sizeof (gunichar));
  | 
| 
sl@0
 | 
   244  | 
      
  | 
| 
sl@0
 | 
   245  | 
      for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
  | 
| 
sl@0
 | 
   246  | 
        r[i] = g_utf8_get_char (p);
  | 
| 
sl@0
 | 
   247  | 
    }
  | 
| 
sl@0
 | 
   248  | 
  else
  | 
| 
sl@0
 | 
   249  | 
    {
 | 
| 
sl@0
 | 
   250  | 
      /* Not in our table.  */
  | 
| 
sl@0
 | 
   251  | 
      r = g_malloc (sizeof (gunichar));
  | 
| 
sl@0
 | 
   252  | 
      *r = ch;
  | 
| 
sl@0
 | 
   253  | 
      *result_len = 1;
  | 
| 
sl@0
 | 
   254  | 
    }
  | 
| 
sl@0
 | 
   255  | 
  | 
| 
sl@0
 | 
   256  | 
  /* Supposedly following the Unicode 2.1.9 table means that the
  | 
| 
sl@0
 | 
   257  | 
     decompositions come out in canonical order.  I haven't tested
  | 
| 
sl@0
 | 
   258  | 
     this, but we rely on it here.  */
  | 
| 
sl@0
 | 
   259  | 
  return r;
  | 
| 
sl@0
 | 
   260  | 
}
  | 
| 
sl@0
 | 
   261  | 
  | 
| 
sl@0
 | 
   262  | 
/* L,V => LV and LV,T => LVT  */
  | 
| 
sl@0
 | 
   263  | 
static gboolean
  | 
| 
sl@0
 | 
   264  | 
combine_hangul (gunichar a,
  | 
| 
sl@0
 | 
   265  | 
                gunichar b,
  | 
| 
sl@0
 | 
   266  | 
                gunichar *result)
  | 
| 
sl@0
 | 
   267  | 
{
 | 
| 
sl@0
 | 
   268  | 
  gint LIndex = a - LBase;
  | 
| 
sl@0
 | 
   269  | 
  gint SIndex = a - SBase;
  | 
| 
sl@0
 | 
   270  | 
  | 
| 
sl@0
 | 
   271  | 
  gint VIndex = b - VBase;
  | 
| 
sl@0
 | 
   272  | 
  gint TIndex = b - TBase;
  | 
| 
sl@0
 | 
   273  | 
  | 
| 
sl@0
 | 
   274  | 
  if (0 <= LIndex && LIndex < LCount
  | 
| 
sl@0
 | 
   275  | 
      && 0 <= VIndex && VIndex < VCount)
  | 
| 
sl@0
 | 
   276  | 
    {
 | 
| 
sl@0
 | 
   277  | 
      *result = SBase + (LIndex * VCount + VIndex) * TCount;
  | 
| 
sl@0
 | 
   278  | 
      return TRUE;
  | 
| 
sl@0
 | 
   279  | 
    }
  | 
| 
sl@0
 | 
   280  | 
  else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
  | 
| 
sl@0
 | 
   281  | 
           && 0 < TIndex && TIndex < TCount)
  | 
| 
sl@0
 | 
   282  | 
    {
 | 
| 
sl@0
 | 
   283  | 
      *result = a + TIndex;
  | 
| 
sl@0
 | 
   284  | 
      return TRUE;
  | 
| 
sl@0
 | 
   285  | 
    }
  | 
| 
sl@0
 | 
   286  | 
  | 
| 
sl@0
 | 
   287  | 
  return FALSE;
  | 
| 
sl@0
 | 
   288  | 
}
  | 
| 
sl@0
 | 
   289  | 
  | 
| 
sl@0
 | 
   290  | 
#define CI(Page, Char) \
  | 
| 
sl@0
 | 
   291  | 
  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
   292  | 
   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
  | 
| 
sl@0
 | 
   293  | 
   : (compose_data[compose_table[Page]][Char]))
  | 
| 
sl@0
 | 
   294  | 
  | 
| 
sl@0
 | 
   295  | 
#define COMPOSE_INDEX(Char) \
  | 
| 
sl@0
 | 
   296  | 
     (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
  | 
| 
sl@0
 | 
   297  | 
  | 
| 
sl@0
 | 
   298  | 
static gboolean
  | 
| 
sl@0
 | 
   299  | 
combine (gunichar  a,
  | 
| 
sl@0
 | 
   300  | 
	 gunichar  b,
  | 
| 
sl@0
 | 
   301  | 
	 gunichar *result)
  | 
| 
sl@0
 | 
   302  | 
{
 | 
| 
sl@0
 | 
   303  | 
  gushort index_a, index_b;
  | 
| 
sl@0
 | 
   304  | 
  | 
| 
sl@0
 | 
   305  | 
  if (combine_hangul (a, b, result))
  | 
| 
sl@0
 | 
   306  | 
    return TRUE;
  | 
| 
sl@0
 | 
   307  | 
  | 
| 
sl@0
 | 
   308  | 
  index_a = COMPOSE_INDEX(a);
  | 
| 
sl@0
 | 
   309  | 
  | 
| 
sl@0
 | 
   310  | 
  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
  | 
| 
sl@0
 | 
   311  | 
    {
 | 
| 
sl@0
 | 
   312  | 
      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
  | 
| 
sl@0
 | 
   313  | 
	{
 | 
| 
sl@0
 | 
   314  | 
	  *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
  | 
| 
sl@0
 | 
   315  | 
	  return TRUE;
  | 
| 
sl@0
 | 
   316  | 
	}
  | 
| 
sl@0
 | 
   317  | 
      else
  | 
| 
sl@0
 | 
   318  | 
        return FALSE;
  | 
| 
sl@0
 | 
   319  | 
    }
  | 
| 
sl@0
 | 
   320  | 
  
  | 
| 
sl@0
 | 
   321  | 
  index_b = COMPOSE_INDEX(b);
  | 
| 
sl@0
 | 
   322  | 
  | 
| 
sl@0
 | 
   323  | 
  if (index_b >= COMPOSE_SECOND_SINGLE_START)
  | 
| 
sl@0
 | 
   324  | 
    {
 | 
| 
sl@0
 | 
   325  | 
      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
  | 
| 
sl@0
 | 
   326  | 
	{
 | 
| 
sl@0
 | 
   327  | 
	  *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
  | 
| 
sl@0
 | 
   328  | 
	  return TRUE;
  | 
| 
sl@0
 | 
   329  | 
	}
  | 
| 
sl@0
 | 
   330  | 
      else
  | 
| 
sl@0
 | 
   331  | 
        return FALSE;
  | 
| 
sl@0
 | 
   332  | 
    }
  | 
| 
sl@0
 | 
   333  | 
  | 
| 
sl@0
 | 
   334  | 
  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
  | 
| 
sl@0
 | 
   335  | 
      index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
  | 
| 
sl@0
 | 
   336  | 
    {
 | 
| 
sl@0
 | 
   337  | 
      gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
  | 
| 
sl@0
 | 
   338  | 
  | 
| 
sl@0
 | 
   339  | 
      if (res)
  | 
| 
sl@0
 | 
   340  | 
	{
 | 
| 
sl@0
 | 
   341  | 
	  *result = res;
  | 
| 
sl@0
 | 
   342  | 
	  return TRUE;
  | 
| 
sl@0
 | 
   343  | 
	}
  | 
| 
sl@0
 | 
   344  | 
    }
  | 
| 
sl@0
 | 
   345  | 
  | 
| 
sl@0
 | 
   346  | 
  return FALSE;
  | 
| 
sl@0
 | 
   347  | 
}
  | 
| 
sl@0
 | 
   348  | 
  | 
| 
sl@0
 | 
   349  | 
gunichar *
  | 
| 
sl@0
 | 
   350  | 
_g_utf8_normalize_wc (const gchar    *str,
  | 
| 
sl@0
 | 
   351  | 
		      gssize          max_len,
  | 
| 
sl@0
 | 
   352  | 
		      GNormalizeMode  mode)
  | 
| 
sl@0
 | 
   353  | 
{
 | 
| 
sl@0
 | 
   354  | 
  gsize n_wc;
  | 
| 
sl@0
 | 
   355  | 
  gunichar *wc_buffer;
  | 
| 
sl@0
 | 
   356  | 
  const char *p;
  | 
| 
sl@0
 | 
   357  | 
  gsize last_start;
  | 
| 
sl@0
 | 
   358  | 
  gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
  | 
| 
sl@0
 | 
   359  | 
			mode == G_NORMALIZE_NFKD);
  | 
| 
sl@0
 | 
   360  | 
  gboolean do_compose = (mode == G_NORMALIZE_NFC ||
  | 
| 
sl@0
 | 
   361  | 
			 mode == G_NORMALIZE_NFKC);
  | 
| 
sl@0
 | 
   362  | 
  | 
| 
sl@0
 | 
   363  | 
  n_wc = 0;
  | 
| 
sl@0
 | 
   364  | 
  p = str;
  | 
| 
sl@0
 | 
   365  | 
  while ((max_len < 0 || p < str + max_len) && *p)
  | 
| 
sl@0
 | 
   366  | 
    {
 | 
| 
sl@0
 | 
   367  | 
      const gchar *decomp;
  | 
| 
sl@0
 | 
   368  | 
      gunichar wc = g_utf8_get_char (p);
  | 
| 
sl@0
 | 
   369  | 
  | 
| 
sl@0
 | 
   370  | 
      if (wc >= 0xac00 && wc <= 0xd7a3)
  | 
| 
sl@0
 | 
   371  | 
        {
 | 
| 
sl@0
 | 
   372  | 
          gsize result_len;
  | 
| 
sl@0
 | 
   373  | 
          decompose_hangul (wc, NULL, &result_len);
  | 
| 
sl@0
 | 
   374  | 
          n_wc += result_len;
  | 
| 
sl@0
 | 
   375  | 
        }
  | 
| 
sl@0
 | 
   376  | 
      else 
  | 
| 
sl@0
 | 
   377  | 
        {
 | 
| 
sl@0
 | 
   378  | 
          decomp = find_decomposition (wc, do_compat);
  | 
| 
sl@0
 | 
   379  | 
  | 
| 
sl@0
 | 
   380  | 
          if (decomp)
  | 
| 
sl@0
 | 
   381  | 
            n_wc += g_utf8_strlen (decomp, -1);
  | 
| 
sl@0
 | 
   382  | 
          else
  | 
| 
sl@0
 | 
   383  | 
            n_wc++;
  | 
| 
sl@0
 | 
   384  | 
        }
  | 
| 
sl@0
 | 
   385  | 
  | 
| 
sl@0
 | 
   386  | 
      p = g_utf8_next_char (p);
  | 
| 
sl@0
 | 
   387  | 
    }
  | 
| 
sl@0
 | 
   388  | 
  | 
| 
sl@0
 | 
   389  | 
  wc_buffer = g_new (gunichar, n_wc + 1);
  | 
| 
sl@0
 | 
   390  | 
  | 
| 
sl@0
 | 
   391  | 
  last_start = 0;
  | 
| 
sl@0
 | 
   392  | 
  n_wc = 0;
  | 
| 
sl@0
 | 
   393  | 
  p = str;
  | 
| 
sl@0
 | 
   394  | 
  while ((max_len < 0 || p < str + max_len) && *p)
  | 
| 
sl@0
 | 
   395  | 
    {
 | 
| 
sl@0
 | 
   396  | 
      gunichar wc = g_utf8_get_char (p);
  | 
| 
sl@0
 | 
   397  | 
      const gchar *decomp;
  | 
| 
sl@0
 | 
   398  | 
      int cc;
  | 
| 
sl@0
 | 
   399  | 
      gsize old_n_wc = n_wc;
  | 
| 
sl@0
 | 
   400  | 
	  
  | 
| 
sl@0
 | 
   401  | 
      if (wc >= 0xac00 && wc <= 0xd7a3)
  | 
| 
sl@0
 | 
   402  | 
        {
 | 
| 
sl@0
 | 
   403  | 
          gsize result_len;
  | 
| 
sl@0
 | 
   404  | 
          decompose_hangul (wc, wc_buffer + n_wc, &result_len);
  | 
| 
sl@0
 | 
   405  | 
          n_wc += result_len;
  | 
| 
sl@0
 | 
   406  | 
        }
  | 
| 
sl@0
 | 
   407  | 
      else
  | 
| 
sl@0
 | 
   408  | 
        {
 | 
| 
sl@0
 | 
   409  | 
          decomp = find_decomposition (wc, do_compat);
  | 
| 
sl@0
 | 
   410  | 
          
  | 
| 
sl@0
 | 
   411  | 
          if (decomp)
  | 
| 
sl@0
 | 
   412  | 
            {
 | 
| 
sl@0
 | 
   413  | 
              const char *pd;
  | 
| 
sl@0
 | 
   414  | 
              for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
  | 
| 
sl@0
 | 
   415  | 
                wc_buffer[n_wc++] = g_utf8_get_char (pd);
  | 
| 
sl@0
 | 
   416  | 
            }
  | 
| 
sl@0
 | 
   417  | 
          else
  | 
| 
sl@0
 | 
   418  | 
            wc_buffer[n_wc++] = wc;
  | 
| 
sl@0
 | 
   419  | 
        }
  | 
| 
sl@0
 | 
   420  | 
  | 
| 
sl@0
 | 
   421  | 
      if (n_wc > 0)
  | 
| 
sl@0
 | 
   422  | 
	{
 | 
| 
sl@0
 | 
   423  | 
	  cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
  | 
| 
sl@0
 | 
   424  | 
  | 
| 
sl@0
 | 
   425  | 
	  if (cc == 0)
  | 
| 
sl@0
 | 
   426  | 
	    {
 | 
| 
sl@0
 | 
   427  | 
	      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
  | 
| 
sl@0
 | 
   428  | 
	      last_start = old_n_wc;
  | 
| 
sl@0
 | 
   429  | 
	    }
  | 
| 
sl@0
 | 
   430  | 
	}
  | 
| 
sl@0
 | 
   431  | 
      
  | 
| 
sl@0
 | 
   432  | 
      p = g_utf8_next_char (p);
  | 
| 
sl@0
 | 
   433  | 
    }
  | 
| 
sl@0
 | 
   434  | 
  | 
| 
sl@0
 | 
   435  | 
  if (n_wc > 0)
  | 
| 
sl@0
 | 
   436  | 
    {
 | 
| 
sl@0
 | 
   437  | 
      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
  | 
| 
sl@0
 | 
   438  | 
      last_start = n_wc;
  | 
| 
sl@0
 | 
   439  | 
    }
  | 
| 
sl@0
 | 
   440  | 
	  
  | 
| 
sl@0
 | 
   441  | 
  wc_buffer[n_wc] = 0;
  | 
| 
sl@0
 | 
   442  | 
  | 
| 
sl@0
 | 
   443  | 
  /* All decomposed and reordered */ 
  | 
| 
sl@0
 | 
   444  | 
  | 
| 
sl@0
 | 
   445  | 
  if (do_compose && n_wc > 0)
  | 
| 
sl@0
 | 
   446  | 
    {
 | 
| 
sl@0
 | 
   447  | 
      gsize i, j;
  | 
| 
sl@0
 | 
   448  | 
      int last_cc = 0;
  | 
| 
sl@0
 | 
   449  | 
      last_start = 0;
  | 
| 
sl@0
 | 
   450  | 
      
  | 
| 
sl@0
 | 
   451  | 
      for (i = 0; i < n_wc; i++)
  | 
| 
sl@0
 | 
   452  | 
	{
 | 
| 
sl@0
 | 
   453  | 
	  int cc = COMBINING_CLASS (wc_buffer[i]);
  | 
| 
sl@0
 | 
   454  | 
  | 
| 
sl@0
 | 
   455  | 
	  if (i > 0 &&
  | 
| 
sl@0
 | 
   456  | 
	      (last_cc == 0 || last_cc < cc) &&
  | 
| 
sl@0
 | 
   457  | 
	      combine (wc_buffer[last_start], wc_buffer[i],
  | 
| 
sl@0
 | 
   458  | 
		       &wc_buffer[last_start]))
  | 
| 
sl@0
 | 
   459  | 
	    {
 | 
| 
sl@0
 | 
   460  | 
	      for (j = i + 1; j < n_wc; j++)
  | 
| 
sl@0
 | 
   461  | 
		wc_buffer[j-1] = wc_buffer[j];
  | 
| 
sl@0
 | 
   462  | 
	      n_wc--;
  | 
| 
sl@0
 | 
   463  | 
	      i--;
  | 
| 
sl@0
 | 
   464  | 
	      
  | 
| 
sl@0
 | 
   465  | 
	      if (i == last_start)
  | 
| 
sl@0
 | 
   466  | 
		last_cc = 0;
  | 
| 
sl@0
 | 
   467  | 
	      else
  | 
| 
sl@0
 | 
   468  | 
		last_cc = COMBINING_CLASS (wc_buffer[i-1]);
  | 
| 
sl@0
 | 
   469  | 
	      
  | 
| 
sl@0
 | 
   470  | 
	      continue;
  | 
| 
sl@0
 | 
   471  | 
	    }
  | 
| 
sl@0
 | 
   472  | 
  | 
| 
sl@0
 | 
   473  | 
	  if (cc == 0)
  | 
| 
sl@0
 | 
   474  | 
	    last_start = i;
  | 
| 
sl@0
 | 
   475  | 
  | 
| 
sl@0
 | 
   476  | 
	  last_cc = cc;
  | 
| 
sl@0
 | 
   477  | 
	}
  | 
| 
sl@0
 | 
   478  | 
    }
  | 
| 
sl@0
 | 
   479  | 
  | 
| 
sl@0
 | 
   480  | 
  wc_buffer[n_wc] = 0;
  | 
| 
sl@0
 | 
   481  | 
  | 
| 
sl@0
 | 
   482  | 
  return wc_buffer;
  | 
| 
sl@0
 | 
   483  | 
}
  | 
| 
sl@0
 | 
   484  | 
  | 
| 
sl@0
 | 
   485  | 
/**
  | 
| 
sl@0
 | 
   486  | 
 * g_utf8_normalize:
  | 
| 
sl@0
 | 
   487  | 
 * @str: a UTF-8 encoded string.
  | 
| 
sl@0
 | 
   488  | 
 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
  | 
| 
sl@0
 | 
   489  | 
 * @mode: the type of normalization to perform.
  | 
| 
sl@0
 | 
   490  | 
 *
  | 
| 
sl@0
 | 
   491  | 
 * Converts a string into canonical form, standardizing
  | 
| 
sl@0
 | 
   492  | 
 * such issues as whether a character with an accent
  | 
| 
sl@0
 | 
   493  | 
 * is represented as a base character and combining
  | 
| 
sl@0
 | 
   494  | 
 * accent or as a single precomposed character. The
  | 
| 
sl@0
 | 
   495  | 
 * string has to be valid UTF-8, otherwise %NULL is
  | 
| 
sl@0
 | 
   496  | 
 * returned. You should generally call g_utf8_normalize()
  | 
| 
sl@0
 | 
   497  | 
 * before comparing two Unicode strings.
  | 
| 
sl@0
 | 
   498  | 
 *
  | 
| 
sl@0
 | 
   499  | 
 * The normalization mode %G_NORMALIZE_DEFAULT only
  | 
| 
sl@0
 | 
   500  | 
 * standardizes differences that do not affect the
  | 
| 
sl@0
 | 
   501  | 
 * text content, such as the above-mentioned accent
  | 
| 
sl@0
 | 
   502  | 
 * representation. %G_NORMALIZE_ALL also standardizes
  | 
| 
sl@0
 | 
   503  | 
 * the "compatibility" characters in Unicode, such
  | 
| 
sl@0
 | 
   504  | 
 * as SUPERSCRIPT THREE to the standard forms
  | 
| 
sl@0
 | 
   505  | 
 * (in this case DIGIT THREE). Formatting information
  | 
| 
sl@0
 | 
   506  | 
 * may be lost but for most text operations such
  | 
| 
sl@0
 | 
   507  | 
 * characters should be considered the same.
  | 
| 
sl@0
 | 
   508  | 
 *
  | 
| 
sl@0
 | 
   509  | 
 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
  | 
| 
sl@0
 | 
   510  | 
 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
  | 
| 
sl@0
 | 
   511  | 
 * but returned a result with composed forms rather
  | 
| 
sl@0
 | 
   512  | 
 * than a maximally decomposed form. This is often
  | 
| 
sl@0
 | 
   513  | 
 * useful if you intend to convert the string to
  | 
| 
sl@0
 | 
   514  | 
 * a legacy encoding or pass it to a system with
  | 
| 
sl@0
 | 
   515  | 
 * less capable Unicode handling.
  | 
| 
sl@0
 | 
   516  | 
 *
  | 
| 
sl@0
 | 
   517  | 
 * Return value: a newly allocated string, that is the
  | 
| 
sl@0
 | 
   518  | 
 *   normalized form of @str, or %NULL if @str is not
  | 
| 
sl@0
 | 
   519  | 
 *   valid UTF-8.
  | 
| 
sl@0
 | 
   520  | 
 **/
  | 
| 
sl@0
 | 
   521  | 
EXPORT_C gchar *
  | 
| 
sl@0
 | 
   522  | 
g_utf8_normalize (const gchar    *str,
  | 
| 
sl@0
 | 
   523  | 
		  gssize          len,
  | 
| 
sl@0
 | 
   524  | 
		  GNormalizeMode  mode)
  | 
| 
sl@0
 | 
   525  | 
{
 | 
| 
sl@0
 | 
   526  | 
  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
  | 
| 
sl@0
 | 
   527  | 
  gchar *result;
  | 
| 
sl@0
 | 
   528  | 
  | 
| 
sl@0
 | 
   529  | 
  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
  | 
| 
sl@0
 | 
   530  | 
  g_free (result_wc);
  | 
| 
sl@0
 | 
   531  | 
  | 
| 
sl@0
 | 
   532  | 
  return result;
  | 
| 
sl@0
 | 
   533  | 
}
  | 
| 
sl@0
 | 
   534  | 
  | 
| 
sl@0
 | 
   535  | 
#define __G_UNIDECOMP_C__
  | 
| 
sl@0
 | 
   536  | 
#include "galiasdef.c"
  |