os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 # uniParse.tcl --
     2 #
     3 #	This program parses the UnicodeData file and generates the
     4 #	corresponding tclUniData.c file with compressed character
     5 #	data tables.  The input to this program should be the latest
     6 #	UnicodeData file from:
     7 #	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
     8 #
     9 # Copyright (c) 1998-1999 by Scriptics Corporation.
    10 # All rights reserved.
    11 # 
    12 # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
    13 
    14 
    15 namespace eval uni {
    16     set shift 5;		# number of bits of data within a page
    17 				# This value can be adjusted to find the
    18 				# best split to minimize table size
    19 
    20     variable pMap;		# map from page to page index, each entry is
    21 				# an index into the pages table, indexed by
    22 				# page number
    23     variable pages;		# map from page index to page info, each
    24 				# entry is a list of indices into the groups
    25 				# table, the list is indexed by the offset
    26     variable groups;		# list of character info values, indexed by
    27 				# group number, initialized with the
    28 				# unassigned character group
    29 
    30     variable categories {
    31 	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
    32 	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
    33     };				# Ordered list of character categories, must
    34 				# match the enumeration in the header file.
    35 
    36     variable titleCount 0;	# Count of the number of title case
    37 				# characters.  This value is used in the
    38 				# regular expression code to allocate enough
    39 				# space for the title case variants.
    40 }
    41 
    42 proc uni::getValue {items index} {
    43     variable categories
    44     variable titleCount
    45 
    46     # Extract character info
    47 
    48     set category [lindex $items 2]
    49     if {[scan [lindex $items 12] %4x toupper] == 1} {
    50 	set toupper [expr {$index - $toupper}]
    51     } else {
    52 	set toupper {}
    53     }
    54     if {[scan [lindex $items 13] %4x tolower] == 1} {
    55 	set tolower [expr {$tolower - $index}]
    56     } else {
    57 	set tolower {}
    58     }
    59     if {[scan [lindex $items 14] %4x totitle] == 1} {
    60 	set totitle [expr {$index - $totitle}]
    61     } else {
    62 	set totitle {}
    63     }
    64 
    65     set categoryIndex [lsearch -exact $categories $category]
    66     if {$categoryIndex < 0} {
    67 	puts "Unexpected character category: $index($category)"
    68 	set categoryIndex 0
    69     } elseif {$category == "Lt"} {
    70 	incr titleCount
    71     }
    72 
    73     return "$categoryIndex,$toupper,$tolower,$totitle"
    74 }
    75 
    76 proc uni::getGroup {value} {
    77     variable groups
    78 
    79     set gIndex [lsearch -exact $groups $value]
    80     if {$gIndex == -1} {
    81 	set gIndex [llength $groups]
    82 	lappend groups $value
    83     }
    84     return $gIndex
    85 }
    86 
    87 proc uni::addPage {info} {
    88     variable pMap
    89     variable pages
    90     
    91     set pIndex [lsearch -exact $pages $info]
    92     if {$pIndex == -1} {
    93 	set pIndex [llength $pages]
    94 	lappend pages $info
    95     }
    96     lappend pMap $pIndex
    97     return
    98 }
    99     
   100 proc uni::buildTables {data} {
   101     variable shift
   102 
   103     variable pMap {}
   104     variable pages {}
   105     variable groups {{0,,,}}
   106     set info {}			;# temporary page info
   107     
   108     set mask [expr {(1 << $shift) - 1}]
   109 
   110     set next 0
   111 
   112     foreach line [split $data \n] {
   113 	if {$line == ""} {
   114 	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
   115 	}
   116 
   117 	set items [split $line \;]
   118 
   119 	scan [lindex $items 0] %4x index
   120 	set index [format 0x%0.4x $index]
   121 	
   122 	set gIndex [getGroup [getValue $items $index]]
   123 
   124 	# Since the input table omits unassigned characters, these will
   125 	# show up as gaps in the index sequence.  There are a few special cases
   126 	# where the gaps correspond to a uniform block of assigned characters.
   127 	# These are indicated as such in the character name.
   128 
   129 	# Enter all unassigned characters up to the current character.
   130 	if {($index > $next) \
   131 		&& ![regexp "Last>$" [lindex $items 1]]} {
   132 	    for {} {$next < $index} {incr next} {
   133 		lappend info 0
   134 		if {($next & $mask) == $mask} {
   135 		    addPage $info
   136 		    set info {}
   137 		}
   138 	    }
   139 	}
   140 
   141 	# Enter all assigned characters up to the current character
   142 	for {set i $next} {$i <= $index} {incr i} {
   143 	    # Split character index into offset and page number
   144 	    set offset [expr {$i & $mask}]
   145 	    set page [expr {($i >> $shift)}]
   146 
   147 	    # Add the group index to the info for the current page
   148 	    lappend info $gIndex
   149 
   150 	    # If this is the last entry in the page, add the page
   151 	    if {$offset == $mask} {
   152 		addPage $info
   153 		set info {}
   154 	    }
   155 	}
   156 	set next [expr {$index + 1}]
   157     }
   158     return
   159 }
   160 
   161 proc uni::main {} {
   162     global argc argv0 argv
   163     variable pMap
   164     variable pages
   165     variable groups
   166     variable shift
   167     variable titleCount
   168 
   169     if {$argc != 2} {
   170 	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
   171 	exit 1
   172     }
   173     set f [open [lindex $argv 0] r]
   174     set data [read $f]
   175     close $f
   176 
   177     buildTables $data
   178     puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
   179     set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
   180     puts "shift = 6, space = $size"
   181     puts "title case count = $titleCount"
   182 
   183     set f [open [file join [lindex $argv 1] tclUniData.c] w]
   184     fconfigure $f -translation lf
   185     puts $f "/*
   186  * tclUniData.c --
   187  *
   188  *	Declarations of Unicode character information tables.  This file is
   189  *	automatically generated by the tools/uniParse.tcl script.  Do not
   190  *	modify this file by hand.
   191  *
   192  * Copyright (c) 1998 by Scriptics Corporation.
   193  * All rights reserved.
   194  *
   195  * RCS: @(#) \$Id\$
   196  */
   197 
   198 /*
   199  * A 16-bit Unicode character is split into two parts in order to index
   200  * into the following tables.  The lower OFFSET_BITS comprise an offset
   201  * into a page of characters.  The upper bits comprise the page number.
   202  */
   203 
   204 #define OFFSET_BITS $shift
   205 
   206 /*
   207  * The pageMap is indexed by page number and returns an alternate page number
   208  * that identifies a unique page of characters.  Many Unicode characters map
   209  * to the same alternate page number.
   210  */
   211 
   212 static unsigned char pageMap\[\] = {"
   213     set line "    "
   214     set last [expr {[llength $pMap] - 1}]
   215     for {set i 0} {$i <= $last} {incr i} {
   216 	append line [lindex $pMap $i]
   217 	if {$i != $last} {
   218 	    append line ", "
   219 	}
   220 	if {[string length $line] > 70} {
   221 	    puts $f $line
   222 	    set line "    "
   223 	}
   224     }
   225     puts $f $line
   226     puts $f "};
   227 
   228 /*
   229  * The groupMap is indexed by combining the alternate page number with
   230  * the page offset and returns a group number that identifies a unique
   231  * set of character attributes.
   232  */
   233 
   234 static unsigned char groupMap\[\] = {"
   235     set line "    "
   236     set lasti [expr {[llength $pages] - 1}]
   237     for {set i 0} {$i <= $lasti} {incr i} {
   238 	set page [lindex $pages $i]
   239 	set lastj [expr {[llength $page] - 1}]
   240 	for {set j 0} {$j <= $lastj} {incr j} {
   241 	    append line [lindex $page $j]
   242 	    if {$j != $lastj || $i != $lasti} {
   243 		append line ", "
   244 	    }
   245 	    if {[string length $line] > 70} {
   246 		puts $f $line
   247 		set line "    "
   248 	    }
   249 	}
   250     }
   251     puts $f $line
   252     puts $f "};
   253 
   254 /*
   255  * Each group represents a unique set of character attributes.  The attributes
   256  * are encoded into a 32-bit value as follows:
   257  *
   258  * Bits 0-4	Character category: see the constants listed below.
   259  *
   260  * Bits 5-7	Case delta type: 000 = identity
   261  *				 010 = add delta for lower
   262  *				 011 = add delta for lower, add 1 for title
   263  *				 100 = sutract delta for title/upper
   264  *				 101 = sub delta for upper, sub 1 for title
   265  *				 110 = sub delta for upper, add delta for lower
   266  *
   267  * Bits 8-21	Reserved for future use.
   268  *
   269  * Bits 22-31	Case delta: delta for case conversions.  This should be the
   270  *			    highest field so we can easily sign extend.
   271  */
   272 
   273 static int groups\[\] = {"
   274     set line "    "
   275     set last [expr {[llength $groups] - 1}]
   276     for {set i 0} {$i <= $last} {incr i} {
   277 	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
   278 	
   279 	# Compute the case conversion type and delta
   280 
   281 	if {$totitle != ""} {
   282 	    if {$totitle == $toupper} {
   283 		# subtract delta for title or upper
   284 		set case 4
   285 		set delta $toupper
   286 	    } elseif {$toupper != ""} {
   287 		# subtract delta for upper, subtract 1 for title
   288 		set case 5
   289 		set delta $toupper
   290 	    } else {
   291 		# add delta for lower, add 1 for title
   292 		set case 3
   293 		set delta $tolower
   294 	    }
   295 	} elseif {$toupper != ""} {
   296 	    # subtract delta for upper, add delta for lower
   297 	    set case 6
   298 	    set delta $toupper
   299 	} elseif {$tolower != ""} {
   300 	    # add delta for lower
   301 	    set case 2
   302 	    set delta $tolower
   303 	} else {
   304 	    # noop
   305 	    set case 0
   306 	    set delta 0
   307 	}
   308 
   309 	set val [expr {($delta << 22) | ($case << 5) | $type}]
   310 
   311 	append line [format "%d" $val]
   312 	if {$i != $last} {
   313 	    append line ", "
   314 	}
   315 	if {[string length $line] > 65} {
   316 	    puts $f $line
   317 	    set line "    "
   318 	}
   319     }
   320     puts $f $line
   321     puts $f "};
   322 
   323 /*
   324  * The following constants are used to determine the category of a
   325  * Unicode character.
   326  */
   327 
   328 #define UNICODE_CATEGORY_MASK 0X1F
   329 
   330 enum {
   331     UNASSIGNED,
   332     UPPERCASE_LETTER,
   333     LOWERCASE_LETTER,
   334     TITLECASE_LETTER,
   335     MODIFIER_LETTER,
   336     OTHER_LETTER,
   337     NON_SPACING_MARK,
   338     ENCLOSING_MARK,
   339     COMBINING_SPACING_MARK,
   340     DECIMAL_DIGIT_NUMBER,
   341     LETTER_NUMBER,
   342     OTHER_NUMBER,
   343     SPACE_SEPARATOR,
   344     LINE_SEPARATOR,
   345     PARAGRAPH_SEPARATOR,
   346     CONTROL,
   347     FORMAT,
   348     PRIVATE_USE,
   349     SURROGATE,
   350     CONNECTOR_PUNCTUATION,
   351     DASH_PUNCTUATION,
   352     OPEN_PUNCTUATION,
   353     CLOSE_PUNCTUATION,
   354     INITIAL_QUOTE_PUNCTUATION,
   355     FINAL_QUOTE_PUNCTUATION,
   356     OTHER_PUNCTUATION,
   357     MATH_SYMBOL,
   358     CURRENCY_SYMBOL,
   359     MODIFIER_SYMBOL,
   360     OTHER_SYMBOL
   361 };
   362 
   363 /*
   364  * The following macros extract the fields of the character info.  The
   365  * GetDelta() macro is complicated because we can't rely on the C compiler
   366  * to do sign extension on right shifts.
   367  */
   368 
   369 #define GetCaseType(info) (((info) & 0xE0) >> 5)
   370 #define GetCategory(info) ((info) & 0x1F)
   371 #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
   372 
   373 /*
   374  * This macro extracts the information about a character from the
   375  * Unicode character tables.
   376  */
   377 
   378 #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
   379 "
   380 
   381     close $f
   382 }
   383 
   384 uni::main
   385 
   386 return