os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl
Update contrib.
3 # This program parses the UnicodeData file and generates the
4 # corresponding tclUniData.c file with compressed character
5 # data tables. The input to this program should be the latest
6 # UnicodeData file from:
7 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
9 # Copyright (c) 1998-1999 by Scriptics Corporation.
10 # All rights reserved.
12 # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
16 set shift 5; # number of bits of data within a page
17 # This value can be adjusted to find the
18 # best split to minimize table size
20 variable pMap; # map from page to page index, each entry is
21 # an index into the pages table, indexed by
23 variable pages; # map from page index to page info, each
24 # entry is a list of indices into the groups
25 # table, the list is indexed by the offset
26 variable groups; # list of character info values, indexed by
27 # group number, initialized with the
28 # unassigned character group
31 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
32 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
33 }; # Ordered list of character categories, must
34 # match the enumeration in the header file.
36 variable titleCount 0; # Count of the number of title case
37 # characters. This value is used in the
38 # regular expression code to allocate enough
39 # space for the title case variants.
42 proc uni::getValue {items index} {
46 # Extract character info
48 set category [lindex $items 2]
49 if {[scan [lindex $items 12] %4x toupper] == 1} {
50 set toupper [expr {$index - $toupper}]
54 if {[scan [lindex $items 13] %4x tolower] == 1} {
55 set tolower [expr {$tolower - $index}]
59 if {[scan [lindex $items 14] %4x totitle] == 1} {
60 set totitle [expr {$index - $totitle}]
65 set categoryIndex [lsearch -exact $categories $category]
66 if {$categoryIndex < 0} {
67 puts "Unexpected character category: $index($category)"
69 } elseif {$category == "Lt"} {
73 return "$categoryIndex,$toupper,$tolower,$totitle"
76 proc uni::getGroup {value} {
79 set gIndex [lsearch -exact $groups $value]
81 set gIndex [llength $groups]
87 proc uni::addPage {info} {
91 set pIndex [lsearch -exact $pages $info]
93 set pIndex [llength $pages]
100 proc uni::buildTables {data} {
105 variable groups {{0,,,}}
106 set info {} ;# temporary page info
108 set mask [expr {(1 << $shift) - 1}]
112 foreach line [split $data \n] {
114 set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
117 set items [split $line \;]
119 scan [lindex $items 0] %4x index
120 set index [format 0x%0.4x $index]
122 set gIndex [getGroup [getValue $items $index]]
124 # Since the input table omits unassigned characters, these will
125 # show up as gaps in the index sequence. There are a few special cases
126 # where the gaps correspond to a uniform block of assigned characters.
127 # These are indicated as such in the character name.
129 # Enter all unassigned characters up to the current character.
130 if {($index > $next) \
131 && ![regexp "Last>$" [lindex $items 1]]} {
132 for {} {$next < $index} {incr next} {
134 if {($next & $mask) == $mask} {
141 # Enter all assigned characters up to the current character
142 for {set i $next} {$i <= $index} {incr i} {
143 # Split character index into offset and page number
144 set offset [expr {$i & $mask}]
145 set page [expr {($i >> $shift)}]
147 # Add the group index to the info for the current page
150 # If this is the last entry in the page, add the page
151 if {$offset == $mask} {
156 set next [expr {$index + 1}]
162 global argc argv0 argv
170 puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
173 set f [open [lindex $argv 0] r]
178 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
179 set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
180 puts "shift = 6, space = $size"
181 puts "title case count = $titleCount"
183 set f [open [file join [lindex $argv 1] tclUniData.c] w]
184 fconfigure $f -translation lf
188 * Declarations of Unicode character information tables. This file is
189 * automatically generated by the tools/uniParse.tcl script. Do not
190 * modify this file by hand.
192 * Copyright (c) 1998 by Scriptics Corporation.
193 * All rights reserved.
199 * A 16-bit Unicode character is split into two parts in order to index
200 * into the following tables. The lower OFFSET_BITS comprise an offset
201 * into a page of characters. The upper bits comprise the page number.
204 #define OFFSET_BITS $shift
207 * The pageMap is indexed by page number and returns an alternate page number
208 * that identifies a unique page of characters. Many Unicode characters map
209 * to the same alternate page number.
212 static unsigned char pageMap\[\] = {"
214 set last [expr {[llength $pMap] - 1}]
215 for {set i 0} {$i <= $last} {incr i} {
216 append line [lindex $pMap $i]
220 if {[string length $line] > 70} {
229 * The groupMap is indexed by combining the alternate page number with
230 * the page offset and returns a group number that identifies a unique
231 * set of character attributes.
234 static unsigned char groupMap\[\] = {"
236 set lasti [expr {[llength $pages] - 1}]
237 for {set i 0} {$i <= $lasti} {incr i} {
238 set page [lindex $pages $i]
239 set lastj [expr {[llength $page] - 1}]
240 for {set j 0} {$j <= $lastj} {incr j} {
241 append line [lindex $page $j]
242 if {$j != $lastj || $i != $lasti} {
245 if {[string length $line] > 70} {
255 * Each group represents a unique set of character attributes. The attributes
256 * are encoded into a 32-bit value as follows:
258 * Bits 0-4 Character category: see the constants listed below.
260 * Bits 5-7 Case delta type: 000 = identity
261 * 010 = add delta for lower
262 * 011 = add delta for lower, add 1 for title
263 * 100 = sutract delta for title/upper
264 * 101 = sub delta for upper, sub 1 for title
265 * 110 = sub delta for upper, add delta for lower
267 * Bits 8-21 Reserved for future use.
269 * Bits 22-31 Case delta: delta for case conversions. This should be the
270 * highest field so we can easily sign extend.
273 static int groups\[\] = {"
275 set last [expr {[llength $groups] - 1}]
276 for {set i 0} {$i <= $last} {incr i} {
277 foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
279 # Compute the case conversion type and delta
281 if {$totitle != ""} {
282 if {$totitle == $toupper} {
283 # subtract delta for title or upper
286 } elseif {$toupper != ""} {
287 # subtract delta for upper, subtract 1 for title
291 # add delta for lower, add 1 for title
295 } elseif {$toupper != ""} {
296 # subtract delta for upper, add delta for lower
299 } elseif {$tolower != ""} {
300 # add delta for lower
309 set val [expr {($delta << 22) | ($case << 5) | $type}]
311 append line [format "%d" $val]
315 if {[string length $line] > 65} {
324 * The following constants are used to determine the category of a
328 #define UNICODE_CATEGORY_MASK 0X1F
339 COMBINING_SPACING_MARK,
340 DECIMAL_DIGIT_NUMBER,
350 CONNECTOR_PUNCTUATION,
354 INITIAL_QUOTE_PUNCTUATION,
355 FINAL_QUOTE_PUNCTUATION,
364 * The following macros extract the fields of the character info. The
365 * GetDelta() macro is complicated because we can't rely on the C compiler
366 * to do sign extension on right shifts.
369 #define GetCaseType(info) (((info) & 0xE0) >> 5)
370 #define GetCategory(info) ((info) & 0x1F)
371 #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
374 * This macro extracts the information about a character from the
375 * Unicode character tables.
378 #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])