os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,386 @@
1.4 +# uniParse.tcl --
1.5 +#
1.6 +# This program parses the UnicodeData file and generates the
1.7 +# corresponding tclUniData.c file with compressed character
1.8 +# data tables. The input to this program should be the latest
1.9 +# UnicodeData file from:
1.10 +# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
1.11 +#
1.12 +# Copyright (c) 1998-1999 by Scriptics Corporation.
1.13 +# All rights reserved.
1.14 +#
1.15 +# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
1.16 +
1.17 +
1.18 +namespace eval uni {
1.19 + set shift 5; # number of bits of data within a page
1.20 + # This value can be adjusted to find the
1.21 + # best split to minimize table size
1.22 +
1.23 + variable pMap; # map from page to page index, each entry is
1.24 + # an index into the pages table, indexed by
1.25 + # page number
1.26 + variable pages; # map from page index to page info, each
1.27 + # entry is a list of indices into the groups
1.28 + # table, the list is indexed by the offset
1.29 + variable groups; # list of character info values, indexed by
1.30 + # group number, initialized with the
1.31 + # unassigned character group
1.32 +
1.33 + variable categories {
1.34 + Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
1.35 + Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
1.36 + }; # Ordered list of character categories, must
1.37 + # match the enumeration in the header file.
1.38 +
1.39 + variable titleCount 0; # Count of the number of title case
1.40 + # characters. This value is used in the
1.41 + # regular expression code to allocate enough
1.42 + # space for the title case variants.
1.43 +}
1.44 +
1.45 +proc uni::getValue {items index} {
1.46 + variable categories
1.47 + variable titleCount
1.48 +
1.49 + # Extract character info
1.50 +
1.51 + set category [lindex $items 2]
1.52 + if {[scan [lindex $items 12] %4x toupper] == 1} {
1.53 + set toupper [expr {$index - $toupper}]
1.54 + } else {
1.55 + set toupper {}
1.56 + }
1.57 + if {[scan [lindex $items 13] %4x tolower] == 1} {
1.58 + set tolower [expr {$tolower - $index}]
1.59 + } else {
1.60 + set tolower {}
1.61 + }
1.62 + if {[scan [lindex $items 14] %4x totitle] == 1} {
1.63 + set totitle [expr {$index - $totitle}]
1.64 + } else {
1.65 + set totitle {}
1.66 + }
1.67 +
1.68 + set categoryIndex [lsearch -exact $categories $category]
1.69 + if {$categoryIndex < 0} {
1.70 + puts "Unexpected character category: $index($category)"
1.71 + set categoryIndex 0
1.72 + } elseif {$category == "Lt"} {
1.73 + incr titleCount
1.74 + }
1.75 +
1.76 + return "$categoryIndex,$toupper,$tolower,$totitle"
1.77 +}
1.78 +
1.79 +proc uni::getGroup {value} {
1.80 + variable groups
1.81 +
1.82 + set gIndex [lsearch -exact $groups $value]
1.83 + if {$gIndex == -1} {
1.84 + set gIndex [llength $groups]
1.85 + lappend groups $value
1.86 + }
1.87 + return $gIndex
1.88 +}
1.89 +
1.90 +proc uni::addPage {info} {
1.91 + variable pMap
1.92 + variable pages
1.93 +
1.94 + set pIndex [lsearch -exact $pages $info]
1.95 + if {$pIndex == -1} {
1.96 + set pIndex [llength $pages]
1.97 + lappend pages $info
1.98 + }
1.99 + lappend pMap $pIndex
1.100 + return
1.101 +}
1.102 +
1.103 +proc uni::buildTables {data} {
1.104 + variable shift
1.105 +
1.106 + variable pMap {}
1.107 + variable pages {}
1.108 + variable groups {{0,,,}}
1.109 + set info {} ;# temporary page info
1.110 +
1.111 + set mask [expr {(1 << $shift) - 1}]
1.112 +
1.113 + set next 0
1.114 +
1.115 + foreach line [split $data \n] {
1.116 + if {$line == ""} {
1.117 + set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
1.118 + }
1.119 +
1.120 + set items [split $line \;]
1.121 +
1.122 + scan [lindex $items 0] %4x index
1.123 + set index [format 0x%0.4x $index]
1.124 +
1.125 + set gIndex [getGroup [getValue $items $index]]
1.126 +
1.127 + # Since the input table omits unassigned characters, these will
1.128 + # show up as gaps in the index sequence. There are a few special cases
1.129 + # where the gaps correspond to a uniform block of assigned characters.
1.130 + # These are indicated as such in the character name.
1.131 +
1.132 + # Enter all unassigned characters up to the current character.
1.133 + if {($index > $next) \
1.134 + && ![regexp "Last>$" [lindex $items 1]]} {
1.135 + for {} {$next < $index} {incr next} {
1.136 + lappend info 0
1.137 + if {($next & $mask) == $mask} {
1.138 + addPage $info
1.139 + set info {}
1.140 + }
1.141 + }
1.142 + }
1.143 +
1.144 + # Enter all assigned characters up to the current character
1.145 + for {set i $next} {$i <= $index} {incr i} {
1.146 + # Split character index into offset and page number
1.147 + set offset [expr {$i & $mask}]
1.148 + set page [expr {($i >> $shift)}]
1.149 +
1.150 + # Add the group index to the info for the current page
1.151 + lappend info $gIndex
1.152 +
1.153 + # If this is the last entry in the page, add the page
1.154 + if {$offset == $mask} {
1.155 + addPage $info
1.156 + set info {}
1.157 + }
1.158 + }
1.159 + set next [expr {$index + 1}]
1.160 + }
1.161 + return
1.162 +}
1.163 +
1.164 +proc uni::main {} {
1.165 + global argc argv0 argv
1.166 + variable pMap
1.167 + variable pages
1.168 + variable groups
1.169 + variable shift
1.170 + variable titleCount
1.171 +
1.172 + if {$argc != 2} {
1.173 + puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
1.174 + exit 1
1.175 + }
1.176 + set f [open [lindex $argv 0] r]
1.177 + set data [read $f]
1.178 + close $f
1.179 +
1.180 + buildTables $data
1.181 + puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
1.182 + set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
1.183 + puts "shift = 6, space = $size"
1.184 + puts "title case count = $titleCount"
1.185 +
1.186 + set f [open [file join [lindex $argv 1] tclUniData.c] w]
1.187 + fconfigure $f -translation lf
1.188 + puts $f "/*
1.189 + * tclUniData.c --
1.190 + *
1.191 + * Declarations of Unicode character information tables. This file is
1.192 + * automatically generated by the tools/uniParse.tcl script. Do not
1.193 + * modify this file by hand.
1.194 + *
1.195 + * Copyright (c) 1998 by Scriptics Corporation.
1.196 + * All rights reserved.
1.197 + *
1.198 + * RCS: @(#) \$Id\$
1.199 + */
1.200 +
1.201 +/*
1.202 + * A 16-bit Unicode character is split into two parts in order to index
1.203 + * into the following tables. The lower OFFSET_BITS comprise an offset
1.204 + * into a page of characters. The upper bits comprise the page number.
1.205 + */
1.206 +
1.207 +#define OFFSET_BITS $shift
1.208 +
1.209 +/*
1.210 + * The pageMap is indexed by page number and returns an alternate page number
1.211 + * that identifies a unique page of characters. Many Unicode characters map
1.212 + * to the same alternate page number.
1.213 + */
1.214 +
1.215 +static unsigned char pageMap\[\] = {"
1.216 + set line " "
1.217 + set last [expr {[llength $pMap] - 1}]
1.218 + for {set i 0} {$i <= $last} {incr i} {
1.219 + append line [lindex $pMap $i]
1.220 + if {$i != $last} {
1.221 + append line ", "
1.222 + }
1.223 + if {[string length $line] > 70} {
1.224 + puts $f $line
1.225 + set line " "
1.226 + }
1.227 + }
1.228 + puts $f $line
1.229 + puts $f "};
1.230 +
1.231 +/*
1.232 + * The groupMap is indexed by combining the alternate page number with
1.233 + * the page offset and returns a group number that identifies a unique
1.234 + * set of character attributes.
1.235 + */
1.236 +
1.237 +static unsigned char groupMap\[\] = {"
1.238 + set line " "
1.239 + set lasti [expr {[llength $pages] - 1}]
1.240 + for {set i 0} {$i <= $lasti} {incr i} {
1.241 + set page [lindex $pages $i]
1.242 + set lastj [expr {[llength $page] - 1}]
1.243 + for {set j 0} {$j <= $lastj} {incr j} {
1.244 + append line [lindex $page $j]
1.245 + if {$j != $lastj || $i != $lasti} {
1.246 + append line ", "
1.247 + }
1.248 + if {[string length $line] > 70} {
1.249 + puts $f $line
1.250 + set line " "
1.251 + }
1.252 + }
1.253 + }
1.254 + puts $f $line
1.255 + puts $f "};
1.256 +
1.257 +/*
1.258 + * Each group represents a unique set of character attributes. The attributes
1.259 + * are encoded into a 32-bit value as follows:
1.260 + *
1.261 + * Bits 0-4 Character category: see the constants listed below.
1.262 + *
1.263 + * Bits 5-7 Case delta type: 000 = identity
1.264 + * 010 = add delta for lower
1.265 + * 011 = add delta for lower, add 1 for title
1.266 + * 100 = sutract delta for title/upper
1.267 + * 101 = sub delta for upper, sub 1 for title
1.268 + * 110 = sub delta for upper, add delta for lower
1.269 + *
1.270 + * Bits 8-21 Reserved for future use.
1.271 + *
1.272 + * Bits 22-31 Case delta: delta for case conversions. This should be the
1.273 + * highest field so we can easily sign extend.
1.274 + */
1.275 +
1.276 +static int groups\[\] = {"
1.277 + set line " "
1.278 + set last [expr {[llength $groups] - 1}]
1.279 + for {set i 0} {$i <= $last} {incr i} {
1.280 + foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
1.281 +
1.282 + # Compute the case conversion type and delta
1.283 +
1.284 + if {$totitle != ""} {
1.285 + if {$totitle == $toupper} {
1.286 + # subtract delta for title or upper
1.287 + set case 4
1.288 + set delta $toupper
1.289 + } elseif {$toupper != ""} {
1.290 + # subtract delta for upper, subtract 1 for title
1.291 + set case 5
1.292 + set delta $toupper
1.293 + } else {
1.294 + # add delta for lower, add 1 for title
1.295 + set case 3
1.296 + set delta $tolower
1.297 + }
1.298 + } elseif {$toupper != ""} {
1.299 + # subtract delta for upper, add delta for lower
1.300 + set case 6
1.301 + set delta $toupper
1.302 + } elseif {$tolower != ""} {
1.303 + # add delta for lower
1.304 + set case 2
1.305 + set delta $tolower
1.306 + } else {
1.307 + # noop
1.308 + set case 0
1.309 + set delta 0
1.310 + }
1.311 +
1.312 + set val [expr {($delta << 22) | ($case << 5) | $type}]
1.313 +
1.314 + append line [format "%d" $val]
1.315 + if {$i != $last} {
1.316 + append line ", "
1.317 + }
1.318 + if {[string length $line] > 65} {
1.319 + puts $f $line
1.320 + set line " "
1.321 + }
1.322 + }
1.323 + puts $f $line
1.324 + puts $f "};
1.325 +
1.326 +/*
1.327 + * The following constants are used to determine the category of a
1.328 + * Unicode character.
1.329 + */
1.330 +
1.331 +#define UNICODE_CATEGORY_MASK 0X1F
1.332 +
1.333 +enum {
1.334 + UNASSIGNED,
1.335 + UPPERCASE_LETTER,
1.336 + LOWERCASE_LETTER,
1.337 + TITLECASE_LETTER,
1.338 + MODIFIER_LETTER,
1.339 + OTHER_LETTER,
1.340 + NON_SPACING_MARK,
1.341 + ENCLOSING_MARK,
1.342 + COMBINING_SPACING_MARK,
1.343 + DECIMAL_DIGIT_NUMBER,
1.344 + LETTER_NUMBER,
1.345 + OTHER_NUMBER,
1.346 + SPACE_SEPARATOR,
1.347 + LINE_SEPARATOR,
1.348 + PARAGRAPH_SEPARATOR,
1.349 + CONTROL,
1.350 + FORMAT,
1.351 + PRIVATE_USE,
1.352 + SURROGATE,
1.353 + CONNECTOR_PUNCTUATION,
1.354 + DASH_PUNCTUATION,
1.355 + OPEN_PUNCTUATION,
1.356 + CLOSE_PUNCTUATION,
1.357 + INITIAL_QUOTE_PUNCTUATION,
1.358 + FINAL_QUOTE_PUNCTUATION,
1.359 + OTHER_PUNCTUATION,
1.360 + MATH_SYMBOL,
1.361 + CURRENCY_SYMBOL,
1.362 + MODIFIER_SYMBOL,
1.363 + OTHER_SYMBOL
1.364 +};
1.365 +
1.366 +/*
1.367 + * The following macros extract the fields of the character info. The
1.368 + * GetDelta() macro is complicated because we can't rely on the C compiler
1.369 + * to do sign extension on right shifts.
1.370 + */
1.371 +
1.372 +#define GetCaseType(info) (((info) & 0xE0) >> 5)
1.373 +#define GetCategory(info) ((info) & 0x1F)
1.374 +#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
1.375 +
1.376 +/*
1.377 + * This macro extracts the information about a character from the
1.378 + * Unicode character tables.
1.379 + */
1.380 +
1.381 +#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
1.382 +"
1.383 +
1.384 + close $f
1.385 +}
1.386 +
1.387 +uni::main
1.388 +
1.389 +return