os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,386 @@
     1.4 +# uniParse.tcl --
     1.5 +#
     1.6 +#	This program parses the UnicodeData file and generates the
     1.7 +#	corresponding tclUniData.c file with compressed character
     1.8 +#	data tables.  The input to this program should be the latest
     1.9 +#	UnicodeData file from:
    1.10 +#	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
    1.11 +#
    1.12 +# Copyright (c) 1998-1999 by Scriptics Corporation.
    1.13 +# All rights reserved.
    1.14 +# 
    1.15 +# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
    1.16 +
    1.17 +
    1.18 +namespace eval uni {
    1.19 +    set shift 5;		# number of bits of data within a page
    1.20 +				# This value can be adjusted to find the
    1.21 +				# best split to minimize table size
    1.22 +
    1.23 +    variable pMap;		# map from page to page index, each entry is
    1.24 +				# an index into the pages table, indexed by
    1.25 +				# page number
    1.26 +    variable pages;		# map from page index to page info, each
    1.27 +				# entry is a list of indices into the groups
    1.28 +				# table, the list is indexed by the offset
    1.29 +    variable groups;		# list of character info values, indexed by
    1.30 +				# group number, initialized with the
    1.31 +				# unassigned character group
    1.32 +
    1.33 +    variable categories {
    1.34 +	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
    1.35 +	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
    1.36 +    };				# Ordered list of character categories, must
    1.37 +				# match the enumeration in the header file.
    1.38 +
    1.39 +    variable titleCount 0;	# Count of the number of title case
    1.40 +				# characters.  This value is used in the
    1.41 +				# regular expression code to allocate enough
    1.42 +				# space for the title case variants.
    1.43 +}
    1.44 +
    1.45 +proc uni::getValue {items index} {
    1.46 +    variable categories
    1.47 +    variable titleCount
    1.48 +
    1.49 +    # Extract character info
    1.50 +
    1.51 +    set category [lindex $items 2]
    1.52 +    if {[scan [lindex $items 12] %4x toupper] == 1} {
    1.53 +	set toupper [expr {$index - $toupper}]
    1.54 +    } else {
    1.55 +	set toupper {}
    1.56 +    }
    1.57 +    if {[scan [lindex $items 13] %4x tolower] == 1} {
    1.58 +	set tolower [expr {$tolower - $index}]
    1.59 +    } else {
    1.60 +	set tolower {}
    1.61 +    }
    1.62 +    if {[scan [lindex $items 14] %4x totitle] == 1} {
    1.63 +	set totitle [expr {$index - $totitle}]
    1.64 +    } else {
    1.65 +	set totitle {}
    1.66 +    }
    1.67 +
    1.68 +    set categoryIndex [lsearch -exact $categories $category]
    1.69 +    if {$categoryIndex < 0} {
    1.70 +	puts "Unexpected character category: $index($category)"
    1.71 +	set categoryIndex 0
    1.72 +    } elseif {$category == "Lt"} {
    1.73 +	incr titleCount
    1.74 +    }
    1.75 +
    1.76 +    return "$categoryIndex,$toupper,$tolower,$totitle"
    1.77 +}
    1.78 +
    1.79 +proc uni::getGroup {value} {
    1.80 +    variable groups
    1.81 +
    1.82 +    set gIndex [lsearch -exact $groups $value]
    1.83 +    if {$gIndex == -1} {
    1.84 +	set gIndex [llength $groups]
    1.85 +	lappend groups $value
    1.86 +    }
    1.87 +    return $gIndex
    1.88 +}
    1.89 +
    1.90 +proc uni::addPage {info} {
    1.91 +    variable pMap
    1.92 +    variable pages
    1.93 +    
    1.94 +    set pIndex [lsearch -exact $pages $info]
    1.95 +    if {$pIndex == -1} {
    1.96 +	set pIndex [llength $pages]
    1.97 +	lappend pages $info
    1.98 +    }
    1.99 +    lappend pMap $pIndex
   1.100 +    return
   1.101 +}
   1.102 +    
   1.103 +proc uni::buildTables {data} {
   1.104 +    variable shift
   1.105 +
   1.106 +    variable pMap {}
   1.107 +    variable pages {}
   1.108 +    variable groups {{0,,,}}
   1.109 +    set info {}			;# temporary page info
   1.110 +    
   1.111 +    set mask [expr {(1 << $shift) - 1}]
   1.112 +
   1.113 +    set next 0
   1.114 +
   1.115 +    foreach line [split $data \n] {
   1.116 +	if {$line == ""} {
   1.117 +	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
   1.118 +	}
   1.119 +
   1.120 +	set items [split $line \;]
   1.121 +
   1.122 +	scan [lindex $items 0] %4x index
   1.123 +	set index [format 0x%0.4x $index]
   1.124 +	
   1.125 +	set gIndex [getGroup [getValue $items $index]]
   1.126 +
   1.127 +	# Since the input table omits unassigned characters, these will
   1.128 +	# show up as gaps in the index sequence.  There are a few special cases
   1.129 +	# where the gaps correspond to a uniform block of assigned characters.
   1.130 +	# These are indicated as such in the character name.
   1.131 +
   1.132 +	# Enter all unassigned characters up to the current character.
   1.133 +	if {($index > $next) \
   1.134 +		&& ![regexp "Last>$" [lindex $items 1]]} {
   1.135 +	    for {} {$next < $index} {incr next} {
   1.136 +		lappend info 0
   1.137 +		if {($next & $mask) == $mask} {
   1.138 +		    addPage $info
   1.139 +		    set info {}
   1.140 +		}
   1.141 +	    }
   1.142 +	}
   1.143 +
   1.144 +	# Enter all assigned characters up to the current character
   1.145 +	for {set i $next} {$i <= $index} {incr i} {
   1.146 +	    # Split character index into offset and page number
   1.147 +	    set offset [expr {$i & $mask}]
   1.148 +	    set page [expr {($i >> $shift)}]
   1.149 +
   1.150 +	    # Add the group index to the info for the current page
   1.151 +	    lappend info $gIndex
   1.152 +
   1.153 +	    # If this is the last entry in the page, add the page
   1.154 +	    if {$offset == $mask} {
   1.155 +		addPage $info
   1.156 +		set info {}
   1.157 +	    }
   1.158 +	}
   1.159 +	set next [expr {$index + 1}]
   1.160 +    }
   1.161 +    return
   1.162 +}
   1.163 +
   1.164 +proc uni::main {} {
   1.165 +    global argc argv0 argv
   1.166 +    variable pMap
   1.167 +    variable pages
   1.168 +    variable groups
   1.169 +    variable shift
   1.170 +    variable titleCount
   1.171 +
   1.172 +    if {$argc != 2} {
   1.173 +	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
   1.174 +	exit 1
   1.175 +    }
   1.176 +    set f [open [lindex $argv 0] r]
   1.177 +    set data [read $f]
   1.178 +    close $f
   1.179 +
   1.180 +    buildTables $data
   1.181 +    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
   1.182 +    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
   1.183 +    puts "shift = 6, space = $size"
   1.184 +    puts "title case count = $titleCount"
   1.185 +
   1.186 +    set f [open [file join [lindex $argv 1] tclUniData.c] w]
   1.187 +    fconfigure $f -translation lf
   1.188 +    puts $f "/*
   1.189 + * tclUniData.c --
   1.190 + *
   1.191 + *	Declarations of Unicode character information tables.  This file is
   1.192 + *	automatically generated by the tools/uniParse.tcl script.  Do not
   1.193 + *	modify this file by hand.
   1.194 + *
   1.195 + * Copyright (c) 1998 by Scriptics Corporation.
   1.196 + * All rights reserved.
   1.197 + *
   1.198 + * RCS: @(#) \$Id\$
   1.199 + */
   1.200 +
   1.201 +/*
   1.202 + * A 16-bit Unicode character is split into two parts in order to index
   1.203 + * into the following tables.  The lower OFFSET_BITS comprise an offset
   1.204 + * into a page of characters.  The upper bits comprise the page number.
   1.205 + */
   1.206 +
   1.207 +#define OFFSET_BITS $shift
   1.208 +
   1.209 +/*
   1.210 + * The pageMap is indexed by page number and returns an alternate page number
   1.211 + * that identifies a unique page of characters.  Many Unicode characters map
   1.212 + * to the same alternate page number.
   1.213 + */
   1.214 +
   1.215 +static unsigned char pageMap\[\] = {"
   1.216 +    set line "    "
   1.217 +    set last [expr {[llength $pMap] - 1}]
   1.218 +    for {set i 0} {$i <= $last} {incr i} {
   1.219 +	append line [lindex $pMap $i]
   1.220 +	if {$i != $last} {
   1.221 +	    append line ", "
   1.222 +	}
   1.223 +	if {[string length $line] > 70} {
   1.224 +	    puts $f $line
   1.225 +	    set line "    "
   1.226 +	}
   1.227 +    }
   1.228 +    puts $f $line
   1.229 +    puts $f "};
   1.230 +
   1.231 +/*
   1.232 + * The groupMap is indexed by combining the alternate page number with
   1.233 + * the page offset and returns a group number that identifies a unique
   1.234 + * set of character attributes.
   1.235 + */
   1.236 +
   1.237 +static unsigned char groupMap\[\] = {"
   1.238 +    set line "    "
   1.239 +    set lasti [expr {[llength $pages] - 1}]
   1.240 +    for {set i 0} {$i <= $lasti} {incr i} {
   1.241 +	set page [lindex $pages $i]
   1.242 +	set lastj [expr {[llength $page] - 1}]
   1.243 +	for {set j 0} {$j <= $lastj} {incr j} {
   1.244 +	    append line [lindex $page $j]
   1.245 +	    if {$j != $lastj || $i != $lasti} {
   1.246 +		append line ", "
   1.247 +	    }
   1.248 +	    if {[string length $line] > 70} {
   1.249 +		puts $f $line
   1.250 +		set line "    "
   1.251 +	    }
   1.252 +	}
   1.253 +    }
   1.254 +    puts $f $line
   1.255 +    puts $f "};
   1.256 +
   1.257 +/*
   1.258 + * Each group represents a unique set of character attributes.  The attributes
   1.259 + * are encoded into a 32-bit value as follows:
   1.260 + *
   1.261 + * Bits 0-4	Character category: see the constants listed below.
   1.262 + *
   1.263 + * Bits 5-7	Case delta type: 000 = identity
   1.264 + *				 010 = add delta for lower
   1.265 + *				 011 = add delta for lower, add 1 for title
   1.266 + *				 100 = sutract delta for title/upper
   1.267 + *				 101 = sub delta for upper, sub 1 for title
   1.268 + *				 110 = sub delta for upper, add delta for lower
   1.269 + *
   1.270 + * Bits 8-21	Reserved for future use.
   1.271 + *
   1.272 + * Bits 22-31	Case delta: delta for case conversions.  This should be the
   1.273 + *			    highest field so we can easily sign extend.
   1.274 + */
   1.275 +
   1.276 +static int groups\[\] = {"
   1.277 +    set line "    "
   1.278 +    set last [expr {[llength $groups] - 1}]
   1.279 +    for {set i 0} {$i <= $last} {incr i} {
   1.280 +	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
   1.281 +	
   1.282 +	# Compute the case conversion type and delta
   1.283 +
   1.284 +	if {$totitle != ""} {
   1.285 +	    if {$totitle == $toupper} {
   1.286 +		# subtract delta for title or upper
   1.287 +		set case 4
   1.288 +		set delta $toupper
   1.289 +	    } elseif {$toupper != ""} {
   1.290 +		# subtract delta for upper, subtract 1 for title
   1.291 +		set case 5
   1.292 +		set delta $toupper
   1.293 +	    } else {
   1.294 +		# add delta for lower, add 1 for title
   1.295 +		set case 3
   1.296 +		set delta $tolower
   1.297 +	    }
   1.298 +	} elseif {$toupper != ""} {
   1.299 +	    # subtract delta for upper, add delta for lower
   1.300 +	    set case 6
   1.301 +	    set delta $toupper
   1.302 +	} elseif {$tolower != ""} {
   1.303 +	    # add delta for lower
   1.304 +	    set case 2
   1.305 +	    set delta $tolower
   1.306 +	} else {
   1.307 +	    # noop
   1.308 +	    set case 0
   1.309 +	    set delta 0
   1.310 +	}
   1.311 +
   1.312 +	set val [expr {($delta << 22) | ($case << 5) | $type}]
   1.313 +
   1.314 +	append line [format "%d" $val]
   1.315 +	if {$i != $last} {
   1.316 +	    append line ", "
   1.317 +	}
   1.318 +	if {[string length $line] > 65} {
   1.319 +	    puts $f $line
   1.320 +	    set line "    "
   1.321 +	}
   1.322 +    }
   1.323 +    puts $f $line
   1.324 +    puts $f "};
   1.325 +
   1.326 +/*
   1.327 + * The following constants are used to determine the category of a
   1.328 + * Unicode character.
   1.329 + */
   1.330 +
   1.331 +#define UNICODE_CATEGORY_MASK 0X1F
   1.332 +
   1.333 +enum {
   1.334 +    UNASSIGNED,
   1.335 +    UPPERCASE_LETTER,
   1.336 +    LOWERCASE_LETTER,
   1.337 +    TITLECASE_LETTER,
   1.338 +    MODIFIER_LETTER,
   1.339 +    OTHER_LETTER,
   1.340 +    NON_SPACING_MARK,
   1.341 +    ENCLOSING_MARK,
   1.342 +    COMBINING_SPACING_MARK,
   1.343 +    DECIMAL_DIGIT_NUMBER,
   1.344 +    LETTER_NUMBER,
   1.345 +    OTHER_NUMBER,
   1.346 +    SPACE_SEPARATOR,
   1.347 +    LINE_SEPARATOR,
   1.348 +    PARAGRAPH_SEPARATOR,
   1.349 +    CONTROL,
   1.350 +    FORMAT,
   1.351 +    PRIVATE_USE,
   1.352 +    SURROGATE,
   1.353 +    CONNECTOR_PUNCTUATION,
   1.354 +    DASH_PUNCTUATION,
   1.355 +    OPEN_PUNCTUATION,
   1.356 +    CLOSE_PUNCTUATION,
   1.357 +    INITIAL_QUOTE_PUNCTUATION,
   1.358 +    FINAL_QUOTE_PUNCTUATION,
   1.359 +    OTHER_PUNCTUATION,
   1.360 +    MATH_SYMBOL,
   1.361 +    CURRENCY_SYMBOL,
   1.362 +    MODIFIER_SYMBOL,
   1.363 +    OTHER_SYMBOL
   1.364 +};
   1.365 +
   1.366 +/*
   1.367 + * The following macros extract the fields of the character info.  The
   1.368 + * GetDelta() macro is complicated because we can't rely on the C compiler
   1.369 + * to do sign extension on right shifts.
   1.370 + */
   1.371 +
   1.372 +#define GetCaseType(info) (((info) & 0xE0) >> 5)
   1.373 +#define GetCategory(info) ((info) & 0x1F)
   1.374 +#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
   1.375 +
   1.376 +/*
   1.377 + * This macro extracts the information about a character from the
   1.378 + * Unicode character tables.
   1.379 + */
   1.380 +
   1.381 +#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
   1.382 +"
   1.383 +
   1.384 +    close $f
   1.385 +}
   1.386 +
   1.387 +uni::main
   1.388 +
   1.389 +return