sl@0: # uniParse.tcl -- sl@0: # sl@0: # This program parses the UnicodeData file and generates the sl@0: # corresponding tclUniData.c file with compressed character sl@0: # data tables. The input to this program should be the latest sl@0: # UnicodeData file from: sl@0: # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt sl@0: # sl@0: # Copyright (c) 1998-1999 by Scriptics Corporation. sl@0: # All rights reserved. sl@0: # sl@0: # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ sl@0: sl@0: sl@0: namespace eval uni { sl@0: set shift 5; # number of bits of data within a page sl@0: # This value can be adjusted to find the sl@0: # best split to minimize table size sl@0: sl@0: variable pMap; # map from page to page index, each entry is sl@0: # an index into the pages table, indexed by sl@0: # page number sl@0: variable pages; # map from page index to page info, each sl@0: # entry is a list of indices into the groups sl@0: # table, the list is indexed by the offset sl@0: variable groups; # list of character info values, indexed by sl@0: # group number, initialized with the sl@0: # unassigned character group sl@0: sl@0: variable categories { sl@0: Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp sl@0: Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So sl@0: }; # Ordered list of character categories, must sl@0: # match the enumeration in the header file. sl@0: sl@0: variable titleCount 0; # Count of the number of title case sl@0: # characters. This value is used in the sl@0: # regular expression code to allocate enough sl@0: # space for the title case variants. sl@0: } sl@0: sl@0: proc uni::getValue {items index} { sl@0: variable categories sl@0: variable titleCount sl@0: sl@0: # Extract character info sl@0: sl@0: set category [lindex $items 2] sl@0: if {[scan [lindex $items 12] %4x toupper] == 1} { sl@0: set toupper [expr {$index - $toupper}] sl@0: } else { sl@0: set toupper {} sl@0: } sl@0: if {[scan [lindex $items 13] %4x tolower] == 1} { sl@0: set tolower [expr {$tolower - $index}] sl@0: } else { sl@0: set tolower {} sl@0: } sl@0: if {[scan [lindex $items 14] %4x totitle] == 1} { sl@0: set totitle [expr {$index - $totitle}] sl@0: } else { sl@0: set totitle {} sl@0: } sl@0: sl@0: set categoryIndex [lsearch -exact $categories $category] sl@0: if {$categoryIndex < 0} { sl@0: puts "Unexpected character category: $index($category)" sl@0: set categoryIndex 0 sl@0: } elseif {$category == "Lt"} { sl@0: incr titleCount sl@0: } sl@0: sl@0: return "$categoryIndex,$toupper,$tolower,$totitle" sl@0: } sl@0: sl@0: proc uni::getGroup {value} { sl@0: variable groups sl@0: sl@0: set gIndex [lsearch -exact $groups $value] sl@0: if {$gIndex == -1} { sl@0: set gIndex [llength $groups] sl@0: lappend groups $value sl@0: } sl@0: return $gIndex sl@0: } sl@0: sl@0: proc uni::addPage {info} { sl@0: variable pMap sl@0: variable pages sl@0: sl@0: set pIndex [lsearch -exact $pages $info] sl@0: if {$pIndex == -1} { sl@0: set pIndex [llength $pages] sl@0: lappend pages $info sl@0: } sl@0: lappend pMap $pIndex sl@0: return sl@0: } sl@0: sl@0: proc uni::buildTables {data} { sl@0: variable shift sl@0: sl@0: variable pMap {} sl@0: variable pages {} sl@0: variable groups {{0,,,}} sl@0: set info {} ;# temporary page info sl@0: sl@0: set mask [expr {(1 << $shift) - 1}] sl@0: sl@0: set next 0 sl@0: sl@0: foreach line [split $data \n] { sl@0: if {$line == ""} { sl@0: set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" sl@0: } sl@0: sl@0: set items [split $line \;] sl@0: sl@0: scan [lindex $items 0] %4x index sl@0: set index [format 0x%0.4x $index] sl@0: sl@0: set gIndex [getGroup [getValue $items $index]] sl@0: sl@0: # Since the input table omits unassigned characters, these will sl@0: # show up as gaps in the index sequence. There are a few special cases sl@0: # where the gaps correspond to a uniform block of assigned characters. sl@0: # These are indicated as such in the character name. sl@0: sl@0: # Enter all unassigned characters up to the current character. sl@0: if {($index > $next) \ sl@0: && ![regexp "Last>$" [lindex $items 1]]} { sl@0: for {} {$next < $index} {incr next} { sl@0: lappend info 0 sl@0: if {($next & $mask) == $mask} { sl@0: addPage $info sl@0: set info {} sl@0: } sl@0: } sl@0: } sl@0: sl@0: # Enter all assigned characters up to the current character sl@0: for {set i $next} {$i <= $index} {incr i} { sl@0: # Split character index into offset and page number sl@0: set offset [expr {$i & $mask}] sl@0: set page [expr {($i >> $shift)}] sl@0: sl@0: # Add the group index to the info for the current page sl@0: lappend info $gIndex sl@0: sl@0: # If this is the last entry in the page, add the page sl@0: if {$offset == $mask} { sl@0: addPage $info sl@0: set info {} sl@0: } sl@0: } sl@0: set next [expr {$index + 1}] sl@0: } sl@0: return sl@0: } sl@0: sl@0: proc uni::main {} { sl@0: global argc argv0 argv sl@0: variable pMap sl@0: variable pages sl@0: variable groups sl@0: variable shift sl@0: variable titleCount sl@0: sl@0: if {$argc != 2} { sl@0: puts stderr "\nusage: $argv0 \n" sl@0: exit 1 sl@0: } sl@0: set f [open [lindex $argv 0] r] sl@0: set data [read $f] sl@0: close $f sl@0: sl@0: buildTables $data sl@0: puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" sl@0: set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] sl@0: puts "shift = 6, space = $size" sl@0: puts "title case count = $titleCount" sl@0: sl@0: set f [open [file join [lindex $argv 1] tclUniData.c] w] sl@0: fconfigure $f -translation lf sl@0: puts $f "/* sl@0: * tclUniData.c -- sl@0: * sl@0: * Declarations of Unicode character information tables. This file is sl@0: * automatically generated by the tools/uniParse.tcl script. Do not sl@0: * modify this file by hand. sl@0: * sl@0: * Copyright (c) 1998 by Scriptics Corporation. sl@0: * All rights reserved. sl@0: * sl@0: * RCS: @(#) \$Id\$ sl@0: */ sl@0: sl@0: /* sl@0: * A 16-bit Unicode character is split into two parts in order to index sl@0: * into the following tables. The lower OFFSET_BITS comprise an offset sl@0: * into a page of characters. The upper bits comprise the page number. sl@0: */ sl@0: sl@0: #define OFFSET_BITS $shift sl@0: sl@0: /* sl@0: * The pageMap is indexed by page number and returns an alternate page number sl@0: * that identifies a unique page of characters. Many Unicode characters map sl@0: * to the same alternate page number. sl@0: */ sl@0: sl@0: static unsigned char pageMap\[\] = {" sl@0: set line " " sl@0: set last [expr {[llength $pMap] - 1}] sl@0: for {set i 0} {$i <= $last} {incr i} { sl@0: append line [lindex $pMap $i] sl@0: if {$i != $last} { sl@0: append line ", " sl@0: } sl@0: if {[string length $line] > 70} { sl@0: puts $f $line sl@0: set line " " sl@0: } sl@0: } sl@0: puts $f $line sl@0: puts $f "}; sl@0: sl@0: /* sl@0: * The groupMap is indexed by combining the alternate page number with sl@0: * the page offset and returns a group number that identifies a unique sl@0: * set of character attributes. sl@0: */ sl@0: sl@0: static unsigned char groupMap\[\] = {" sl@0: set line " " sl@0: set lasti [expr {[llength $pages] - 1}] sl@0: for {set i 0} {$i <= $lasti} {incr i} { sl@0: set page [lindex $pages $i] sl@0: set lastj [expr {[llength $page] - 1}] sl@0: for {set j 0} {$j <= $lastj} {incr j} { sl@0: append line [lindex $page $j] sl@0: if {$j != $lastj || $i != $lasti} { sl@0: append line ", " sl@0: } sl@0: if {[string length $line] > 70} { sl@0: puts $f $line sl@0: set line " " sl@0: } sl@0: } sl@0: } sl@0: puts $f $line sl@0: puts $f "}; sl@0: sl@0: /* sl@0: * Each group represents a unique set of character attributes. The attributes sl@0: * are encoded into a 32-bit value as follows: sl@0: * sl@0: * Bits 0-4 Character category: see the constants listed below. sl@0: * sl@0: * Bits 5-7 Case delta type: 000 = identity sl@0: * 010 = add delta for lower sl@0: * 011 = add delta for lower, add 1 for title sl@0: * 100 = sutract delta for title/upper sl@0: * 101 = sub delta for upper, sub 1 for title sl@0: * 110 = sub delta for upper, add delta for lower sl@0: * sl@0: * Bits 8-21 Reserved for future use. sl@0: * sl@0: * Bits 22-31 Case delta: delta for case conversions. This should be the sl@0: * highest field so we can easily sign extend. sl@0: */ sl@0: sl@0: static int groups\[\] = {" sl@0: set line " " sl@0: set last [expr {[llength $groups] - 1}] sl@0: for {set i 0} {$i <= $last} {incr i} { sl@0: foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} sl@0: sl@0: # Compute the case conversion type and delta sl@0: sl@0: if {$totitle != ""} { sl@0: if {$totitle == $toupper} { sl@0: # subtract delta for title or upper sl@0: set case 4 sl@0: set delta $toupper sl@0: } elseif {$toupper != ""} { sl@0: # subtract delta for upper, subtract 1 for title sl@0: set case 5 sl@0: set delta $toupper sl@0: } else { sl@0: # add delta for lower, add 1 for title sl@0: set case 3 sl@0: set delta $tolower sl@0: } sl@0: } elseif {$toupper != ""} { sl@0: # subtract delta for upper, add delta for lower sl@0: set case 6 sl@0: set delta $toupper sl@0: } elseif {$tolower != ""} { sl@0: # add delta for lower sl@0: set case 2 sl@0: set delta $tolower sl@0: } else { sl@0: # noop sl@0: set case 0 sl@0: set delta 0 sl@0: } sl@0: sl@0: set val [expr {($delta << 22) | ($case << 5) | $type}] sl@0: sl@0: append line [format "%d" $val] sl@0: if {$i != $last} { sl@0: append line ", " sl@0: } sl@0: if {[string length $line] > 65} { sl@0: puts $f $line sl@0: set line " " sl@0: } sl@0: } sl@0: puts $f $line sl@0: puts $f "}; sl@0: sl@0: /* sl@0: * The following constants are used to determine the category of a sl@0: * Unicode character. sl@0: */ sl@0: sl@0: #define UNICODE_CATEGORY_MASK 0X1F sl@0: sl@0: enum { sl@0: UNASSIGNED, sl@0: UPPERCASE_LETTER, sl@0: LOWERCASE_LETTER, sl@0: TITLECASE_LETTER, sl@0: MODIFIER_LETTER, sl@0: OTHER_LETTER, sl@0: NON_SPACING_MARK, sl@0: ENCLOSING_MARK, sl@0: COMBINING_SPACING_MARK, sl@0: DECIMAL_DIGIT_NUMBER, sl@0: LETTER_NUMBER, sl@0: OTHER_NUMBER, sl@0: SPACE_SEPARATOR, sl@0: LINE_SEPARATOR, sl@0: PARAGRAPH_SEPARATOR, sl@0: CONTROL, sl@0: FORMAT, sl@0: PRIVATE_USE, sl@0: SURROGATE, sl@0: CONNECTOR_PUNCTUATION, sl@0: DASH_PUNCTUATION, sl@0: OPEN_PUNCTUATION, sl@0: CLOSE_PUNCTUATION, sl@0: INITIAL_QUOTE_PUNCTUATION, sl@0: FINAL_QUOTE_PUNCTUATION, sl@0: OTHER_PUNCTUATION, sl@0: MATH_SYMBOL, sl@0: CURRENCY_SYMBOL, sl@0: MODIFIER_SYMBOL, sl@0: OTHER_SYMBOL sl@0: }; sl@0: sl@0: /* sl@0: * The following macros extract the fields of the character info. The sl@0: * GetDelta() macro is complicated because we can't rely on the C compiler sl@0: * to do sign extension on right shifts. sl@0: */ sl@0: sl@0: #define GetCaseType(info) (((info) & 0xE0) >> 5) sl@0: #define GetCategory(info) ((info) & 0x1F) sl@0: #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) sl@0: sl@0: /* sl@0: * This macro extracts the information about a character from the sl@0: * Unicode character tables. sl@0: */ sl@0: sl@0: #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) sl@0: " sl@0: sl@0: close $f sl@0: } sl@0: sl@0: uni::main sl@0: sl@0: return