sl@0: # uniParse.tcl --
sl@0: #
sl@0: #	This program parses the UnicodeData file and generates the
sl@0: #	corresponding tclUniData.c file with compressed character
sl@0: #	data tables.  The input to this program should be the latest
sl@0: #	UnicodeData file from:
sl@0: #	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
sl@0: #
sl@0: # Copyright (c) 1998-1999 by Scriptics Corporation.
sl@0: # All rights reserved.
sl@0: # 
sl@0: # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
sl@0: 
sl@0: 
sl@0: namespace eval uni {
sl@0:     set shift 5;		# number of bits of data within a page
sl@0: 				# This value can be adjusted to find the
sl@0: 				# best split to minimize table size
sl@0: 
sl@0:     variable pMap;		# map from page to page index, each entry is
sl@0: 				# an index into the pages table, indexed by
sl@0: 				# page number
sl@0:     variable pages;		# map from page index to page info, each
sl@0: 				# entry is a list of indices into the groups
sl@0: 				# table, the list is indexed by the offset
sl@0:     variable groups;		# list of character info values, indexed by
sl@0: 				# group number, initialized with the
sl@0: 				# unassigned character group
sl@0: 
sl@0:     variable categories {
sl@0: 	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
sl@0: 	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
sl@0:     };				# Ordered list of character categories, must
sl@0: 				# match the enumeration in the header file.
sl@0: 
sl@0:     variable titleCount 0;	# Count of the number of title case
sl@0: 				# characters.  This value is used in the
sl@0: 				# regular expression code to allocate enough
sl@0: 				# space for the title case variants.
sl@0: }
sl@0: 
sl@0: proc uni::getValue {items index} {
sl@0:     variable categories
sl@0:     variable titleCount
sl@0: 
sl@0:     # Extract character info
sl@0: 
sl@0:     set category [lindex $items 2]
sl@0:     if {[scan [lindex $items 12] %4x toupper] == 1} {
sl@0: 	set toupper [expr {$index - $toupper}]
sl@0:     } else {
sl@0: 	set toupper {}
sl@0:     }
sl@0:     if {[scan [lindex $items 13] %4x tolower] == 1} {
sl@0: 	set tolower [expr {$tolower - $index}]
sl@0:     } else {
sl@0: 	set tolower {}
sl@0:     }
sl@0:     if {[scan [lindex $items 14] %4x totitle] == 1} {
sl@0: 	set totitle [expr {$index - $totitle}]
sl@0:     } else {
sl@0: 	set totitle {}
sl@0:     }
sl@0: 
sl@0:     set categoryIndex [lsearch -exact $categories $category]
sl@0:     if {$categoryIndex < 0} {
sl@0: 	puts "Unexpected character category: $index($category)"
sl@0: 	set categoryIndex 0
sl@0:     } elseif {$category == "Lt"} {
sl@0: 	incr titleCount
sl@0:     }
sl@0: 
sl@0:     return "$categoryIndex,$toupper,$tolower,$totitle"
sl@0: }
sl@0: 
sl@0: proc uni::getGroup {value} {
sl@0:     variable groups
sl@0: 
sl@0:     set gIndex [lsearch -exact $groups $value]
sl@0:     if {$gIndex == -1} {
sl@0: 	set gIndex [llength $groups]
sl@0: 	lappend groups $value
sl@0:     }
sl@0:     return $gIndex
sl@0: }
sl@0: 
sl@0: proc uni::addPage {info} {
sl@0:     variable pMap
sl@0:     variable pages
sl@0:     
sl@0:     set pIndex [lsearch -exact $pages $info]
sl@0:     if {$pIndex == -1} {
sl@0: 	set pIndex [llength $pages]
sl@0: 	lappend pages $info
sl@0:     }
sl@0:     lappend pMap $pIndex
sl@0:     return
sl@0: }
sl@0:     
sl@0: proc uni::buildTables {data} {
sl@0:     variable shift
sl@0: 
sl@0:     variable pMap {}
sl@0:     variable pages {}
sl@0:     variable groups {{0,,,}}
sl@0:     set info {}			;# temporary page info
sl@0:     
sl@0:     set mask [expr {(1 << $shift) - 1}]
sl@0: 
sl@0:     set next 0
sl@0: 
sl@0:     foreach line [split $data \n] {
sl@0: 	if {$line == ""} {
sl@0: 	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
sl@0: 	}
sl@0: 
sl@0: 	set items [split $line \;]
sl@0: 
sl@0: 	scan [lindex $items 0] %4x index
sl@0: 	set index [format 0x%0.4x $index]
sl@0: 	
sl@0: 	set gIndex [getGroup [getValue $items $index]]
sl@0: 
sl@0: 	# Since the input table omits unassigned characters, these will
sl@0: 	# show up as gaps in the index sequence.  There are a few special cases
sl@0: 	# where the gaps correspond to a uniform block of assigned characters.
sl@0: 	# These are indicated as such in the character name.
sl@0: 
sl@0: 	# Enter all unassigned characters up to the current character.
sl@0: 	if {($index > $next) \
sl@0: 		&& ![regexp "Last>$" [lindex $items 1]]} {
sl@0: 	    for {} {$next < $index} {incr next} {
sl@0: 		lappend info 0
sl@0: 		if {($next & $mask) == $mask} {
sl@0: 		    addPage $info
sl@0: 		    set info {}
sl@0: 		}
sl@0: 	    }
sl@0: 	}
sl@0: 
sl@0: 	# Enter all assigned characters up to the current character
sl@0: 	for {set i $next} {$i <= $index} {incr i} {
sl@0: 	    # Split character index into offset and page number
sl@0: 	    set offset [expr {$i & $mask}]
sl@0: 	    set page [expr {($i >> $shift)}]
sl@0: 
sl@0: 	    # Add the group index to the info for the current page
sl@0: 	    lappend info $gIndex
sl@0: 
sl@0: 	    # If this is the last entry in the page, add the page
sl@0: 	    if {$offset == $mask} {
sl@0: 		addPage $info
sl@0: 		set info {}
sl@0: 	    }
sl@0: 	}
sl@0: 	set next [expr {$index + 1}]
sl@0:     }
sl@0:     return
sl@0: }
sl@0: 
sl@0: proc uni::main {} {
sl@0:     global argc argv0 argv
sl@0:     variable pMap
sl@0:     variable pages
sl@0:     variable groups
sl@0:     variable shift
sl@0:     variable titleCount
sl@0: 
sl@0:     if {$argc != 2} {
sl@0: 	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
sl@0: 	exit 1
sl@0:     }
sl@0:     set f [open [lindex $argv 0] r]
sl@0:     set data [read $f]
sl@0:     close $f
sl@0: 
sl@0:     buildTables $data
sl@0:     puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
sl@0:     set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
sl@0:     puts "shift = 6, space = $size"
sl@0:     puts "title case count = $titleCount"
sl@0: 
sl@0:     set f [open [file join [lindex $argv 1] tclUniData.c] w]
sl@0:     fconfigure $f -translation lf
sl@0:     puts $f "/*
sl@0:  * tclUniData.c --
sl@0:  *
sl@0:  *	Declarations of Unicode character information tables.  This file is
sl@0:  *	automatically generated by the tools/uniParse.tcl script.  Do not
sl@0:  *	modify this file by hand.
sl@0:  *
sl@0:  * Copyright (c) 1998 by Scriptics Corporation.
sl@0:  * All rights reserved.
sl@0:  *
sl@0:  * RCS: @(#) \$Id\$
sl@0:  */
sl@0: 
sl@0: /*
sl@0:  * A 16-bit Unicode character is split into two parts in order to index
sl@0:  * into the following tables.  The lower OFFSET_BITS comprise an offset
sl@0:  * into a page of characters.  The upper bits comprise the page number.
sl@0:  */
sl@0: 
sl@0: #define OFFSET_BITS $shift
sl@0: 
sl@0: /*
sl@0:  * The pageMap is indexed by page number and returns an alternate page number
sl@0:  * that identifies a unique page of characters.  Many Unicode characters map
sl@0:  * to the same alternate page number.
sl@0:  */
sl@0: 
sl@0: static unsigned char pageMap\[\] = {"
sl@0:     set line "    "
sl@0:     set last [expr {[llength $pMap] - 1}]
sl@0:     for {set i 0} {$i <= $last} {incr i} {
sl@0: 	append line [lindex $pMap $i]
sl@0: 	if {$i != $last} {
sl@0: 	    append line ", "
sl@0: 	}
sl@0: 	if {[string length $line] > 70} {
sl@0: 	    puts $f $line
sl@0: 	    set line "    "
sl@0: 	}
sl@0:     }
sl@0:     puts $f $line
sl@0:     puts $f "};
sl@0: 
sl@0: /*
sl@0:  * The groupMap is indexed by combining the alternate page number with
sl@0:  * the page offset and returns a group number that identifies a unique
sl@0:  * set of character attributes.
sl@0:  */
sl@0: 
sl@0: static unsigned char groupMap\[\] = {"
sl@0:     set line "    "
sl@0:     set lasti [expr {[llength $pages] - 1}]
sl@0:     for {set i 0} {$i <= $lasti} {incr i} {
sl@0: 	set page [lindex $pages $i]
sl@0: 	set lastj [expr {[llength $page] - 1}]
sl@0: 	for {set j 0} {$j <= $lastj} {incr j} {
sl@0: 	    append line [lindex $page $j]
sl@0: 	    if {$j != $lastj || $i != $lasti} {
sl@0: 		append line ", "
sl@0: 	    }
sl@0: 	    if {[string length $line] > 70} {
sl@0: 		puts $f $line
sl@0: 		set line "    "
sl@0: 	    }
sl@0: 	}
sl@0:     }
sl@0:     puts $f $line
sl@0:     puts $f "};
sl@0: 
sl@0: /*
sl@0:  * Each group represents a unique set of character attributes.  The attributes
sl@0:  * are encoded into a 32-bit value as follows:
sl@0:  *
sl@0:  * Bits 0-4	Character category: see the constants listed below.
sl@0:  *
sl@0:  * Bits 5-7	Case delta type: 000 = identity
sl@0:  *				 010 = add delta for lower
sl@0:  *				 011 = add delta for lower, add 1 for title
sl@0:  *				 100 = sutract delta for title/upper
sl@0:  *				 101 = sub delta for upper, sub 1 for title
sl@0:  *				 110 = sub delta for upper, add delta for lower
sl@0:  *
sl@0:  * Bits 8-21	Reserved for future use.
sl@0:  *
sl@0:  * Bits 22-31	Case delta: delta for case conversions.  This should be the
sl@0:  *			    highest field so we can easily sign extend.
sl@0:  */
sl@0: 
sl@0: static int groups\[\] = {"
sl@0:     set line "    "
sl@0:     set last [expr {[llength $groups] - 1}]
sl@0:     for {set i 0} {$i <= $last} {incr i} {
sl@0: 	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
sl@0: 	
sl@0: 	# Compute the case conversion type and delta
sl@0: 
sl@0: 	if {$totitle != ""} {
sl@0: 	    if {$totitle == $toupper} {
sl@0: 		# subtract delta for title or upper
sl@0: 		set case 4
sl@0: 		set delta $toupper
sl@0: 	    } elseif {$toupper != ""} {
sl@0: 		# subtract delta for upper, subtract 1 for title
sl@0: 		set case 5
sl@0: 		set delta $toupper
sl@0: 	    } else {
sl@0: 		# add delta for lower, add 1 for title
sl@0: 		set case 3
sl@0: 		set delta $tolower
sl@0: 	    }
sl@0: 	} elseif {$toupper != ""} {
sl@0: 	    # subtract delta for upper, add delta for lower
sl@0: 	    set case 6
sl@0: 	    set delta $toupper
sl@0: 	} elseif {$tolower != ""} {
sl@0: 	    # add delta for lower
sl@0: 	    set case 2
sl@0: 	    set delta $tolower
sl@0: 	} else {
sl@0: 	    # noop
sl@0: 	    set case 0
sl@0: 	    set delta 0
sl@0: 	}
sl@0: 
sl@0: 	set val [expr {($delta << 22) | ($case << 5) | $type}]
sl@0: 
sl@0: 	append line [format "%d" $val]
sl@0: 	if {$i != $last} {
sl@0: 	    append line ", "
sl@0: 	}
sl@0: 	if {[string length $line] > 65} {
sl@0: 	    puts $f $line
sl@0: 	    set line "    "
sl@0: 	}
sl@0:     }
sl@0:     puts $f $line
sl@0:     puts $f "};
sl@0: 
sl@0: /*
sl@0:  * The following constants are used to determine the category of a
sl@0:  * Unicode character.
sl@0:  */
sl@0: 
sl@0: #define UNICODE_CATEGORY_MASK 0X1F
sl@0: 
sl@0: enum {
sl@0:     UNASSIGNED,
sl@0:     UPPERCASE_LETTER,
sl@0:     LOWERCASE_LETTER,
sl@0:     TITLECASE_LETTER,
sl@0:     MODIFIER_LETTER,
sl@0:     OTHER_LETTER,
sl@0:     NON_SPACING_MARK,
sl@0:     ENCLOSING_MARK,
sl@0:     COMBINING_SPACING_MARK,
sl@0:     DECIMAL_DIGIT_NUMBER,
sl@0:     LETTER_NUMBER,
sl@0:     OTHER_NUMBER,
sl@0:     SPACE_SEPARATOR,
sl@0:     LINE_SEPARATOR,
sl@0:     PARAGRAPH_SEPARATOR,
sl@0:     CONTROL,
sl@0:     FORMAT,
sl@0:     PRIVATE_USE,
sl@0:     SURROGATE,
sl@0:     CONNECTOR_PUNCTUATION,
sl@0:     DASH_PUNCTUATION,
sl@0:     OPEN_PUNCTUATION,
sl@0:     CLOSE_PUNCTUATION,
sl@0:     INITIAL_QUOTE_PUNCTUATION,
sl@0:     FINAL_QUOTE_PUNCTUATION,
sl@0:     OTHER_PUNCTUATION,
sl@0:     MATH_SYMBOL,
sl@0:     CURRENCY_SYMBOL,
sl@0:     MODIFIER_SYMBOL,
sl@0:     OTHER_SYMBOL
sl@0: };
sl@0: 
sl@0: /*
sl@0:  * The following macros extract the fields of the character info.  The
sl@0:  * GetDelta() macro is complicated because we can't rely on the C compiler
sl@0:  * to do sign extension on right shifts.
sl@0:  */
sl@0: 
sl@0: #define GetCaseType(info) (((info) & 0xE0) >> 5)
sl@0: #define GetCategory(info) ((info) & 0x1F)
sl@0: #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
sl@0: 
sl@0: /*
sl@0:  * This macro extracts the information about a character from the
sl@0:  * Unicode character tables.
sl@0:  */
sl@0: 
sl@0: #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
sl@0: "
sl@0: 
sl@0:     close $f
sl@0: }
sl@0: 
sl@0: uni::main
sl@0: 
sl@0: return