Symaptic: os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl@260cb5ec6c19

     1 # uniParse.tcl --

2 #

     3 #	This program parses the UnicodeData file and generates the

     4 #	corresponding tclUniData.c file with compressed character

     5 #	data tables.  The input to this program should be the latest

     6 #	UnicodeData file from:

     7 #	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt

8 #

     9 # Copyright (c) 1998-1999 by Scriptics Corporation.

    10 # All rights reserved.

    11 #

    12 # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $

    15 namespace eval uni {

    16     set shift 5;		# number of bits of data within a page

    17 				# This value can be adjusted to find the

    18 				# best split to minimize table size

    20     variable pMap;		# map from page to page index, each entry is

    21 				# an index into the pages table, indexed by

    22 				# page number

    23     variable pages;		# map from page index to page info, each

    24 				# entry is a list of indices into the groups

    25 				# table, the list is indexed by the offset

    26     variable groups;		# list of character info values, indexed by

    27 				# group number, initialized with the

    28 				# unassigned character group

    30     variable categories {

    31 	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp

    32 	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So

    33     };				# Ordered list of character categories, must

    34 				# match the enumeration in the header file.

    36     variable titleCount 0;	# Count of the number of title case

    37 				# characters.  This value is used in the

    38 				# regular expression code to allocate enough

    39 				# space for the title case variants.

    40 }

    42 proc uni::getValue {items index} {

    43     variable categories

    44     variable titleCount

    46     # Extract character info

    48     set category [lindex $items 2]

    49     if {[scan [lindex $items 12] %4x toupper] == 1} {

    50 	set toupper [expr {$index - $toupper}]

    51     } else {

    52 	set toupper {}

    53     }

    54     if {[scan [lindex $items 13] %4x tolower] == 1} {

    55 	set tolower [expr {$tolower - $index}]

    56     } else {

    57 	set tolower {}

    58     }

    59     if {[scan [lindex $items 14] %4x totitle] == 1} {

    60 	set totitle [expr {$index - $totitle}]

    61     } else {

    62 	set totitle {}

    63     }

    65     set categoryIndex [lsearch -exact $categories $category]

    66     if {$categoryIndex < 0} {

    67 	puts "Unexpected character category: $index($category)"

    68 	set categoryIndex 0

    69     } elseif {$category == "Lt"} {

    70 	incr titleCount

    71     }

    73     return "$categoryIndex,$toupper,$tolower,$totitle"

    74 }

    76 proc uni::getGroup {value} {

    77     variable groups

    79     set gIndex [lsearch -exact $groups $value]

    80     if {$gIndex == -1} {

    81 	set gIndex [llength $groups]

    82 	lappend groups $value

    83     }

    84     return $gIndex

    85 }

    87 proc uni::addPage {info} {

    88     variable pMap

    89     variable pages

    91     set pIndex [lsearch -exact $pages $info]

    92     if {$pIndex == -1} {

    93 	set pIndex [llength $pages]

    94 	lappend pages $info

    95     }

    96     lappend pMap $pIndex

    97     return

    98 }

   100 proc uni::buildTables {data} {

   101     variable shift

   103     variable pMap {}

   104     variable pages {}

   105     variable groups {{0,,,}}

   106     set info {}			;# temporary page info

   108     set mask [expr {(1 << $shift) - 1}]

   110     set next 0

   112     foreach line [split $data \n] {

   113 	if {$line == ""} {

   114 	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"

   115 	}

   117 	set items [split $line \;]

   119 	scan [lindex $items 0] %4x index

   120 	set index [format 0x%0.4x $index]

   122 	set gIndex [getGroup [getValue $items $index]]

   124 	# Since the input table omits unassigned characters, these will

   125 	# show up as gaps in the index sequence.  There are a few special cases

   126 	# where the gaps correspond to a uniform block of assigned characters.

   127 	# These are indicated as such in the character name.

   129 	# Enter all unassigned characters up to the current character.

   130 	if {($index > $next) \

   131 		&& ![regexp "Last>$" [lindex $items 1]]} {

   132 	    for {} {$next < $index} {incr next} {

   133 		lappend info 0

   134 		if {($next & $mask) == $mask} {

   135 		    addPage $info

   136 		    set info {}

   137 		}

   138 	    }

   139 	}

   141 	# Enter all assigned characters up to the current character

   142 	for {set i $next} {$i <= $index} {incr i} {

   143 	    # Split character index into offset and page number

   144 	    set offset [expr {$i & $mask}]

   145 	    set page [expr {($i >> $shift)}]

   147 	    # Add the group index to the info for the current page

   148 	    lappend info $gIndex

   150 	    # If this is the last entry in the page, add the page

   151 	    if {$offset == $mask} {

   152 		addPage $info

   153 		set info {}

   154 	    }

   155 	}

   156 	set next [expr {$index + 1}]

   157     }

   158     return

   159 }

   161 proc uni::main {} {

   162     global argc argv0 argv

   163     variable pMap

   164     variable pages

   165     variable groups

   166     variable shift

   167     variable titleCount

   169     if {$argc != 2} {

   170 	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"

   171 	exit 1

   172     }

   173     set f [open [lindex $argv 0] r]

   174     set data [read $f]

   175     close $f

   177     buildTables $data

   178     puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"

   179     set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]

   180     puts "shift = 6, space = $size"

   181     puts "title case count = $titleCount"

   183     set f [open [file join [lindex $argv 1] tclUniData.c] w]

   184     fconfigure $f -translation lf

   185     puts $f "/*

   186  * tclUniData.c --

   187  *

   188  *	Declarations of Unicode character information tables.  This file is

   189  *	automatically generated by the tools/uniParse.tcl script.  Do not

   190  *	modify this file by hand.

   191  *

   192  * Copyright (c) 1998 by Scriptics Corporation.

   193  * All rights reserved.

   194  *

   195  * RCS: @(#) \$Id\$

   196  */

   198 /*

   199  * A 16-bit Unicode character is split into two parts in order to index

   200  * into the following tables.  The lower OFFSET_BITS comprise an offset

   201  * into a page of characters.  The upper bits comprise the page number.

   202  */

   204 #define OFFSET_BITS $shift

   206 /*

   207  * The pageMap is indexed by page number and returns an alternate page number

   208  * that identifies a unique page of characters.  Many Unicode characters map

   209  * to the same alternate page number.

   210  */

   212 static unsigned char pageMap\[\] = {"

   213     set line "    "

   214     set last [expr {[llength $pMap] - 1}]

   215     for {set i 0} {$i <= $last} {incr i} {

   216 	append line [lindex $pMap $i]

   217 	if {$i != $last} {

   218 	    append line ", "

   219 	}

   220 	if {[string length $line] > 70} {

   221 	    puts $f $line

   222 	    set line "    "

   223 	}

   224     }

   225     puts $f $line

   226     puts $f "};

   228 /*

   229  * The groupMap is indexed by combining the alternate page number with

   230  * the page offset and returns a group number that identifies a unique

   231  * set of character attributes.

   232  */

   234 static unsigned char groupMap\[\] = {"

   235     set line "    "

   236     set lasti [expr {[llength $pages] - 1}]

   237     for {set i 0} {$i <= $lasti} {incr i} {

   238 	set page [lindex $pages $i]

   239 	set lastj [expr {[llength $page] - 1}]

   240 	for {set j 0} {$j <= $lastj} {incr j} {

   241 	    append line [lindex $page $j]

   242 	    if {$j != $lastj || $i != $lasti} {

   243 		append line ", "

   244 	    }

   245 	    if {[string length $line] > 70} {

   246 		puts $f $line

   247 		set line "    "

   248 	    }

   249 	}

   250     }

   251     puts $f $line

   252     puts $f "};

   254 /*

   255  * Each group represents a unique set of character attributes.  The attributes

   256  * are encoded into a 32-bit value as follows:

   257  *

   258  * Bits 0-4	Character category: see the constants listed below.

   259  *

   260  * Bits 5-7	Case delta type: 000 = identity

   261  *				 010 = add delta for lower

   262  *				 011 = add delta for lower, add 1 for title

   263  *				 100 = sutract delta for title/upper

   264  *				 101 = sub delta for upper, sub 1 for title

   265  *				 110 = sub delta for upper, add delta for lower

   266  *

   267  * Bits 8-21	Reserved for future use.

   268  *

   269  * Bits 22-31	Case delta: delta for case conversions.  This should be the

   270  *			    highest field so we can easily sign extend.

   271  */

   273 static int groups\[\] = {"

   274     set line "    "

   275     set last [expr {[llength $groups] - 1}]

   276     for {set i 0} {$i <= $last} {incr i} {

   277 	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}

   279 	# Compute the case conversion type and delta

   281 	if {$totitle != ""} {

   282 	    if {$totitle == $toupper} {

   283 		# subtract delta for title or upper

   284 		set case 4

   285 		set delta $toupper

   286 	    } elseif {$toupper != ""} {

   287 		# subtract delta for upper, subtract 1 for title

   288 		set case 5

   289 		set delta $toupper

   290 	    } else {

   291 		# add delta for lower, add 1 for title

   292 		set case 3

   293 		set delta $tolower

   294 	    }

   295 	} elseif {$toupper != ""} {

   296 	    # subtract delta for upper, add delta for lower

   297 	    set case 6

   298 	    set delta $toupper

   299 	} elseif {$tolower != ""} {

   300 	    # add delta for lower

   301 	    set case 2

   302 	    set delta $tolower

   303 	} else {

   304 	    # noop

   305 	    set case 0

   306 	    set delta 0

   307 	}

   309 	set val [expr {($delta << 22) | ($case << 5) | $type}]

   311 	append line [format "%d" $val]

   312 	if {$i != $last} {

   313 	    append line ", "

   314 	}

   315 	if {[string length $line] > 65} {

   316 	    puts $f $line

   317 	    set line "    "

   318 	}

   319     }

   320     puts $f $line

   321     puts $f "};

   323 /*

   324  * The following constants are used to determine the category of a

   325  * Unicode character.

   326  */

   328 #define UNICODE_CATEGORY_MASK 0X1F

   330 enum {

   331     UNASSIGNED,

   332     UPPERCASE_LETTER,

   333     LOWERCASE_LETTER,

   334     TITLECASE_LETTER,

   335     MODIFIER_LETTER,

   336     OTHER_LETTER,

   337     NON_SPACING_MARK,

   338     ENCLOSING_MARK,

   339     COMBINING_SPACING_MARK,

   340     DECIMAL_DIGIT_NUMBER,

   341     LETTER_NUMBER,

   342     OTHER_NUMBER,

   343     SPACE_SEPARATOR,

   344     LINE_SEPARATOR,

   345     PARAGRAPH_SEPARATOR,

   346     CONTROL,

   347     FORMAT,

   348     PRIVATE_USE,

   349     SURROGATE,

   350     CONNECTOR_PUNCTUATION,

   351     DASH_PUNCTUATION,

   352     OPEN_PUNCTUATION,

   353     CLOSE_PUNCTUATION,

   354     INITIAL_QUOTE_PUNCTUATION,

   355     FINAL_QUOTE_PUNCTUATION,

   356     OTHER_PUNCTUATION,

   357     MATH_SYMBOL,

   358     CURRENCY_SYMBOL,

   359     MODIFIER_SYMBOL,

   360     OTHER_SYMBOL

   361 };

   363 /*

   364  * The following macros extract the fields of the character info.  The

   365  * GetDelta() macro is complicated because we can't rely on the C compiler

   366  * to do sign extension on right shifts.

   367  */

   369 #define GetCaseType(info) (((info) & 0xE0) >> 5)

   370 #define GetCategory(info) ((info) & 0x1F)

   371 #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))

   373 /*

   374  * This macro extracts the information about a character from the

   375  * Unicode character tables.

   376  */

   378 #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])

   379 "

   381     close $f

   382 }

   384 uni::main

   386 return

author	sl
	Tue, 10 Jun 2014 14:32:02 +0200
changeset 1	260cb5ec6c19
permissions	-rw-r--r--