os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
# uniParse.tcl --
sl@0
     2
#
sl@0
     3
#	This program parses the UnicodeData file and generates the
sl@0
     4
#	corresponding tclUniData.c file with compressed character
sl@0
     5
#	data tables.  The input to this program should be the latest
sl@0
     6
#	UnicodeData file from:
sl@0
     7
#	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
sl@0
     8
#
sl@0
     9
# Copyright (c) 1998-1999 by Scriptics Corporation.
sl@0
    10
# All rights reserved.
sl@0
    11
# 
sl@0
    12
# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
sl@0
    13
sl@0
    14
sl@0
    15
namespace eval uni {
sl@0
    16
    set shift 5;		# number of bits of data within a page
sl@0
    17
				# This value can be adjusted to find the
sl@0
    18
				# best split to minimize table size
sl@0
    19
sl@0
    20
    variable pMap;		# map from page to page index, each entry is
sl@0
    21
				# an index into the pages table, indexed by
sl@0
    22
				# page number
sl@0
    23
    variable pages;		# map from page index to page info, each
sl@0
    24
				# entry is a list of indices into the groups
sl@0
    25
				# table, the list is indexed by the offset
sl@0
    26
    variable groups;		# list of character info values, indexed by
sl@0
    27
				# group number, initialized with the
sl@0
    28
				# unassigned character group
sl@0
    29
sl@0
    30
    variable categories {
sl@0
    31
	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
sl@0
    32
	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
sl@0
    33
    };				# Ordered list of character categories, must
sl@0
    34
				# match the enumeration in the header file.
sl@0
    35
sl@0
    36
    variable titleCount 0;	# Count of the number of title case
sl@0
    37
				# characters.  This value is used in the
sl@0
    38
				# regular expression code to allocate enough
sl@0
    39
				# space for the title case variants.
sl@0
    40
}
sl@0
    41
sl@0
    42
proc uni::getValue {items index} {
sl@0
    43
    variable categories
sl@0
    44
    variable titleCount
sl@0
    45
sl@0
    46
    # Extract character info
sl@0
    47
sl@0
    48
    set category [lindex $items 2]
sl@0
    49
    if {[scan [lindex $items 12] %4x toupper] == 1} {
sl@0
    50
	set toupper [expr {$index - $toupper}]
sl@0
    51
    } else {
sl@0
    52
	set toupper {}
sl@0
    53
    }
sl@0
    54
    if {[scan [lindex $items 13] %4x tolower] == 1} {
sl@0
    55
	set tolower [expr {$tolower - $index}]
sl@0
    56
    } else {
sl@0
    57
	set tolower {}
sl@0
    58
    }
sl@0
    59
    if {[scan [lindex $items 14] %4x totitle] == 1} {
sl@0
    60
	set totitle [expr {$index - $totitle}]
sl@0
    61
    } else {
sl@0
    62
	set totitle {}
sl@0
    63
    }
sl@0
    64
sl@0
    65
    set categoryIndex [lsearch -exact $categories $category]
sl@0
    66
    if {$categoryIndex < 0} {
sl@0
    67
	puts "Unexpected character category: $index($category)"
sl@0
    68
	set categoryIndex 0
sl@0
    69
    } elseif {$category == "Lt"} {
sl@0
    70
	incr titleCount
sl@0
    71
    }
sl@0
    72
sl@0
    73
    return "$categoryIndex,$toupper,$tolower,$totitle"
sl@0
    74
}
sl@0
    75
sl@0
    76
proc uni::getGroup {value} {
sl@0
    77
    variable groups
sl@0
    78
sl@0
    79
    set gIndex [lsearch -exact $groups $value]
sl@0
    80
    if {$gIndex == -1} {
sl@0
    81
	set gIndex [llength $groups]
sl@0
    82
	lappend groups $value
sl@0
    83
    }
sl@0
    84
    return $gIndex
sl@0
    85
}
sl@0
    86
sl@0
    87
proc uni::addPage {info} {
sl@0
    88
    variable pMap
sl@0
    89
    variable pages
sl@0
    90
    
sl@0
    91
    set pIndex [lsearch -exact $pages $info]
sl@0
    92
    if {$pIndex == -1} {
sl@0
    93
	set pIndex [llength $pages]
sl@0
    94
	lappend pages $info
sl@0
    95
    }
sl@0
    96
    lappend pMap $pIndex
sl@0
    97
    return
sl@0
    98
}
sl@0
    99
    
sl@0
   100
proc uni::buildTables {data} {
sl@0
   101
    variable shift
sl@0
   102
sl@0
   103
    variable pMap {}
sl@0
   104
    variable pages {}
sl@0
   105
    variable groups {{0,,,}}
sl@0
   106
    set info {}			;# temporary page info
sl@0
   107
    
sl@0
   108
    set mask [expr {(1 << $shift) - 1}]
sl@0
   109
sl@0
   110
    set next 0
sl@0
   111
sl@0
   112
    foreach line [split $data \n] {
sl@0
   113
	if {$line == ""} {
sl@0
   114
	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
sl@0
   115
	}
sl@0
   116
sl@0
   117
	set items [split $line \;]
sl@0
   118
sl@0
   119
	scan [lindex $items 0] %4x index
sl@0
   120
	set index [format 0x%0.4x $index]
sl@0
   121
	
sl@0
   122
	set gIndex [getGroup [getValue $items $index]]
sl@0
   123
sl@0
   124
	# Since the input table omits unassigned characters, these will
sl@0
   125
	# show up as gaps in the index sequence.  There are a few special cases
sl@0
   126
	# where the gaps correspond to a uniform block of assigned characters.
sl@0
   127
	# These are indicated as such in the character name.
sl@0
   128
sl@0
   129
	# Enter all unassigned characters up to the current character.
sl@0
   130
	if {($index > $next) \
sl@0
   131
		&& ![regexp "Last>$" [lindex $items 1]]} {
sl@0
   132
	    for {} {$next < $index} {incr next} {
sl@0
   133
		lappend info 0
sl@0
   134
		if {($next & $mask) == $mask} {
sl@0
   135
		    addPage $info
sl@0
   136
		    set info {}
sl@0
   137
		}
sl@0
   138
	    }
sl@0
   139
	}
sl@0
   140
sl@0
   141
	# Enter all assigned characters up to the current character
sl@0
   142
	for {set i $next} {$i <= $index} {incr i} {
sl@0
   143
	    # Split character index into offset and page number
sl@0
   144
	    set offset [expr {$i & $mask}]
sl@0
   145
	    set page [expr {($i >> $shift)}]
sl@0
   146
sl@0
   147
	    # Add the group index to the info for the current page
sl@0
   148
	    lappend info $gIndex
sl@0
   149
sl@0
   150
	    # If this is the last entry in the page, add the page
sl@0
   151
	    if {$offset == $mask} {
sl@0
   152
		addPage $info
sl@0
   153
		set info {}
sl@0
   154
	    }
sl@0
   155
	}
sl@0
   156
	set next [expr {$index + 1}]
sl@0
   157
    }
sl@0
   158
    return
sl@0
   159
}
sl@0
   160
sl@0
   161
proc uni::main {} {
sl@0
   162
    global argc argv0 argv
sl@0
   163
    variable pMap
sl@0
   164
    variable pages
sl@0
   165
    variable groups
sl@0
   166
    variable shift
sl@0
   167
    variable titleCount
sl@0
   168
sl@0
   169
    if {$argc != 2} {
sl@0
   170
	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
sl@0
   171
	exit 1
sl@0
   172
    }
sl@0
   173
    set f [open [lindex $argv 0] r]
sl@0
   174
    set data [read $f]
sl@0
   175
    close $f
sl@0
   176
sl@0
   177
    buildTables $data
sl@0
   178
    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
sl@0
   179
    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
sl@0
   180
    puts "shift = 6, space = $size"
sl@0
   181
    puts "title case count = $titleCount"
sl@0
   182
sl@0
   183
    set f [open [file join [lindex $argv 1] tclUniData.c] w]
sl@0
   184
    fconfigure $f -translation lf
sl@0
   185
    puts $f "/*
sl@0
   186
 * tclUniData.c --
sl@0
   187
 *
sl@0
   188
 *	Declarations of Unicode character information tables.  This file is
sl@0
   189
 *	automatically generated by the tools/uniParse.tcl script.  Do not
sl@0
   190
 *	modify this file by hand.
sl@0
   191
 *
sl@0
   192
 * Copyright (c) 1998 by Scriptics Corporation.
sl@0
   193
 * All rights reserved.
sl@0
   194
 *
sl@0
   195
 * RCS: @(#) \$Id\$
sl@0
   196
 */
sl@0
   197
sl@0
   198
/*
sl@0
   199
 * A 16-bit Unicode character is split into two parts in order to index
sl@0
   200
 * into the following tables.  The lower OFFSET_BITS comprise an offset
sl@0
   201
 * into a page of characters.  The upper bits comprise the page number.
sl@0
   202
 */
sl@0
   203
sl@0
   204
#define OFFSET_BITS $shift
sl@0
   205
sl@0
   206
/*
sl@0
   207
 * The pageMap is indexed by page number and returns an alternate page number
sl@0
   208
 * that identifies a unique page of characters.  Many Unicode characters map
sl@0
   209
 * to the same alternate page number.
sl@0
   210
 */
sl@0
   211
sl@0
   212
static unsigned char pageMap\[\] = {"
sl@0
   213
    set line "    "
sl@0
   214
    set last [expr {[llength $pMap] - 1}]
sl@0
   215
    for {set i 0} {$i <= $last} {incr i} {
sl@0
   216
	append line [lindex $pMap $i]
sl@0
   217
	if {$i != $last} {
sl@0
   218
	    append line ", "
sl@0
   219
	}
sl@0
   220
	if {[string length $line] > 70} {
sl@0
   221
	    puts $f $line
sl@0
   222
	    set line "    "
sl@0
   223
	}
sl@0
   224
    }
sl@0
   225
    puts $f $line
sl@0
   226
    puts $f "};
sl@0
   227
sl@0
   228
/*
sl@0
   229
 * The groupMap is indexed by combining the alternate page number with
sl@0
   230
 * the page offset and returns a group number that identifies a unique
sl@0
   231
 * set of character attributes.
sl@0
   232
 */
sl@0
   233
sl@0
   234
static unsigned char groupMap\[\] = {"
sl@0
   235
    set line "    "
sl@0
   236
    set lasti [expr {[llength $pages] - 1}]
sl@0
   237
    for {set i 0} {$i <= $lasti} {incr i} {
sl@0
   238
	set page [lindex $pages $i]
sl@0
   239
	set lastj [expr {[llength $page] - 1}]
sl@0
   240
	for {set j 0} {$j <= $lastj} {incr j} {
sl@0
   241
	    append line [lindex $page $j]
sl@0
   242
	    if {$j != $lastj || $i != $lasti} {
sl@0
   243
		append line ", "
sl@0
   244
	    }
sl@0
   245
	    if {[string length $line] > 70} {
sl@0
   246
		puts $f $line
sl@0
   247
		set line "    "
sl@0
   248
	    }
sl@0
   249
	}
sl@0
   250
    }
sl@0
   251
    puts $f $line
sl@0
   252
    puts $f "};
sl@0
   253
sl@0
   254
/*
sl@0
   255
 * Each group represents a unique set of character attributes.  The attributes
sl@0
   256
 * are encoded into a 32-bit value as follows:
sl@0
   257
 *
sl@0
   258
 * Bits 0-4	Character category: see the constants listed below.
sl@0
   259
 *
sl@0
   260
 * Bits 5-7	Case delta type: 000 = identity
sl@0
   261
 *				 010 = add delta for lower
sl@0
   262
 *				 011 = add delta for lower, add 1 for title
sl@0
   263
 *				 100 = sutract delta for title/upper
sl@0
   264
 *				 101 = sub delta for upper, sub 1 for title
sl@0
   265
 *				 110 = sub delta for upper, add delta for lower
sl@0
   266
 *
sl@0
   267
 * Bits 8-21	Reserved for future use.
sl@0
   268
 *
sl@0
   269
 * Bits 22-31	Case delta: delta for case conversions.  This should be the
sl@0
   270
 *			    highest field so we can easily sign extend.
sl@0
   271
 */
sl@0
   272
sl@0
   273
static int groups\[\] = {"
sl@0
   274
    set line "    "
sl@0
   275
    set last [expr {[llength $groups] - 1}]
sl@0
   276
    for {set i 0} {$i <= $last} {incr i} {
sl@0
   277
	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
sl@0
   278
	
sl@0
   279
	# Compute the case conversion type and delta
sl@0
   280
sl@0
   281
	if {$totitle != ""} {
sl@0
   282
	    if {$totitle == $toupper} {
sl@0
   283
		# subtract delta for title or upper
sl@0
   284
		set case 4
sl@0
   285
		set delta $toupper
sl@0
   286
	    } elseif {$toupper != ""} {
sl@0
   287
		# subtract delta for upper, subtract 1 for title
sl@0
   288
		set case 5
sl@0
   289
		set delta $toupper
sl@0
   290
	    } else {
sl@0
   291
		# add delta for lower, add 1 for title
sl@0
   292
		set case 3
sl@0
   293
		set delta $tolower
sl@0
   294
	    }
sl@0
   295
	} elseif {$toupper != ""} {
sl@0
   296
	    # subtract delta for upper, add delta for lower
sl@0
   297
	    set case 6
sl@0
   298
	    set delta $toupper
sl@0
   299
	} elseif {$tolower != ""} {
sl@0
   300
	    # add delta for lower
sl@0
   301
	    set case 2
sl@0
   302
	    set delta $tolower
sl@0
   303
	} else {
sl@0
   304
	    # noop
sl@0
   305
	    set case 0
sl@0
   306
	    set delta 0
sl@0
   307
	}
sl@0
   308
sl@0
   309
	set val [expr {($delta << 22) | ($case << 5) | $type}]
sl@0
   310
sl@0
   311
	append line [format "%d" $val]
sl@0
   312
	if {$i != $last} {
sl@0
   313
	    append line ", "
sl@0
   314
	}
sl@0
   315
	if {[string length $line] > 65} {
sl@0
   316
	    puts $f $line
sl@0
   317
	    set line "    "
sl@0
   318
	}
sl@0
   319
    }
sl@0
   320
    puts $f $line
sl@0
   321
    puts $f "};
sl@0
   322
sl@0
   323
/*
sl@0
   324
 * The following constants are used to determine the category of a
sl@0
   325
 * Unicode character.
sl@0
   326
 */
sl@0
   327
sl@0
   328
#define UNICODE_CATEGORY_MASK 0X1F
sl@0
   329
sl@0
   330
enum {
sl@0
   331
    UNASSIGNED,
sl@0
   332
    UPPERCASE_LETTER,
sl@0
   333
    LOWERCASE_LETTER,
sl@0
   334
    TITLECASE_LETTER,
sl@0
   335
    MODIFIER_LETTER,
sl@0
   336
    OTHER_LETTER,
sl@0
   337
    NON_SPACING_MARK,
sl@0
   338
    ENCLOSING_MARK,
sl@0
   339
    COMBINING_SPACING_MARK,
sl@0
   340
    DECIMAL_DIGIT_NUMBER,
sl@0
   341
    LETTER_NUMBER,
sl@0
   342
    OTHER_NUMBER,
sl@0
   343
    SPACE_SEPARATOR,
sl@0
   344
    LINE_SEPARATOR,
sl@0
   345
    PARAGRAPH_SEPARATOR,
sl@0
   346
    CONTROL,
sl@0
   347
    FORMAT,
sl@0
   348
    PRIVATE_USE,
sl@0
   349
    SURROGATE,
sl@0
   350
    CONNECTOR_PUNCTUATION,
sl@0
   351
    DASH_PUNCTUATION,
sl@0
   352
    OPEN_PUNCTUATION,
sl@0
   353
    CLOSE_PUNCTUATION,
sl@0
   354
    INITIAL_QUOTE_PUNCTUATION,
sl@0
   355
    FINAL_QUOTE_PUNCTUATION,
sl@0
   356
    OTHER_PUNCTUATION,
sl@0
   357
    MATH_SYMBOL,
sl@0
   358
    CURRENCY_SYMBOL,
sl@0
   359
    MODIFIER_SYMBOL,
sl@0
   360
    OTHER_SYMBOL
sl@0
   361
};
sl@0
   362
sl@0
   363
/*
sl@0
   364
 * The following macros extract the fields of the character info.  The
sl@0
   365
 * GetDelta() macro is complicated because we can't rely on the C compiler
sl@0
   366
 * to do sign extension on right shifts.
sl@0
   367
 */
sl@0
   368
sl@0
   369
#define GetCaseType(info) (((info) & 0xE0) >> 5)
sl@0
   370
#define GetCategory(info) ((info) & 0x1F)
sl@0
   371
#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
sl@0
   372
sl@0
   373
/*
sl@0
   374
 * This macro extracts the information about a character from the
sl@0
   375
 * Unicode character tables.
sl@0
   376
 */
sl@0
   377
sl@0
   378
#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
sl@0
   379
"
sl@0
   380
sl@0
   381
    close $f
sl@0
   382
}
sl@0
   383
sl@0
   384
uni::main
sl@0
   385
sl@0
   386
return