Symaptic: os/persistentdata/persistentstorage/sqlite3api/TEST/TCL/tcldistribution/tools/uniParse.tcl@bde4ae8d615e (annotated)

sl@0	1	# uniParse.tcl --
sl@0	2	#
sl@0	3	# This program parses the UnicodeData file and generates the
sl@0	4	# corresponding tclUniData.c file with compressed character
sl@0	5	# data tables. The input to this program should be the latest
sl@0	6	# UnicodeData file from:
sl@0	7	# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
sl@0	8	#
sl@0	9	# Copyright (c) 1998-1999 by Scriptics Corporation.
sl@0	10	# All rights reserved.
sl@0	11	#
sl@0	12	# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
sl@0	13
sl@0	14
sl@0	15	namespace eval uni {
sl@0	16	set shift 5; # number of bits of data within a page
sl@0	17	# This value can be adjusted to find the
sl@0	18	# best split to minimize table size
sl@0	19
sl@0	20	variable pMap; # map from page to page index, each entry is
sl@0	21	# an index into the pages table, indexed by
sl@0	22	# page number
sl@0	23	variable pages; # map from page index to page info, each
sl@0	24	# entry is a list of indices into the groups
sl@0	25	# table, the list is indexed by the offset
sl@0	26	variable groups; # list of character info values, indexed by
sl@0	27	# group number, initialized with the
sl@0	28	# unassigned character group
sl@0	29
sl@0	30	variable categories {
sl@0	31	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
sl@0	32	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
sl@0	33	}; # Ordered list of character categories, must
sl@0	34	# match the enumeration in the header file.
sl@0	35
sl@0	36	variable titleCount 0; # Count of the number of title case
sl@0	37	# characters. This value is used in the
sl@0	38	# regular expression code to allocate enough
sl@0	39	# space for the title case variants.
sl@0	40	}
sl@0	41
sl@0	42	proc uni::getValue {items index} {
sl@0	43	variable categories
sl@0	44	variable titleCount
sl@0	45
sl@0	46	# Extract character info
sl@0	47
sl@0	48	set category [lindex $items 2]
sl@0	49	if {[scan [lindex $items 12] %4x toupper] == 1} {
sl@0	50	set toupper [expr {$index - $toupper}]
sl@0	51	} else {
sl@0	52	set toupper {}
sl@0	53	}
sl@0	54	if {[scan [lindex $items 13] %4x tolower] == 1} {
sl@0	55	set tolower [expr {$tolower - $index}]
sl@0	56	} else {
sl@0	57	set tolower {}
sl@0	58	}
sl@0	59	if {[scan [lindex $items 14] %4x totitle] == 1} {
sl@0	60	set totitle [expr {$index - $totitle}]
sl@0	61	} else {
sl@0	62	set totitle {}
sl@0	63	}
sl@0	64
sl@0	65	set categoryIndex [lsearch -exact $categories $category]
sl@0	66	if {$categoryIndex < 0} {
sl@0	67	puts "Unexpected character category: $index($category)"
sl@0	68	set categoryIndex 0
sl@0	69	} elseif {$category == "Lt"} {
sl@0	70	incr titleCount
sl@0	71	}
sl@0	72
sl@0	73	return "$categoryIndex,$toupper,$tolower,$totitle"
sl@0	74	}
sl@0	75
sl@0	76	proc uni::getGroup {value} {
sl@0	77	variable groups
sl@0	78
sl@0	79	set gIndex [lsearch -exact $groups $value]
sl@0	80	if {$gIndex == -1} {
sl@0	81	set gIndex [llength $groups]
sl@0	82	lappend groups $value
sl@0	83	}
sl@0	84	return $gIndex
sl@0	85	}
sl@0	86
sl@0	87	proc uni::addPage {info} {
sl@0	88	variable pMap
sl@0	89	variable pages
sl@0	90
sl@0	91	set pIndex [lsearch -exact $pages $info]
sl@0	92	if {$pIndex == -1} {
sl@0	93	set pIndex [llength $pages]
sl@0	94	lappend pages $info
sl@0	95	}
sl@0	96	lappend pMap $pIndex
sl@0	97	return
sl@0	98	}
sl@0	99
sl@0	100	proc uni::buildTables {data} {
sl@0	101	variable shift
sl@0	102
sl@0	103	variable pMap {}
sl@0	104	variable pages {}
sl@0	105	variable groups {{0,,,}}
sl@0	106	set info {} ;# temporary page info
sl@0	107
sl@0	108	set mask [expr {(1 << $shift) - 1}]
sl@0	109
sl@0	110	set next 0
sl@0	111
sl@0	112	foreach line [split $data \n] {
sl@0	113	if {$line == ""} {
sl@0	114	set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
sl@0	115	}
sl@0	116
sl@0	117	set items [split $line \;]
sl@0	118
sl@0	119	scan [lindex $items 0] %4x index
sl@0	120	set index [format 0x%0.4x $index]
sl@0	121
sl@0	122	set gIndex [getGroup [getValue $items $index]]
sl@0	123
sl@0	124	# Since the input table omits unassigned characters, these will
sl@0	125	# show up as gaps in the index sequence. There are a few special cases
sl@0	126	# where the gaps correspond to a uniform block of assigned characters.
sl@0	127	# These are indicated as such in the character name.
sl@0	128
sl@0	129	# Enter all unassigned characters up to the current character.
sl@0	130	if {($index > $next) \
sl@0	131	&& ![regexp "Last>$" [lindex $items 1]]} {
sl@0	132	for {} {$next < $index} {incr next} {
sl@0	133	lappend info 0
sl@0	134	if {($next & $mask) == $mask} {
sl@0	135	addPage $info
sl@0	136	set info {}
sl@0	137	}
sl@0	138	}
sl@0	139	}
sl@0	140
sl@0	141	# Enter all assigned characters up to the current character
sl@0	142	for {set i $next} {$i <= $index} {incr i} {
sl@0	143	# Split character index into offset and page number
sl@0	144	set offset [expr {$i & $mask}]
sl@0	145	set page [expr {($i >> $shift)}]
sl@0	146
sl@0	147	# Add the group index to the info for the current page
sl@0	148	lappend info $gIndex
sl@0	149
sl@0	150	# If this is the last entry in the page, add the page
sl@0	151	if {$offset == $mask} {
sl@0	152	addPage $info
sl@0	153	set info {}
sl@0	154	}
sl@0	155	}
sl@0	156	set next [expr {$index + 1}]
sl@0	157	}
sl@0	158	return
sl@0	159	}
sl@0	160
sl@0	161	proc uni::main {} {
sl@0	162	global argc argv0 argv
sl@0	163	variable pMap
sl@0	164	variable pages
sl@0	165	variable groups
sl@0	166	variable shift
sl@0	167	variable titleCount
sl@0	168
sl@0	169	if {$argc != 2} {
sl@0	170	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
sl@0	171	exit 1
sl@0	172	}
sl@0	173	set f [open [lindex $argv 0] r]
sl@0	174	set data [read $f]
sl@0	175	close $f
sl@0	176
sl@0	177	buildTables $data
sl@0	178	puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
sl@0	179	set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
sl@0	180	puts "shift = 6, space = $size"
sl@0	181	puts "title case count = $titleCount"
sl@0	182
sl@0	183	set f [open [file join [lindex $argv 1] tclUniData.c] w]
sl@0	184	fconfigure $f -translation lf
sl@0	185	puts $f "/*
sl@0	186	* tclUniData.c --
sl@0	187	*
sl@0	188	* Declarations of Unicode character information tables. This file is
sl@0	189	* automatically generated by the tools/uniParse.tcl script. Do not
sl@0	190	* modify this file by hand.
sl@0	191	*
sl@0	192	* Copyright (c) 1998 by Scriptics Corporation.
sl@0	193	* All rights reserved.
sl@0	194	*
sl@0	195	* RCS: @(#) \$Id\$
sl@0	196	*/
sl@0	197
sl@0	198	/*
sl@0	199	* A 16-bit Unicode character is split into two parts in order to index
sl@0	200	* into the following tables. The lower OFFSET_BITS comprise an offset
sl@0	201	* into a page of characters. The upper bits comprise the page number.
sl@0	202	*/
sl@0	203
sl@0	204	#define OFFSET_BITS $shift
sl@0	205
sl@0	206	/*
sl@0	207	* The pageMap is indexed by page number and returns an alternate page number
sl@0	208	* that identifies a unique page of characters. Many Unicode characters map
sl@0	209	* to the same alternate page number.
sl@0	210	*/
sl@0	211
sl@0	212	static unsigned char pageMap\[\] = {"
sl@0	213	set line " "
sl@0	214	set last [expr {[llength $pMap] - 1}]
sl@0	215	for {set i 0} {$i <= $last} {incr i} {
sl@0	216	append line [lindex $pMap $i]
sl@0	217	if {$i != $last} {
sl@0	218	append line ", "
sl@0	219	}
sl@0	220	if {[string length $line] > 70} {
sl@0	221	puts $f $line
sl@0	222	set line " "
sl@0	223	}
sl@0	224	}
sl@0	225	puts $f $line
sl@0	226	puts $f "};
sl@0	227
sl@0	228	/*
sl@0	229	* The groupMap is indexed by combining the alternate page number with
sl@0	230	* the page offset and returns a group number that identifies a unique
sl@0	231	* set of character attributes.
sl@0	232	*/
sl@0	233
sl@0	234	static unsigned char groupMap\[\] = {"
sl@0	235	set line " "
sl@0	236	set lasti [expr {[llength $pages] - 1}]
sl@0	237	for {set i 0} {$i <= $lasti} {incr i} {
sl@0	238	set page [lindex $pages $i]
sl@0	239	set lastj [expr {[llength $page] - 1}]
sl@0	240	for {set j 0} {$j <= $lastj} {incr j} {
sl@0	241	append line [lindex $page $j]
sl@0	242	if {$j != $lastj \|\| $i != $lasti} {
sl@0	243	append line ", "
sl@0	244	}
sl@0	245	if {[string length $line] > 70} {
sl@0	246	puts $f $line
sl@0	247	set line " "
sl@0	248	}
sl@0	249	}
sl@0	250	}
sl@0	251	puts $f $line
sl@0	252	puts $f "};
sl@0	253
sl@0	254	/*
sl@0	255	* Each group represents a unique set of character attributes. The attributes
sl@0	256	* are encoded into a 32-bit value as follows:
sl@0	257	*
sl@0	258	* Bits 0-4 Character category: see the constants listed below.
sl@0	259	*
sl@0	260	* Bits 5-7 Case delta type: 000 = identity
sl@0	261	* 010 = add delta for lower
sl@0	262	* 011 = add delta for lower, add 1 for title
sl@0	263	* 100 = sutract delta for title/upper
sl@0	264	* 101 = sub delta for upper, sub 1 for title
sl@0	265	* 110 = sub delta for upper, add delta for lower
sl@0	266	*
sl@0	267	* Bits 8-21 Reserved for future use.
sl@0	268	*
sl@0	269	* Bits 22-31 Case delta: delta for case conversions. This should be the
sl@0	270	* highest field so we can easily sign extend.
sl@0	271	*/
sl@0	272
sl@0	273	static int groups\[\] = {"
sl@0	274	set line " "
sl@0	275	set last [expr {[llength $groups] - 1}]
sl@0	276	for {set i 0} {$i <= $last} {incr i} {
sl@0	277	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
sl@0	278
sl@0	279	# Compute the case conversion type and delta
sl@0	280
sl@0	281	if {$totitle != ""} {
sl@0	282	if {$totitle == $toupper} {
sl@0	283	# subtract delta for title or upper
sl@0	284	set case 4
sl@0	285	set delta $toupper
sl@0	286	} elseif {$toupper != ""} {
sl@0	287	# subtract delta for upper, subtract 1 for title
sl@0	288	set case 5
sl@0	289	set delta $toupper
sl@0	290	} else {
sl@0	291	# add delta for lower, add 1 for title
sl@0	292	set case 3
sl@0	293	set delta $tolower
sl@0	294	}
sl@0	295	} elseif {$toupper != ""} {
sl@0	296	# subtract delta for upper, add delta for lower
sl@0	297	set case 6
sl@0	298	set delta $toupper
sl@0	299	} elseif {$tolower != ""} {
sl@0	300	# add delta for lower
sl@0	301	set case 2
sl@0	302	set delta $tolower
sl@0	303	} else {
sl@0	304	# noop
sl@0	305	set case 0
sl@0	306	set delta 0
sl@0	307	}
sl@0	308
sl@0	309	set val [expr {($delta << 22) \| ($case << 5) \| $type}]
sl@0	310
sl@0	311	append line [format "%d" $val]
sl@0	312	if {$i != $last} {
sl@0	313	append line ", "
sl@0	314	}
sl@0	315	if {[string length $line] > 65} {
sl@0	316	puts $f $line
sl@0	317	set line " "
sl@0	318	}
sl@0	319	}
sl@0	320	puts $f $line
sl@0	321	puts $f "};
sl@0	322
sl@0	323	/*
sl@0	324	* The following constants are used to determine the category of a
sl@0	325	* Unicode character.
sl@0	326	*/
sl@0	327
sl@0	328	#define UNICODE_CATEGORY_MASK 0X1F
sl@0	329
sl@0	330	enum {
sl@0	331	UNASSIGNED,
sl@0	332	UPPERCASE_LETTER,
sl@0	333	LOWERCASE_LETTER,
sl@0	334	TITLECASE_LETTER,
sl@0	335	MODIFIER_LETTER,
sl@0	336	OTHER_LETTER,
sl@0	337	NON_SPACING_MARK,
sl@0	338	ENCLOSING_MARK,
sl@0	339	COMBINING_SPACING_MARK,
sl@0	340	DECIMAL_DIGIT_NUMBER,
sl@0	341	LETTER_NUMBER,
sl@0	342	OTHER_NUMBER,
sl@0	343	SPACE_SEPARATOR,
sl@0	344	LINE_SEPARATOR,
sl@0	345	PARAGRAPH_SEPARATOR,
sl@0	346	CONTROL,
sl@0	347	FORMAT,
sl@0	348	PRIVATE_USE,
sl@0	349	SURROGATE,
sl@0	350	CONNECTOR_PUNCTUATION,
sl@0	351	DASH_PUNCTUATION,
sl@0	352	OPEN_PUNCTUATION,
sl@0	353	CLOSE_PUNCTUATION,
sl@0	354	INITIAL_QUOTE_PUNCTUATION,
sl@0	355	FINAL_QUOTE_PUNCTUATION,
sl@0	356	OTHER_PUNCTUATION,
sl@0	357	MATH_SYMBOL,
sl@0	358	CURRENCY_SYMBOL,
sl@0	359	MODIFIER_SYMBOL,
sl@0	360	OTHER_SYMBOL
sl@0	361	};
sl@0	362
sl@0	363	/*
sl@0	364	* The following macros extract the fields of the character info. The
sl@0	365	* GetDelta() macro is complicated because we can't rely on the C compiler
sl@0	366	* to do sign extension on right shifts.
sl@0	367	*/
sl@0	368
sl@0	369	#define GetCaseType(info) (((info) & 0xE0) >> 5)
sl@0	370	#define GetCategory(info) ((info) & 0x1F)
sl@0	371	#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
sl@0	372
sl@0	373	/*
sl@0	374	* This macro extracts the information about a character from the
sl@0	375	* Unicode character tables.
sl@0	376	*/
sl@0	377
sl@0	378	#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) \| ((ch) & ((1 << OFFSET_BITS)-1))\]\])
sl@0	379	"
sl@0	380
sl@0	381	close $f
sl@0	382	}
sl@0	383
sl@0	384	uni::main
sl@0	385
sl@0	386	return

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--