sl@0
|
1 |
# uniParse.tcl --
|
sl@0
|
2 |
#
|
sl@0
|
3 |
# This program parses the UnicodeData file and generates the
|
sl@0
|
4 |
# corresponding tclUniData.c file with compressed character
|
sl@0
|
5 |
# data tables. The input to this program should be the latest
|
sl@0
|
6 |
# UnicodeData file from:
|
sl@0
|
7 |
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt
|
sl@0
|
8 |
#
|
sl@0
|
9 |
# Copyright (c) 1998-1999 by Scriptics Corporation.
|
sl@0
|
10 |
# All rights reserved.
|
sl@0
|
11 |
#
|
sl@0
|
12 |
# RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $
|
sl@0
|
13 |
|
sl@0
|
14 |
|
sl@0
|
15 |
namespace eval uni {
|
sl@0
|
16 |
set shift 5; # number of bits of data within a page
|
sl@0
|
17 |
# This value can be adjusted to find the
|
sl@0
|
18 |
# best split to minimize table size
|
sl@0
|
19 |
|
sl@0
|
20 |
variable pMap; # map from page to page index, each entry is
|
sl@0
|
21 |
# an index into the pages table, indexed by
|
sl@0
|
22 |
# page number
|
sl@0
|
23 |
variable pages; # map from page index to page info, each
|
sl@0
|
24 |
# entry is a list of indices into the groups
|
sl@0
|
25 |
# table, the list is indexed by the offset
|
sl@0
|
26 |
variable groups; # list of character info values, indexed by
|
sl@0
|
27 |
# group number, initialized with the
|
sl@0
|
28 |
# unassigned character group
|
sl@0
|
29 |
|
sl@0
|
30 |
variable categories {
|
sl@0
|
31 |
Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
|
sl@0
|
32 |
Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
|
sl@0
|
33 |
}; # Ordered list of character categories, must
|
sl@0
|
34 |
# match the enumeration in the header file.
|
sl@0
|
35 |
|
sl@0
|
36 |
variable titleCount 0; # Count of the number of title case
|
sl@0
|
37 |
# characters. This value is used in the
|
sl@0
|
38 |
# regular expression code to allocate enough
|
sl@0
|
39 |
# space for the title case variants.
|
sl@0
|
40 |
}
|
sl@0
|
41 |
|
sl@0
|
42 |
proc uni::getValue {items index} {
|
sl@0
|
43 |
variable categories
|
sl@0
|
44 |
variable titleCount
|
sl@0
|
45 |
|
sl@0
|
46 |
# Extract character info
|
sl@0
|
47 |
|
sl@0
|
48 |
set category [lindex $items 2]
|
sl@0
|
49 |
if {[scan [lindex $items 12] %4x toupper] == 1} {
|
sl@0
|
50 |
set toupper [expr {$index - $toupper}]
|
sl@0
|
51 |
} else {
|
sl@0
|
52 |
set toupper {}
|
sl@0
|
53 |
}
|
sl@0
|
54 |
if {[scan [lindex $items 13] %4x tolower] == 1} {
|
sl@0
|
55 |
set tolower [expr {$tolower - $index}]
|
sl@0
|
56 |
} else {
|
sl@0
|
57 |
set tolower {}
|
sl@0
|
58 |
}
|
sl@0
|
59 |
if {[scan [lindex $items 14] %4x totitle] == 1} {
|
sl@0
|
60 |
set totitle [expr {$index - $totitle}]
|
sl@0
|
61 |
} else {
|
sl@0
|
62 |
set totitle {}
|
sl@0
|
63 |
}
|
sl@0
|
64 |
|
sl@0
|
65 |
set categoryIndex [lsearch -exact $categories $category]
|
sl@0
|
66 |
if {$categoryIndex < 0} {
|
sl@0
|
67 |
puts "Unexpected character category: $index($category)"
|
sl@0
|
68 |
set categoryIndex 0
|
sl@0
|
69 |
} elseif {$category == "Lt"} {
|
sl@0
|
70 |
incr titleCount
|
sl@0
|
71 |
}
|
sl@0
|
72 |
|
sl@0
|
73 |
return "$categoryIndex,$toupper,$tolower,$totitle"
|
sl@0
|
74 |
}
|
sl@0
|
75 |
|
sl@0
|
76 |
proc uni::getGroup {value} {
|
sl@0
|
77 |
variable groups
|
sl@0
|
78 |
|
sl@0
|
79 |
set gIndex [lsearch -exact $groups $value]
|
sl@0
|
80 |
if {$gIndex == -1} {
|
sl@0
|
81 |
set gIndex [llength $groups]
|
sl@0
|
82 |
lappend groups $value
|
sl@0
|
83 |
}
|
sl@0
|
84 |
return $gIndex
|
sl@0
|
85 |
}
|
sl@0
|
86 |
|
sl@0
|
87 |
proc uni::addPage {info} {
|
sl@0
|
88 |
variable pMap
|
sl@0
|
89 |
variable pages
|
sl@0
|
90 |
|
sl@0
|
91 |
set pIndex [lsearch -exact $pages $info]
|
sl@0
|
92 |
if {$pIndex == -1} {
|
sl@0
|
93 |
set pIndex [llength $pages]
|
sl@0
|
94 |
lappend pages $info
|
sl@0
|
95 |
}
|
sl@0
|
96 |
lappend pMap $pIndex
|
sl@0
|
97 |
return
|
sl@0
|
98 |
}
|
sl@0
|
99 |
|
sl@0
|
100 |
proc uni::buildTables {data} {
|
sl@0
|
101 |
variable shift
|
sl@0
|
102 |
|
sl@0
|
103 |
variable pMap {}
|
sl@0
|
104 |
variable pages {}
|
sl@0
|
105 |
variable groups {{0,,,}}
|
sl@0
|
106 |
set info {} ;# temporary page info
|
sl@0
|
107 |
|
sl@0
|
108 |
set mask [expr {(1 << $shift) - 1}]
|
sl@0
|
109 |
|
sl@0
|
110 |
set next 0
|
sl@0
|
111 |
|
sl@0
|
112 |
foreach line [split $data \n] {
|
sl@0
|
113 |
if {$line == ""} {
|
sl@0
|
114 |
set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
|
sl@0
|
115 |
}
|
sl@0
|
116 |
|
sl@0
|
117 |
set items [split $line \;]
|
sl@0
|
118 |
|
sl@0
|
119 |
scan [lindex $items 0] %4x index
|
sl@0
|
120 |
set index [format 0x%0.4x $index]
|
sl@0
|
121 |
|
sl@0
|
122 |
set gIndex [getGroup [getValue $items $index]]
|
sl@0
|
123 |
|
sl@0
|
124 |
# Since the input table omits unassigned characters, these will
|
sl@0
|
125 |
# show up as gaps in the index sequence. There are a few special cases
|
sl@0
|
126 |
# where the gaps correspond to a uniform block of assigned characters.
|
sl@0
|
127 |
# These are indicated as such in the character name.
|
sl@0
|
128 |
|
sl@0
|
129 |
# Enter all unassigned characters up to the current character.
|
sl@0
|
130 |
if {($index > $next) \
|
sl@0
|
131 |
&& ![regexp "Last>$" [lindex $items 1]]} {
|
sl@0
|
132 |
for {} {$next < $index} {incr next} {
|
sl@0
|
133 |
lappend info 0
|
sl@0
|
134 |
if {($next & $mask) == $mask} {
|
sl@0
|
135 |
addPage $info
|
sl@0
|
136 |
set info {}
|
sl@0
|
137 |
}
|
sl@0
|
138 |
}
|
sl@0
|
139 |
}
|
sl@0
|
140 |
|
sl@0
|
141 |
# Enter all assigned characters up to the current character
|
sl@0
|
142 |
for {set i $next} {$i <= $index} {incr i} {
|
sl@0
|
143 |
# Split character index into offset and page number
|
sl@0
|
144 |
set offset [expr {$i & $mask}]
|
sl@0
|
145 |
set page [expr {($i >> $shift)}]
|
sl@0
|
146 |
|
sl@0
|
147 |
# Add the group index to the info for the current page
|
sl@0
|
148 |
lappend info $gIndex
|
sl@0
|
149 |
|
sl@0
|
150 |
# If this is the last entry in the page, add the page
|
sl@0
|
151 |
if {$offset == $mask} {
|
sl@0
|
152 |
addPage $info
|
sl@0
|
153 |
set info {}
|
sl@0
|
154 |
}
|
sl@0
|
155 |
}
|
sl@0
|
156 |
set next [expr {$index + 1}]
|
sl@0
|
157 |
}
|
sl@0
|
158 |
return
|
sl@0
|
159 |
}
|
sl@0
|
160 |
|
sl@0
|
161 |
proc uni::main {} {
|
sl@0
|
162 |
global argc argv0 argv
|
sl@0
|
163 |
variable pMap
|
sl@0
|
164 |
variable pages
|
sl@0
|
165 |
variable groups
|
sl@0
|
166 |
variable shift
|
sl@0
|
167 |
variable titleCount
|
sl@0
|
168 |
|
sl@0
|
169 |
if {$argc != 2} {
|
sl@0
|
170 |
puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
|
sl@0
|
171 |
exit 1
|
sl@0
|
172 |
}
|
sl@0
|
173 |
set f [open [lindex $argv 0] r]
|
sl@0
|
174 |
set data [read $f]
|
sl@0
|
175 |
close $f
|
sl@0
|
176 |
|
sl@0
|
177 |
buildTables $data
|
sl@0
|
178 |
puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
|
sl@0
|
179 |
set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
|
sl@0
|
180 |
puts "shift = 6, space = $size"
|
sl@0
|
181 |
puts "title case count = $titleCount"
|
sl@0
|
182 |
|
sl@0
|
183 |
set f [open [file join [lindex $argv 1] tclUniData.c] w]
|
sl@0
|
184 |
fconfigure $f -translation lf
|
sl@0
|
185 |
puts $f "/*
|
sl@0
|
186 |
* tclUniData.c --
|
sl@0
|
187 |
*
|
sl@0
|
188 |
* Declarations of Unicode character information tables. This file is
|
sl@0
|
189 |
* automatically generated by the tools/uniParse.tcl script. Do not
|
sl@0
|
190 |
* modify this file by hand.
|
sl@0
|
191 |
*
|
sl@0
|
192 |
* Copyright (c) 1998 by Scriptics Corporation.
|
sl@0
|
193 |
* All rights reserved.
|
sl@0
|
194 |
*
|
sl@0
|
195 |
* RCS: @(#) \$Id\$
|
sl@0
|
196 |
*/
|
sl@0
|
197 |
|
sl@0
|
198 |
/*
|
sl@0
|
199 |
* A 16-bit Unicode character is split into two parts in order to index
|
sl@0
|
200 |
* into the following tables. The lower OFFSET_BITS comprise an offset
|
sl@0
|
201 |
* into a page of characters. The upper bits comprise the page number.
|
sl@0
|
202 |
*/
|
sl@0
|
203 |
|
sl@0
|
204 |
#define OFFSET_BITS $shift
|
sl@0
|
205 |
|
sl@0
|
206 |
/*
|
sl@0
|
207 |
* The pageMap is indexed by page number and returns an alternate page number
|
sl@0
|
208 |
* that identifies a unique page of characters. Many Unicode characters map
|
sl@0
|
209 |
* to the same alternate page number.
|
sl@0
|
210 |
*/
|
sl@0
|
211 |
|
sl@0
|
212 |
static unsigned char pageMap\[\] = {"
|
sl@0
|
213 |
set line " "
|
sl@0
|
214 |
set last [expr {[llength $pMap] - 1}]
|
sl@0
|
215 |
for {set i 0} {$i <= $last} {incr i} {
|
sl@0
|
216 |
append line [lindex $pMap $i]
|
sl@0
|
217 |
if {$i != $last} {
|
sl@0
|
218 |
append line ", "
|
sl@0
|
219 |
}
|
sl@0
|
220 |
if {[string length $line] > 70} {
|
sl@0
|
221 |
puts $f $line
|
sl@0
|
222 |
set line " "
|
sl@0
|
223 |
}
|
sl@0
|
224 |
}
|
sl@0
|
225 |
puts $f $line
|
sl@0
|
226 |
puts $f "};
|
sl@0
|
227 |
|
sl@0
|
228 |
/*
|
sl@0
|
229 |
* The groupMap is indexed by combining the alternate page number with
|
sl@0
|
230 |
* the page offset and returns a group number that identifies a unique
|
sl@0
|
231 |
* set of character attributes.
|
sl@0
|
232 |
*/
|
sl@0
|
233 |
|
sl@0
|
234 |
static unsigned char groupMap\[\] = {"
|
sl@0
|
235 |
set line " "
|
sl@0
|
236 |
set lasti [expr {[llength $pages] - 1}]
|
sl@0
|
237 |
for {set i 0} {$i <= $lasti} {incr i} {
|
sl@0
|
238 |
set page [lindex $pages $i]
|
sl@0
|
239 |
set lastj [expr {[llength $page] - 1}]
|
sl@0
|
240 |
for {set j 0} {$j <= $lastj} {incr j} {
|
sl@0
|
241 |
append line [lindex $page $j]
|
sl@0
|
242 |
if {$j != $lastj || $i != $lasti} {
|
sl@0
|
243 |
append line ", "
|
sl@0
|
244 |
}
|
sl@0
|
245 |
if {[string length $line] > 70} {
|
sl@0
|
246 |
puts $f $line
|
sl@0
|
247 |
set line " "
|
sl@0
|
248 |
}
|
sl@0
|
249 |
}
|
sl@0
|
250 |
}
|
sl@0
|
251 |
puts $f $line
|
sl@0
|
252 |
puts $f "};
|
sl@0
|
253 |
|
sl@0
|
254 |
/*
|
sl@0
|
255 |
* Each group represents a unique set of character attributes. The attributes
|
sl@0
|
256 |
* are encoded into a 32-bit value as follows:
|
sl@0
|
257 |
*
|
sl@0
|
258 |
* Bits 0-4 Character category: see the constants listed below.
|
sl@0
|
259 |
*
|
sl@0
|
260 |
* Bits 5-7 Case delta type: 000 = identity
|
sl@0
|
261 |
* 010 = add delta for lower
|
sl@0
|
262 |
* 011 = add delta for lower, add 1 for title
|
sl@0
|
263 |
* 100 = sutract delta for title/upper
|
sl@0
|
264 |
* 101 = sub delta for upper, sub 1 for title
|
sl@0
|
265 |
* 110 = sub delta for upper, add delta for lower
|
sl@0
|
266 |
*
|
sl@0
|
267 |
* Bits 8-21 Reserved for future use.
|
sl@0
|
268 |
*
|
sl@0
|
269 |
* Bits 22-31 Case delta: delta for case conversions. This should be the
|
sl@0
|
270 |
* highest field so we can easily sign extend.
|
sl@0
|
271 |
*/
|
sl@0
|
272 |
|
sl@0
|
273 |
static int groups\[\] = {"
|
sl@0
|
274 |
set line " "
|
sl@0
|
275 |
set last [expr {[llength $groups] - 1}]
|
sl@0
|
276 |
for {set i 0} {$i <= $last} {incr i} {
|
sl@0
|
277 |
foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}
|
sl@0
|
278 |
|
sl@0
|
279 |
# Compute the case conversion type and delta
|
sl@0
|
280 |
|
sl@0
|
281 |
if {$totitle != ""} {
|
sl@0
|
282 |
if {$totitle == $toupper} {
|
sl@0
|
283 |
# subtract delta for title or upper
|
sl@0
|
284 |
set case 4
|
sl@0
|
285 |
set delta $toupper
|
sl@0
|
286 |
} elseif {$toupper != ""} {
|
sl@0
|
287 |
# subtract delta for upper, subtract 1 for title
|
sl@0
|
288 |
set case 5
|
sl@0
|
289 |
set delta $toupper
|
sl@0
|
290 |
} else {
|
sl@0
|
291 |
# add delta for lower, add 1 for title
|
sl@0
|
292 |
set case 3
|
sl@0
|
293 |
set delta $tolower
|
sl@0
|
294 |
}
|
sl@0
|
295 |
} elseif {$toupper != ""} {
|
sl@0
|
296 |
# subtract delta for upper, add delta for lower
|
sl@0
|
297 |
set case 6
|
sl@0
|
298 |
set delta $toupper
|
sl@0
|
299 |
} elseif {$tolower != ""} {
|
sl@0
|
300 |
# add delta for lower
|
sl@0
|
301 |
set case 2
|
sl@0
|
302 |
set delta $tolower
|
sl@0
|
303 |
} else {
|
sl@0
|
304 |
# noop
|
sl@0
|
305 |
set case 0
|
sl@0
|
306 |
set delta 0
|
sl@0
|
307 |
}
|
sl@0
|
308 |
|
sl@0
|
309 |
set val [expr {($delta << 22) | ($case << 5) | $type}]
|
sl@0
|
310 |
|
sl@0
|
311 |
append line [format "%d" $val]
|
sl@0
|
312 |
if {$i != $last} {
|
sl@0
|
313 |
append line ", "
|
sl@0
|
314 |
}
|
sl@0
|
315 |
if {[string length $line] > 65} {
|
sl@0
|
316 |
puts $f $line
|
sl@0
|
317 |
set line " "
|
sl@0
|
318 |
}
|
sl@0
|
319 |
}
|
sl@0
|
320 |
puts $f $line
|
sl@0
|
321 |
puts $f "};
|
sl@0
|
322 |
|
sl@0
|
323 |
/*
|
sl@0
|
324 |
* The following constants are used to determine the category of a
|
sl@0
|
325 |
* Unicode character.
|
sl@0
|
326 |
*/
|
sl@0
|
327 |
|
sl@0
|
328 |
#define UNICODE_CATEGORY_MASK 0X1F
|
sl@0
|
329 |
|
sl@0
|
330 |
enum {
|
sl@0
|
331 |
UNASSIGNED,
|
sl@0
|
332 |
UPPERCASE_LETTER,
|
sl@0
|
333 |
LOWERCASE_LETTER,
|
sl@0
|
334 |
TITLECASE_LETTER,
|
sl@0
|
335 |
MODIFIER_LETTER,
|
sl@0
|
336 |
OTHER_LETTER,
|
sl@0
|
337 |
NON_SPACING_MARK,
|
sl@0
|
338 |
ENCLOSING_MARK,
|
sl@0
|
339 |
COMBINING_SPACING_MARK,
|
sl@0
|
340 |
DECIMAL_DIGIT_NUMBER,
|
sl@0
|
341 |
LETTER_NUMBER,
|
sl@0
|
342 |
OTHER_NUMBER,
|
sl@0
|
343 |
SPACE_SEPARATOR,
|
sl@0
|
344 |
LINE_SEPARATOR,
|
sl@0
|
345 |
PARAGRAPH_SEPARATOR,
|
sl@0
|
346 |
CONTROL,
|
sl@0
|
347 |
FORMAT,
|
sl@0
|
348 |
PRIVATE_USE,
|
sl@0
|
349 |
SURROGATE,
|
sl@0
|
350 |
CONNECTOR_PUNCTUATION,
|
sl@0
|
351 |
DASH_PUNCTUATION,
|
sl@0
|
352 |
OPEN_PUNCTUATION,
|
sl@0
|
353 |
CLOSE_PUNCTUATION,
|
sl@0
|
354 |
INITIAL_QUOTE_PUNCTUATION,
|
sl@0
|
355 |
FINAL_QUOTE_PUNCTUATION,
|
sl@0
|
356 |
OTHER_PUNCTUATION,
|
sl@0
|
357 |
MATH_SYMBOL,
|
sl@0
|
358 |
CURRENCY_SYMBOL,
|
sl@0
|
359 |
MODIFIER_SYMBOL,
|
sl@0
|
360 |
OTHER_SYMBOL
|
sl@0
|
361 |
};
|
sl@0
|
362 |
|
sl@0
|
363 |
/*
|
sl@0
|
364 |
* The following macros extract the fields of the character info. The
|
sl@0
|
365 |
* GetDelta() macro is complicated because we can't rely on the C compiler
|
sl@0
|
366 |
* to do sign extension on right shifts.
|
sl@0
|
367 |
*/
|
sl@0
|
368 |
|
sl@0
|
369 |
#define GetCaseType(info) (((info) & 0xE0) >> 5)
|
sl@0
|
370 |
#define GetCategory(info) ((info) & 0x1F)
|
sl@0
|
371 |
#define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22)))
|
sl@0
|
372 |
|
sl@0
|
373 |
/*
|
sl@0
|
374 |
* This macro extracts the information about a character from the
|
sl@0
|
375 |
* Unicode character tables.
|
sl@0
|
376 |
*/
|
sl@0
|
377 |
|
sl@0
|
378 |
#define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
|
sl@0
|
379 |
"
|
sl@0
|
380 |
|
sl@0
|
381 |
close $f
|
sl@0
|
382 |
}
|
sl@0
|
383 |
|
sl@0
|
384 |
uni::main
|
sl@0
|
385 |
|
sl@0
|
386 |
return
|