1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
| | #!/bin/sh
# Copyright (C) 2015-2019 Free Software Foundation, Inc.
# Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
# National Institute of Advanced Industrial Science and Technology (AIST)
# Registration Number H13PRO009
# This file is part of GNU Emacs.
# GNU Emacs is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# GNU Emacs is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
# Commentary:
# Convert charset map of various format into this:
# 0xXX 0xYYYY
# where,
# XX is a code point of the charset in hexa-decimal,
# YYYY is the corresponding Unicode character code in hexa-decimal.
# Arguments are:
# $1: source map file
# $2: address pattern for sed (optionally with substitution command)
# $3: format of source map file
# GLIBC-1 GLIBC-2 GLIBC-2-7 CZYBORRA IANA UNICODE UNICODE2 YASUOKA
# $4: awk script
## So that eg [A-F] as used by KANJI-DATABASE branch below works as expected.
## Otherwise with LANG=en_US.utf8, CNS-6.map was generated with a
## bogus entry. By experiment, LC_COLLATE=C was not enough.
export LC_ALL=C
BASE=`expr "$1" : '.*/\(.*\)' '|' "$1"` # basename
FILE="admin/charsets/mapfiles/$BASE"
BASE=`expr "$BASE" : '\(.*\)\.gz$' '|' "$BASE"` # remove any .gz suffix
AWK=${AWK:-awk}
case "$3" in
GLIBC*)
FILE="$BASE in localedata/charmaps of glibc";
SOURCE="";;
CZYBORRA)
BASE="$BASE.gz";
SOURCE="http://czyborra.com/charsets/${BASE}";;
IANA)
SOURCE="http://www.iana.org/assignments/charset-reg/${BASE}";;
UNICODE)
SOURCE="http://www.unicode.org/Public/MAPPINGS/VENDORS/ADOBE/${BASE}";;
UNICODE2)
SOURCE="http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/${BASE}";;
YASUOKA)
BASE="$BASE.Z";
SOURCE="http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/ftp/CJKtable/${BASE}";;
KANJI-DATABASE)
SOURCE="http://kanji-database.cvs.sourceforge.net/viewvc/*checkout*/kanji-database/kanji-database/data/cns2ucsdkw.txt?revision=1.4";;
*)
printf 'Unknown file type: %s\n' "$3"
exit 1;;
esac
if [ -n "$SOURCE" ] ; then
echo "# Generated from $FILE which is a copy of";
echo "# $SOURCE"
else
echo "# Generated from $FILE"
fi
if [ -n "$4" ] ; then
if [ -f "$4" ] ; then
AWKPROG="$AWK -f $4"
else
echo "Awk program does not exist: $4"
exit 1
fi
else
AWKPROG=cat
fi
if [ "$3" = "GLIBC-1" ] ; then
# Source format is:
# <UYYYY> /xXX
gunzip -c $1 | sed -n -e "${2}p" \
| sed -e 's,<U\([^>]*\)>[ ]*/x\(..\).*,0x\2 0x\1,' \
| sort | ${AWKPROG}
elif [ "$3" = "GLIBC-2" ] ; then
# Source format is:
# <UYYYY> /xXX/xZZ
gunzip -c $1 | sed -n -e "${2}p" \
| sed -e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
| sort | ${AWKPROG}
elif [ "$3" = "GLIBC-2-7" ] ; then
# Source format is:
# <UYYYY> /xXX/xZZ
# We must drop MSBs of XX and ZZ
gunzip -c $1 | sed -n -e "${2}p" \
| sed -e 's/xa/x2/g' -e 's/xb/x3/g' -e 's/xc/x4/g' \
-e 's/xd/x5/g' -e 's/xe/x6/g' -e 's/xf/x7/g' \
-e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
| sort | ${AWKPROG}
elif [ "$3" = "CZYBORRA" ] ; then
# Source format is:
# =XX U+YYYY
sed -n -e "${2}p" < $1 \
| sed -e 's/=\(..\)[^U]*U+\([0-9A-F]*\).*/0x\1 0x\2/' \
| sort | ${AWKPROG}
elif [ "$3" = "IANA" ] ; then
# Source format is:
# 0xXX 0xYYYY
sed -n -e "${2}p" < $1 \
| sed -e 's/\(0x[[:xdigit:]]*\)[^0]*\(0x[[:xdigit:]]*\).*/\1 \2/' \
| sort | ${AWKPROG}
elif [ "$3" = "UNICODE" ] ; then
# Source format is:
# YYYY XX
# We perform reverse sort to prefer the first one in the
# duplicated mappings (e.g. 0x20->U+0020, 0x20->U+00A0).
sed -n -e "${2}p" < $1 \
| sed -e 's/\([0-9A-F]*\)[^0-9A-F]*\([0-9A-F]*\).*/0x\2 0x\1/' \
| sort -r
elif [ "$3" = "UNICODE2" ] ; then
# Source format is:
# 0xXXXX 0xYYYY # ...
sed -n -e "${2}p" < $1 \
| sed -e 's/\([0-9A-Fx]*\)[^0]*\([0-9A-Fx]*\).*/\1 \2/' \
| ${AWKPROG} | sort -n -k 4,4
elif [ "$3" = "YASUOKA" ] ; then
# Source format is:
# YYYY 0-XXXX (XXXX is a Kuten code)
sed -n -e "${2}p" < $1 \
| sed -e 's/\([0-9A-F]*\)[^0]*0-\([0-9]*\).*/0x\2 0x\1/' \
| sort | ${AWKPROG}
elif [ "$3" = "KANJI-DATABASE" ] ; then
# Source format is:
# C?-XXXX U+YYYYY .....
sed -n -e "${2}p" < $1 \
| sed -e 's/...\(....\) U+\([0-9A-F]*\).*/0x\1 0x\2/' \
| sort | ${AWKPROG}
else
printf 'Invalid arguments: %s\n' "$3"
exit 1
fi
|