From b3c8be22f8ab5f4cc852cd56f960079ed4e84c49 Mon Sep 17 00:00:00 2001 From: Timothy Sample Date: Wed, 16 Mar 2022 21:13:45 -0600 Subject: [PATCH 1/2] Reimplement 'unidata_to_charset.pl' in Awk. * libguile/unidata_to_charset.pl: Delete file. * libguile/unidata_to_charset.awk: New file. * libguile/Makefile.am (EXTRA_DIST): Adjust accordingly. --- libguile/Makefile.am | 2 +- libguile/unidata_to_charset.awk | 409 ++++++++++++++++++++++++++++++++ libguile/unidata_to_charset.pl | 401 ------------------------------- 3 files changed, 410 insertions(+), 402 deletions(-) create mode 100644 libguile/unidata_to_charset.awk delete mode 100755 libguile/unidata_to_charset.pl diff --git a/libguile/Makefile.am b/libguile/Makefile.am index 40619d379..b2a7d1c51 100644 --- a/libguile/Makefile.am +++ b/libguile/Makefile.am @@ -728,7 +728,7 @@ EXTRA_DIST = ChangeLog-scm ChangeLog-threads \ guile-func-name-check \ cpp-E.syms cpp-E.c cpp-SIG.syms cpp-SIG.c \ c-tokenize.lex \ - scmconfig.h.top libgettext.h unidata_to_charset.pl libguile.map \ + scmconfig.h.top libgettext.h unidata_to_charset.awk libguile.map \ vm-operations.h libguile-@GUILE_EFFECTIVE_VERSION@-gdb.scm \ $(lightening_c_files) $(lightening_extra_files) # $(DOT_DOC_FILES) $(EXTRA_DOT_DOC_FILES) \ diff --git a/libguile/unidata_to_charset.awk b/libguile/unidata_to_charset.awk new file mode 100644 index 000000000..11dfb2686 --- /dev/null +++ b/libguile/unidata_to_charset.awk @@ -0,0 +1,409 @@ +# unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt +# +# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 3 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# Utilities +########### + +# Print MESSAGE to standard error, and exit with STATUS. +function die(status, message) { + print "unidata_to_charset.awk:", message | "cat 1>&2"; + exit_status = status; + exit exit_status; +} + +# Parse the string S as a hexadecimal number. Note that R, C, and B are +# local variables that need not be set by callers. Most Awk +# implementations have an 'strtonum' function that we could use, but it +# is not part of POSIX. +function hex(s, r, c, b) { + if (length(s) == 0) { + die(1, "Cannot parse empty string as hexadecimal."); + } + r = 0; + for (i = 1; i <= length(s); i++) { + c = substr(s, i, 1); + b = 0; + if (c == "0") { b = 0; } + else if (c == "1") { b = 1; } + else if (c == "2") { b = 2; } + else if (c == "3") { b = 3; } + else if (c == "4") { b = 4; } + else if (c == "5") { b = 5; } + else if (c == "6") { b = 6; } + else if (c == "7") { b = 7; } + else if (c == "8") { b = 8; } + else if (c == "9") { b = 9; } + else if (c == "A") { b = 10; } + else if (c == "B") { b = 11; } + else if (c == "C") { b = 12; } + else if (c == "D") { b = 13; } + else if (c == "E") { b = 14; } + else if (c == "F") { b = 15; } + else { die(1, "Invalid hexadecimal character: " c); } + r *= 16; + r += b; + } + return r; +} + +# Program initialization +######################## + +BEGIN { + # The columns are separated by semicolons. + FS = ";"; + + # This will help us handle errors. + exit_status = 0; + + # List of charsets. + all_charsets_count = 0; + all_charsets[all_charsets_count++] = "lower_case"; + all_charsets[all_charsets_count++] = "upper_case"; + all_charsets[all_charsets_count++] = "title_case"; + all_charsets[all_charsets_count++] = "letter"; + all_charsets[all_charsets_count++] = "digit"; + all_charsets[all_charsets_count++] = "hex_digit"; + all_charsets[all_charsets_count++] = "letter_plus_digit"; + all_charsets[all_charsets_count++] = "graphic"; + all_charsets[all_charsets_count++] = "whitespace"; + all_charsets[all_charsets_count++] = "printing"; + all_charsets[all_charsets_count++] = "iso_control"; + all_charsets[all_charsets_count++] = "punctuation"; + all_charsets[all_charsets_count++] = "symbol"; + all_charsets[all_charsets_count++] = "blank"; + all_charsets[all_charsets_count++] = "ascii"; + all_charsets[all_charsets_count++] = "empty"; + all_charsets[all_charsets_count++] = "designated"; + + # Initialize charset state table. + for (i in all_charsets) { + cs = all_charsets[i]; + state[cs, "start"] = -1; + state[cs, "end"] = -1; + state[cs, "count"] = 0; + } +} + +# Record initialization +####################### + +# In this block we give names to each field, and do some basic +# initialization. +{ + codepoint = hex($1); + name = $2; + category = $3; + uppercase = $13; + lowercase = $14; + + codepoint_end = codepoint; + charset_count = 0; +} + +# Some pairs of lines in UnicodeData.txt delimit ranges of +# characters. +name ~ /First>$/ { + getline; + last_name = name; + sub(/First>$/, "Last>", last_name); + if (last_name != $2) { + die(1, "Invalid range in Unicode data."); + exit_status = 1; + exit 1; + } + codepoint_end = hex($1); +} + +# Character set predicates +########################## + +## The lower_case character set +############################### + +# For Unicode, we follow Java's specification: a character is +# lowercase if +# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and +# * the Unicode attribute table does not give a lowercase mapping +# for it, and +# * at least one of the following is true: +# o the Unicode attribute table gives a mapping to uppercase +# for the character, or +# o the name for the character in the Unicode attribute table +# contains the words "SMALL LETTER" or "SMALL LIGATURE". + +(codepoint < 8192 || codepoint > 12287) && +lowercase == "" && +(uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) { + charsets[charset_count++] = "lower_case"; +} + +## The upper_case character set +############################### + +# For Unicode, we follow Java's specification: a character is +# uppercase if +# * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and +# * the Unicode attribute table does not give an uppercase mapping +# for it (this excludes titlecase characters), and +# * at least one of the following is true: +# o the Unicode attribute table gives a mapping to lowercase +# for the character, or +# o the name for the character in the Unicode attribute table +# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE". + +(codepoint < 8192 || codepoint > 12287) && +uppercase == "" && +(lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) { + charsets[charset_count++] = "upper_case"; +} + +## The title_case character set +############################### + +# A character is titlecase if it has the category Lt in the character +# attribute database. + +category == "Lt" { + charsets[charset_count++] = "title_case"; +} + +## The letter character set +########################### + +# A letter is any character with one of the letter categories (Lu, Ll, +# Lt, Lm, Lo) in the Unicode character database. + +category == "Lu" || +category == "Ll" || +category == "Lt" || +category == "Lm" || +category == "Lo" { + charsets[charset_count++] = "letter"; + charsets[charset_count++] = "letter_plus_digit"; +} + +## The digit character set +########################## + +# A character is a digit if it has the category Nd in the character +# attribute database. In Latin-1 and ASCII, the only such characters +# are 0123456789. In Unicode, there are other digit characters in +# other code blocks, such as Gujarati digits and Tibetan digits. + +category == "Nd" { + charsets[charset_count++] = "digit"; + charsets[charset_count++] = "letter_plus_digit"; +} + +## The hex_digit character set +############################## + +# The only hex digits are 0123456789abcdefABCDEF. + +(codepoint >= 48 && codepoint <= 57) || +(codepoint >= 65 && codepoint <= 70) || +(codepoint >= 97 && codepoint <= 102) { + charsets[charset_count++] = "hex_digit"; +} + +## The graphic character set +############################ + +# Characters that would 'use ink' when printed + +category ~ /L|M|N|P|S/ { + charsets[charset_count++] = "graphic"; + charsets[charset_count++] = "printing"; +} + +## The whitespace character set +############################### + +# A whitespace character is either +# * a character with one of the space, line, or paragraph separator +# categories (Zs, Zl or Zp) of the Unicode character database. +# * U+0009 (09) Horizontal tabulation (\t control-I) +# * U+000A (10) Line feed (\n control-J) +# * U+000B (11) Vertical tabulation (\v control-K) +# * U+000C (12) Form feed (\f control-L) +# * U+000D (13) Carriage return (\r control-M) + +category ~ /Zs|Zl|Zp/ || +(codepoint >= 9 && codepoint <= 13) { + charsets[charset_count++] = "whitespace"; + charsets[charset_count++] = "printing"; +} + +## The iso_control character set +################################ + +# The ISO control characters are the Unicode/Latin-1 characters in the +# ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]). + +(codepoint >= 0 && codepoint <= 31) || +(codepoint >= 127 && codepoint <= 159) { + charsets[charset_count++] = "iso_control"; +} + +## The punctuation character set +################################ + +# A punctuation character is any character that has one of the +# punctuation categories in the Unicode character database (Pc, Pd, +# Ps, Pe, Pi, Pf, or Po.) + +# Note that srfi-14 gives conflicting requirements!! It claims that +# only the Unicode punctuation is necessary, but, explicitly calls out +# the soft hyphen character (U+00AD) as punctution. Current versions +# of Unicode consider U+00AD to be a formatting character, not +# punctuation. + +category ~ /P/ { + charsets[charset_count++] = "punctuation"; +} + +## The symbol character set +########################### + +# A symbol is any character that has one of the symbol categories in +# the Unicode character database (Sm, Sc, Sk, or So). + +category ~ /S/ { + charsets[charset_count++] = "symbol"; +} + +## The blank character set +########################## + +# Blank chars are horizontal whitespace. A blank character is either +# * a character with the space separator category (Zs) in the +# Unicode character database. +# * U+0009 (9) Horizontal tabulation (\t control-I) + +category ~ /Zs/ || codepoint == 9 { + charsets[charset_count++] = "blank"; +} + +## The ascii character set +########################## + +codepoint <= 127 { + charsets[charset_count++] = "ascii"; +} + +## The designated character set +############################### + +# Designated -- All characters except for the surrogates + +category !~ /Cs/ { + charsets[charset_count++] = "designated"; +} + +## Other character sets +####################### + +# Note that the "letter_plus_digit" and "printing" character sets, which +# are unions of other character sets, are included in the patterns +# matching their constituent parts (i.e., the "letter_plus_digit" +# character set is included as part of the "letter" and "digit" +# patterns). +# +# Also, the "empty" character is computed by doing precisely nothing! + +# Keeping track of state +######################## + +# Update the state for each charset. +{ + for (i = 0; i < charset_count; i++) { + cs = charsets[i]; + if (state[cs, "start"] == -1) { + state[cs, "start"] = codepoint; + state[cs, "end"] = codepoint_end; + } else if (state[cs, "end"] + 1 == codepoint) { + state[cs, "end"] = codepoint_end; + } else { + count = state[cs, "count"]; + state[cs, "count"]++; + state[cs, "ranges", count, 0] = state[cs, "start"]; + state[cs, "ranges", count, 1] = state[cs, "end"]; + state[cs, "start"] = codepoint; + state[cs, "end"] = codepoint_end; + } + } +} + +# Printing and error handling +############################# + +END { + # Normally, an exit statement runs all the 'END' blocks before + # actually exiting. We use the 'exit_status' variable to short + # circuit the rest of the 'END' block by reissuing the exit + # statement. + if (exit_status != 0) { + exit exit_status; + } + + # Write a bit of a header. + print("/* srfi-14.i.c -- standard SRFI-14 character set data */"); + print(""); + print("/* This file is #include'd by srfi-14.c. */"); + print(""); + print("/* This file was generated from"); + print(" https://unicode.org/Public/UNIDATA/UnicodeData.txt"); + print(" with the unidata_to_charset.awk script. */"); + print(""); + + for (i = 0; i < all_charsets_count; i++) { + cs = all_charsets[i]; + + # Extra logic to ensure that the last range is included. + if (state[cs, "start"] != -1) { + count = state[cs, "count"]; + state[cs, "count"]++; + state[cs, "ranges", count, 0] = state[cs, "start"]; + state[cs, "ranges", count, 1] = state[cs, "end"]; + } + + count = state[cs, "count"]; + + print("static const scm_t_char_range cs_" cs "_ranges[] = {"); + for (j = 0; j < count; j++) { + rstart = state[cs, "ranges", j, 0]; + rend = state[cs, "ranges", j, 1]; + if (j + 1 < count) { + printf(" {0x%04x, 0x%04x},\n", rstart, rend); + } else { + printf(" {0x%04x, 0x%04x}\n", rstart, rend); + } + } + print("};"); + print(""); + + count = state[cs, "count"]; + printf("static const size_t cs_%s_len = %d;\n", cs, count); + if (i + 1 < all_charsets_count) { + print(""); + } + } +} + +# And we're done. diff --git a/libguile/unidata_to_charset.pl b/libguile/unidata_to_charset.pl deleted file mode 100755 index 9cd7e6e71..000000000 --- a/libguile/unidata_to_charset.pl +++ /dev/null @@ -1,401 +0,0 @@ -#!/usr/bin/perl -# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt -# -# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc. -# -# This library is free software; you can redistribute it and/or -# modify it under the terms of the GNU Lesser General Public -# License as published by the Free Software Foundation; either -# version 3 of the License, or (at your option) any later version. -# -# This library is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -open(my $in, "<", "UnicodeData.txt") or die "Can't open UnicodeData.txt: $!"; -open(my $out, ">", "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!"; - -# For Unicode, we follow Java's specification: a character is -# lowercase if -# * it is not in the range [U+2000,U+2FFF], and -# * the Unicode attribute table does not give a lowercase mapping -# for it, and -# * at least one of the following is true: -# o the Unicode attribute table gives a mapping to uppercase -# for the character, or -# o the name for the character in the Unicode attribute table -# contains the words "SMALL LETTER" or "SMALL LIGATURE". - -sub lower_case { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (($codepoint < 0x2000 || $codepoint > 0x2FFF) - && (!defined($lowercase) || $lowercase eq "") - && ((defined($uppercase) && $uppercase ne "") - || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) { - return 1; - } else { - return 0; - } -} - -# For Unicode, we follow Java's specification: a character is -# uppercase if -# * it is not in the range [U+2000,U+2FFF], and -# * the Unicode attribute table does not give an uppercase mapping -# for it (this excludes titlecase characters), and -# * at least one of the following is true: -# o the Unicode attribute table gives a mapping to lowercase -# for the character, or -# o the name for the character in the Unicode attribute table -# contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE". - -sub upper_case { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (($codepoint < 0x2000 || $codepoint > 0x2FFF) - && (!defined($uppercase) || $uppercase eq "") - && ((defined($lowercase) && $lowercase ne "") - || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) { - return 1; - } else { - return 0; - } -} - -# A character is titlecase if it has the category Lt in the character -# attribute database. - -sub title_case { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (defined($category) && $category eq "Lt") { - return 1; - } else { - return 0; - } -} - -# A letter is any character with one of the letter categories (Lu, Ll, -# Lt, Lm, Lo) in the Unicode character database. - -sub letter { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (defined($category) && ($category eq "Lu" - || $category eq "Ll" - || $category eq "Lt" - || $category eq "Lm" - || $category eq "Lo")) { - return 1; - } else { - return 0; - } -} - -# A character is a digit if it has the category Nd in the character -# attribute database. In Latin-1 and ASCII, the only such characters -# are 0123456789. In Unicode, there are other digit characters in -# other code blocks, such as Gujarati digits and Tibetan digits. - -sub digit { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (defined($category) && $category eq "Nd") { - return 1; - } else { - return 0; - } -} - -# The only hex digits are 0123456789abcdefABCDEF. - -sub hex_digit { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (($codepoint >= 0x30 && $codepoint <= 0x39) - || ($codepoint >= 0x41 && $codepoint <= 0x46) - || ($codepoint >= 0x61 && $codepoint <= 0x66)) { - return 1; - } else { - return 0; - } -} - -# The union of char-set:letter and char-set:digit. - -sub letter_plus_digit { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (letter($codepoint, $name, $category, $uppercase, $lowercase) - || digit($codepoint, $name, $category, $uppercase, $lowercase)) { - return 1; - } else { - return 0; - } -} - -# Characters that would 'use ink' when printed -sub graphic { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/L|M|N|P|S/)) { - return 1; - } else { - return 0; - } -} - -# A whitespace character is either -# * a character with one of the space, line, or paragraph separator -# categories (Zs, Zl or Zp) of the Unicode character database. -# * U+0009 Horizontal tabulation (\t control-I) -# * U+000A Line feed (\n control-J) -# * U+000B Vertical tabulation (\v control-K) -# * U+000C Form feed (\f control-L) -# * U+000D Carriage return (\r control-M) - -sub whitespace { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/Zs|Zl|Zp/) - || $codepoint == 0x9 - || $codepoint == 0xA - || $codepoint == 0xB - || $codepoint == 0xC - || $codepoint == 0xD) { - return 1; - } else { - return 0; - } -} - -# A printing character is one that would occupy space when printed, -# i.e., a graphic character or a space character. char-set:printing is -# the union of char-set:whitespace and char-set:graphic. - -sub printing { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (whitespace($codepoint, $name, $category, $uppercase, $lowercase) - || graphic($codepoint, $name, $category, $uppercase, $lowercase)) { - return 1; - } else { - return 0; - } -} - -# The ISO control characters are the Unicode/Latin-1 characters in the -# ranges [U+0000,U+001F] and [U+007F,U+009F]. - -sub iso_control { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if (($codepoint >= 0x00 && $codepoint <= 0x1F) - || ($codepoint >= 0x7F && $codepoint <= 0x9F)) { - return 1; - } else { - return 0; - } -} - -# A punctuation character is any character that has one of the -# punctuation categories in the Unicode character database (Pc, Pd, -# Ps, Pe, Pi, Pf, or Po.) - -# Note that srfi-14 gives conflicting requirements!! It claims that -# only the Unicode punctuation is necessary, but, explicitly calls out -# the soft hyphen character (U+00AD) as punctution. Current versions -# of Unicode consider U+00AD to be a formatting character, not -# punctuation. - -sub punctuation { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/P/)) { - return 1; - } else { - return 0; - } -} - -# A symbol is any character that has one of the symbol categories in -# the Unicode character database (Sm, Sc, Sk, or So). - -sub symbol { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/S/)) { - return 1; - } else { - return 0; - } -} - -# Blank chars are horizontal whitespace. A blank character is either -# * a character with the space separator category (Zs) in the -# Unicode character database. -# * U+0009 Horizontal tabulation (\t control-I) -sub blank { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/Zs/) - || $codepoint == 0x9) { - return 1; - } else { - return 0; - } -} - -# ASCII -sub ascii { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($codepoint <= 0x7F) { - return 1; - } else { - return 0; - } -} - -# Empty -sub empty { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - return 0; -} - -# Designated -- All characters except for the surrogates -sub designated { - my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - if ($category =~ (/Cs/)) { - return 0; - } else { - return 1; - } -} - - -# The procedure generates the two C structures necessary to describe a -# given category. -sub compute { - my($f) = @_; - my $start = -1; - my $end = -1; - my $len = 0; - my @rstart = (-1); - my @rend = (-1); - - seek($in, 0, 0) or die "Can't seek to beginning of file: $!"; - - print "$f\n"; - - while (<$in>) { - # Parse the 14 column, semicolon-delimited UnicodeData.txt - # file - chomp; - my(@fields) = split(/;/); - - # The codepoint: an integer - my $codepoint = hex($fields[0]); - - # If this is a character range, the last character in this - # range - my $codepoint_end = $codepoint; - - # The name of the character - my $name = $fields[1]; - - # A two-character category code, such as Ll (lower-case - # letter) - my $category = $fields[2]; - - # The codepoint of the uppercase version of this char - my $uppercase = $fields[12]; - - # The codepoint of the lowercase version of this char - my $lowercase = $fields[13]; - - my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase); - if ($pass == 1) { - - # Some pairs of lines in UnicodeData.txt delimit ranges of - # characters. - if ($name =~ /First/) { - $line = <$in>; - die $! if $!; - $codepoint_end = hex( (split(/;/, $line))[0] ); - } - - # Compute ranges of characters [start:end] that meet the - # criteria. Store the ranges. - if ($start == -1) { - $start = $codepoint; - $end = $codepoint_end; - } elsif ($end + 1 == $codepoint) { - $end = $codepoint_end; - } else { - $rstart[$len] = $start; - $rend[$len] = $end; - $len++; - $start = $codepoint; - $end = $codepoint_end; - } - } - } - - # Extra logic to ensure that the last range is included - if ($start != -1) { - if ($len > 0 && $rstart[@rstart-1] != $start) { - $rstart[$len] = $start; - $rend[$len] = $end; - $len++; - } elsif ($len == 0) { - $rstart[0] = $start; - $rend[0] = $end; - $len++; - } - } - - # Print the C struct that contains the range list. - print $out "static const scm_t_char_range cs_" . $f . "_ranges[] = {\n"; - if ($rstart[0] != -1) { - for (my $i=0; $i<@rstart-1; $i++) { - printf $out " {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i]; - } - printf $out " {0x%04x, 0x%04x}\n", $rstart[@rstart-1], $rend[@rstart-1]; - } - print $out "};\n\n"; - - # Print the C struct that contains the range list length and - # pointer to the range list. - print $out "static const size_t cs_${f}_len = $len;\n\n"; -} - -# Write a bit of a header -print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n"; -print $out "/* This file is #include'd by srfi-14.c. */\n\n"; -print $out "/* This file was generated from\n"; -print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n"; -print $out " with the unidata_to_charset.pl script. */\n\n"; - -# Write the C structs for each SRFI-14 charset -compute "lower_case"; -compute "upper_case"; -compute "title_case"; -compute "letter"; -compute "digit"; -compute "hex_digit"; -compute "letter_plus_digit"; -compute "graphic"; -compute "whitespace"; -compute "printing"; -compute "iso_control"; -compute "punctuation"; -compute "symbol"; -compute "blank"; -compute "ascii"; -compute "empty"; -compute "designated"; - -close $in; -close $out; - -exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!"; - -# And we're done. - - - - - - -- 2.34.0