From: handa <handa@gnu.org>
To: ynyaaa@gmail.com
Cc: 23814@debbugs.gnu.org
Subject: bug#23814: 24.5; bug of hz coding-system
Date: Wed, 17 Aug 2016 23:43:13 +0900 [thread overview]
Message-ID: <87bn0rjw1q.fsf@gnu.org> (raw)
In-Reply-To: <87oa4rdhvq.fsf@gmail.com> (ynyaaa@gmail.com)
In article <87oa4rdhvq.fsf@gmail.com>, ynyaaa@gmail.com writes:
> Hi, I tried new china-util.el. It works very well.
Thank you for testing it.
> I prefer 7bit encoding to use only 7bit data, too.
> As for elisp, "\u12345" is treated as "\u1234\ 5".
Ah, ok, I changed to encode characters not in BMP to \UXXXXXXXX.
I've just committed the attached change.
---
K. Handa
handa@gnu.org
2016-08-17 handa <handa@gnu.org>
* lisp/language/china-util.el (decode-hz-region): Pay
attention to "~~}" sequence at the end of Chinese character
range.
(hz-category-table): New variable.
(encode-hz-region): Convert non-encodable characters to
\u... and \U... Preserve ESC on ecoding. Put
`chinese-gb2312' `charset' text property in advance to force
iso-2022-encoding to select chinese-gb2312 designation.
diff --git a/lisp/language/china-util.el b/lisp/language/china-util.el
index e531640..6505fb8 100644
--- a/lisp/language/china-util.el
+++ b/lisp/language/china-util.el
@@ -88,43 +88,34 @@ decode-hz-region
(let (pos ch)
(narrow-to-region beg end)
- ;; We, at first, convert HZ/ZW to `euc-china',
+ ;; We, at first, convert HZ/ZW to `iso-2022-7bit',
;; then decode it.
- ;; "~\n" -> "\n", "~~" -> "~"
+ ;; "~\n" -> "", "~~" -> "~"
(goto-char (point-min))
(while (search-forward "~" nil t)
(setq ch (following-char))
- (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
+ (cond ((= ch ?{)
+ (delete-region (1- (point)) (1+ (point)))
+ (setq pos (point))
+ (insert iso2022-gb-designation)
+ (if (looking-at "\\([!-}][!-~]\\)*")
+ (goto-char (match-end 0)))
+ (if (looking-at hz-ascii-designation)
+ (delete-region (match-beginning 0) (match-end 0)))
+ (insert iso2022-ascii-designation)
+ (decode-coding-region pos (point) 'iso-2022-7bit))
+
+ ((= ch ?~)
+ (delete-char 1))
+
+ ((and (= ch ?\n)
+ decode-hz-line-continuation)
+ (delete-region (1- (point)) (1+ (point))))
+
+ (t
+ (forward-char 1)))))
- ;; "^zW...\n" -> Chinese GB2312
- ;; "~{...~}" -> Chinese GB2312
- (goto-char (point-min))
- (setq beg nil)
- (while (re-search-forward hz/zw-start-gb nil t)
- (setq pos (match-beginning 0)
- ch (char-after pos))
- ;; Record the first position to start conversion.
- (or beg (setq beg pos))
- (end-of-line)
- (setq end (point))
- (if (>= ch 128) ; 8bit GB2312
- nil
- (goto-char pos)
- (delete-char 2)
- (setq end (- end 2))
- (if (= ch ?z) ; ZW -> euc-china
- (progn
- (translate-region (point) end hz-set-msb-table)
- (goto-char end))
- (if (search-forward hz-ascii-designation
- (if decode-hz-line-continuation nil end)
- t)
- (delete-char -2))
- (setq end (point))
- (translate-region pos (point) hz-set-msb-table))))
- (if beg
- (decode-coding-region beg end 'euc-china)))
(- (point-max) (point-min)))))
;;;###autoload
@@ -133,33 +124,57 @@ decode-hz-buffer
(interactive)
(decode-hz-region (point-min) (point-max)))
+(defvar hz-category-table nil)
+
;;;###autoload
(defun encode-hz-region (beg end)
"Encode the text in the current region to HZ.
Return the length of resulting text."
(interactive "r")
+ (unless hz-category-table
+ (setq hz-category-table (make-category-table))
+ (with-category-table hz-category-table
+ (define-category ?c "hz encodable")
+ (map-charset-chars #'modify-category-entry 'ascii ?c)
+ (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)))
(save-excursion
(save-restriction
(narrow-to-region beg end)
+ (with-category-table hz-category-table
+ ;; ~ -> ~~
+ (goto-char (point-min))
+ (while (search-forward "~" nil t) (insert ?~))
+
+ ;; ESC -> ESC ESC
+ (goto-char (point-min))
+ (while (search-forward "\e" nil t) (insert ?\e))
- ;; "~" -> "~~"
- (goto-char (point-min))
- (while (search-forward "~" nil t) (insert ?~))
-
- ;; Chinese GB2312 -> "~{...~}"
- (goto-char (point-min))
- (if (re-search-forward "\\cc" nil t)
- (let (pos)
- (goto-char (setq pos (match-beginning 0)))
- (encode-coding-region pos (point-max) 'iso-2022-7bit)
- (goto-char pos)
- (while (search-forward iso2022-gb-designation nil t)
- (delete-char -3)
- (insert hz-gb-designation))
- (goto-char pos)
- (while (search-forward iso2022-ascii-designation nil t)
- (delete-char -3)
- (insert hz-ascii-designation))))
+ ;; Non-ASCII-GB2312 -> \uXXXX
+ (goto-char (point-min))
+ (while (re-search-forward "\\Cc" nil t)
+ (let ((ch (preceding-char)))
+ (delete-char -1)
+ (insert (format (if (< ch #x10000) "\\u%04X" "\\U%08X") ch))))
+
+ ;; Prefer chinese-gb2312 for Chinese characters.
+ (put-text-property (point-min) (point-max) 'charset 'chinese-gb2312)
+ (encode-coding-region (point-min) (point-max) 'iso-2022-7bit)
+
+ ;; ESC $ B ... ESC ( B -> ~{ ... ~}
+ ;; ESC ESC -> ESC
+ (goto-char (point-min))
+ (while (search-forward "\e" nil t)
+ (if (= (following-char) ?\e)
+ ;; ESC ESC -> ESC
+ (delete-char 1)
+ (forward-char -1)
+ (if (looking-at iso2022-gb-designation)
+ (progn
+ (delete-region (match-beginning 0) (match-end 0))
+ (insert hz-gb-designation)
+ (search-forward iso2022-ascii-designation nil 'move)
+ (delete-region (match-beginning 0) (match-end 0))
+ (insert hz-ascii-designation))))))
(- (point-max) (point-min)))))
;;;###autoload
next prev parent reply other threads:[~2016-08-17 14:43 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-06-21 12:22 bug#23814: 24.5; bug of hz coding-system ynyaaa
2016-06-21 12:58 ` Eli Zaretskii
2016-06-22 13:47 ` ynyaaa
2016-06-22 15:28 ` Eli Zaretskii
2016-06-22 17:04 ` ynyaaa
2016-06-22 17:26 ` Eli Zaretskii
2016-07-09 11:20 ` Eli Zaretskii
2016-07-13 14:12 ` handa
2016-07-23 17:47 ` Eli Zaretskii
2016-07-24 8:21 ` ynyaaa
2016-07-26 15:09 ` handa
2016-07-29 1:05 ` ynyaaa
2016-08-14 11:22 ` handa
2016-08-17 6:33 ` ynyaaa
2016-08-17 14:43 ` handa [this message]
2016-08-17 15:28 ` Eli Zaretskii
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/emacs/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87bn0rjw1q.fsf@gnu.org \
--to=handa@gnu.org \
--cc=23814@debbugs.gnu.org \
--cc=ynyaaa@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).