unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
* struct.el -- a package to encode/decode binary data
@ 2002-03-18 23:12 Kim F. Storm
  2002-03-19  0:25 ` Miles Bader
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Kim F. Storm @ 2002-03-18 23:12 UTC (permalink / raw)



While writing a package that sends and receives datagrams using the
new make-network-process functionality, I quickly found that I needed
to be able to encode and decode binary data structures, so I came up
with the following package (struct.el).

I'd like to hear if something like this already exists, or if others
find it should be added to emacs (with more complete documentation of
course).  [Also, the struct-pack function doesn't work with nested
data, but I'll fix that if there is an interest in this package].

++kfs

------------------------- struct.el --------------------
;;; struct.el --- basic data structure packing and unpacking.

;; Copyright (C) 2002 Free Software Foundation, Inc.

;; This file is part of GNU Emacs.

;; GNU Emacs is free software; you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 2, or (at your option)
;; any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING.  If not, write to the
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
;; Boston, MA 02111-1307, USA.

;;; Commentary:

;;  Packing and unpacking of (binary) data structures.
;;
;;  The data formats used in binary files and network protocols are
;;  often structed data which can be described by a C-style structure
;;  such as the one shown below.  Using the struct package, decoding
;;  and encoding binary data formats like these is made simple using a
;;  structure specification which closely resembles the C style
;;  structure declarations.
;;  
;;  Encoded (binary) data is stored in a unibyte string or vector,
;;  while the decoded data is stored in an alist with (FIELD . VALUE) 
;;  pairs.
;;

;;; Example:
  
;;  Consider the following C structures:
;;  
;;  struct header {
;;	unsigned long	dest_ip;
;;	unsigned long	src_ip;
;;	unsigned short	dest_port;
;;	unsigned short	src_port;
;;  };
;;  
;;  struct data {
;;	unsigned char	type;
;;	unsigned char	opcode;
;;	unsigned long	length;  /* In little endian order */
;;	unsigned char	id[8];   /* nul-terminated string  */
;;	unsigned char	data[/* (length + 3) & ~3 */];
;;  };
;;  
;;  struct packet {
;;	struct header	header;
;;	unsigned char	items;
;;	unsigned char   filler[3];
;;	struct data	item[/* items */];
;;  };
;;  
;;  The corresponding Lisp struct specification looks like this:
;;  
;;  (setq header-spec
;;    '((dest-ip   ip)
;;	(src-ip    ip)
;;	(dest-port u16)
;;	(src-port  u16)))
;;  
;;  (setq data-spec
;;    '((type      u8)
;;	(opcode	   u8)
;;	(length	   u16r)  ;; little endian order
;;	(id	   strz 8)
;;	(data	   vec (length))
;;	(align     4)))
;;  
;;  (setq packet-spec
;;    '((header    struct header-spec)
;;	(items	   u8)
;;	(fill 3)
;;	(item	   repeat (items)
;;		   ((struct data-spec)))))
;;  
;;
;;  A binary representation may look like
;;   [ 192 168 1 100 192 168 1 101 01 28 21 32 2 0 0 0  
;;     2 3 5 0 ?A ?B ?C ?D ?E ?F 0 0 1 2 3 4 5 0 0 0
;;     1 4 7 0 ?B ?C ?D ?E ?F ?G 0 0 6 7 8 9 10 11 12 0 ]
;;  
;;  The corresponding decoded structure looks like
;;
;;      ((header
;;        (dest-ip   . [192 168 1 100])
;;        (src-ip    . [192 168 1 101])
;;        (dest-port . 284)
;;        (src-port  . 5408))
;;       (items . 2)
;;       (item ((data . [1 2 3 4 5])
;;      	(id . "ABCDEF")
;;      	(length . 5)
;;      	(opcode . 3)
;;      	(type . 2))
;;             ((data . [6 7 8 9 10 11 12])
;;      	(id . "BCDEFG")
;;      	(length . 7)
;;      	(opcode . 4)
;;      	(type . 1))))

;;; Code:

;; Helper functions for structure unpacking.
;; Relies on dynamic binding of RAW-DATA and POS

(eval-when-compile
  (defvar raw-data)
  (defvar pos))

(defun struct--unpack-u8 ()
  (prog1
      (if (stringp raw-data)
	  (string-to-char (substring raw-data pos (1+ pos)))
	(aref raw-data pos))
    (setq pos (1+ pos))))
    
(defun struct--unpack-u16 ()
  (let* ((a (struct--unpack-u8)) (b (struct--unpack-u8)))
    (+ (* a 256) b)))

(defun struct--unpack-u24 ()
  (let* ((a (struct--unpack-u16)) (b (struct--unpack-u8)))
    (+ (* a 256) b)))

(defun struct--unpack-u32 ()
  (let* ((a (struct--unpack-u16)) (b (struct--unpack-u16)))
    (+ (* a 65536) b)))

(defun struct--unpack-u16r ()
  (let* ((a (struct--unpack-u8)) (b (struct--unpack-u8)))
    (+ (* b 256) a)))

(defun struct--unpack-u24r ()
  (let* ((a (struct--unpack-u16r)) (b (struct--unpack-u8)))
    (+ (* b 65536) a)))

(defun struct--unpack-u32r ()
  (let* ((a (struct--unpack-u16r)) (b (struct--unpack-u16r)))
    (+ (* b 65536) a)))

(defun struct--unpack-item (type len)
  (if (eq type 'ip)
      (setq type 'vec len 4))
  (cond
   ((memq type '(u8 byte))
    (struct--unpack-u8))
   ((memq type '(u16 word short))
    (struct--unpack-u16))
   ((eq type 'u24)
    (struct--unpack-u24))
   ((memq type '(u32 dword long))
    (struct--unpack-u32))
   ((eq type 'u16r)
    (struct--unpack-u16r))
   ((eq type 'u24r)
    (struct--unpack-u24r))
   ((eq type 'u32r)
    (struct--unpack-u32r))
   ((eq type 'str)
    (let ((s (substring raw-data pos (+ pos len))))
      (setq pos (+ pos len))
      (if (stringp s) s
	(string-make-unibyte (concat s)))))
   ((eq type 'strz)
    (let ((i 0) s)
      (while (and (< i len) (/= (aref raw-data (+ pos i)) 0))
	(setq i (1+ i)))
      (setq s (substring raw-data pos (+ pos i)))
      (setq pos (+ pos len))
      (if (stringp s) s
	(string-make-unibyte (concat s)))))
   ((eq type 'vec)
    (let ((v (make-vector len 0)) (i 0))
      (while (< i len)
	(aset v i (struct--unpack-u8))
	(setq i (1+ i)))
      v))
   (t nil)))

(defun struct--unpack-group (spec)
  (let (result)
    (while spec
      (let* ((item (car spec))
	     (field (car item))
	     (type (nth 1 item))
	     (len (nth 2 item))
	     data)
	(cond 
	 ((eq field 'fill)
	  (setq pos (+ pos type)))
	 ((eq field 'align)
	  (while (/= (% pos type) 0)
	    (setq pos (1+ pos))))
	 ((eq field 'struct)
	  (setq result (append (struct--unpack-group (eval type)) result)))
	 ((eq type 'struct)
	  (setq data (struct--unpack-group (eval len)))
	  (setq result (cons (cons field data) result)))
	 (t
	  (if (consp len)
	      (setq len (apply 'struct-field result len)))
	  (if (not len)
	      (setq len 1))
	  (if (eq type 'repeat)
	      (let ((i 0))
		(while (< i len)
		  (setq data (cons (struct--unpack-group (nth 3 item)) data))
		  (setq i (1+ i)))
		(setq data (reverse data)))
	    (setq data (struct--unpack-item type len)))
	  (setq result (cons (cons field data) result))))
	(setq spec (cdr spec))))
      (reverse result)))

(defun struct-unpack (raw-data spec)
  "Unpack RAW-DATA according to struct specification SPEC."
  (let ((pos 0))
    (struct--unpack-group spec)))

(defun struct-field (struct &rest field)
  (while (and struct field)
    (setq struct (if (integerp (car field))
		     (nth (car field) struct)
		   (let ((val (assq (car field) struct)))
		     (if (consp val) (cdr val)))))
    (setq field (cdr field)))
  struct)



(defun struct-ip-to-string (ip)
  (format "%d.%d.%d.%d"
	  (aref ip 0) (aref ip 1) (aref ip 2) (aref ip 3)))

(defun struct-vector-to-hex (v)
  (let ((i 0) (len (length v)) s)
    (while (< i len)
      (setq s (cons (format ":%02x" (aref v i)) s)
	    i (1+ i)))
    (setq s (reverse s))
    (substring (apply 'concat s) 1)))


;; Pack structured data into raw-data

(defun struct--pack-u8 (v)
  (if v
      (char-to-string v)
    [0]))
    
(defun struct--pack-u16 (v)
  (if v
      (vector (% (/ v 256) 256)
	      (% v 256))
    [0 0]))

(defun struct--pack-u24 (v)
  (if v
      (vector (% (/ v 65536) 256)
	      (% (/ v 256) 256)
	      (% v 256))
    [0 0 0]))

(defun struct--pack-u32 (v)
  (if v
      (vector (% (/ v 16777216) 256)
	      (% (/ v 65536) 256)
	      (% (/ v 256) 256)
	      (% v 256))
    [0 0 0 0]))

(defun struct--pack-u16r (v)
  (if v
      (vector (% v 256)
	      (% (/ v 256) 256))
    [0 0]))

(defun struct--pack-u24r (v)
  (if v
      (vector (% v 256)
	      (% (/ v 256) 256)
	      (% (/ v 65536) 256))
    [0 0 0]))

(defun struct--pack-u32r (v)
  (if v
      (vector (% v 256)
	      (% (/ v 256) 256)
	      (% (/ v 65536) 256)
	      (% (/ v 16777216) 256))
    [0 0 0 0]))

(defun struct--pack-item (v type len)
  (if (eq type 'ip)
      (setq type 'vec len 4))
  (cond
   ((memq type '(u8 byte))
    (struct--pack-u8 v))
   ((memq type '(u16 word short))
    (struct--pack-u16 v))
   ((eq type 'u24)
    (struct--pack-u24 v))
   ((memq type '(u32 dword long))
    (struct--pack-u32 v))
   ((eq type 'u16r)
    (struct--pack-u16r v))
   ((eq type 'u24r)
    (struct--pack-u24r v))
   ((eq type 'u32r)
    (struct--pack-u32r v))
   ((memq type '(str strz vec))
    (let ((l (length v)))
      (if (>= l len)
	  (substring v 0 len)
	(concat v (make-vector (- len l) 0)))))
   (t 
    (make-vector len 0))))

(defun struct--pack-group (struct spec offset)
  (let (result)
    (while spec
      (let* ((item (car spec))
	     (field (car item))
	     (type (nth 1 item))
	     (len (nth 2 item))
	     data)
	(cond 
	 ((eq field 'fill)
	  (setq data (make-vector type 0)))
	 ((eq field 'align)
	  (let ((extra (- type (% (+ (length result) offset) type))))
	    (setq data (if (> extra 0) (make-vector extra 0)))))
	 ((eq field 'struct)
	  (setq result
		(append result
			(struct--pack-group struct (eval type)
					    (length result)))))
	 ((eq type 'struct)
	  (setq result
		(append result
			(struct--pack-group (struct-field struct field)
					    (eval len) (length result)))))
	 (t
	  (if (consp len)
	      (setq len (apply 'struct-field result len)))
	  (if (not len)
	      (setq len 1))
	  (if (eq type 'repeat)
	      (let ((i 0))
		(while (< i len)
		  (setq result
			(append result
				(struct--pack-group struct (nth 3 item)
						    (length result))))
		  (setq i (1+ i))))
	    (setq data (struct--pack-item (struct-field struct field) type len)))))
	(if data
	    (setq result (append result (list data)))))
      (setq spec (cdr spec)))
    result))

(defun struct-pack (struct spec)
  "Pack STRUCT according to struct specification SPEC."
  (string-make-unibyte
   (apply 'concat (struct--pack-group struct spec 0))))


(provide 'struct)


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-18 23:12 Kim F. Storm
@ 2002-03-19  0:25 ` Miles Bader
  2002-03-19  7:02 ` Eli Zaretskii
  2002-03-19 14:33 ` Luke Gorrie
  2 siblings, 0 replies; 24+ messages in thread
From: Miles Bader @ 2002-03-19  0:25 UTC (permalink / raw)
  Cc: emacs-devel

sotrm@cua.dk (Kim F. Storm) writes:
> While writing a package that sends and receives datagrams using the
> new make-network-process functionality, I quickly found that I needed
> to be able to encode and decode binary data structures, so I came up
> with the following package (struct.el).

This is not a comment on the contents, but you probably should call it
something other than `struct', since people generally use that term to
refer to lisp structures (made with `defstruct').

-Miles
-- 
We live, as we dream -- alone....

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-18 23:12 Kim F. Storm
  2002-03-19  0:25 ` Miles Bader
@ 2002-03-19  7:02 ` Eli Zaretskii
  2002-03-19 12:34   ` Stefan Monnier
  2002-03-19 14:33 ` Luke Gorrie
  2 siblings, 1 reply; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-19  7:02 UTC (permalink / raw)
  Cc: emacs-devel, Kenichi Handa


On 19 Mar 2002, Kim F. Storm wrote:

> While writing a package that sends and receives datagrams using the
> new make-network-process functionality, I quickly found that I needed
> to be able to encode and decode binary data structures, so I came up
> with the following package (struct.el).
> 
> I'd like to hear if something like this already exists, or if others
> find it should be added to emacs (with more complete documentation of
> course).  [Also, the struct-pack function doesn't work with nested
> data, but I'll fix that if there is an interest in this package].

I think it would be a very useful addition to Emacs, but I have one
comment about the implementation: I don't like (and that's an
understatement!) the idea of using unibyte strings.  I especially get 
shivers when I see string-make-unibyte and its ilk.

Unibyte strings and buffers are The Mother Of All Evil in Emacs--they are
the primary reason for those pesky \201 characters popping up in user
buffers.  Thanks to titanic effort of Handa-san and others, Emacs mostly
does TRT with unibyte text, but I think we shouldn't test that too much
for our own good, especially in a package bundled with Emacs. 

If I understand correctly what you want to do, there should be no reason 
for unibyte text in your implementation.  Emacs should be able to deal 
with binary data as multibyte, just be sure to decode it as raw-text-unix.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-19  7:02 ` Eli Zaretskii
@ 2002-03-19 12:34   ` Stefan Monnier
  2002-03-19 14:38     ` Eli Zaretskii
  0 siblings, 1 reply; 24+ messages in thread
From: Stefan Monnier @ 2002-03-19 12:34 UTC (permalink / raw)
  Cc: Kim F. Storm, emacs-devel, Kenichi Handa

> > While writing a package that sends and receives datagrams using the
> > new make-network-process functionality, I quickly found that I needed
> > to be able to encode and decode binary data structures, so I came up
> > with the following package (struct.el).
> > 
> > I'd like to hear if something like this already exists, or if others
> > find it should be added to emacs (with more complete documentation of
> > course).  [Also, the struct-pack function doesn't work with nested
> > data, but I'll fix that if there is an interest in this package].
> 
> I think it would be a very useful addition to Emacs, but I have one
> comment about the implementation: I don't like (and that's an
> understatement!) the idea of using unibyte strings.  I especially get 
> shivers when I see string-make-unibyte and its ilk.

While I agree that unibyte strings and buffers are sources of all
sorts of problems, I think that storing binary data in multibyte strings
and buffers is generally wrong.  It's perfectly fine to use the
eight-bit-control charset for the odd "unknown byte sequence",
but for raw binary data, it's just a waste of memory and CPU
resources.


	Stefan


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-18 23:12 Kim F. Storm
  2002-03-19  0:25 ` Miles Bader
  2002-03-19  7:02 ` Eli Zaretskii
@ 2002-03-19 14:33 ` Luke Gorrie
  2 siblings, 0 replies; 24+ messages in thread
From: Luke Gorrie @ 2002-03-19 14:33 UTC (permalink / raw)
  Cc: emacs-devel

no-spam@cua.dk (Kim F. Storm) writes:

> I'd like to hear if something like this already exists, or if others
> find it should be added to emacs (with more complete documentation of
> course).  [Also, the struct-pack function doesn't work with nested
> data, but I'll fix that if there is an interest in this package].

I often write code like this and would be very keen on a nice generic
library.

e.g. I have a decoding/encoding module for a sexp-like binary format
at http://www.bluetail.com/~luke/misc/erlext.el - it's not generalised
and takes a different approach, but it's the same sort of code.

On a similar note, I've recently written a small framework for network
state machines in order non-blockingly support non-trivial protocols,
with trace/debug convenience and so on. A work in progress that uses
it is at http://www.bluetail.com/~luke/misc/distel.tar.gz if you're
interested. net-fsm.el is the state machine code, also downloadable by
itself at http://www.bluetail.com/~luke/misc/net-fsm.el (it includes
yet another small ad hoc binary encoding API).

Cheers,
Luke


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
       [not found] <Pine.SUN.3.91.1020319133840.17982G-100000@is>
@ 2002-03-19 14:34 ` Kim F. Storm
  2002-03-19 21:27   ` Thien-Thi Nguyen
  2002-03-21  9:04   ` Richard Stallman
  0 siblings, 2 replies; 24+ messages in thread
From: Kim F. Storm @ 2002-03-19 14:34 UTC (permalink / raw)
  Cc: emacs-devel

Eli Zaretskii <eliz@is.elta.co.il> writes:

> [I think we should conduct this discussion on emacs-devel: it might be 
> useful for others.]
I've CC:ed the list.

> 
> On 19 Mar 2002, Kim F. Storm wrote:
> 
> > I used string-make-unibyte because I was having a great many
> > difficulties with "struct-pack" generating what seems to be a bogus
> > string, i.e. if I used `length' on the result, it didn't even have the
> > expected length.  So it didn't DTRT for me in this case.
> 
> You would have to show fragments of Lisp for me to see what could be the 
> problem.
> 
> > I did manage to control that via the current buffer's coding system,
> > but that seemed to make pretty random results, depending on what
> > buffer I happened to eval the function in (e.g. my source buffer
> > vs. *scratch* vs. a process buffer).
> 
> This seems to indicate that you forced Emacs to convert from unibyte to 
> multibyte and back.  When you do that, it uses the current defaults to 
> guess what you mean as TRT.  That's why I said we shouldn't stress this 
> ad-hoc'ery too much--it's too fragile.

The problem probably was that I took some binary data from one buffer,
put it into a (multibyte) string, and then tried to unpack that string
in another buffer.  It didn't work - probably because those buffers
had different coding systems.

Since I really wanted this to work on binary, byte-oriented data,
I thought unibyte would be the correct mantra for me to use (and
it does work for this purpose).

> 
> > So I probably just have to warn users that this package will only work
> > on binary data if the current buffer's coding system is raw-text-unix.
> 
> I don't understand this--when you read binary data, what other decoding 
> can you possibly use in the buffer into which the data is read?  

True.  It just puzzles me why I always seem to run into problems when
emacs DTRT with this coding stuff.  It *was* easier in the old days,
and my mind probably just can't cope with this "new" coding stuff :-)


>                                                                    Why 
> should users bother about what kind of buffer are you using behind the 
> scenes to convert binary garbage into human-readable description?  What 
> am I missing?

In this scope, the `users' I was talking about are really `programmers'.
So `real users' definitely shouldn't bother -- and they will probably be
unaware of the existence of the struct.el package too.

> 
> > To me it would make sense to have a coding system named `binary' which
> > would work across all platforms.
> 
> `binary' _does_ work this way.  My problem was not with the coding system 
> you used, it was with the fact that you used unibyte strings.  That's a 
> different issue.
> 
> > Besides setting the buffer's coding system, is there some other way to
> > ensure that I'll always *use* a coding system which ensures that
> > `(substring ... 4 5)' really does only take one *byte* from the
> > string?
> 
> `substring' is a string operation.  Your data is not text, so you 
> should avoid string operations, I think.  Again, it's hard to give 
> specific advice without a specific example.  In general, functions like 
> char-after are much better.

My code is typically used in a network process filter, and in that
scope, the received (binary) data is delivered in a string, so yes, my
data isn't text, but it stored in a string...

In any case, I'll follow your advice and rely on using the proper
coding systems for the buffers and processes involved (and keep my
fingers crossed :-).

-- 
Kim F. Storm <storm@cua.dk> http://www.cua.dk


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-19 12:34   ` Stefan Monnier
@ 2002-03-19 14:38     ` Eli Zaretskii
  0 siblings, 0 replies; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-19 14:38 UTC (permalink / raw)
  Cc: Kim F. Storm, emacs-devel, Kenichi Handa


On Tue, 19 Mar 2002, Stefan Monnier wrote:

> It's perfectly fine to use the
> eight-bit-control charset for the odd "unknown byte sequence",
> but for raw binary data, it's just a waste of memory and CPU
> resources.

I'd prefer some waste of cycles to the problems that unibyte text can 
beget.

But I'm not even sure there is a waste of cycles: it's quite possible 
that Emacs does the conversion, maybe even several times, behind our 
back.  If we really care about this, we should step through the code and 
see.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-19 14:34 ` Kim F. Storm
@ 2002-03-19 21:27   ` Thien-Thi Nguyen
  2002-03-21  9:04   ` Richard Stallman
  1 sibling, 0 replies; 24+ messages in thread
From: Thien-Thi Nguyen @ 2002-03-19 21:27 UTC (permalink / raw)
  Cc: Eli Zaretskii, emacs-devel

storm@cua.dk (Kim F. Storm) writes:

   In any case, I'll follow your advice and rely on using the proper
   coding systems for the buffers and processes involved (and keep my
   fingers crossed :-).

perhaps you're already aware of this: GNU serveez has similar provision (using
guile for hll instead of elisp) for "network binary" <-> sexp handling.  i
hope the related APIs for emacs and serveez can be kept w/in machine-shouting
distance of each other...

thi

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-19 14:34 ` Kim F. Storm
  2002-03-19 21:27   ` Thien-Thi Nguyen
@ 2002-03-21  9:04   ` Richard Stallman
  2002-03-21 11:15     ` Eli Zaretskii
  1 sibling, 1 reply; 24+ messages in thread
From: Richard Stallman @ 2002-03-21  9:04 UTC (permalink / raw)
  Cc: eliz, emacs-devel

There is nothing illegitimate about unibyte strings.  There is no
general reason to avoid them, and we do not have a general policy
of avoiding them.  Using them here seems natural to me.

Is there some specific reason not to use unibyte strings here?


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21  9:04   ` Richard Stallman
@ 2002-03-21 11:15     ` Eli Zaretskii
  2002-03-21 13:31       ` Kim F. Storm
  2002-03-23  2:36       ` Richard Stallman
  0 siblings, 2 replies; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-21 11:15 UTC (permalink / raw)
  Cc: storm, emacs-devel


On Thu, 21 Mar 2002, Richard Stallman wrote:

> Is there some specific reason not to use unibyte strings here?

They are simply not needed here (AFAICS).  IMO, unibyte strings
should not be used unless absolutely necessary, ideally never.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 11:15     ` Eli Zaretskii
@ 2002-03-21 13:31       ` Kim F. Storm
  2002-03-21 14:56         ` Eli Zaretskii
  2002-03-21 16:53         ` Stefan Monnier
  2002-03-23  2:36       ` Richard Stallman
  1 sibling, 2 replies; 24+ messages in thread
From: Kim F. Storm @ 2002-03-21 13:31 UTC (permalink / raw)
  Cc: Richard Stallman, emacs-devel

Eli Zaretskii <eliz@is.elta.co.il> writes:

> On Thu, 21 Mar 2002, Richard Stallman wrote:
> 
> > Is there some specific reason not to use unibyte strings here?
> 
> They are simply not needed here (AFAICS). 

On the surface, it looks "obvious" to use unibyte strings for
data which is known to be - and processed as - byte-oriented
data.

But I guess you are right that it isn't strictly necessary even for
byte-oriented data, so I will avoid the final conversion to unibyte --
if unibyte-ness is really needed, the caller can do the conversion.

I've now changed the struct.el code to operate on a vector instead of
a string, so in my case it really doesn't matter anymore.

>                                            IMO, unibyte strings
> should not be used unless absolutely necessary, ideally never.

Am I right in assuming that aref, aset, and substring are potentially
much slower on a multibyte string than on a unibyte string?  Probably
not noticeable if you just look at a single char from a string, but
what if you loop over a string (forwards or backwards)?

I can see there is some caching in string_char_to_byte for multiple
accesses to the same string, but it breaks as soon as you operate on more
than one string e.g. in a loop.

-- 
Kim F. Storm <storm@cua.dk> http://www.cua.dk


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 13:31       ` Kim F. Storm
@ 2002-03-21 14:56         ` Eli Zaretskii
  2002-03-21 15:36           ` Kim F. Storm
                             ` (2 more replies)
  2002-03-21 16:53         ` Stefan Monnier
  1 sibling, 3 replies; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-21 14:56 UTC (permalink / raw)
  Cc: Richard Stallman, emacs-devel


On 21 Mar 2002, Kim F. Storm wrote:

> > > Is there some specific reason not to use unibyte strings here?
> > 
> > They are simply not needed here (AFAICS). 
> 
> On the surface, it looks "obvious" to use unibyte strings for
> data which is known to be - and processed as - byte-oriented
> data.

That's the C programmer in you trying to get free ;-)

Seriously, though: I think we should stop thinking about unibyte vs 
multibyte strings, and instead think about them as, well, just strings.
The uni- vs multi-byteness is an implementation detail that should not 
bother a Lisp programmer.  (That it sometimes does is a sign of bugs 
that need to be fixed, IMHO.)

Emacs 21 is perfectly capable of holding binary data in a multibyte 
buffer, so there's no need to do this on the Lisp level.  If Emacs 
decides that a buffer needs to be switched to unibyte mode, it will do
so automatically.

> >                                            IMO, unibyte strings
> > should not be used unless absolutely necessary, ideally never.
> 
> Am I right in assuming that aref, aset, and substring are potentially
> much slower on a multibyte string than on a unibyte string?

If the string holds non-ASCII text, yes.  But not in your case, where the 
``string'' holds binary data, I think.

Also, I think the slow-down should be noticeable when you need to walk 
many characters, like when you invoke substring with a large argument.  
In your case, you take a small number of characters and then run some 
Lisp on them (e.g. to convert them into a number), so I think the 
slow-down will be negligible.  But that's speculation: I didn't really do 
any measurements, so some complication might be evading me.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 14:56         ` Eli Zaretskii
@ 2002-03-21 15:36           ` Kim F. Storm
  2002-03-21 16:58             ` Stefan Monnier
  2002-03-22  0:37           ` Miles Bader
  2002-03-23  2:35           ` Richard Stallman
  2 siblings, 1 reply; 24+ messages in thread
From: Kim F. Storm @ 2002-03-21 15:36 UTC (permalink / raw)
  Cc: emacs-devel

Eli Zaretskii <eliz@is.elta.co.il> writes:

> On 21 Mar 2002, Kim F. Storm wrote:
> 
> > > > Is there some specific reason not to use unibyte strings here?
> > > 
> > > They are simply not needed here (AFAICS). 
> > 
> > On the surface, it looks "obvious" to use unibyte strings for
> > data which is known to be - and processed as - byte-oriented
> > data.
> 
> That's the C programmer in you trying to get free ;-)

Right on the spot :-)

> > Am I right in assuming that aref, aset, and substring are potentially
> > much slower on a multibyte string than on a unibyte string?

I just tried to measure whether there is any difference, by copying
between two strings like this:
  (let ((i (length a)))
    (while (> i 0)
      (setq i (1- i))
      (aset a i (aref b i))))

a and b are either both multibyte or both unibyte and initialized
to all NULs.  The difference in time is very small if there is any...

> 
> If the string holds non-ASCII text, yes.  But not in your case, where the 
> ``string'' holds binary data, I think.

.. which confirms your belief.

-- 
Kim F. Storm <storm@cua.dk> http://www.cua.dk


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 13:31       ` Kim F. Storm
  2002-03-21 14:56         ` Eli Zaretskii
@ 2002-03-21 16:53         ` Stefan Monnier
  1 sibling, 0 replies; 24+ messages in thread
From: Stefan Monnier @ 2002-03-21 16:53 UTC (permalink / raw)
  Cc: Eli Zaretskii, Richard Stallman, emacs-devel

> On the surface, it looks "obvious" to use unibyte strings for
> data which is known to be - and processed as - byte-oriented
> data.

Agreed.  Unibyte strings are basically byte-arrays whereas
multibyte-strings are char arrays.  In your case you're dealing
with bytes rather than with chars so it makes more sense (to me)
to use unibyte strings.

> >                                            IMO, unibyte strings
> > should not be used unless absolutely necessary, ideally never.
> 
> Am I right in assuming that aref, aset, and substring are potentially
> much slower on a multibyte string than on a unibyte string?  Probably
> not noticeable if you just look at a single char from a string, but
> what if you loop over a string (forwards or backwards)?

I think it's worse than that.  `aset' on a multibyte string is very
difficult to support in the case where you replace a char with another
of a different byte-length: it ends up allocating a whole new string.
This is really bad.  I think `aset' on strings should simply be disallowed
(I have it disabled here).


	Stefan


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 15:36           ` Kim F. Storm
@ 2002-03-21 16:58             ` Stefan Monnier
  2002-03-21 19:45               ` Eli Zaretskii
  0 siblings, 1 reply; 24+ messages in thread
From: Stefan Monnier @ 2002-03-21 16:58 UTC (permalink / raw)
  Cc: Eli Zaretskii, emacs-devel

> I just tried to measure whether there is any difference, by copying
> between two strings like this:
>   (let ((i (length a)))
>     (while (> i 0)
>       (setq i (1- i))
>       (aset a i (aref b i))))
> 
> a and b are either both multibyte or both unibyte and initialized
> to all NULs.  The difference in time is very small if there is any...

Try it with a initialized to all NULs and b initialized to all
eight-bit-graphic chars.

> > If the string holds non-ASCII text, yes.  But not in your case, where the 
> > ``string'' holds binary data, I think.

This is only true if binary values between 128 and 256 are represented
as a single-byte, but that's not the case in multibyte strings.


	Stefan


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 16:58             ` Stefan Monnier
@ 2002-03-21 19:45               ` Eli Zaretskii
  2002-03-22  1:05                 ` Stefan Monnier
  0 siblings, 1 reply; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-21 19:45 UTC (permalink / raw)
  Cc: storm, emacs-devel

> From: "Stefan Monnier" <monnier+gnu/emacs@rum.cs.yale.edu>
> Date: Thu, 21 Mar 2002 11:58:34 -0500
> 
> > > If the string holds non-ASCII text, yes.  But not in your case, where the 
> > > ``string'' holds binary data, I think.
> 
> This is only true if binary values between 128 and 256 are represented
> as a single-byte, but that's not the case in multibyte strings.

Most of them, those between 160 and 255 _are_ represented as a single
byte, even in a multibyte string/buffer.  That's the eight-bit-graphic
character set.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 14:56         ` Eli Zaretskii
  2002-03-21 15:36           ` Kim F. Storm
@ 2002-03-22  0:37           ` Miles Bader
  2002-03-23  2:35           ` Richard Stallman
  2 siblings, 0 replies; 24+ messages in thread
From: Miles Bader @ 2002-03-22  0:37 UTC (permalink / raw)
  Cc: Kim F. Storm, Richard Stallman, emacs-devel

Eli Zaretskii <eliz@is.elta.co.il> writes:
> Emacs 21 is perfectly capable of holding binary data in a multibyte 
> buffer, so there's no need to do this on the Lisp level.  If Emacs 
> decides that a buffer needs to be switched to unibyte mode, it will do
> so automatically.

I agree strongly.  Unibyte strings as `an interface' should go away.

A unibyte string (or buffer?) could reference a single character set
that applies to every character in them; if you try to add a character
not in that character set, it would convert the whole thing to
multibyte, and then add the character.

-Miles
-- 
[|nurgle|]  ddt- demonic? so quake will have an evil kinda setting? one that 
            will  make every christian in the world foamm at the mouth? 
[iddt]      nurg, that's the goal 

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 19:45               ` Eli Zaretskii
@ 2002-03-22  1:05                 ` Stefan Monnier
  2002-03-22 11:04                   ` Eli Zaretskii
  0 siblings, 1 reply; 24+ messages in thread
From: Stefan Monnier @ 2002-03-22  1:05 UTC (permalink / raw)
  Cc: monnier+gnu/emacs, storm, emacs-devel

> > From: "Stefan Monnier" <monnier+gnu/emacs@rum.cs.yale.edu>
> > Date: Thu, 21 Mar 2002 11:58:34 -0500
> > 
> > > > If the string holds non-ASCII text, yes.  But not in your case, where the 
> > > > ``string'' holds binary data, I think.
> > 
> > This is only true if binary values between 128 and 256 are represented
> > as a single-byte, but that's not the case in multibyte strings.
> 
> Most of them, those between 160 and 255 _are_ represented as a single
> byte, even in a multibyte string/buffer.  That's the eight-bit-graphic
> character set.

Interesting.  It still means that binary values between 128 and 160
(i.e. from eight-bit-control) are posing problems when calling `aset'.

But now that I think about it, if 160-255 can be an eight-bit-graphic
character, how does the code does with "backward-char" ?
Looking at DEC_POS in charset.h I see that we do

	while (p > p_min && !CHAR_HEAD_P (*p)) p--;

so if the buffer is filled with eight-bit-graphic we might skip backward
over the whole buffer before discovering that the char we want
to skip was just an eight-bit-graphic.

Do I understand the code correctly ?
If so, shouldn't we set p_min to a value like

	max (p_min, p - MAX_MULTIBYTE_LENGTH)

to make sure we don't hit this pathological case ?


	Stefan


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
@ 2002-03-22  1:25 Kenichi Handa
  2002-03-22  1:27 ` Stefan Monnier
  0 siblings, 1 reply; 24+ messages in thread
From: Kenichi Handa @ 2002-03-22  1:25 UTC (permalink / raw)
  Cc: eliz, monnier+gnu/emacs, storm, emacs-devel

"Stefan Monnier" <monnier+gnu/emacs@RUM.cs.yale.edu> writes:
> But now that I think about it, if 160-255 can be an eight-bit-graphic
> character, how does the code does with "backward-char" ?
> Looking at DEC_POS in charset.h I see that we do

> 	while (p > p_min && !CHAR_HEAD_P (*p)) p--;

Please don't skip the following four lines:

	len = pend + 1 - p;						\
	PARSE_MULTIBYTE_SEQ (p, len, bytes);				\
	if (bytes == len)						\
	  pos_byte -= len - 1;						\

which handles the above case.  When we at last reach a
char-head, PARSE_MULTIBYTE_SEQ checks how long the byte
sequence should be.  We update pos_byte only if the length
is the same as what we decreased.

---
Ken'ichi HANDA
handa@etl.go.jp

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-22  1:25 struct.el -- a package to encode/decode binary data Kenichi Handa
@ 2002-03-22  1:27 ` Stefan Monnier
  0 siblings, 0 replies; 24+ messages in thread
From: Stefan Monnier @ 2002-03-22  1:27 UTC (permalink / raw)
  Cc: monnier+gnu/emacs, eliz, storm, emacs-devel

> "Stefan Monnier" <monnier+gnu/emacs@RUM.cs.yale.edu> writes:
> > But now that I think about it, if 160-255 can be an eight-bit-graphic
> > character, how does the code does with "backward-char" ?
> > Looking at DEC_POS in charset.h I see that we do
> 
> > 	while (p > p_min && !CHAR_HEAD_P (*p)) p--;
> 
> Please don't skip the following four lines:
> 
> 	len = pend + 1 - p;						\
> 	PARSE_MULTIBYTE_SEQ (p, len, bytes);				\
> 	if (bytes == len)						\
> 	  pos_byte -= len - 1;						\
> 
> which handles the above case.  When we at last reach a
> char-head, PARSE_MULTIBYTE_SEQ checks how long the byte
> sequence should be.  We update pos_byte only if the length
> is the same as what we decreased.

Yes, I saw that, so indeed the code is correct.  But it seems
that going back over the whole buffer (potentially) only
to discover that a single byte was needed is kind of silly.
Hence my suggestion to set p_min to max (p_min, p - MAX_MULTIBYTE_LENGTH)


	Stefan


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
@ 2002-03-22  1:30 Kenichi Handa
  0 siblings, 0 replies; 24+ messages in thread
From: Kenichi Handa @ 2002-03-22  1:30 UTC (permalink / raw)
  Cc: eliz, monnier+gnu/emacs, storm, emacs-devel

I wrote:
> which handles the above case.  When we at last reach a
> char-head, PARSE_MULTIBYTE_SEQ checks how long the byte
> sequence should be.  We update pos_byte only if the length
> is the same as what we decreased.

Oops, it seems that I misunderstood what Stefan meant.  If
what he meant is about efficiency, yes, he is write.  It is
better that p_min is set to:
	max (p_min, p - MAX_MULTIBYTE_LENGTH)

---
Ken'ichi HANDA
handa@etl.go.jp

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-22  1:05                 ` Stefan Monnier
@ 2002-03-22 11:04                   ` Eli Zaretskii
  0 siblings, 0 replies; 24+ messages in thread
From: Eli Zaretskii @ 2002-03-22 11:04 UTC (permalink / raw)
  Cc: storm, emacs-devel

> From: "Stefan Monnier" <monnier+gnu/emacs@rum.cs.yale.edu>
> Date: Thu, 21 Mar 2002 20:05:09 -0500
> 
> > Most of them, those between 160 and 255 _are_ represented as a single
> > byte, even in a multibyte string/buffer.  That's the eight-bit-graphic
> > character set.
> 
> Interesting.  It still means that binary values between 128 and 160
> (i.e. from eight-bit-control) are posing problems when calling `aset'.

Yes, that's true.  So moving through a multibyte buffer/string with
binary data is slightly less efficient than with unibyte buffer/string
(assuming a uniform distribution of possible 8-bit values in the range
[0..255]).  Moving *back* through such buffers/strings is especially
painful, since you cannot in advance find the head of the multibyte
sequence, which tells how many bytes to move to skip the character.

However, I think in code that constructs numbers from binary data you
don't normally need to move back, or use aset.  You normally need to
move/scan forward a small number of bytes, while you accumulate their
values into a number.  That should be reasonably efficient, and so
using unibyte strings doesn't seem to be justified.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 14:56         ` Eli Zaretskii
  2002-03-21 15:36           ` Kim F. Storm
  2002-03-22  0:37           ` Miles Bader
@ 2002-03-23  2:35           ` Richard Stallman
  2 siblings, 0 replies; 24+ messages in thread
From: Richard Stallman @ 2002-03-23  2:35 UTC (permalink / raw)
  Cc: storm, emacs-devel

    > Am I right in assuming that aref, aset, and substring are potentially
    > much slower on a multibyte string than on a unibyte string?

    If the string holds non-ASCII text, yes.  But not in your case, where the 
    ``string'' holds binary data, I think.

Multibyte strings are slower in this case.  The only case where they
are not slower is when the contents are all ASCII.

To see the slowdown best, try accessing the string nonlinearly.

_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: struct.el -- a package to encode/decode binary data
  2002-03-21 11:15     ` Eli Zaretskii
  2002-03-21 13:31       ` Kim F. Storm
@ 2002-03-23  2:36       ` Richard Stallman
  1 sibling, 0 replies; 24+ messages in thread
From: Richard Stallman @ 2002-03-23  2:36 UTC (permalink / raw)
  Cc: storm, emacs-devel

      IMO, unibyte strings
    should not be used unless absolutely necessary, ideally never.

There is nothing bad about unibyte strings.  As long as Emacs has
unibyte buffers, unibyte strings are natural and add very little extra
complexity.


_______________________________________________
Emacs-devel mailing list
Emacs-devel@gnu.org
http://mail.gnu.org/mailman/listinfo/emacs-devel


^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2002-03-23  2:36 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-03-22  1:25 struct.el -- a package to encode/decode binary data Kenichi Handa
2002-03-22  1:27 ` Stefan Monnier
  -- strict thread matches above, loose matches on Subject: below --
2002-03-22  1:30 Kenichi Handa
     [not found] <Pine.SUN.3.91.1020319133840.17982G-100000@is>
2002-03-19 14:34 ` Kim F. Storm
2002-03-19 21:27   ` Thien-Thi Nguyen
2002-03-21  9:04   ` Richard Stallman
2002-03-21 11:15     ` Eli Zaretskii
2002-03-21 13:31       ` Kim F. Storm
2002-03-21 14:56         ` Eli Zaretskii
2002-03-21 15:36           ` Kim F. Storm
2002-03-21 16:58             ` Stefan Monnier
2002-03-21 19:45               ` Eli Zaretskii
2002-03-22  1:05                 ` Stefan Monnier
2002-03-22 11:04                   ` Eli Zaretskii
2002-03-22  0:37           ` Miles Bader
2002-03-23  2:35           ` Richard Stallman
2002-03-21 16:53         ` Stefan Monnier
2002-03-23  2:36       ` Richard Stallman
2002-03-18 23:12 Kim F. Storm
2002-03-19  0:25 ` Miles Bader
2002-03-19  7:02 ` Eli Zaretskii
2002-03-19 12:34   ` Stefan Monnier
2002-03-19 14:38     ` Eli Zaretskii
2002-03-19 14:33 ` Luke Gorrie

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).