unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
* Huge file adventure (+patch)
@ 2013-10-07  9:47 Dmitry Antipov
  2013-10-07 15:15 ` Stefan Monnier
  2013-10-07 16:07 ` Eli Zaretskii
  0 siblings, 2 replies; 6+ messages in thread
From: Dmitry Antipov @ 2013-10-07  9:47 UTC (permalink / raw)
  To: Emacs development discussions

[-- Attachment #1: Type: text/plain, Size: 1930 bytes --]

Recently I have got installed 16GB RAM in my laptop and, of course,
have tried to open 8GB ASCII text file to see what happens (now it's
just 1/2 of RAM, so why not?). After opening, I did some editing,
then wait for auto-save and ... got "Memory exhausted" message. Short
investigation quickly shows that I need ... 24GB of RAM to handle
such a file :-(. Here is why:

1) Fdo_auto_save calls Fwrite_region for the whole file, which
    issues e_write (.., start=1, end=8G, ...).

2) CODING_REQUIRE_ENCODING decides that it's time to do some
    encoding, and encode_coding_object allocates 8G destination
    buffer (from coding.c):

    8335    else if (EQ (dst_object, Qt))
    8336      {
    8337        ptrdiff_t dst_bytes = max (1, coding->src_chars);
    8338        coding->dst_object = Qnil;
    8339        coding->destination = xmalloc (dst_bytes); /* HERE */
    8340        coding->dst_bytes = dst_bytes;
    8341        coding->dst_multibyte = 0;
    8342      }

3) Finally encode_coding_object tries to create 8G Lisp string
    to hold the result (from coding.c):

    8351    if (EQ (dst_object, Qt))
    8352      {
    8353        if (BUFFERP (coding->dst_object))
    8354          coding->dst_object = Fbuffer_string ();
    8355        else
    8356          {
    8357            coding->dst_object
    8358              = make_unibyte_string ((char *) coding->destination,
    8359                                     coding->produced); /* HERE */
    8360            xfree (coding->destination);
    8361          }
    8362      }

So 8G for buffer text + 8G for encoding buffer + 8G for the result.
Since `coding->destination' is freed immediately after creating Lisp
string, I need 16G for some period of time but with short 24G peak.

And, of course, there is a patch to address an issues described above.
Comments are very welcome because I'm not hooked too much in coding
machinery (yet).

Dmitry

[-- Attachment #2: e_write_and_encode.patch --]
[-- Type: text/x-patch, Size: 3913 bytes --]

=== modified file 'src/coding.c'
--- src/coding.c	2013-08-26 05:20:59 +0000
+++ src/coding.c	2013-10-07 09:16:14 +0000
@@ -5761,6 +5761,7 @@
   coding->safe_charsets = SDATA (val);
   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
   coding->carryover_bytes = 0;
+  coding->raw_destination = 0;
 
   coding_type = CODING_ATTR_TYPE (attrs);
   if (EQ (coding_type, Qundecided))
@@ -8354,10 +8355,18 @@
 	coding->dst_object = Fbuffer_string ();
       else
 	{
-	  coding->dst_object
-	    = make_unibyte_string ((char *) coding->destination,
-				   coding->produced);
-	  xfree (coding->destination);
+	  /* This is used to avoid creating huge Lisp string.
+	     NOTE: caller who set `raw_destination' is also
+	     responsible to free `destination' buffer.  */
+	  if (coding->raw_destination)
+	    coding->dst_object = Qnil;
+	  else
+	    {
+	      coding->dst_object
+		= make_unibyte_string ((char *) coding->destination,
+				       coding->produced);
+	      xfree (coding->destination);
+	    }
 	}
     }
 

=== modified file 'src/coding.h'
--- src/coding.h	2013-08-30 12:17:44 +0000
+++ src/coding.h	2013-10-07 08:21:30 +0000
@@ -512,6 +512,10 @@
      `charbuf', but at `src_object'.  */
   unsigned chars_at_source : 1;
 
+  /* Nonzero if the result of conversion is in `destination'
+     buffer rather than in `dst_object'.  */
+  unsigned raw_destination : 1;
+
   /* Set to 1 if charbuf contains an annotation.  */
   unsigned annotated : 1;
 

=== modified file 'src/fileio.c'
--- src/fileio.c	2013-09-11 05:03:23 +0000
+++ src/fileio.c	2013-10-07 09:17:51 +0000
@@ -5263,6 +5263,10 @@
   return 1;
 }
 
+/* Maximum number of characters that the next
+   function encodes per one loop iteration.  */
+
+enum { E_WRITE_MAX = 8 * 1024 * 1024 };
 
 /* Write text in the range START and END into descriptor DESC,
    encoding them with coding system CODING.  If STRING is nil, START
@@ -5289,9 +5293,16 @@
 	  coding->src_multibyte = SCHARS (string) < SBYTES (string);
 	  if (CODING_REQUIRE_ENCODING (coding))
 	    {
-	      encode_coding_object (coding, string,
-				    start, string_char_to_byte (string, start),
-				    end, string_char_to_byte (string, end), Qt);
+	      ptrdiff_t nchars = min (end - start, E_WRITE_MAX);
+
+	      /* Avoid creating huge Lisp string in encode_coding_object.  */
+	      if (nchars == E_WRITE_MAX)
+		coding->raw_destination = 1;
+
+	      encode_coding_object
+		(coding, string, start, string_char_to_byte (string, start),
+		 start + nchars, string_char_to_byte (string, start + nchars),
+		 Qt);
 	    }
 	  else
 	    {
@@ -5308,8 +5319,15 @@
 	  coding->src_multibyte = (end - start) < (end_byte - start_byte);
 	  if (CODING_REQUIRE_ENCODING (coding))
 	    {
-	      encode_coding_object (coding, Fcurrent_buffer (),
-				    start, start_byte, end, end_byte, Qt);
+	      ptrdiff_t nchars = min (end - start, E_WRITE_MAX);
+
+	      /* Likewise.  */
+	      if (nchars == E_WRITE_MAX)
+		coding->raw_destination = 1;
+
+	      encode_coding_object
+		(coding, Fcurrent_buffer (), start, start_byte,
+		 start + nchars, CHAR_TO_BYTE (start + nchars), Qt);
 	    }
 	  else
 	    {
@@ -5330,11 +5348,19 @@
 
       if (coding->produced > 0)
 	{
-	  char *buf = (STRINGP (coding->dst_object)
-		       ? SSDATA (coding->dst_object)
-		       : (char *) BYTE_POS_ADDR (coding->dst_pos_byte));
+	  char *buf = (coding->raw_destination ? (char *) coding->destination
+		       : (STRINGP (coding->dst_object)
+			  ? SSDATA (coding->dst_object)
+			  : (char *) BYTE_POS_ADDR (coding->dst_pos_byte)));
 	  coding->produced -= emacs_write_sig (desc, buf, coding->produced);
 
+	  if (coding->raw_destination)
+	    {
+	      /* We're responsible to free this, see
+		 encode_coding_object to check why.  */
+	      xfree (coding->destination);
+	      coding->raw_destination = 0;
+	    }
 	  if (coding->produced)
 	    return 0;
 	}


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Huge file adventure (+patch)
  2013-10-07  9:47 Huge file adventure (+patch) Dmitry Antipov
@ 2013-10-07 15:15 ` Stefan Monnier
  2013-10-07 16:03   ` Dmitry Antipov
  2013-10-07 16:07 ` Eli Zaretskii
  1 sibling, 1 reply; 6+ messages in thread
From: Stefan Monnier @ 2013-10-07 15:15 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Emacs development discussions

> And, of course, there is a patch to address an issues described above.
> Comments are very welcome because I'm not hooked too much in coding
> machinery (yet).

Sounds good, in general.

>        else
>  	{
> -	  coding->dst_object
> -	    = make_unibyte_string ((char *) coding->destination,
> -				   coding->produced);
> -	  xfree (coding->destination);
> +	  /* This is used to avoid creating huge Lisp string.
> +	     NOTE: caller who set `raw_destination' is also
> +	     responsible to free `destination' buffer.  */
> +	  if (coding->raw_destination)
> +	    coding->dst_object = Qnil;
> +	  else
> +	    {
> +	      coding->dst_object
> +		= make_unibyte_string ((char *) coding->destination,
> +				       coding->produced);
> +	      xfree (coding->destination);
> +	    }
>  	}
>      }
 
You can remove the { between "else" and "if" (with the side-benefit that
the unchanged code will stay at the same indentation level).

>  	  if (CODING_REQUIRE_ENCODING (coding))
>  	    {
> -	      encode_coding_object (coding, string,
> -				    start, string_char_to_byte (string, start),
> -				    end, string_char_to_byte (string, end), Qt);
> +	      ptrdiff_t nchars = min (end - start, E_WRITE_MAX);
> +
> +	      /* Avoid creating huge Lisp string in encode_coding_object.  */
> +	      if (nchars == E_WRITE_MAX)
> +		coding->raw_destination = 1;
> +
> +	      encode_coding_object
> +		(coding, string, start, string_char_to_byte (string, start),
> +		 start + nchars, string_char_to_byte (string, start + nchars),
> +		 Qt);
>  	    }

Where/how do you make sure we loop back here for the rest
(when end-start > E_WRITE_MAX) ?


        Stefan



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Huge file adventure (+patch)
  2013-10-07 15:15 ` Stefan Monnier
@ 2013-10-07 16:03   ` Dmitry Antipov
  2013-10-07 19:03     ` Stefan Monnier
  0 siblings, 1 reply; 6+ messages in thread
From: Dmitry Antipov @ 2013-10-07 16:03 UTC (permalink / raw)
  To: Stefan Monnier; +Cc: Emacs development discussions

On 10/07/2013 07:15 PM, Stefan Monnier wrote:

> You can remove the { between "else" and "if" (with the side-benefit that
> the unchanged code will stay at the same indentation level).

Argh, OK.

> Where/how do you make sure we loop back here for the rest
> (when end-start > E_WRITE_MAX) ?

Did you mean 'end - start < E_WRITE_MAX'? Or am I creating
an endless loop in e_write (BTW, I don't see how)?

Dmitry




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Huge file adventure (+patch)
  2013-10-07  9:47 Huge file adventure (+patch) Dmitry Antipov
  2013-10-07 15:15 ` Stefan Monnier
@ 2013-10-07 16:07 ` Eli Zaretskii
  2013-10-07 17:07   ` Dmitry Antipov
  1 sibling, 1 reply; 6+ messages in thread
From: Eli Zaretskii @ 2013-10-07 16:07 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: emacs-devel

> Date: Mon, 07 Oct 2013 13:47:26 +0400
> From: Dmitry Antipov <dmantipov@yandex.ru>
> 
> Recently I have got installed 16GB RAM in my laptop and, of course,
> have tried to open 8GB ASCII text file to see what happens (now it's
> just 1/2 of RAM, so why not?). After opening, I did some editing,
> then wait for auto-save and ... got "Memory exhausted" message. Short
> investigation quickly shows that I need ... 24GB of RAM to handle
> such a file :-(. Here is why:

Thanks for testing.  Just 2.5 years ago Emacs would crash and burn
with any file larger than 2GB, instead of erroring out due to
insufficient VM.



^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Huge file adventure (+patch)
  2013-10-07 16:07 ` Eli Zaretskii
@ 2013-10-07 17:07   ` Dmitry Antipov
  0 siblings, 0 replies; 6+ messages in thread
From: Dmitry Antipov @ 2013-10-07 17:07 UTC (permalink / raw)
  To: Eli Zaretskii; +Cc: emacs-devel

On 10/07/2013 08:07 PM, Eli Zaretskii wrote:

> Thanks for testing.  Just 2.5 years ago Emacs would crash and burn
> with any file larger than 2GB, instead of erroring out due to
> insufficient VM.

Today I also spent some time checking why an attempt to load 5.2G Arabic
script causes cryptic "Wrong type argument: inserted-chars, 2958692352"
error. Now this is fixed in r114558 :-). More surprises are coming, indeed.

Dmitry




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: Huge file adventure (+patch)
  2013-10-07 16:03   ` Dmitry Antipov
@ 2013-10-07 19:03     ` Stefan Monnier
  0 siblings, 0 replies; 6+ messages in thread
From: Stefan Monnier @ 2013-10-07 19:03 UTC (permalink / raw)
  To: Dmitry Antipov; +Cc: Emacs development discussions

>> You can remove the { between "else" and "if" (with the side-benefit that
>> the unchanged code will stay at the same indentation level).
> Argh, OK.
>> Where/how do you make sure we loop back here for the rest
>> (when end-start > E_WRITE_MAX) ?
> Did you mean 'end - start < E_WRITE_MAX'?

No.

> Or am I creating an endless loop in e_write (BTW, I don't see how)?

Oh, sorry, I now see that the "start += coding->consumed_char;" makes
sure that if we don't encode the whole text at once, the loop will come
back to let us finish.

Please install.


        Stefan



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2013-10-07 19:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2013-10-07  9:47 Huge file adventure (+patch) Dmitry Antipov
2013-10-07 15:15 ` Stefan Monnier
2013-10-07 16:03   ` Dmitry Antipov
2013-10-07 19:03     ` Stefan Monnier
2013-10-07 16:07 ` Eli Zaretskii
2013-10-07 17:07   ` Dmitry Antipov

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).