From: Dmitry Antipov <dmantipov@yandex.ru>
To: Emacs development discussions <emacs-devel@gnu.org>
Subject: Huge file adventure (+patch)
Date: Mon, 07 Oct 2013 13:47:26 +0400 [thread overview]
Message-ID: <5252832E.5060804@yandex.ru> (raw)
[-- Attachment #1: Type: text/plain, Size: 1930 bytes --]
Recently I have got installed 16GB RAM in my laptop and, of course,
have tried to open 8GB ASCII text file to see what happens (now it's
just 1/2 of RAM, so why not?). After opening, I did some editing,
then wait for auto-save and ... got "Memory exhausted" message. Short
investigation quickly shows that I need ... 24GB of RAM to handle
such a file :-(. Here is why:
1) Fdo_auto_save calls Fwrite_region for the whole file, which
issues e_write (.., start=1, end=8G, ...).
2) CODING_REQUIRE_ENCODING decides that it's time to do some
encoding, and encode_coding_object allocates 8G destination
buffer (from coding.c):
8335 else if (EQ (dst_object, Qt))
8336 {
8337 ptrdiff_t dst_bytes = max (1, coding->src_chars);
8338 coding->dst_object = Qnil;
8339 coding->destination = xmalloc (dst_bytes); /* HERE */
8340 coding->dst_bytes = dst_bytes;
8341 coding->dst_multibyte = 0;
8342 }
3) Finally encode_coding_object tries to create 8G Lisp string
to hold the result (from coding.c):
8351 if (EQ (dst_object, Qt))
8352 {
8353 if (BUFFERP (coding->dst_object))
8354 coding->dst_object = Fbuffer_string ();
8355 else
8356 {
8357 coding->dst_object
8358 = make_unibyte_string ((char *) coding->destination,
8359 coding->produced); /* HERE */
8360 xfree (coding->destination);
8361 }
8362 }
So 8G for buffer text + 8G for encoding buffer + 8G for the result.
Since `coding->destination' is freed immediately after creating Lisp
string, I need 16G for some period of time but with short 24G peak.
And, of course, there is a patch to address an issues described above.
Comments are very welcome because I'm not hooked too much in coding
machinery (yet).
Dmitry
[-- Attachment #2: e_write_and_encode.patch --]
[-- Type: text/x-patch, Size: 3913 bytes --]
=== modified file 'src/coding.c'
--- src/coding.c 2013-08-26 05:20:59 +0000
+++ src/coding.c 2013-10-07 09:16:14 +0000
@@ -5761,6 +5761,7 @@
coding->safe_charsets = SDATA (val);
coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
coding->carryover_bytes = 0;
+ coding->raw_destination = 0;
coding_type = CODING_ATTR_TYPE (attrs);
if (EQ (coding_type, Qundecided))
@@ -8354,10 +8355,18 @@
coding->dst_object = Fbuffer_string ();
else
{
- coding->dst_object
- = make_unibyte_string ((char *) coding->destination,
- coding->produced);
- xfree (coding->destination);
+ /* This is used to avoid creating huge Lisp string.
+ NOTE: caller who set `raw_destination' is also
+ responsible to free `destination' buffer. */
+ if (coding->raw_destination)
+ coding->dst_object = Qnil;
+ else
+ {
+ coding->dst_object
+ = make_unibyte_string ((char *) coding->destination,
+ coding->produced);
+ xfree (coding->destination);
+ }
}
}
=== modified file 'src/coding.h'
--- src/coding.h 2013-08-30 12:17:44 +0000
+++ src/coding.h 2013-10-07 08:21:30 +0000
@@ -512,6 +512,10 @@
`charbuf', but at `src_object'. */
unsigned chars_at_source : 1;
+ /* Nonzero if the result of conversion is in `destination'
+ buffer rather than in `dst_object'. */
+ unsigned raw_destination : 1;
+
/* Set to 1 if charbuf contains an annotation. */
unsigned annotated : 1;
=== modified file 'src/fileio.c'
--- src/fileio.c 2013-09-11 05:03:23 +0000
+++ src/fileio.c 2013-10-07 09:17:51 +0000
@@ -5263,6 +5263,10 @@
return 1;
}
+/* Maximum number of characters that the next
+ function encodes per one loop iteration. */
+
+enum { E_WRITE_MAX = 8 * 1024 * 1024 };
/* Write text in the range START and END into descriptor DESC,
encoding them with coding system CODING. If STRING is nil, START
@@ -5289,9 +5293,16 @@
coding->src_multibyte = SCHARS (string) < SBYTES (string);
if (CODING_REQUIRE_ENCODING (coding))
{
- encode_coding_object (coding, string,
- start, string_char_to_byte (string, start),
- end, string_char_to_byte (string, end), Qt);
+ ptrdiff_t nchars = min (end - start, E_WRITE_MAX);
+
+ /* Avoid creating huge Lisp string in encode_coding_object. */
+ if (nchars == E_WRITE_MAX)
+ coding->raw_destination = 1;
+
+ encode_coding_object
+ (coding, string, start, string_char_to_byte (string, start),
+ start + nchars, string_char_to_byte (string, start + nchars),
+ Qt);
}
else
{
@@ -5308,8 +5319,15 @@
coding->src_multibyte = (end - start) < (end_byte - start_byte);
if (CODING_REQUIRE_ENCODING (coding))
{
- encode_coding_object (coding, Fcurrent_buffer (),
- start, start_byte, end, end_byte, Qt);
+ ptrdiff_t nchars = min (end - start, E_WRITE_MAX);
+
+ /* Likewise. */
+ if (nchars == E_WRITE_MAX)
+ coding->raw_destination = 1;
+
+ encode_coding_object
+ (coding, Fcurrent_buffer (), start, start_byte,
+ start + nchars, CHAR_TO_BYTE (start + nchars), Qt);
}
else
{
@@ -5330,11 +5348,19 @@
if (coding->produced > 0)
{
- char *buf = (STRINGP (coding->dst_object)
- ? SSDATA (coding->dst_object)
- : (char *) BYTE_POS_ADDR (coding->dst_pos_byte));
+ char *buf = (coding->raw_destination ? (char *) coding->destination
+ : (STRINGP (coding->dst_object)
+ ? SSDATA (coding->dst_object)
+ : (char *) BYTE_POS_ADDR (coding->dst_pos_byte)));
coding->produced -= emacs_write_sig (desc, buf, coding->produced);
+ if (coding->raw_destination)
+ {
+ /* We're responsible to free this, see
+ encode_coding_object to check why. */
+ xfree (coding->destination);
+ coding->raw_destination = 0;
+ }
if (coding->produced)
return 0;
}
next reply other threads:[~2013-10-07 9:47 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-10-07 9:47 Dmitry Antipov [this message]
2013-10-07 15:15 ` Huge file adventure (+patch) Stefan Monnier
2013-10-07 16:03 ` Dmitry Antipov
2013-10-07 19:03 ` Stefan Monnier
2013-10-07 16:07 ` Eli Zaretskii
2013-10-07 17:07 ` Dmitry Antipov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/emacs/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5252832E.5060804@yandex.ru \
--to=dmantipov@yandex.ru \
--cc=emacs-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).