From 5c2f94a182a1154766154040eb5b4b39275fd3b6 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 14 Apr 2019 16:31:24 -0700 Subject: [PATCH] =?UTF-8?q?Replace=20executable=E2=80=99s=20fingerprint=20?= =?UTF-8?q?in=20place?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * admin/merge-gnulib (GNULIB_MODULES): Add memmem-simple. (AVOIDED_MODULES): Add memchr. * configure.ac (HAVE_PDUMPER): AC_SUBST it, too, for use in makefiles. * lib/Makefile.in (libgnu_a_OBJECTS): Add fingerprint.o. * lib/fingerprint.c: New file. * lib/memmem.c, lib/str-two-way.h, m4/memmem.m4: New files, copied from Gnulib. * lib/fingerprint.h: Rename from src/fingerprint.h. * lib-src/make-fingerprint.c: Include limits.h, sys/stat.h, fingerprint.h, intprops.h, min-max.h. (SSIZE_MAX): New macro, if not already defined. (main): Without -r, Replace the fingerprint in the input file instead of generating a fingerprint.c. * lib/Makefile.in (libgnu_a_OBJECTS): Add fingerprint.o. * lib/gnulib.mk.in, m4/gnulib-comp.m4: Regenerate. * src/Makefile.in (HAVE_PDUMPER, MAKE_PDUMPER_FINGERPRINT): New macros. (temacs$(EXEEXT)): Use them to replace the fingerprint instead of precalculating it. (mostlyclean, ctagsfiles1): Do not worry about fingerprint.c. --- admin/merge-gnulib | 4 +- configure.ac | 6 +- lib-src/make-fingerprint.c | 119 +++++++--- lib/Makefile.in | 2 +- lib/fingerprint.c | 66 ++++++ {src => lib}/fingerprint.h | 0 lib/gnulib.mk.in | 14 +- lib/memmem.c | 71 ++++++ lib/str-two-way.h | 452 +++++++++++++++++++++++++++++++++++++ m4/gnulib-comp.m4 | 22 +- m4/memmem.m4 | 154 +++++++++++++ src/Makefile.in | 30 ++- 12 files changed, 880 insertions(+), 60 deletions(-) create mode 100644 lib/fingerprint.c rename {src => lib}/fingerprint.h (100%) create mode 100644 lib/memmem.c create mode 100644 lib/str-two-way.h create mode 100644 m4/memmem.m4 diff --git a/admin/merge-gnulib b/admin/merge-gnulib index 055e791d62..4a69310d83 100755 --- a/admin/merge-gnulib +++ b/admin/merge-gnulib @@ -36,7 +36,7 @@ GNULIB_MODULES= filemode filevercmp flexmember fpieee fstatat fsusage fsync getloadavg getopt-gnu gettime gettimeofday gitlog-to-changelog ieee754-h ignore-value intprops largefile lstat - manywarnings memrchr minmax mkostemp mktime nstrftime + manywarnings memmem-simple memrchr minmax mkostemp mktime nstrftime pipe2 pselect pthread_sigmask putenv qcopy-acl readlink readlinkat regex sig2str socklen stat-time std-gnu11 stdalign stddef stdio stpcpy strtoimax symlink sys_stat sys_time @@ -47,7 +47,7 @@ GNULIB_MODULES= AVOIDED_MODULES=' btowc close dup fchdir fstat langinfo lock - malloc-posix mbrtowc mbsinit mkdir msvc-inval msvc-nothrow nl_langinfo + malloc-posix mbrtowc mbsinit memchr mkdir msvc-inval msvc-nothrow nl_langinfo openat-die opendir raise save-cwd select setenv sigprocmask stat stdarg stdbool threadlib tzset unsetenv utime utime-h diff --git a/configure.ac b/configure.ac index 1814a30cbc..cff9a27422 100644 --- a/configure.ac +++ b/configure.ac @@ -379,8 +379,12 @@ AC_DEFUN fi if test "$with_pdumper" = "yes"; then - AC_DEFINE(HAVE_PDUMPER, 1, [Define to build with portable dumper support]) + AC_DEFINE([HAVE_PDUMPER], 1, [Define to build with portable dumper support]) + HAVE_PDUMPER=yes +else + HAVE_PDUMPER=no fi +AC_SUBST([HAVE_PDUMPER]) DUMPING=$with_dumping AC_SUBST(DUMPING) diff --git a/lib-src/make-fingerprint.c b/lib-src/make-fingerprint.c index 35bb8b98a0..dc21fc2aa9 100644 --- a/lib-src/make-fingerprint.c +++ b/lib-src/make-fingerprint.c @@ -25,14 +25,25 @@ along with GNU Emacs. If not, see . */ #include +#include #include #include #include #include #include +#include + #include -#include + +#include #include +#include +#include +#include + +#ifndef SSIZE_MAX +# define SSIZE_MAX TYPE_MAXIMUM (ssize_t) +#endif #ifdef WINDOWSNT /* Defined to be sys_fopen in ms-w32.h, but only #ifdef emacs, so this @@ -54,41 +65,62 @@ main (int argc, char **argv) raw = true; break; case 'h': - printf ("make-fingerprint [-r] FILES...: compute a hash\n"); - return 0; + printf ("make-fingerprint [-r] FILE: replace or compute a hash\n"); + return EXIT_SUCCESS; default: - return 1; + return EXIT_FAILURE; } } struct sha256_ctx ctx; sha256_init_ctx (&ctx); - for (int i = optind; i < argc; ++i) + if (argc - optind != 1) { - FILE *f = fopen (argv[i], "r" FOPEN_BINARY); - if (!f) - { - fprintf (stderr, "%s: Error: could not open %s\n", - argv[0], argv[i]); - return 1; - } + fprintf (stderr, "%s: missing or extra file operand\n", argv[0]); + return EXIT_FAILURE; + } - char buf[128*1024]; - do - { - size_t chunksz = fread (buf, 1, sizeof (buf), f); - if (ferror (f)) - { - fprintf (stderr, "%s: Error: could not read %s\n", - argv[0], argv[i]); - return 1; - } - sha256_process_bytes (buf, chunksz, &ctx); - } while (!feof (f)); - fclose (f); + FILE *f = fopen (argv[1], raw ? "r" FOPEN_BINARY : "r+" FOPEN_BINARY); + struct stat st; + if (!f || fstat (fileno (f), &st) != 0) + { + perror (argv[1]); + return EXIT_FAILURE; } + if (!S_ISREG (st.st_mode)) + { + fprintf (stderr, "%s: Error: %s is not a regular file\n", + argv[0], argv[1]); + return EXIT_FAILURE; + } + + ptrdiff_t maxlen = min (min (TYPE_MAXIMUM (off_t), PTRDIFF_MAX), + min (SIZE_MAX, SSIZE_MAX)); + if (maxlen <= st.st_size) + { + fprintf (stderr, "%s: %s: file too big\n", argv[0], argv[1]); + return EXIT_FAILURE; + } + + char *buf = malloc (st.st_size + 1); + if (!buf) + { + perror ("malloc"); + return EXIT_FAILURE; + } + + size_t chunksz = fread (buf, 1, st.st_size + 1, f); + if (ferror (f) || chunksz != st.st_size) + { + fprintf (stderr, "%s: Error: could not read %s\n", + argv[0], argv[1]); + return EXIT_FAILURE; + } + + sha256_process_bytes (buf, chunksz, &ctx); + unsigned char digest[32]; sha256_finish_ctx (&ctx, digest); @@ -99,12 +131,37 @@ main (int argc, char **argv) } else { - puts ("#include \"fingerprint.h\"\n" - "unsigned char const fingerprint[] =\n" - "{"); - for (int i = 0; i < 32; ++i) - printf ("\t0x%02X,\n", digest[i]); - puts ("};"); + char *finger = memmem (buf, chunksz, fingerprint, sizeof fingerprint); + if (!finger) + { + fprintf (stderr, "%s: %s: missing fingerprint\n", argv[0], argv[1]); + return EXIT_FAILURE; + } + else if (memmem (finger + 1, buf + chunksz - (finger + 1), + fingerprint, sizeof fingerprint)) + { + fprintf (stderr, "%s: %s: two occurrences of fingerprint\n", + argv[0], argv[1]); + return EXIT_FAILURE; + } + + if (fseeko (f, finger - buf, SEEK_SET) != 0) + { + perror (argv[1]); + return EXIT_FAILURE; + } + + if (fwrite (digest, 1, sizeof digest, f) != sizeof digest) + { + perror (argv[1]); + return EXIT_FAILURE; + } + } + + if (fclose (f) != 0) + { + perror (argv[1]); + return EXIT_FAILURE; } return EXIT_SUCCESS; diff --git a/lib/Makefile.in b/lib/Makefile.in index f2d203564a..ac32c7070f 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -84,7 +84,7 @@ Makefile: # and building it would just waste time. not_emacs_OBJECTS = regex.o -libgnu_a_OBJECTS = $(gl_LIBOBJS) \ +libgnu_a_OBJECTS = fingerprint.o $(gl_LIBOBJS) \ $(patsubst %.c,%.o,$(filter %.c,$(libgnu_a_SOURCES))) for_emacs_OBJECTS = $(filter-out $(not_emacs_OBJECTS),$(libgnu_a_OBJECTS)) libegnu_a_OBJECTS = $(patsubst %.o,e-%.o,$(for_emacs_OBJECTS)) diff --git a/lib/fingerprint.c b/lib/fingerprint.c new file mode 100644 index 0000000000..e55de9c6da --- /dev/null +++ b/lib/fingerprint.c @@ -0,0 +1,66 @@ +/* Placeholder fingerprint for Emacs + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of GNU Emacs. + +GNU Emacs is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. + +GNU Emacs is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Emacs. If not, see . */ + +#include + +#include "fingerprint.h" + +/* This random fingerprint was generated by the shell command: + + shuf -i 0-255 -n 32 -r | awk '{printf " 0x%.02X,\n", $0}' + + In the final Emacs executable, this random fingerprint is replaced + by a fingerprint of the temporary Emacs executable that was built + along the way. */ + +unsigned char const fingerprint[] = + { + 0xDE, + 0x86, + 0xBB, + 0x99, + 0xFF, + 0xF5, + 0x46, + 0x9A, + 0x9E, + 0x3F, + 0x9F, + 0x5D, + 0x9A, + 0xDF, + 0xF0, + 0x91, + 0xBD, + 0xCD, + 0xC1, + 0xE8, + 0x0C, + 0x16, + 0x1E, + 0xAF, + 0xB8, + 0x6C, + 0xE2, + 0x2B, + 0xB1, + 0x24, + 0xCE, + 0xB0, + }; diff --git a/src/fingerprint.h b/lib/fingerprint.h similarity index 100% rename from src/fingerprint.h rename to lib/fingerprint.h diff --git a/lib/gnulib.mk.in b/lib/gnulib.mk.in index 1450df9f63..03160340c8 100644 --- a/lib/gnulib.mk.in +++ b/lib/gnulib.mk.in @@ -44,6 +44,7 @@ # --avoid=malloc-posix \ # --avoid=mbrtowc \ # --avoid=mbsinit \ +# --avoid=memchr \ # --avoid=mkdir \ # --avoid=msvc-inval \ # --avoid=msvc-nothrow \ @@ -111,6 +112,7 @@ # largefile \ # lstat \ # manywarnings \ +# memmem-simple \ # memrchr \ # minmax \ # mkostemp \ @@ -1053,7 +1055,6 @@ gl_GNULIB_ENABLED_03e0aaad4cb89ca757653bd367a6ccb7 = @gl_GNULIB_ENABLED_03e0aaad gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9 = @gl_GNULIB_ENABLED_2049e887c7e5308faad27b3f894bb8c9@ gl_GNULIB_ENABLED_21ee726a3540c09237a8e70c0baf7467 = @gl_GNULIB_ENABLED_21ee726a3540c09237a8e70c0baf7467@ gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b = @gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b@ -gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547 = @gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547@ gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31 = @gl_GNULIB_ENABLED_5264294aa0a5557541b53c8c741f7f31@ gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c = @gl_GNULIB_ENABLED_6099e9737f757db36c47fa9d9f02e88c@ gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec = @gl_GNULIB_ENABLED_682e609604ccaac6be382e4ee3a4eaec@ @@ -1963,6 +1964,17 @@ EXTRA_libgnu_a_SOURCES += lstat.c endif ## end gnulib module lstat +## begin gnulib module memmem-simple +ifeq (,$(OMIT_GNULIB_MODULE_memmem-simple)) + + +EXTRA_DIST += memmem.c str-two-way.h + +EXTRA_libgnu_a_SOURCES += memmem.c + +endif +## end gnulib module memmem-simple + ## begin gnulib module memrchr ifeq (,$(OMIT_GNULIB_MODULE_memrchr)) diff --git a/lib/memmem.c b/lib/memmem.c new file mode 100644 index 0000000000..12ae24f41b --- /dev/null +++ b/lib/memmem.c @@ -0,0 +1,71 @@ +/* Copyright (C) 1991-1994, 1996-1998, 2000, 2004, 2007-2019 Free Software + Foundation, Inc. + This file is part of the GNU C Library. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, see . */ + +/* This particular implementation was written by Eric Blake, 2008. */ + +#ifndef _LIBC +# include +#endif + +/* Specification of memmem. */ +#include + +#define RETURN_TYPE void * +#define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) +#include "str-two-way.h" + +/* Return the first occurrence of NEEDLE in HAYSTACK. Return HAYSTACK + if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in + HAYSTACK. */ +void * +memmem (const void *haystack_start, size_t haystack_len, + const void *needle_start, size_t needle_len) +{ + /* Abstract memory is considered to be an array of 'unsigned char' values, + not an array of 'char' values. See ISO C 99 section 6.2.6.1. */ + const unsigned char *haystack = (const unsigned char *) haystack_start; + const unsigned char *needle = (const unsigned char *) needle_start; + + if (needle_len == 0) + /* The first occurrence of the empty string is deemed to occur at + the beginning of the string. */ + return (void *) haystack; + + /* Sanity check, otherwise the loop might search through the whole + memory. */ + if (__builtin_expect (haystack_len < needle_len, 0)) + return NULL; + + /* Use optimizations in memchr when possible, to reduce the search + size of haystack using a linear algorithm with a smaller + coefficient. However, avoid memchr for long needles, since we + can often achieve sublinear performance. */ + if (needle_len < LONG_NEEDLE_THRESHOLD) + { + haystack = memchr (haystack, *needle, haystack_len); + if (!haystack || __builtin_expect (needle_len == 1, 0)) + return (void *) haystack; + haystack_len -= haystack - (const unsigned char *) haystack_start; + if (haystack_len < needle_len) + return NULL; + return two_way_short_needle (haystack, haystack_len, needle, needle_len); + } + else + return two_way_long_needle (haystack, haystack_len, needle, needle_len); +} + +#undef LONG_NEEDLE_THRESHOLD diff --git a/lib/str-two-way.h b/lib/str-two-way.h new file mode 100644 index 0000000000..9155e6b560 --- /dev/null +++ b/lib/str-two-way.h @@ -0,0 +1,452 @@ +/* Byte-wise substring search, using the Two-Way algorithm. + Copyright (C) 2008-2019 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Written by Eric Blake , 2008. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, see . */ + +/* Before including this file, you need to include and + , and define: + RESULT_TYPE A macro that expands to the return type. + AVAILABLE(h, h_l, j, n_l) + A macro that returns nonzero if there are + at least N_L bytes left starting at H[J]. + H is 'unsigned char *', H_L, J, and N_L + are 'size_t'; H_L is an lvalue. For + NUL-terminated searches, H_L can be + modified each iteration to avoid having + to compute the end of H up front. + + For case-insensitivity, you may optionally define: + CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L + characters of P1 and P2 are equal. + CANON_ELEMENT(c) A macro that canonicalizes an element right after + it has been fetched from one of the two strings. + The argument is an 'unsigned char'; the result + must be an 'unsigned char' as well. + + This file undefines the macros documented above, and defines + LONG_NEEDLE_THRESHOLD. +*/ + +#include +#include + +/* We use the Two-Way string matching algorithm (also known as + Chrochemore-Perrin), which guarantees linear complexity with + constant space. Additionally, for long needles, we also use a bad + character shift table similar to the Boyer-Moore algorithm to + achieve improved (potentially sub-linear) performance. + + See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260, + https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm, + https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf +*/ + +/* Point at which computing a bad-byte shift table is likely to be + worthwhile. Small needles should not compute a table, since it + adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a + speedup no greater than a factor of NEEDLE_LEN. The larger the + needle, the better the potential performance gain. On the other + hand, on non-POSIX systems with CHAR_BIT larger than eight, the + memory required for the table is prohibitive. */ +#if CHAR_BIT < 10 +# define LONG_NEEDLE_THRESHOLD 32U +#else +# define LONG_NEEDLE_THRESHOLD SIZE_MAX +#endif + +#ifndef MAX +# define MAX(a, b) ((a < b) ? (b) : (a)) +#endif + +#ifndef CANON_ELEMENT +# define CANON_ELEMENT(c) c +#endif +#ifndef CMP_FUNC +# define CMP_FUNC memcmp +#endif + +/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. + Return the index of the first byte in the right half, and set + *PERIOD to the global period of the right half. + + The global period of a string is the smallest index (possibly its + length) at which all remaining bytes in the string are repetitions + of the prefix (the last repetition may be a subset of the prefix). + + When NEEDLE is factored into two halves, a local period is the + length of the smallest word that shares a suffix with the left half + and shares a prefix with the right half. All factorizations of a + non-empty NEEDLE have a local period of at least 1 and no greater + than NEEDLE_LEN. + + A critical factorization has the property that the local period + equals the global period. All strings have at least one critical + factorization with the left half smaller than the global period. + And while some strings have more than one critical factorization, + it is provable that with an ordered alphabet, at least one of the + critical factorizations corresponds to a maximal suffix. + + Given an ordered alphabet, a critical factorization can be computed + in linear time, with 2 * NEEDLE_LEN comparisons, by computing the + shorter of two ordered maximal suffixes. The ordered maximal + suffixes are determined by lexicographic comparison while tracking + periodicity. */ +static size_t +critical_factorization (const unsigned char *needle, size_t needle_len, + size_t *period) +{ + /* Index of last byte of left half, or SIZE_MAX. */ + size_t max_suffix, max_suffix_rev; + size_t j; /* Index into NEEDLE for current candidate suffix. */ + size_t k; /* Offset into current period. */ + size_t p; /* Intermediate period. */ + unsigned char a, b; /* Current comparison bytes. */ + + /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered + out 0-length needles. */ + if (needle_len < 3) + { + *period = 1; + return needle_len - 1; + } + + /* Invariants: + 0 <= j < NEEDLE_LEN - 1 + -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) + min(max_suffix, max_suffix_rev) < global period of NEEDLE + 1 <= p <= global period of NEEDLE + p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] + 1 <= k <= p + */ + + /* Perform lexicographic search. */ + max_suffix = SIZE_MAX; + j = 0; + k = p = 1; + while (j + k < needle_len) + { + a = CANON_ELEMENT (needle[j + k]); + b = CANON_ELEMENT (needle[max_suffix + k]); + if (a < b) + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix; + } + else if (a == b) + { + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else + { + j += p; + k = 1; + } + } + else /* b < a */ + { + /* Suffix is larger, start over from current location. */ + max_suffix = j++; + k = p = 1; + } + } + *period = p; + + /* Perform reverse lexicographic search. */ + max_suffix_rev = SIZE_MAX; + j = 0; + k = p = 1; + while (j + k < needle_len) + { + a = CANON_ELEMENT (needle[j + k]); + b = CANON_ELEMENT (needle[max_suffix_rev + k]); + if (b < a) + { + /* Suffix is smaller, period is entire prefix so far. */ + j += k; + k = 1; + p = j - max_suffix_rev; + } + else if (a == b) + { + /* Advance through repetition of the current period. */ + if (k != p) + ++k; + else + { + j += p; + k = 1; + } + } + else /* a < b */ + { + /* Suffix is larger, start over from current location. */ + max_suffix_rev = j++; + k = p = 1; + } + } + + /* Choose the shorter suffix. Return the index of the first byte of + the right half, rather than the last byte of the left half. + + For some examples, 'banana' has two critical factorizations, both + exposed by the two lexicographic extreme suffixes of 'anana' and + 'nana', where both suffixes have a period of 2. On the other + hand, with 'aab' and 'bba', both strings have a single critical + factorization of the last byte, with the suffix having a period + of 1. While the maximal lexicographic suffix of 'aab' is 'b', + the maximal lexicographic suffix of 'bba' is 'ba', which is not a + critical factorization. Conversely, the maximal reverse + lexicographic suffix of 'a' works for 'bba', but not 'ab' for + 'aab'. The shorter suffix of the two will always be a critical + factorization. */ + if (max_suffix_rev + 1 < max_suffix + 1) + return max_suffix + 1; + *period = p; + return max_suffix_rev + 1; +} + +/* Return the first location of non-empty NEEDLE within HAYSTACK, or + NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This + method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. + Performance is guaranteed to be linear, with an initialization cost + of 2 * NEEDLE_LEN comparisons. + + If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at + most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. + If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * + HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ +static RETURN_TYPE +two_way_short_needle (const unsigned char *haystack, size_t haystack_len, + const unsigned char *needle, size_t needle_len) +{ + size_t i; /* Index into current byte of NEEDLE. */ + size_t j; /* Index into current window of HAYSTACK. */ + size_t period; /* The period of the right half of needle. */ + size_t suffix; /* The index of the right half of needle. */ + + /* Factor the needle into two halves, such that the left half is + smaller than the global period, and the right half is + periodic (with a period as large as NEEDLE_LEN - suffix). */ + suffix = critical_factorization (needle, needle_len, &period); + + /* Perform the search. Each iteration compares the right half + first. */ + if (CMP_FUNC (needle, needle + period, suffix) == 0) + { + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ + size_t memory = 0; + j = 0; + while (AVAILABLE (haystack, haystack_len, j, needle_len)) + { + /* Scan for matches in right half. */ + i = MAX (suffix, memory); + while (i < needle_len && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i + 1 < memory + 1) + return (RETURN_TYPE) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } + } + else + { + /* The two halves of needle are distinct; no extra memory is + required, and any mismatch results in a maximal shift. */ + period = MAX (suffix, needle_len - suffix) + 1; + j = 0; + while (AVAILABLE (haystack, haystack_len, j, needle_len)) + { + /* Scan for matches in right half. */ + i = suffix; + while (i < needle_len && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i == SIZE_MAX) + return (RETURN_TYPE) (haystack + j); + j += period; + } + else + j += i - suffix + 1; + } + } + return NULL; +} + +/* Return the first location of non-empty NEEDLE within HAYSTACK, or + NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This + method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. + Performance is guaranteed to be linear, with an initialization cost + of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. + + If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at + most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, + and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. + If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * + HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and + sublinear performance is not possible. */ +static RETURN_TYPE +two_way_long_needle (const unsigned char *haystack, size_t haystack_len, + const unsigned char *needle, size_t needle_len) +{ + size_t i; /* Index into current byte of NEEDLE. */ + size_t j; /* Index into current window of HAYSTACK. */ + size_t period; /* The period of the right half of needle. */ + size_t suffix; /* The index of the right half of needle. */ + size_t shift_table[1U << CHAR_BIT]; /* See below. */ + + /* Factor the needle into two halves, such that the left half is + smaller than the global period, and the right half is + periodic (with a period as large as NEEDLE_LEN - suffix). */ + suffix = critical_factorization (needle, needle_len, &period); + + /* Populate shift_table. For each possible byte value c, + shift_table[c] is the distance from the last occurrence of c to + the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. + shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ + for (i = 0; i < 1U << CHAR_BIT; i++) + shift_table[i] = needle_len; + for (i = 0; i < needle_len; i++) + shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; + + /* Perform the search. Each iteration compares the right half + first. */ + if (CMP_FUNC (needle, needle + period, suffix) == 0) + { + /* Entire needle is periodic; a mismatch in the left half can + only advance by the period, so use memory to avoid rescanning + known occurrences of the period in the right half. */ + size_t memory = 0; + size_t shift; + j = 0; + while (AVAILABLE (haystack, haystack_len, j, needle_len)) + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; + if (0 < shift) + { + if (memory && shift < period) + { + /* Since needle is periodic, but the last period has + a byte out of place, there can be no match until + after the mismatch. */ + shift = needle_len - period; + } + memory = 0; + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = MAX (suffix, memory); + while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (memory < i + 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i + 1 < memory + 1) + return (RETURN_TYPE) (haystack + j); + /* No match, so remember how many repetitions of period + on the right half were scanned. */ + j += period; + memory = needle_len - period; + } + else + { + j += i - suffix + 1; + memory = 0; + } + } + } + else + { + /* The two halves of needle are distinct; no extra memory is + required, and any mismatch results in a maximal shift. */ + size_t shift; + period = MAX (suffix, needle_len - suffix) + 1; + j = 0; + while (AVAILABLE (haystack, haystack_len, j, needle_len)) + { + /* Check the last byte first; if it does not match, then + shift to the next possible match location. */ + shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; + if (0 < shift) + { + j += shift; + continue; + } + /* Scan for matches in right half. The last byte has + already been matched, by virtue of the shift table. */ + i = suffix; + while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + ++i; + if (needle_len - 1 <= i) + { + /* Scan for matches in left half. */ + i = suffix - 1; + while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) + == CANON_ELEMENT (haystack[i + j]))) + --i; + if (i == SIZE_MAX) + return (RETURN_TYPE) (haystack + j); + j += period; + } + else + j += i - suffix + 1; + } + } + return NULL; +} + +#undef AVAILABLE +#undef CANON_ELEMENT +#undef CMP_FUNC +#undef MAX +#undef RETURN_TYPE diff --git a/m4/gnulib-comp.m4 b/m4/gnulib-comp.m4 index f25a0e4081..f648b7a495 100644 --- a/m4/gnulib-comp.m4 +++ b/m4/gnulib-comp.m4 @@ -113,6 +113,7 @@ AC_DEFUN # Code from module localtime-buffer: # Code from module lstat: # Code from module manywarnings: + # Code from module memmem-simple: # Code from module memrchr: # Code from module minmax: # Code from module mkostemp: @@ -195,6 +196,7 @@ AC_DEFUN gl_source_base='lib' gl_FUNC_ACL gl_FUNC_ALLOCA + gl___BUILTIN_EXPECT gl_BYTESWAP AC_CHECK_FUNCS_ONCE([readlinkat]) gl_CLOCK_TIME @@ -302,6 +304,11 @@ AC_DEFUN gl_PREREQ_LSTAT fi gl_SYS_STAT_MODULE_INDICATOR([lstat]) + gl_FUNC_MEMMEM_SIMPLE + if test $HAVE_MEMMEM = 0 || test $REPLACE_MEMMEM = 1; then + AC_LIBOBJ([memmem]) + fi + gl_STRING_MODULE_INDICATOR([memmem]) gl_FUNC_MEMRCHR if test $ac_cv_func_memrchr = no; then AC_LIBOBJ([memrchr]) @@ -426,7 +433,6 @@ AC_DEFUN gl_UTIMENS AC_C_VARARRAYS gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b=false - gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547=false gl_gnulib_enabled_cloexec=false gl_gnulib_enabled_dirfd=false gl_gnulib_enabled_euidaccess=false @@ -450,13 +456,6 @@ AC_DEFUN func_gl_gnulib_m4code_open fi } - func_gl_gnulib_m4code_37f71b604aa9c54446783d80f42fe547 () - { - if ! $gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547; then - gl___BUILTIN_EXPECT - gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547=true - fi - } func_gl_gnulib_m4code_cloexec () { if ! $gl_gnulib_enabled_cloexec; then @@ -652,9 +651,6 @@ AC_DEFUN if test $HAVE_READLINKAT = 0; then func_gl_gnulib_m4code_03e0aaad4cb89ca757653bd367a6ccb7 fi - if test $ac_use_included_regex = yes; then - func_gl_gnulib_m4code_37f71b604aa9c54446783d80f42fe547 - fi if test $ac_use_included_regex = yes; then func_gl_gnulib_m4code_21ee726a3540c09237a8e70c0baf7467 fi @@ -666,7 +662,6 @@ AC_DEFUN fi m4_pattern_allow([^gl_GNULIB_ENABLED_]) AM_CONDITIONAL([gl_GNULIB_ENABLED_260941c0e5dc67ec9e87d1fb321c300b], [$gl_gnulib_enabled_260941c0e5dc67ec9e87d1fb321c300b]) - AM_CONDITIONAL([gl_GNULIB_ENABLED_37f71b604aa9c54446783d80f42fe547], [$gl_gnulib_enabled_37f71b604aa9c54446783d80f42fe547]) AM_CONDITIONAL([gl_GNULIB_ENABLED_cloexec], [$gl_gnulib_enabled_cloexec]) AM_CONDITIONAL([gl_GNULIB_ENABLED_dirfd], [$gl_gnulib_enabled_dirfd]) AM_CONDITIONAL([gl_GNULIB_ENABLED_euidaccess], [$gl_gnulib_enabled_euidaccess]) @@ -916,6 +911,7 @@ AC_DEFUN lib/lstat.c lib/md5.c lib/md5.h + lib/memmem.c lib/memrchr.c lib/minmax.h lib/mkostemp.c @@ -959,6 +955,7 @@ AC_DEFUN lib/stdio.in.h lib/stdlib.in.h lib/stpcpy.c + lib/str-two-way.h lib/strftime.h lib/string.in.h lib/strtoimax.c @@ -1049,6 +1046,7 @@ AC_DEFUN m4/manywarnings.m4 m4/mbstate_t.m4 m4/md5.m4 + m4/memmem.m4 m4/memrchr.m4 m4/minmax.m4 m4/mkostemp.m4 diff --git a/m4/memmem.m4 b/m4/memmem.m4 new file mode 100644 index 0000000000..af2d5bbcf5 --- /dev/null +++ b/m4/memmem.m4 @@ -0,0 +1,154 @@ +# memmem.m4 serial 25 +dnl Copyright (C) 2002-2004, 2007-2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl Check that memmem is present and functional. +AC_DEFUN([gl_FUNC_MEMMEM_SIMPLE], +[ + dnl Persuade glibc to declare memmem(). + AC_REQUIRE([AC_USE_SYSTEM_EXTENSIONS]) + + AC_REQUIRE([gl_HEADER_STRING_H_DEFAULTS]) + AC_CHECK_FUNCS([memmem]) + if test $ac_cv_func_memmem = yes; then + HAVE_MEMMEM=1 + else + HAVE_MEMMEM=0 + fi + AC_CHECK_DECLS_ONCE([memmem]) + if test $ac_cv_have_decl_memmem = no; then + HAVE_DECL_MEMMEM=0 + else + dnl Detect https://sourceware.org/bugzilla/show_bug.cgi?id=12092. + dnl Also check that we handle empty needles correctly. + AC_CACHE_CHECK([whether memmem works], + [gl_cv_func_memmem_works_always], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([[ +#include /* for memmem */ +#define P "_EF_BF_BD" +#define HAYSTACK "F_BD_CE_BD" P P P P "_C3_88_20" P P P "_C3_A7_20" P +#define NEEDLE P P P P P +]], [[ + int result = 0; + if (memmem (HAYSTACK, strlen (HAYSTACK), NEEDLE, strlen (NEEDLE))) + result |= 1; + /* Check for empty needle behavior. */ + { + const char *haystack = "AAA"; + if (memmem (haystack, 3, NULL, 0) != haystack) + result |= 2; + } + return result; + ]])], + [gl_cv_func_memmem_works_always=yes], + [gl_cv_func_memmem_works_always=no], + [dnl glibc 2.9..2.12 and cygwin 1.7.7 have issue #12092 above. + dnl Also empty needles work on glibc >= 2.1 and cygwin >= 1.7.0. + dnl uClibc is not affected, since it uses different source code. + dnl Assume that it works on all other platforms (even if not linear). + AC_EGREP_CPP([Lucky user], + [ +#ifdef __GNU_LIBRARY__ + #include + #if ((__GLIBC__ == 2 && ((__GLIBC_MINOR > 0 && __GLIBC_MINOR__ < 9) \ + || __GLIBC_MINOR__ > 12)) \ + || (__GLIBC__ > 2)) \ + || defined __UCLIBC__ + Lucky user + #endif +#elif defined __CYGWIN__ + #include + #if CYGWIN_VERSION_DLL_COMBINED > CYGWIN_VERSION_DLL_MAKE_COMBINED (1007, 7) + Lucky user + #endif +#else + Lucky user +#endif + ], + [gl_cv_func_memmem_works_always="guessing yes"], + [gl_cv_func_memmem_works_always="guessing no"]) + ]) + ]) + case "$gl_cv_func_memmem_works_always" in + *yes) ;; + *) + REPLACE_MEMMEM=1 + ;; + esac + fi + gl_PREREQ_MEMMEM +]) # gl_FUNC_MEMMEM_SIMPLE + +dnl Additionally, check that memmem has linear performance characteristics +AC_DEFUN([gl_FUNC_MEMMEM], +[ + AC_REQUIRE([gl_FUNC_MEMMEM_SIMPLE]) + if test $HAVE_DECL_MEMMEM = 1 && test $REPLACE_MEMMEM = 0; then + AC_CACHE_CHECK([whether memmem works in linear time], + [gl_cv_func_memmem_works_fast], + [AC_RUN_IFELSE([AC_LANG_PROGRAM([[ +#include /* for signal */ +#include /* for memmem */ +#include /* for malloc */ +#include /* for alarm */ +static void quit (int sig) { _exit (sig + 128); } +]], [[ + int result = 0; + size_t m = 1000000; + char *haystack = (char *) malloc (2 * m + 1); + char *needle = (char *) malloc (m + 1); + /* Failure to compile this test due to missing alarm is okay, + since all such platforms (mingw) also lack memmem. */ + signal (SIGALRM, quit); + alarm (5); + /* Check for quadratic performance. */ + if (haystack && needle) + { + memset (haystack, 'A', 2 * m); + haystack[2 * m] = 'B'; + memset (needle, 'A', m); + needle[m] = 'B'; + if (!memmem (haystack, 2 * m + 1, needle, m + 1)) + result |= 1; + } + /* Free allocated memory, in case some sanitizer is watching. */ + free (haystack); + free (needle); + return result; + ]])], + [gl_cv_func_memmem_works_fast=yes], [gl_cv_func_memmem_works_fast=no], + [dnl Only glibc >= 2.9 and cygwin > 1.7.0 are known to have a + dnl memmem that works in linear time. + AC_EGREP_CPP([Lucky user], + [ +#include +#ifdef __GNU_LIBRARY__ + #if ((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 9) || (__GLIBC__ > 2)) \ + && !defined __UCLIBC__ + Lucky user + #endif +#endif +#ifdef __CYGWIN__ + #include + #if CYGWIN_VERSION_DLL_COMBINED > CYGWIN_VERSION_DLL_MAKE_COMBINED (1007, 0) + Lucky user + #endif +#endif + ], + [gl_cv_func_memmem_works_fast="guessing yes"], + [gl_cv_func_memmem_works_fast="guessing no"]) + ]) + ]) + case "$gl_cv_func_memmem_works_fast" in + *yes) ;; + *) + REPLACE_MEMMEM=1 + ;; + esac + fi +]) # gl_FUNC_MEMMEM + +# Prerequisites of lib/memmem.c. +AC_DEFUN([gl_PREREQ_MEMMEM], [:]) diff --git a/src/Makefile.in b/src/Makefile.in index 2348c8dae4..3aab5270a4 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -332,6 +332,7 @@ UNEXEC_OBJ = DUMPING=@DUMPING@ CHECK_STRUCTS = @CHECK_STRUCTS@ +HAVE_PDUMPER = @HAVE_PDUMPER@ # 'make' verbosity. AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@ @@ -627,19 +628,25 @@ LIBEGNU_ARCHIVE = $(LIBEGNU_ARCHIVE): $(config_h) $(MAKE) -C $(dir $@) all -FINGERPRINTED = $(LIBXMENU) $(ALLOBJS) $(LIBEGNU_ARCHIVE) $(EMACSRES) -fingerprint.c: $(FINGERPRINTED) $(libsrc)/make-fingerprint$(EXEEXT) - $(AM_V_GEN)$(libsrc)/make-fingerprint$(EXEEXT) $(FINGERPRINTED) >$@.tmp - $(AM_V_at)mv $@.tmp $@ +ifeq ($(HAVE_PDUMPER),yes) + MAKE_PDUMPER_FINGERPRINT = $(libsrc)/make-fingerprint$(EXEEXT) +else + MAKE_PDUMPER_FINGERPRINT = +endif ## We have to create $(etc) here because init_cmdargs tests its ## existence when setting Vinstallation_directory (FIXME?). ## This goes on to affect various things, and the emacs binary fails ## to start if Vinstallation_directory has the wrong value. -temacs$(EXEEXT): fingerprint.o $(charsets) $(charscript) - $(AM_V_CCLD)$(CC) -o $@ $(ALL_CFLAGS) $(TEMACS_LDFLAGS) $(LDFLAGS) \ - $(ALLOBJS) fingerprint.o \ - $(LIBEGNU_ARCHIVE) $(W32_RES_LINK) $(LIBES) +temacs$(EXEEXT): $(LIBXMENU) $(ALLOBJS) $(LIBEGNU_ARCHIVE) $(EMACSRES) \ + $(charsets) $(charscript) $(MAKE_PDUMPER_FINGERPRINT) + $(AM_V_CCLD)$(CC) -o $@.tmp \ + $(ALL_CFLAGS) $(TEMACS_LDFLAGS) $(LDFLAGS) \ + $(ALLOBJS) $(LIBEGNU_ARCHIVE) $(W32_RES_LINK) $(LIBES) +ifeq ($(HAVE_PDUMPER),yes) + $(AM_V_at)$(MAKE_PDUMPER_FINGERPRINT) $@.tmp +endif + $(AM_V_at)mv $@.tmp $@ $(MKDIR_P) $(etc) ifeq ($(DUMPING),unexec) ifneq ($(PAXCTL_notdumped),) @@ -676,7 +683,7 @@ .PHONY: mostlyclean: rm -f temacs$(EXEEXT) core ./*.core \#* ./*.o - rm -f dmpstruct.h fingerprint.c + rm -f dmpstruct.h rm -f emacs.pdmp rm -f ../etc/DOC rm -f bootstrap-emacs$(EXEEXT) $(bootstrap_pdmp) @@ -716,10 +723,9 @@ ETAGS = ${ETAGS}: FORCE $(MAKE) -C $(dir $@) $(notdir $@) -# Remove macuvs.h and fingerprint.c since they'd cause `src/emacs` +# Remove macuvs.h since it'd cause `src/emacs` # to be built before we can get TAGS. -ctagsfiles1 = $(filter-out ${srcdir}/macuvs.h ${srcdir}/fingerprint.c, \ - $(wildcard ${srcdir}/*.[hc])) +ctagsfiles1 = $(filter-out ${srcdir}/macuvs.h, $(wildcard ${srcdir}/*.[hc])) ctagsfiles2 = $(wildcard ${srcdir}/*.m) ## In out-of-tree builds, TAGS are generated in the build dir, like -- 2.17.1