all messages for Guix-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
* [PATCH] git-download: Speed up 'git-predicate'.
@ 2017-06-02  7:08 Christopher Baines
  2017-06-02  7:34 ` Christopher Baines
  2017-06-07 12:52 ` Ludovic Courtès
  0 siblings, 2 replies; 13+ messages in thread
From: Christopher Baines @ 2017-06-02  7:08 UTC (permalink / raw)
  To: guix-devel

Adjust 'git-predicate' to use data structures that perform better when used
with git repositories with a large number of files.

Previously when matching either a regular file or directory, 'git-predicate'
would search a list with a length equal to the number of files in the
repository. As a search operation happens for roughly every file in the
repository, this meant that the time taken to use 'git-predicate' to traverse
all the files in a repository was roughly exponential with respect to the
number of files in the repository.

Now, for matching regular files or symlinks, 'git-predicate' uses a vhash
using the inode value as the key. This should perform roughly in constant
amount of time, instead of linear with respect to the number of files in the
repository.

For matching directories, 'git-predicate' now uses a tree structure stored in
association lists. To check if a directory is in the tree, the tree is
traversed from the root. The time complexity of this depends on the shape of
the tree, but it should be an improvement on searching through the list of all
files.

* guix/git-download.scm (git-predicate): Use different data structures to
  speed up 'git-predicate' with a large number of files.
---
 guix/git-download.scm | 98 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 72 insertions(+), 26 deletions(-)

diff --git a/guix/git-download.scm b/guix/git-download.scm
index 316835502..e26e5e91f 100644
--- a/guix/git-download.scm
+++ b/guix/git-download.scm
@@ -28,6 +28,7 @@
   #:use-module (ice-9 match)
   #:use-module (ice-9 popen)
   #:use-module (ice-9 rdelim)
+  #:use-module (ice-9 vlist)
   #:use-module (srfi srfi-1)
   #:export (git-reference
             git-reference?
@@ -131,39 +132,84 @@ living at DIRECTORY.  Upon Git failure, return #f instead of a predicate.
 
 The returned predicate takes two arguments FILE and STAT where FILE is an
 absolute file name and STAT is the result of 'lstat'."
-  (define (parent-directory? thing directory)
-    ;; Return #t if DIRECTORY is the parent of THING.
-    (or (string-suffix? thing directory)
-        (and (string-index thing #\/)
-             (parent-directory? (dirname thing) directory))))
-
-  (let* ((pipe        (with-directory-excursion directory
-                        (open-pipe* OPEN_READ "git" "ls-files")))
-         (files       (let loop ((lines '()))
-                        (match (read-line pipe)
-                          ((? eof-object?)
-                           (reverse lines))
-                          (line
-                           (loop (cons line lines))))))
-         (inodes      (map (lambda (file)
-                             (let ((stat (lstat
-                                          (string-append directory "/" file))))
-                               (cons (stat:dev stat) (stat:ino stat))))
-                           files))
-         (status      (close-pipe pipe)))
+  (define (create-directory-tree files)
+    (define (directory-lists->tree directory-lists)
+      (map (lambda (top-level-dir)
+             (cons top-level-dir
+                   (directory-lists->tree
+                    (filter-map
+                     (lambda (directory-list)
+                       (if (eq? (length directory-list) 1)
+                           #f
+                           (cdr directory-list)))
+                     ;; Find all the directory lists under this top-level-dir
+                     (filter
+                      (lambda (directory-list)
+                        (equal? (car directory-list)
+                                top-level-dir))
+                      directory-lists)))))
+           (delete-duplicates
+            (map car directory-lists))))
+
+    (directory-lists->tree
+     (filter-map (lambda (path)
+                   (let ((split-path (string-split path #\/)))
+                     ;; If this is a file in the top of the repository?
+                     (if (eq? (length split-path) 1)
+                         #f
+                         ;; drop-right to remove the filename, as it's
+                         ;; just the directory tree that's important
+                         (drop-right (string-split path #\/) 1))))
+                 files)))
+
+  (define (directory-in-tree? directory tree)
+    (define (directory-list-in-tree? directory-list tree)
+      (if (eq? (length directory-list) 1)
+          (list? (member (car directory-list)
+                         (map car tree)))
+          (and=> (find (match-lambda
+                         ((top-level-dir . subtree)
+                          (equal? top-level-dir
+                                  (car directory-list))))
+                       tree)
+                 (match-lambda
+                   ((top-level-dir . subtree)
+                    (directory-list-in-tree? (cdr directory-list)
+                                             subtree))))))
+
+    (directory-list-in-tree? (string-split directory #\/)
+                             tree))
+
+  (let* ((pipe           (with-directory-excursion directory
+                           (open-pipe* OPEN_READ "git" "ls-files")))
+         (files          (let loop ((lines '()))
+                           (match (read-line pipe)
+                             ((? eof-object?)
+                              (reverse lines))
+                             (line
+                              (loop (cons line lines))))))
+         (directory-tree (create-directory-tree files))
+         (inodes-vhash   (alist->vhash
+                          (map
+                           (lambda (file)
+                             (let ((stat
+                                    (lstat (string-append directory "/" file))))
+                               (cons (stat:ino stat) (stat:dev stat))))
+                           files)))
+         (status         (close-pipe pipe)))
     (and (zero? status)
          (lambda (file stat)
            (match (stat:type stat)
              ('directory
-              ;; 'git ls-files' does not list directories, only regular files,
-              ;; so we need this special trick.
-              (any (lambda (f) (parent-directory? f file))
-                   files))
+              (directory-in-tree?
+               (string-drop file (+ 1 (string-length directory)))
+               directory-tree))
              ((or 'regular 'symlink)
               ;; Comparing file names is always tricky business so we rely on
               ;; inode numbers instead
-              (member (cons (stat:dev stat) (stat:ino stat))
-                      inodes))
+              (and=> (vhash-assq (stat:ino stat) inodes-vhash)
+                     (lambda (ino-dev)
+                       (eq? (cdr ino-dev) (stat:dev stat)))))
              (_
               #f))))))
 
-- 
2.13.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2017-07-26  9:58 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-02  7:08 [PATCH] git-download: Speed up 'git-predicate' Christopher Baines
2017-06-02  7:34 ` Christopher Baines
2017-06-07 12:40   ` Ludovic Courtès
2017-06-07 18:12     ` Christopher Baines
2017-06-07 12:52 ` Ludovic Courtès
2017-06-08 20:43   ` Christopher Baines
2017-06-19  7:14   ` Christopher Baines
2017-06-21 21:44     ` Ludovic Courtès
2017-07-16 10:42       ` Christopher Baines
2017-07-25 21:26         ` Ludovic Courtès
2017-07-26  9:58           ` Christopher Baines
2017-06-19  7:24   ` Christopher Baines
2017-06-21 21:17     ` Ludovic Courtès

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/guix.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.