From mboxrd@z Thu Jan 1 00:00:00 1970 From: Christopher Baines Subject: [PATCH] git-download: Speed up 'git-predicate'. Date: Mon, 19 Jun 2017 08:14:43 +0100 Message-ID: <20170619071443.23122-1-mail@cbaines.net> References: <87zidk2c4g.fsf@gnu.org> Return-path: Received: from eggs.gnu.org ([2001:4830:134:3::10]:53254) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1dMqte-0006qi-Qn for guix-devel@gnu.org; Mon, 19 Jun 2017 03:14:52 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1dMqtZ-0008Mp-Rn for guix-devel@gnu.org; Mon, 19 Jun 2017 03:14:50 -0400 Received: from li622-129.members.linode.com ([212.71.249.129]:42107 helo=mira.cbaines.net) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1dMqtZ-0008Kf-ID for guix-devel@gnu.org; Mon, 19 Jun 2017 03:14:45 -0400 Received: from localhost (cpc102582-walt20-2-0-cust14.13-2.cable.virginm.net [86.27.34.15]) by mira.cbaines.net (Postfix) with ESMTPSA id 955D213D066 for ; Mon, 19 Jun 2017 08:14:43 +0100 (BST) Received: from localhost.localdomain (localhost [127.0.0.1]) by localhost (OpenSMTPD) with ESMTP id 5fb384b2 for ; Mon, 19 Jun 2017 07:14:43 +0000 (UTC) In-Reply-To: <87zidk2c4g.fsf@gnu.org> List-Id: "Development of GNU Guix and the GNU System distribution." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: guix-devel-bounces+gcggd-guix-devel=m.gmane.org@gnu.org Sender: "Guix-devel" To: guix-devel@gnu.org Adjust 'git-predicate' to use data structures that perform better when used with git repositories with a large number of files. Previously when matching either a regular file or directory, 'git-predicate' would search a list with a length equal to the number of files in the repository. As a search operation happens for roughly every file in the repository, this meant that the time taken to use 'git-predicate' to traverse all the files in a repository was roughly exponential with respect to the number of files in the repository. Now, for matching regular files or symlinks, 'git-predicate' uses a vhash using the inode value as the key. This should perform roughly in constant amount of time, instead of linear with respect to the number of files in the repository. For matching directories, 'git-predicate' now uses a tree structure stored in association lists. To check if a directory is in the tree, the tree is traversed from the root. The time complexity of this depends on the shape of the tree, but it should be an improvement on searching through the list of all files. * guix/git-download.scm (git-predicate): Use different data structures to speed up 'git-predicate' with a large number of files. --- guix/git-download.scm | 99 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 26 deletions(-) diff --git a/guix/git-download.scm b/guix/git-download.scm index 316835502..f9c144a6e 100644 --- a/guix/git-download.scm +++ b/guix/git-download.scm @@ -28,6 +28,7 @@ #:use-module (ice-9 match) #:use-module (ice-9 popen) #:use-module (ice-9 rdelim) + #:use-module (ice-9 vlist) #:use-module (srfi srfi-1) #:export (git-reference git-reference? @@ -125,45 +126,91 @@ HASH-ALGO (a symbol). Use NAME as the file name, or a generic name if #f." "Return the file-name for packages using git-download." (string-append name "-" version "-checkout")) +(define (create-directory-tree files) + (define (directory-lists->tree directory-lists) + (map (lambda (top-level-dir) + (cons top-level-dir + (directory-lists->tree + (filter-map + (lambda (directory-list) + (if (eq? (length directory-list) 1) + #f + (cdr directory-list))) + ;; Find all the directory lists under this top-level-dir + (filter + (lambda (directory-list) + (equal? (car directory-list) + top-level-dir)) + directory-lists))))) + (delete-duplicates + (map car directory-lists)))) + + (directory-lists->tree + (filter-map (lambda (path) + (let ((split-path (string-split path #\/))) + ;; If this is a file in the top of the repository? + (if (eq? (length split-path) 1) + #f + ;; drop-right to remove the filename, as it's + ;; just the directory tree that's important + (drop-right (string-split path #\/) 1)))) + files))) + +(define (directory-in-tree? directory whole-tree) + (define directory-list-in-tree? + (match-lambda* + (((top-directory . rest) tree) + (if (null? rest) + (list? (member top-directory (map car tree))) + (and=> (find (match-lambda + ((subtree-top-directory . subtree) + (equal? subtree-top-directory + top-directory))) + tree) + (match-lambda + ((subtree-top-directory . subtree) + (directory-list-in-tree? rest subtree)))))))) + + (directory-list-in-tree? (string-split directory #\/) + whole-tree)) + (define (git-predicate directory) "Return a predicate that returns true if a file is part of the Git checkout living at DIRECTORY. Upon Git failure, return #f instead of a predicate. The returned predicate takes two arguments FILE and STAT where FILE is an absolute file name and STAT is the result of 'lstat'." - (define (parent-directory? thing directory) - ;; Return #t if DIRECTORY is the parent of THING. - (or (string-suffix? thing directory) - (and (string-index thing #\/) - (parent-directory? (dirname thing) directory)))) - - (let* ((pipe (with-directory-excursion directory - (open-pipe* OPEN_READ "git" "ls-files"))) - (files (let loop ((lines '())) - (match (read-line pipe) - ((? eof-object?) - (reverse lines)) - (line - (loop (cons line lines)))))) - (inodes (map (lambda (file) - (let ((stat (lstat - (string-append directory "/" file)))) - (cons (stat:dev stat) (stat:ino stat)))) - files)) - (status (close-pipe pipe))) + (let* ((pipe (with-directory-excursion directory + (open-pipe* OPEN_READ "git" "ls-files"))) + (files (let loop ((lines '())) + (match (read-line pipe) + ((? eof-object?) + (reverse lines)) + (line + (loop (cons line lines)))))) + (directory-tree (create-directory-tree files)) + (inodes (alist->vhash + (map + (lambda (file) + (let ((stat + (lstat (string-append directory "/" file)))) + (cons (stat:ino stat) (stat:dev stat)))) + files))) + (directory-prefix-length (+ 1 (string-length + (canonicalize-path directory)))) + (status (close-pipe pipe))) (and (zero? status) (lambda (file stat) (match (stat:type stat) ('directory - ;; 'git ls-files' does not list directories, only regular files, - ;; so we need this special trick. - (any (lambda (f) (parent-directory? f file)) - files)) + (directory-in-tree? (string-drop file directory-prefix-length) + directory-tree)) ((or 'regular 'symlink) ;; Comparing file names is always tricky business so we rely on ;; inode numbers instead - (member (cons (stat:dev stat) (stat:ino stat)) - inodes)) + (and=> (vhash-assq (stat:ino stat) inodes) + (lambda (ino-dev) + (eq? (cdr ino-dev) (stat:dev stat))))) (_ #f)))))) -- 2.13.1