all messages for Guix-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
From: Maxim Cournoyer <maxim.cournoyer@gmail.com>
To: 47310@debbugs.gnu.org
Cc: Maxim Cournoyer <maxim.cournoyer@gmail.com>
Subject: [bug#47310] [PATCH 2/4] import: go: Improve synopsis and description parsing.
Date: Mon, 22 Mar 2021 00:20:20 -0400	[thread overview]
Message-ID: <20210322042022.17385-2-maxim.cournoyer@gmail.com> (raw)
In-Reply-To: <20210322042022.17385-1-maxim.cournoyer@gmail.com>

* guix/import/go.scm (%strict-tokenizer?): Set parameter to #t.
(go-path-escape): Redefine to prevent inlining.
(http-get*): Replace by ...
(http-fetch*): this ...
(json-fetch*): New procedure.
(go.pkg.dev-info): Use http-fetch*.
(go-package-licenses): Rewrite in terms of go.pkg.dev-info.
(go-package-description): Likewise.
(go-package-synopsis): Likewise.
(fetch-go.mod): Use the memoized http-fetch*.
(parse-go.mod): Adjust to receive content as a string.
(fetch-module-meta-data): Adjust to use http-fetch*.
(go-module->guix-package): Adjust to the modified fetch-go.mod return value.
[inputs]: Use propagated inputs, which is the most common situations for Go
libraries.
[description]: Beautify description.
[licenses]: Do no check for #f.  The result of the license parsing is always a
list.
* tests/go.scm: Adjust following above changes.
---
 guix/import/go.scm    | 213 ++++++++++++++++++++++++++----------------
 guix/import/utils.scm |   4 +-
 tests/go.scm          |  75 +++++++--------
 3 files changed, 170 insertions(+), 122 deletions(-)

diff --git a/guix/import/go.scm b/guix/import/go.scm
index 7452b4c903..2ef2d872e8 100644
--- a/guix/import/go.scm
+++ b/guix/import/go.scm
@@ -32,7 +32,7 @@
   #:use-module (guix http-client)
   #:use-module ((guix licenses) #:prefix license:)
   #:use-module (guix memoization)
-  #:autoload   (htmlprag) (html->sxml)            ;from Guile-Lib
+  #:use-module (htmlprag)               ;from Guile-Lib
   #:autoload   (guix git) (update-cached-checkout)
   #:autoload   (gcrypt hash) (open-hash-port hash-algorithm sha256)
   #:autoload   (guix serialization) (write-file)
@@ -42,20 +42,28 @@
   #:use-module (ice-9 rdelim)
   #:use-module (ice-9 receive)
   #:use-module (ice-9 regex)
+  #:use-module (ice-9 textual-ports)
   #:use-module ((rnrs io ports) #:select (call-with-port))
   #:use-module (srfi srfi-1)
+  #:use-module (srfi srfi-2)
   #:use-module (srfi srfi-9)
   #:use-module (srfi srfi-11)
   #:use-module (srfi srfi-26)
-  #:use-module (sxml xpath)
+  #:use-module (sxml match)
+  #:use-module ((sxml xpath) #:renamer (lambda (s)
+                                         (if (eq? 'filter s)
+                                             'xfilter
+                                             s)))
   #:use-module (web client)
   #:use-module (web response)
   #:use-module (web uri)
 
-  #:export (go-path-escape
-            go-module->guix-package
+  #:export (go-module->guix-package
             go-module-recursive-import))
 
+;;; Parameterize htmlprag to parse valid HTML more reliably.
+(%strict-tokenizer? #t)
+
 ;;; Commentary:
 ;;;
 ;;; (guix import go) attempts to make it easier to create Guix package
@@ -89,6 +97,14 @@
 
 ;;; Code:
 
+(define http-fetch*
+  ;; Like http-fetch, but memoized and returning the body as a string.
+  (memoize (lambda args
+             (call-with-port (apply http-fetch args) get-string-all))))
+
+(define json-fetch*
+  (memoize json-fetch))
+
 (define (go-path-escape path)
   "Escape a module path by replacing every uppercase letter with an
 exclamation mark followed with its lowercase equivalent, as per the module
@@ -98,54 +114,73 @@ https://godoc.org/golang.org/x/mod/module#hdr-Escaped_Paths)."
     (string-append "!" (string-downcase (match:substring occurrence))))
   (regexp-substitute/global #f "[A-Z]" path 'pre escape 'post))
 
+;; Prevent inlining of this procedure, which is accessed by unit tests.
+(set! go-path-escape go-path-escape)
+
+(define (go.pkg.dev-info name)
+  (http-fetch* (string-append "https://pkg.go.dev/" name)))
+
 (define (go-module-latest-version goproxy-url module-path)
   "Fetch the version number of the latest version for MODULE-PATH from the
 given GOPROXY-URL server."
-  (assoc-ref (json-fetch (format #f "~a/~a/@latest" goproxy-url
-                                 (go-path-escape module-path)))
+  (assoc-ref (json-fetch* (format #f "~a/~a/@latest" goproxy-url
+                                  (go-path-escape module-path)))
              "Version"))
 
-
 (define (go-package-licenses name)
   "Retrieve the list of licenses that apply to NAME, a Go package or module
-name (e.g. \"github.com/golang/protobuf/proto\").  The data is scraped from
-the https://pkg.go.dev/ web site."
-  (let*-values (((url) (string-append "https://pkg.go.dev/" name
-                                      "?tab=licenses"))
-                ((response body) (http-get url))
-                ;; Extract the text contained in a h2 child node of any
-                ;; element marked with a "License" class attribute.
-                ((select) (sxpath `(// (* (@ (equal? (class "License"))))
-                                       h2 // *text*))))
-    (and (eq? (response-code response) 200)
-         (match (select (html->sxml body))
-           (() #f)                      ;nothing selected
-           (licenses licenses)))))
-
-(define (go.pkg.dev-info name)
-  (http-get (string-append "https://pkg.go.dev/" name)))
-(define go.pkg.dev-info*
-  (memoize go.pkg.dev-info))
+name (e.g. \"github.com/golang/protobuf/proto\")."
+  (let* ((body (go.pkg.dev-info (string-append name "?tab=licenses")))
+         ;; Extract the text contained in a h2 child node of any
+         ;; element marked with a "License" class attribute.
+         (select (sxpath `(// (* (@ (equal? (class "License"))))
+                              h2 // *text*))))
+    (select (html->sxml body))))
+
+(define (sxml->texi sxml-node)
+  "A very basic SXML to Texinfo converter which attempts to preserve HTML
+formatting and links as text."
+  (sxml-match sxml-node
+    ((strong ,text)
+     (format #f "@strong{~a}" text))
+    ((a (@ (href ,url)) ,text)
+     (format #f "@url{~a,~a}" url text))
+    ((code ,text)
+     (format #f "@code{~a}" text))
+    (,something-else something-else)))
 
 (define (go-package-description name)
   "Retrieve a short description for NAME, a Go package name,
-e.g. \"google.golang.org/protobuf/proto\".  The data is scraped from the
-https://pkg.go.dev/ web site."
-  (let*-values (((response body) (go.pkg.dev-info* name))
-                ;; Extract the text contained in a h2 child node of any
-                ;; element marked with a "License" class attribute.
-                ((select) (sxpath
-                           `(// (section
-                                 (@ (equal? (class "Documentation-overview"))))
-                                (p 1)))))
-    (and (eq? (response-code response) 200)
-         (match (select (html->sxml body))
-           (() #f)                      ;nothing selected
-           (((p . strings))
-            ;; The paragraph text is returned as a list of strings embedding
-            ;; newline characters.  Join them and strip the newline
-            ;; characters.
-            (string-delete #\newline (string-join strings)))))))
+e.g. \"google.golang.org/protobuf/proto\"."
+  (let* ((body (go.pkg.dev-info name))
+         (sxml (html->sxml body))
+         (overview ((sxpath
+                     `(//
+                       (* (@ (equal? (class "Documentation-overview"))))
+                       (p 1))) sxml))
+         ;; Sometimes, the first paragraph just contains images/links that
+         ;; has only "\n" for text.  The following filter is designed to
+         ;; omit it.
+         (contains-text? (lambda (node)
+                           (remove string-null?
+                                   (map string-trim-both
+                                        (filter (node-typeof? '*text*)
+                                                (cdr node))))))
+         (select-content (sxpath
+                          `(//
+                            (* (@ (equal? (class "UnitReadme-content"))))
+                            div // p ,(xfilter contains-text?))))
+         ;; Fall-back to use content; this is less desirable as it is more
+         ;; verbose, but not every page has an overview.
+         (description (if (not (null? overview))
+                          overview
+                          (select-content sxml)))
+         (description* (and (not (null? description))
+                            (first description))))
+    (match description*
+      (() #f)                           ;nothing selected
+      ((p elements ...)
+       (apply string-append (filter string? (map sxml->texi elements)))))))
 
 (define (go-package-synopsis module-name)
   "Retrieve a short synopsis for a Go module named MODULE-NAME,
@@ -153,17 +188,17 @@ e.g. \"google.golang.org/protobuf\".  The data is scraped from
 the https://pkg.go.dev/ web site."
   ;; Note: Only the *module* (rather than package) page has the README title
   ;; used as a synopsis on the https://pkg.go.dev web site.
-  (let*-values (((response body) (go.pkg.dev-info* module-name))
-                ;; Extract the text contained in a h2 child node of any
-                ;; element marked with a "License" class attribute.
-                ((select) (sxpath
-                           `(// (div (@ (equal? (class "UnitReadme-content"))))
-                                // h3 *text*))))
-    (and (eq? (response-code response) 200)
-         (match (select (html->sxml body))
-           (() #f)                      ;nothing selected
-           ((title more ...)            ;title is the first string of the list
-            (string-trim-both title))))))
+  (let* ((url (string-append "https://pkg.go.dev/" module-name))
+         (body (http-fetch* url))
+         ;; Extract the text contained in a h2 child node of any
+         ;; element marked with a "License" class attribute.
+         (select-title (sxpath
+                        `(// (div (@ (equal? (class "UnitReadme-content"))))
+                             // h3 *text*))))
+    (match (select-title (html->sxml body))
+      (() #f)                           ;nothing selected
+      ((title more ...)                 ;title is the first string of the list
+       (string-trim-both title)))))
 
 (define (list->licenses licenses)
   "Given a list of LICENSES mostly following the SPDX conventions, return the
@@ -188,13 +223,13 @@ corresponding Guix license or 'unknown-license!"
                          'unknown-license!)))
               licenses))
 
-(define (fetch-go.mod goproxy-url module-path version)
-  "Fetches go.mod from the given GOPROXY-URL server for the given MODULE-PATH
-and VERSION."
-  (let ((url (format #f "~a/~a/@v/~a.mod" goproxy-url
+(define (fetch-go.mod goproxy module-path version)
+  "Fetch go.mod from the given GOPROXY server for the given MODULE-PATH
+and VERSION and return an input port."
+  (let ((url (format #f "~a/~a/@v/~a.mod" goproxy
                      (go-path-escape module-path)
                      (go-path-escape version))))
-    (http-fetch url)))
+    (http-fetch* url)))
 
 (define %go.mod-require-directive-rx
   ;; A line in a require directive is composed of a module path and
@@ -215,9 +250,8 @@ and VERSION."
     "[[:blank:]]+" "=>" "[[:blank:]]+"
     "([^[:blank:]]+)([[:blank:]]+([^[:blank:]]+))?")))
 
-(define (parse-go.mod port)
-  "Parse the go.mod file accessible via the input PORT, returning a list of
-requirements."
+(define (parse-go.mod content)
+  "Parse the go.mod file CONTENT, returning a list of requirements."
   (define-record-type <results>
     (make-results requirements replacements)
     results?
@@ -228,7 +262,7 @@ requirements."
   (define (toplevel results)
     "Main parser, RESULTS is a pair of alist serving as accumulator for
      all encountered requirements and replacements."
-    (let ((line (read-line port)))
+    (let ((line (read-line)))
       (cond
        ((eof-object? line)
         ;; parsing ended, give back the result
@@ -254,7 +288,7 @@ requirements."
         (toplevel results)))))
 
   (define (in-require results)
-    (let ((line (read-line port)))
+    (let ((line (read-line)))
       (cond
        ((eof-object? line)
         ;; this should never happen here but we ignore silently
@@ -266,7 +300,7 @@ requirements."
         (in-require (require-directive results line))))))
 
   (define (in-replace results)
-    (let ((line (read-line port)))
+    (let ((line (read-line)))
       (cond
        ((eof-object? line)
         ;; this should never happen here but we ignore silently
@@ -305,7 +339,9 @@ requirements."
         (($ <results> requirements replaced)
          (make-results (alist-cons module-path version requirements) replaced)))))
 
-  (let ((results (toplevel (make-results '() '()))))
+  (let ((results (with-input-from-string content
+                   (lambda _
+                     (toplevel (make-results '() '()))))))
     (match results
       (($ <results> requirements replaced)
        ;; At last we remove replaced modules from the requirements list
@@ -324,8 +360,10 @@ requirements."
   (url-prefix vcs-url-prefix)
   (root-regex vcs-root-regex)
   (type vcs-type))
+
 (define (make-vcs prefix regexp type)
-    (%make-vcs prefix (make-regexp regexp) type))
+  (%make-vcs prefix (make-regexp regexp) type))
+
 (define known-vcs
   ;; See the following URL for the official Go equivalent:
   ;; https://github.com/golang/go/blob/846dce9d05f19a1f53465e62a304dea21b99f910/src/cmd/go/internal/vcs/vcs.go#L1026-L1087
@@ -384,6 +422,14 @@ hence the need to derive this information."
                                           "." "-")
                                          "/" "-"))))
 
+(define (strip-.git-suffix/maybe repo-url)
+  "Strip a repository URL '.git' suffix from REPO-URL if hosted at GitHub."
+  (match repo-url
+    ((and (? (cut string-prefix? "https://github.com" <>))
+          (? (cut string-suffix? ".git" <>)))
+     (string-drop-right repo-url 4))
+    (_ repo-url)))
+
 (define-record-type <module-meta>
   (make-module-meta import-prefix vcs repo-root)
   module-meta?
@@ -396,21 +442,22 @@ hence the need to derive this information."
 because goproxy servers don't currently provide all the information needed to
 build a package."
   ;; <meta name="go-import" content="import-prefix vcs repo-root">
-  (let* ((port (http-fetch (format #f "https://~a?go-get=1" module-path)))
+  (let* ((meta-data (http-fetch* (format #f "https://~a?go-get=1" module-path)))
          (select (sxpath `(// head (meta (@ (equal? (name "go-import"))))
                               // content))))
-    (match (select (call-with-port port html->sxml))
-      (() #f)                         ;nothing selected
+    (match (select (html->sxml meta-data))
+      (() #f)                           ;nothing selected
       (((content content-text))
        (match (string-split content-text #\space)
          ((root-path vcs repo-url)
-          (make-module-meta root-path (string->symbol vcs) repo-url)))))))
+          (make-module-meta root-path (string->symbol vcs)
+                            (strip-.git-suffix/maybe repo-url))))))))
 
-(define (module-meta-data-repo-url meta-data goproxy-url)
+(define (module-meta-data-repo-url meta-data goproxy)
   "Return the URL where the fetcher which will be used can download the
 source."
   (if (member (module-meta-vcs meta-data) '(fossil mod))
-      goproxy-url
+      goproxy
       (module-meta-repo-root meta-data)))
 
 ;; XXX: Copied from (guix scripts hash).
@@ -463,6 +510,9 @@ control system is being used."
           (method git-fetch)
           (uri (git-reference
                 (url ,vcs-repo-url)
+                ;; This is done because the version field of the package,
+                ;; which the generated quoted expression refers to, has been
+                ;; stripped of any 'v' prefixed.
                 (commit ,(if (and plain-version? v-prefixed?)
                              '(string-append "v" version)
                              '(go-version->git-ref version)))))
@@ -502,8 +552,8 @@ control system is being used."
 (define* (go-module->guix-package module-path #:key
                                   (goproxy-url "https://proxy.golang.org"))
   (let* ((latest-version (go-module-latest-version goproxy-url module-path))
-         (port (fetch-go.mod goproxy-url module-path latest-version))
-         (dependencies (map car (call-with-port port parse-go.mod)))
+         (content (fetch-go.mod goproxy-url module-path latest-version))
+         (dependencies (map car (parse-go.mod content)))
          (guix-name (go-module->guix-package-name module-path))
          (root-module-path (module-path->repository-root module-path))
          ;; The VCS type and URL are not included in goproxy information. For
@@ -524,14 +574,17 @@ control system is being used."
         (build-system go-build-system)
         (arguments
          '(#:import-path ,root-module-path))
-        ,@(maybe-inputs (map go-module->guix-package-name dependencies))
+        ,@(maybe-propagated-inputs
+           (map go-module->guix-package-name dependencies))
         (home-page ,(format #f "https://~a" root-module-path))
         (synopsis ,synopsis)
-        (description ,description)
-        (license ,(match (and=> licenses list->licenses)
-                    ((license) license)
-                    ((licenses ...) `(list ,@licenses))
-                    (x x))))
+        (description ,(and=> description beautify-description))
+        (license ,(match (list->licenses licenses)
+                    (() #f)             ;unknown license
+                    ((license)          ;a single license
+                     license)
+                    ((license ...)      ;a list of licenses
+                     `(list ,@license)))))
      dependencies)))
 
 (define go-module->guix-package* (memoize go-module->guix-package))
diff --git a/guix/import/utils.scm b/guix/import/utils.scm
index e55258e306..21fafad388 100644
--- a/guix/import/utils.scm
+++ b/guix/import/utils.scm
@@ -445,8 +445,8 @@ obtain a node's uniquely identifying \"key\"."
   "Return a list of package expressions for PACKAGE-NAME and all its
 dependencies, sorted in topological order.  For each package,
 call (REPO->GUIX-PACKAGE NAME :KEYS version repo), which should return a
-package expression and a list of dependencies; call (GUIX-NAME NAME) to
-obtain the Guix package name corresponding to the upstream name."
+package expression and a list of dependencies; call (GUIX-NAME PACKAGE-NAME)
+to obtain the Guix package name corresponding to the upstream name."
   (define-record-type <node>
     (make-node name version package dependencies)
     node?
diff --git a/tests/go.scm b/tests/go.scm
index 6ab99f508a..fa8fa7a2a6 100644
--- a/tests/go.scm
+++ b/tests/go.scm
@@ -180,13 +180,9 @@ require github.com/kr/pretty v0.2.1
 (define (testing-parse-mod name expected input)
   (define (inf? p1 p2)
     (string<? (car p1) (car p2)))
-  (let ((input-port (open-input-string input)))
-    (test-equal name
-      (sort expected inf?)
-      (sort
-       ( (@@ (guix import go) parse-go.mod)
-         input-port)
-       inf?))))
+  (test-equal name
+    (sort expected inf?)
+    (sort ((@@ (guix import go) parse-go.mod) input) inf?)))
 
 (testing-parse-mod "parse-go.mod-simple"
                    '(("good/thing" . "v1.4.5")
@@ -249,44 +245,43 @@ require github.com/kr/pretty v0.2.1
 
 (test-equal "go-module->guix-package"
   '(package
-    (name "go-github-com-go-check-check")
-    (version "0.0.0-20201130134442-10cb98267c6c")
-    (source
-     (origin
-       (method git-fetch)
-       (uri (git-reference
-             (url "https://github.com/go-check/check.git")
-             (commit (go-version->git-ref version))))
-       (file-name (git-file-name name version))
-       (sha256
-        (base32
-         "0sjjj9z1dhilhpc8pq4154czrb79z9cm044jvn75kxcjv6v5l2m5"))))
-    (build-system go-build-system)
-    (arguments
-     (quote (#:import-path "github.com/go-check/check")))
-    (inputs
-     (quasiquote (("go-github-com-kr-pretty"
-                   (unquote go-github-com-kr-pretty)))))
-    (home-page "https://github.com/go-check/check")
-    (synopsis "Instructions")
-    (description #f)
-    (license license:bsd-2))
+     (name "go-github-com-go-check-check")
+     (version "0.0.0-20201130134442-10cb98267c6c")
+     (source
+      (origin
+        (method git-fetch)
+        (uri (git-reference
+              (url "https://github.com/go-check/check")
+              (commit (go-version->git-ref version))))
+        (file-name (git-file-name name version))
+        (sha256
+         (base32
+          "0sjjj9z1dhilhpc8pq4154czrb79z9cm044jvn75kxcjv6v5l2m5"))))
+     (build-system go-build-system)
+     (arguments
+      '(#:import-path "github.com/go-check/check"))
+     (propagated-inputs
+      `(("go-github-com-kr-pretty" ,go-github-com-kr-pretty)))
+     (home-page "https://github.com/go-check/check")
+     (synopsis "Instructions")
+     (description "Package check is a rich testing extension for Go's testing \
+package.")
+     (license license:bsd-2))
 
   ;; Replace network resources with sample data.
   (call-with-temporary-directory
    (lambda (checkout)
      (mock ((web client) http-get
             (mock-http-get fixtures-go-check-test))
-           (mock ((guix http-client) http-fetch
-                  (mock-http-fetch fixtures-go-check-test))
-                 (mock ((guix git) update-cached-checkout
-                        (lambda* (url #:key ref)
-                          ;; Return an empty directory and its hash.
-                          (values checkout
-                                  (nix-base32-string->bytevector
-                                   "0sjjj9z1dhilhpc8pq4154czrb79z9cm044jvn75kxcjv6v5l2m5")
-                                  #f)))
-                       (go-module->guix-package "github.com/go-check/check")))))))
+         (mock ((guix http-client) http-fetch
+                (mock-http-fetch fixtures-go-check-test))
+             (mock ((guix git) update-cached-checkout
+                    (lambda* (url #:key ref)
+                      ;; Return an empty directory and its hash.
+                      (values checkout
+                              (nix-base32-string->bytevector
+                               "0sjjj9z1dhilhpc8pq4154czrb79z9cm044jvn75kxcjv6v5l2m5")
+                              #f)))
+                 (go-module->guix-package "github.com/go-check/check")))))))
 
 (test-end "go")
-
-- 
2.30.1





  reply	other threads:[~2021-03-22  4:31 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-22  4:10 [bug#47310] [PATCH 0/4] Improve parsing of synopsis and description, add pinned versions mode Maxim Cournoyer
2021-03-22  4:20 ` [bug#47310] [PATCH 1/4] import: utils: Refactor maybe-inputs and add maybe-propagated-inputs Maxim Cournoyer
2021-03-22  4:20   ` Maxim Cournoyer [this message]
2021-03-22  4:20   ` [bug#47310] [PATCH 3/4] import: go: Add an option to use pinned versions Maxim Cournoyer
2021-04-10  3:13     ` bug#47310: " Maxim Cournoyer
2021-03-22  4:20   ` [bug#47310] [PATCH 4/4] import: go: Append version to symbol name in the pinned version mode Maxim Cournoyer
2021-04-10 20:54 ` [bug#47310] [PATCH 0/4] go importer: Better synopsis and descriptions, pinned versions mode Ludovic Courtès
2021-04-11  0:57   ` Maxim Cournoyer
     [not found] ` <20210414120939.mmvprjaz6636kz4h@noop.avalenn.eu>
     [not found]   ` <87czusjtpc.fsf@gmail.com>
2021-04-18 10:31     ` go-build-system possible improvments François
2021-04-21  2:00       ` Maxim Cournoyer
2021-04-22 16:36         ` François
2021-04-24  3:30           ` Maxim Cournoyer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210322042022.17385-2-maxim.cournoyer@gmail.com \
    --to=maxim.cournoyer@gmail.com \
    --cc=47310@debbugs.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/guix.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.