all messages for Guix-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
From: swedebugia <swedebugia@riseup.net>
To: Pierre Neidhardt <mail@ambrevar.xyz>, guix-devel <guix-devel@gnu.org>
Subject: Help with sxml simple parser for the quicklisp importer
Date: Wed, 23 Jan 2019 13:35:03 +0100	[thread overview]
Message-ID: <1b161633-c285-1401-d771-c965dae58149@riseup.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 937 bytes --]

Hi

I am trying to extract information from the quickdocs website to fill 
into the description.

I got this code:
;; Guess the homepage of the package
(define (homepage name)
   (string-append "http://quickdocs.org/" name "/"))

(define (get-homepage name)
   "Get the latest meta release file. From the links in this we extract all
other information we need."
   (call-with-temporary-output-file
    (lambda (temp port)
      (and (url-fetch (homepage name) temp)
           (xml->sxml (get-string-all port))))))
(display (get-homepage "1am"))


But it errors out with:

  sxml/simple.scm:143:4: In procedure loop:
Throw to key `parser-error' with args `(#<input: string 23c45b0> 
"[GIMatch] broken for " (END . head) " while expecting " END link)'.

The document has no link problems in the html according to 
https://validator.w3.org/nu/?doc=http%3A%2F%2Fquickdocs.org%2F1am%2F

Any ideas how to make it parse?
-- 
Cheers Swedebugia

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: quicklisp.scm --]
[-- Type: text/x-scheme; name="quicklisp.scm", Size: 15494 bytes --]

;;; GNU Guix --- Functional package management for GNU
;;; Copyright © 2018 Julien Lepiller <julien@lepiller.eu>
;;;
;;; This file is part of GNU Guix.
;;;
;;; GNU Guix is free software; you can redistribute it and/or modify it
;;; under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or (at
;;; your option) any later version.
;;;
;;; GNU Guix is distributed in the hope that it will be useful, but
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with GNU Guix.  If not, see <http://www.gnu.org/licenses/>.

(define-module (guix import quicklisp)
 ; #:use-module (ice-9 ftw)
  #:use-module (ice-9 match)
  #:use-module (ice-9 peg)
 ; #:use-module (ice-9 receive)
  #:use-module ((ice-9 rdelim) #:select (read-line))
  #:use-module (ice-9 textual-ports)
  #:use-module (sxml simple)
  #:use-module (sxml xpath)
  #:use-module (srfi srfi-1)
  #:use-module (srfi srfi-2)
  #:use-module (web uri)
;  #:use-module (guix http-client)
  #:use-module ((guix build download) #:prefix build:)
;  #:use-module (guix git)
;  #:use-module (guix ui)
  #:use-module ((guix utils) #:select (call-with-temporary-output-file))
 ; #:use-module (guix import utils)
;  #:use-module ((guix licenses) #:prefix license:)
  #:export (test))

;; Define a PEG parser for the quicklisp format
(define-peg-pattern SP none (or " " "\n"))
(define-peg-pattern NL none "\n")
(define-peg-pattern COLON none ":")
(define-peg-pattern RELEASE none "release/")
(define-peg-pattern HASH none "#")
(define-peg-pattern IGNORE none (peg "(! SP .)*"))
(define-peg-pattern IGNORE-UNTIL-NL none (peg "(! NL .)*"))
(define-peg-pattern text all (+ (or (range #\a #\z) "-")))
(define-peg-pattern text-until-sp all (peg "(! SP .)*"))
(define-peg-pattern text-until-colon all (peg "(! COLON .)*"))
(define-peg-pattern text-until-nl all (peg "(! NL .)*"))

;; Meta and distinfo files use COLON as separator:
(define-peg-pattern test all
  text-until-colon)
(define-peg-pattern record all
  (and text COLON (* SP) text-until-nl))
(define-peg-pattern records body (* (and record (* SP))))

;; Release index has no colons between the values:
(define-peg-pattern record-index all (and text-until-sp (* SP) text-until-nl))
;; Field no. 2 is tarball url
(define-peg-pattern field-2 all
  (and text-until-sp SP text-until-sp IGNORE-UNTIL-NL))
(define-peg-pattern 2s body
  (* (and (or header-hash field-2) (* SP))))
;; Field no. 6 is version
(define-peg-pattern version all
  (and text-until-sp SP IGNORE SP IGNORE SP IGNORE SP
       IGNORE SP text-until-sp IGNORE-UNTIL-NL))
(define-peg-pattern versions body
  (* (and (or header-hash version) (* SP))))
;; Field no. 7-N is system files which we need.
(define-peg-pattern system-files all
  (and text-until-sp SP IGNORE SP IGNORE SP IGNORE SP
       IGNORE SP IGNORE SP text-until-nl))
(define-peg-pattern system-files-list body
  (* (and (or header-hash system-files) (* SP))))

;; Release hashfile has no colons between the values:
;; Names contain numbers, get everything exept space.
(define-peg-pattern record-hash all
  (and
   ;; This matches the prefix "release/"
   RELEASE
   text-until-sp (* SP) text-until-nl))
(define-peg-pattern header-hash all
  (and
   (and HASH (* SP))
   text-until-sp (* SP) text-until-nl))
(define-peg-pattern records-hash body (* (and (or header-hash record-hash) (* SP))))

;; Systems.txt with dependencies
;; This parsing results in a lot of duplicates, but we ignore that because our
;; match just picks the first and returns happy and the overhead is neglible.
;; Field no. 3 is system-name
(define-peg-pattern field-3 all
  (and text-until-sp SP IGNORE SP text-until-sp IGNORE-UNTIL-NL))
(define-peg-pattern 3s body
  (* (and (or header-hash field-3) (* SP))))

(define-peg-pattern record-sys all
  (and
   ;; We ignore the second and third field for now.
   text-until-sp (* SP) IGNORE SP IGNORE SP text-until-nl))
(define-peg-pattern records-sys body (* (and (or header-hash record-sys) (* SP))))

;;; QL= QuickLisp
;;; cl= Common Lisp

(define (url-fetch url file-name)
  "Save the contents of URL to FILE-NAME.  Return #f on failure."
  (parameterize ((current-output-port (current-error-port)))
    (build:url-fetch url file-name)))

(define (ql-meta-file)
  "Get the latest meta release file. From the links in this we extract all
other information we need."
  (call-with-temporary-output-file
   (lambda (temp port)
     (and (url-fetch "https://beta.quicklisp.org/dist/quicklisp.txt" temp)
          (peg:tree (match-pattern test (get-string-all port)))))))
;;(display (metadata-ref (ql-meta-file) "release-index-url"))

(define (ql-latest-index-file)
  "Get the latest release index file content. This contains: name tarball-url
file-md5 content-sha1-hash version etc. Space separated."
  (let ((latest (metadata-ref (ql-meta-file) "release-index-url")))
    (call-with-temporary-output-file
     (lambda (temp port)
       (and (url-fetch latest temp)
            (get-string-all port))))))

(define (ql-distinfo-file)
  "Get the latest distinfo file. Colon separated. From this we only get the link to system-index-url."
  (let ((latest (metadata-ref (ql-meta-file) "canonical-distinfo-url")))
    (call-with-temporary-output-file
     (lambda (temp port)
       (and (url-fetch latest temp)
            (peg:tree (match-pattern records (get-string-all port))))))))

(define (ql-hash-file)
  "Get the latest hashfile."
  (let* ((uri (string->uri (metadata-ref (ql-meta-file) "release-index-url")))
         (host (uri-host uri))
        (path (string-drop-right (uri-path uri) 12))
        (url (string-append "https://" host path "digests.txt")))
    (pk 'url url)
    (call-with-temporary-output-file
     (lambda (temp port)
       (and (url-fetch url temp)
            (peg:tree (match-pattern records-hash (get-string-all port))))))))

(define (ql-systems-file)
  "Get the latest file with dependency information for each package. Spaceseparated list of dependencies."
  (let ((latest (metadata-ref (ql-distinfo-file) "system-index-url")))
    (call-with-temporary-output-file
     (lambda (temp port)
       (and (url-fetch latest temp)
            (peg:tree (match-pattern records-sys (get-string-all port))))))))
;;(display (ql-systems-file)) works.

(define (metadata-ref file lookup)
  "Lookup metadata for FILE and LOOKUP."
;;   (pk 'file file 'lookup lookup)
   (fold (lambda (record acc)
           (match record
             ;; Output from parser looks like this:
             ((record (_ key) (_ val))
              ;; Find our key
              (if (equal? key lookup)
                  ;; return val
                  val
                  ;; else return acc
                  acc))))
         #f file))

;;(metadata-ref (ql-latest-index-file) "1am")
                                        ; returns url of the tarball -
                                        ; works!
;;(metadata-ref (ql-hash-file) "1am")
                                        ; returns sha256 hash of the tarball -
                                        ; works!

;;(metadata-ref (ql-systems-file) "able")
                                        ; returns string with dependencies
                                        ; space separated - works!

(define (ql-extract field name)
  "Helper to read the right field from (ql-latest-index-file). Field is one of
url, system-file, system-name, version. Name is the package name."
  (let* ((release (ql-latest-index-file))
         (systems (ql-systems-file))
         (system-files (peg:tree (match-pattern system-files-list release)))
         (system-name (peg:tree (match-pattern 3s systems)))
         (url (peg:tree (match-pattern 2s release)))
         (version (peg:tree (match-pattern versions release))))
    (cond
     ((equal? field 'version)
      (let ((str (metadata-ref version name)))
        (if (string-prefix? name str)
            ;; Drop "name-" from version-string.
            (string-drop str (+ 1 (string-length name)))
            str)))
     ;; FIXME these ONLY extract the first match
     ((equal? field 'system-files)
      (metadata-ref system-files name))
     ((equal? field 'system-name)
      (metadata-ref system-name name)))))
;;(display (peg:tree (match-pattern versions (ql-latest-index-file))))
#;
(display
 (peg:tree (match-pattern versions (ql-latest-index-file))))
;;(display (ql-extract 'version "1am"))
;;(display (ql-extract 'system-files "1am"))
;;(display (ql-latest-index-file))

;; Guess the homepage of the package
(define (homepage name)
  (string-append "http://quickdocs.org/" name "/"))

(define (get-homepage name)
  "Get the latest meta release file. From the links in this we extract all
other information we need."
  (call-with-temporary-output-file
   (lambda (temp port)
     (and (url-fetch (homepage name) temp)
          (xml->sxml (get-string-all port))))))
(display (get-homepage "1am"))

#;
(define (synopsis name)
  ;;extract from homepage: section class="readme"
  )


;; fetcher from texlive importer:
#;
(define (fetch-sxml name)
  "Return an sxml representation of the package information contained in the
XML description of the CTAN package or #f in case of failure."
  ;; This API always returns the latest release of the module.
  (let ((url (string-append "http://www.ctan.org/xml/1.2/pkg/" name)))
    (guard (c ((http-get-error? c)
               (format (current-error-port)
                       "error: failed to retrieve package information \
from ~s: ~a (~s)~%"
                       (uri->string (http-get-error-uri c))
                       (http-get-error-code c)
                       (http-get-error-reason c))
               #f))
      (xml->sxml (http-fetch url)
                 #:trim-whitespace? #t))))

(define (cl-name->guix-name name)
  (cond
   ;; TODO: Any special cases?
    ((string-prefix? "cl-" name) name)
    (else (string-append "cl-" name))))

;; Native dependency information is not available in QL.
;; (define (dependency->input dependency)
;;   (match dependency
;; ))

(define (dependency-list->inputs lst)
  (map
    (lambda (dependency)
      (list dependency (list 'unquote (string->symbol dependency))))
    (cl-names->guix-names lst)))

(define (sxml->guix-package name)
  (define (sxml-value path)
    (match ((sxpath path) sxml)
      (() #f)
      ((val) val)))
  (and-let* (
             (sxml (get-homepage name))
             (version (find-latest-version name))
             (file '())
             ;;(home-page '())
             (hash (metadata-ref (ql-hash-file) name))
             (source-url (metadata-ref url-dict "src"))
             (inputs
              (dependency-list->inputs
               (string-split (metadata-ref (ql-systems-file) name) " "))))
        (call-with-temporary-output-file
          (lambda (temp port)
            (and (url-fetch source-url temp)
                 `(package
                    (name ,(cl-name->guix-name name))
                    (version ,(metadata-ref opam-content "version"))
                    (source
                      (origin
                        (method url-fetch)
                        (uri ,source-url)
                        ;; TODO chech hash
                        (sha256 (base32 ,(guix-hash-url temp)))))
                    (build-system ocaml-build-system)
                    ,@(if (null? inputs)
                        '()
                        `((inputs ,(list 'quasiquote inputs))))
                    (home-page ,(metadata-ref opam-content "homepage"))
                    (synopsis ,(metadata-ref opam-content "synopsis"))
                    (description ,(metadata-ref opam-content "description"))
                    (license #f)))))))


;; opam stuff:
;; (define (find-latest-version package repository)
;;   "Get the latest version of a package as described in the given repository."
;;   (let* ((dir (string-append repository "/packages/" package))
;;          (versions (scandir dir (lambda (name) (not (string-prefix? "." name))))))
;;     (if versions
;;       (let ((versions (map
;;                         (lambda (dir)
;;                           (string-join (cdr (string-split dir #\.)) "."))
;;                         versions)))
;;         (latest-version versions))
;;       (begin
;;         (format #t (G_ "Package not found in opam repository: ~a~%") package)
;;         #f))))

;; (define (get-metadata opam-file)
;;   (with-input-from-file opam-file
;;     (lambda _
;;       (peg:tree (match-pattern records (get-string-all (current-input-port)))))))

;; (define (ocaml-name->guix-name name)
;;   (cond
;;     ((equal? name "ocamlfind") "ocaml-findlib")
;;     ((string-prefix? "ocaml" name) name)
;;     ((string-prefix? "conf-" name) (substring name 5))
;;     (else (string-append "ocaml-" name))))

;; (define (metadata-ref file lookup)
;;   (pk 'file file 'lookup lookup)
;;   (fold (lambda (record acc)
;;           (match record
;;             ((record key val)
;;              (if (equal? key lookup)
;;                (match val
;;                  (('list-pat . stuff) stuff)
;;                  (('string-pat stuff) stuff)
;;                  (('multiline-string stuff) stuff)
;;                  (('dict records ...) records))
;;                acc))))
;;         #f file))

;; (define (native? condition)
;;   (match condition
;;     (('condition-var var)
;;      (match var
;;        ("with-test" #t)
;;        ("test" #t)
;;        ("build" #t)
;;        (_ #f)))
;;     ((or ('condition-or cond-left cond-right) ('condition-and cond-left cond-right))
;;      (or (native? cond-left)
;;          (native? cond-right)))
;;     (_ #f)))

;; (define (dependency->input dependency)
;;   (match dependency
;;     (('string-pat str) str)
;;     (('conditional-value val condition)
;;      (if (native? condition) "" (dependency->input val)))))

;; (define (dependency->native-input dependency)
;;   (match dependency
;;     (('string-pat str) "")
;;     (('conditional-value val condition)
;;      (if (native? condition) (dependency->input val) ""))))

;; (define (ocaml-names->guix-names names)
;;   (map ocaml-name->guix-name
;;        (remove (lambda (name)
;;                  (or (equal? "" name))
;;                      (equal? "ocaml" name))
;;                names)))

;; (define (depends->inputs depends)
;;   (filter (lambda (name)
;;             (and (not (equal? "" name))
;;                  (not (equal? "ocaml" name))
;;                  (not (equal? "ocamlfind" name))))
;;     (map dependency->input depends)))

;; (define (depends->native-inputs depends)
;;   (filter (lambda (name) (not (equal? "" name)))
;;     (map dependency->native-input depends)))

;; (define (dependency-list->inputs lst)
;;   (map
;;     (lambda (dependency)
;;       (list dependency (list 'unquote (string->symbol dependency))))
;;     (ocaml-names->guix-names lst)))

             reply	other threads:[~2019-01-23 12:28 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-23 12:35 swedebugia [this message]
2019-01-23 14:22 ` Help with sxml simple parser for the quicklisp importer Ricardo Wurmus
2019-01-23 16:03   ` swedebugia
2019-01-23 15:58     ` Ricardo Wurmus
2019-01-23 16:32       ` swedebugia
2019-01-23 16:41         ` Ricardo Wurmus
2019-01-23 18:58           ` Pierre Neidhardt
2019-01-23 21:21             ` swedebugia
2019-01-23 16:55       ` Katherine Cox-Buday

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1b161633-c285-1401-d771-c965dae58149@riseup.net \
    --to=swedebugia@riseup.net \
    --cc=guix-devel@gnu.org \
    --cc=mail@ambrevar.xyz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/guix.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.