all messages for Guix-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
* [bug#73266] [PATCH 0/9] Add python-spacy-curated-transformers
@ 2024-09-15  8:11 Nicolas Graves via Guix-patches via
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
  0 siblings, 1 reply; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:11 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

This patch series builds upon 73094, 73106, 73109, 73115 to add all
necessary packages to be able to create packages from spacy models,
which I will probably propose in a dedicated channel.

Nicolas Graves (9):
  gnu: Add python-azure-storage-file-datalake.
  gnu: Add python-cloudpathlib.
  gnu: Add python-weasel.
  gnu: python-thinc: Update to 8.2.2.
  gnu: python-spacy: Update to 3.7.5.
  gnu: Add python-cutlery.
  gnu: Add python-curated-transformers.
  gnu: Add python-curated-tokenizers.
  gnu: Add python-spacy-curated-transformers.

 gnu/packages/machine-learning.scm | 186 ++++++++++++++++++++++++++++--
 gnu/packages/python-web.scm       |  46 ++++++++
 gnu/packages/python-xyz.scm       |  47 ++++++++
 3 files changed, 267 insertions(+), 12 deletions(-)

-- 
2.46.0





^ permalink raw reply	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake.
  2024-09-15  8:11 [bug#73266] [PATCH 0/9] Add python-spacy-curated-transformers Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57 ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 2/9] gnu: Add python-cloudpathlib Nicolas Graves via Guix-patches via
                     ` (7 more replies)
  0 siblings, 8 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/python-web.scm (python-azure-storage-file-datalake): New variable.

Change-Id: Iba59fc31822b95361558f1ef62a2a40be0865080
---
 gnu/packages/python-web.scm | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm
index 8b29f1cd93..a8510dbcc1 100644
--- a/gnu/packages/python-web.scm
+++ b/gnu/packages/python-web.scm
@@ -8072,6 +8072,28 @@ (define-public python-azure-core
 Python.")
     (license license:expat)))
 
+(define-public python-azure-storage-file-datalake
+  (package
+    (name "python-azure-storage-file-datalake")
+    (version "12.16.0")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "azure-storage-file-datalake" version))
+       (sha256
+        (base32 "0cq302vpnhb2fwlm6zxhpypqs97gn93cp27vhkpn50a39q75i19i"))))
+    (build-system pyproject-build-system)
+    (propagated-inputs (list python-azure-core
+                             python-azure-storage-blob
+                             python-isodate
+                             python-typing-extensions))
+    (home-page "https://github.com/Azure/azure-sdk-for-python")
+    (synopsis "Microsoft Azure File DataLake Storage Client Library for Python")
+    (description
+     "This package provides the Microsoft Azure File @code{DataLake} Storage
+Client Library for Python.")
+    (license license:expat)))
+
 (define-public python-azure-storage-blob
   (package
     (name "python-azure-storage-blob")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 2/9] gnu: Add python-cloudpathlib.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 3/9] gnu: Add python-weasel Nicolas Graves via Guix-patches via
                     ` (6 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/python-web.scm (python-cloudpathlib): New variable.

Change-Id: I492abd6bea422faee1b5054edcf8f9e46c286fcf
---
 gnu/packages/python-web.scm | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/gnu/packages/python-web.scm b/gnu/packages/python-web.scm
index a8510dbcc1..0d0e76e0bd 100644
--- a/gnu/packages/python-web.scm
+++ b/gnu/packages/python-web.scm
@@ -7569,6 +7569,30 @@ (define-public python-cloud-init
     ;; Either license can be chosen
     (license (list license:asl2.0 license:gpl3))))
 
+(define-public python-cloudpathlib
+  (package
+    (name "python-cloudpathlib")
+    (version "0.19.0")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "cloudpathlib" version))
+       (sha256
+        (base32 "1s2gcv89ybpsvyh6d0rpdpnb177q7la0n8fs4cj5v4sfkbyxp7li"))))
+    (build-system pyproject-build-system)
+    (arguments (list #:tests? #f))  ; No tests bundled.
+    (propagated-inputs (list python-azure-storage-blob
+                             python-azure-storage-file-datalake
+                             python-boto3
+                             python-google-cloud-storage
+                             python-typing-extensions))
+    (native-inputs (list python-flit-core))
+    (home-page "https://cloudpathlib.drivendata.org/stable")
+    (synopsis "Python classes for cloud storage services")
+    (description "This package provides provides @code{pathlib.Path}-like
+classes for different cloud storage services.")
+    (license license:expat)))
+
 (define-public python-cloudscraper
   (package
     (name "python-cloudscraper")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 3/9] gnu: Add python-weasel.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 2/9] gnu: Add python-cloudpathlib Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 4/9] gnu: python-thinc: Update to 8.2.2 Nicolas Graves via Guix-patches via
                     ` (5 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/python-xyz.scm (python-weasel): New variable.

Change-Id: If9be639d55503ec5dd1c0867708ea63746a5887b
---
 gnu/packages/python-xyz.scm | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/gnu/packages/python-xyz.scm b/gnu/packages/python-xyz.scm
index 42a678ceba..38c877d481 100644
--- a/gnu/packages/python-xyz.scm
+++ b/gnu/packages/python-xyz.scm
@@ -30718,6 +30718,53 @@ (define-public python-watchgod
 operating systems and an elegant approach to concurrency using threading.")
     (license license:expat)))
 
+(define-public python-weasel
+  (package
+    (name "python-weasel")
+    (version "0.4.1")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "weasel" version))
+       (sha256
+        (base32 "1aas113r29y6yxrmdlsw80rj8w4kgw1jhfjw9rsgc4rf0w7j3g5a"))))
+    (build-system pyproject-build-system)
+    (arguments  ; These tests require network.
+     (list #:test-flags
+           `(list "-k" ,(string-append
+                         "not test_project_git_dir_asset"
+                         " and not test_project_git_file_asset"
+                         " and not test_project_assets"
+                         " and not test_project_clone"
+                         " and not test_remote"))))
+    (propagated-inputs (list python-cloudpathlib
+                             python-confection
+                             python-packaging
+                             python-pydantic
+                             python-requests
+                             python-smart-open
+                             python-srsly
+                             python-typer
+                             python-wasabi))
+    (native-inputs (list git-minimal
+                         python-pytest))
+    (home-page "https://github.com/explosion/weasel")
+    (synopsis "Small workflow system in Python")
+    (description "This package provides management and sharing end-to-end
+workflows for different use cases and domains, and orchestrate training,
+packaging and serving custom pipelines.  It provides the following
+functionality:
+@itemize
+@item clone a pre-defined project template,
+@item adjust it to fit particular needs,
+@item load in data,
+@item train a pipeline,
+@item export it as a Python package,
+@item upload outputs to a remote storage,
+@item share results with a team.
+@end itemize")
+    (license license:expat)))
+
 (define-public python-wget
   (package
     (name "python-wget")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 4/9] gnu: python-thinc: Update to 8.2.2.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 2/9] gnu: Add python-cloudpathlib Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 3/9] gnu: Add python-weasel Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 5/9] gnu: python-spacy: Update to 3.7.5 Nicolas Graves via Guix-patches via
                     ` (4 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-thinc): Update to 8.2.2.
[propagated-inputs]: Remove python-contextvars, python-dataclasses,
python-typing-extensions.

Change-Id: Ic9176f75b7a7fe075a72c7b6607bc1a1b39294f4
---
 gnu/packages/machine-learning.scm | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 107954577b..4b834a847f 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -2093,13 +2093,13 @@ (define-public python-mord
 (define-public python-thinc
   (package
     (name "python-thinc")
-    (version "8.1.10")
+    (version "8.2.2")
     (source (origin
               (method url-fetch)
               (uri (pypi-uri "thinc" version))
               (sha256
                (base32
-                "14drmwa2sh8fqszv1fm2jl4lky1j5yrbkjv89bl49q07vbblhjkc"))))
+                "11qmdpw4r7qm1l5p8iws8jf33az1hac7zxki38j9a3rccx2bk1bf"))))
     (build-system pyproject-build-system)
     (arguments
      '(#:phases
@@ -2111,16 +2111,13 @@ (define-public python-thinc
     (propagated-inputs (list python-blis-for-thinc
                              python-catalogue
                              python-confection
-                             python-contextvars
                              python-cymem
-                             python-dataclasses
                              python-murmurhash
                              python-numpy
                              python-packaging
                              python-preshed
                              python-pydantic
                              python-srsly
-                             python-typing-extensions
                              python-wasabi))
     (native-inputs (list python-cython python-mock python-pytest))
     (home-page "https://github.com/explosion/thinc")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 5/9] gnu: python-spacy: Update to 3.7.5.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
                     ` (2 preceding siblings ...)
  2024-09-15  8:57   ` [bug#73266] [PATCH 4/9] gnu: python-thinc: Update to 8.2.2 Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 6/9] gnu: Add python-cutlery Nicolas Graves via Guix-patches via
                     ` (3 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-spacy): Update to 3.7.5.
[arguments]<#:test-flags>: Ignore test_pass_doc_to_pipeline.
[propagated-inputs]: Remove python-pathy, python-smart-open,
python-typing-extensions. Add python-weasel.

Change-Id: Ieae58f004d06323990e80859e3c4dce2166c447c
---
 gnu/packages/machine-learning.scm | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 4b834a847f..008bf2060a 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -1336,13 +1336,13 @@ (define-public python-spacy-loggers
 (define-public python-spacy
   (package
     (name "python-spacy")
-    (version "3.5.3")
+    (version "3.7.5")
     (source (origin
               (method url-fetch)
               (uri (pypi-uri "spacy" version))
               (sha256
                (base32
-                "13141hc966d8nxbnlwj01vhndgq0rq4nmii3qkb3hrap45kiv5rm"))))
+                "1lrd7k7hizygqldpln4aham6kprbyaj7z7pfd5dabixcyb5wcj56"))))
     (build-system pyproject-build-system)
     (arguments
      (list
@@ -1356,7 +1356,9 @@ (define-public python-spacy
               ;; This tries to run the application with typer, which fails
               ;; with an unspecified error, possibly because the build
               ;; container doesn't have /bin/sh.
-              " and not test_project_assets"))
+              " and not test_project_assets"
+              ;; Fails with DeprecationWarning
+              " and not test_pass_doc_to_pipeline"))
       #:phases
       '(modify-phases %standard-phases
          (add-after 'build 'build-ext
@@ -1370,20 +1372,18 @@ (define-public python-spacy
                              python-murmurhash
                              python-numpy
                              python-packaging
-                             python-pathy
                              python-preshed
                              python-pydantic
                              python-requests
                              python-setuptools
-                             python-smart-open
                              python-spacy-legacy
                              python-spacy-loggers
                              python-srsly
                              python-thinc
                              python-tqdm
                              python-typer
-                             python-typing-extensions
-                             python-wasabi))
+                             python-wasabi
+                             python-weasel))
     (native-inputs
      (list python-cython python-pytest python-mock))
     (home-page "https://spacy.io")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 6/9] gnu: Add python-cutlery.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
                     ` (3 preceding siblings ...)
  2024-09-15  8:57   ` [bug#73266] [PATCH 5/9] gnu: python-spacy: Update to 3.7.5 Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 7/9] gnu: Add python-curated-transformers Nicolas Graves via Guix-patches via
                     ` (2 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-cutlery): New variable.

Change-Id: I5304205737330850ce84a49df814b96a4d605699
---
 gnu/packages/machine-learning.scm | 38 +++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 008bf2060a..89fcd3c1b7 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -2442,6 +2442,44 @@ (define-public python-cmaes
 Covariance Matrix Adaptation Evolution Strategy (CMA-ES) for Python.")
     (license license:expat)))
 
+(define-public python-cutlery
+  (package
+    (name "python-cutlery")
+    (version "0.0.6")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "cutlery" version))
+       (sha256
+        (base32 "1l5jv0mkmvzlmglz61py6f4inil2iwgh1ap8881cyb6k7hnnccc9"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:phases
+      #~(modify-phases %standard-phases
+          ;; For some reason when both local and installed exist,
+          ;; local is chosen and is missing shared libraries.
+          ;; Use installed version to run tests instead.
+          (add-before 'check 'pre-check
+            (lambda* (#:key tests? inputs outputs #:allow-other-keys)
+              (when tests?
+                (copy-recursively "cutlery/tests" "tests")
+                (delete-file-recursively "cutlery")
+                (add-installed-pythonpath inputs outputs)))))))
+    (propagated-inputs (list python-regex))
+    (native-inputs (list python-cython python-pytest))
+    (home-page "https://github.com/explosion/curated-tokenizers")
+    (synopsis "Lightweight piece tokenization library")
+    (description "This package provides a lightweight wordpiece and
+sentencepiece tokenization library.  It supports multiple tokenizers:
+@itemize
+@item BPE
+@item Byte BPE
+@item Unigram
+@item Wordpiece
+@end itemize")
+    (license license:expat)))
+
 (define-public python-autograd
   (let* ((commit "c6d81ce7eede6db801d4e9a92b27ec5d409d0eab")
          (revision "0")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 7/9] gnu: Add python-curated-transformers.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
                     ` (4 preceding siblings ...)
  2024-09-15  8:57   ` [bug#73266] [PATCH 6/9] gnu: Add python-cutlery Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 8/9] gnu: Add python-curated-tokenizers Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 9/9] gnu: Add python-spacy-curated-transformers Nicolas Graves via Guix-patches via
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-curated-transformers): New variable.

Change-Id: I42cf780097456f5a8a9a9efc2a56e2c082d2a938
---
 gnu/packages/machine-learning.scm | 55 +++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index 89fcd3c1b7..d1b282fea8 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -2480,6 +2480,61 @@ (define-public python-cutlery
 @end itemize")
     (license license:expat)))
 
+(define-public python-curated-transformers
+  (package
+    (name "python-curated-transformers")
+    (version "0.1.0")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "curated-transformers" version))
+       (sha256
+        (base32 "04k54r5cxjl3l7xs4kx4cfnqsjr7gdlr577sp7sl7qgrk3kfqjbm"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:test-flags
+      '(list  ; Most ignored tests require network.
+        "--ignore=curated_transformers/tests/tokenizers/test_auto_tokenizer.py"
+        "-k" (string-append "not test_special_pieces"
+                            " and not test_auto_encoder"
+                            " and not test_auto_decoder"
+                            " and not test_auto_causal_lm"
+                            " and not test_from_hf_hub_to_cache"
+                            " and not test_from_hf_hub_to_cache_legacy"
+                            " and not test_checkpoint_type_without_safetensors"
+                            " and not test_hf_hub_failures"
+                            ;; These have been added when downgrading curated_tokenizers.
+                            " and not test_camembert_tokenizer_toy_tokenizer"
+                            " and not test_roberta_tokenizer"
+                            " and not test_xlmr_toy_tokenizer"))))
+    (propagated-inputs (list python-catalogue
+                             python-cutlery
+                             python-huggingface-hub
+                             python-pytorch
+                             python-tokenizers))
+    (native-inputs (list python-pytest))
+    (home-page "https://github.com/explosion/curated-transformers")
+    (synopsis "PyTorch library of transformer models and components")
+    (description
+     "This package provides a @code{PyTorch} library of transformer models and
+components.  It helps to download state-of-the-art models that are composed
+from a set of reusable components.  The stand-out features of Curated
+Transformer are:
+
+@itemize
+@item Supports state-of-the art transformer models, including LLMs such as
+Falcon, Llama, and Dolly v2.
+@item Each model is composed from a set of reusable building blocks, providing
+many benefits: implementing a feature or bugfix benefits all models ; Adding
+new models to the library is low-effort.
+@item Consistent type annotations of all public APIs, hence a great coding
+support from IDEs.  Integrates well with your existing type-checked code.
+@item Great for education, because the building blocks are easy to study.
+@item Minimal dependencies.
+@end itemize")
+    (license license:expat)))
+
 (define-public python-autograd
   (let* ((commit "c6d81ce7eede6db801d4e9a92b27ec5d409d0eab")
          (revision "0")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 8/9] gnu: Add python-curated-tokenizers.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
                     ` (5 preceding siblings ...)
  2024-09-15  8:57   ` [bug#73266] [PATCH 7/9] gnu: Add python-curated-transformers Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  2024-09-15  8:57   ` [bug#73266] [PATCH 9/9] gnu: Add python-spacy-curated-transformers Nicolas Graves via Guix-patches via
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-curated-tokenizers): New variable.

Change-Id: I719d2ffd499c86e6bb2f9215ed979e47c0e32484
---
 gnu/packages/machine-learning.scm | 41 +++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index d1b282fea8..e80412ed41 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -2480,6 +2480,47 @@ (define-public python-cutlery
 @end itemize")
     (license license:expat)))
 
+(define-public python-curated-tokenizers
+  (package
+    (name "python-curated-tokenizers")
+    (version "0.0.9")
+    ;; This source includes third_party protobuf, but a version that
+    ;; is not currently packaged in guix (3.6 < version <= 3.19.5).
+    ;; Try using guix's protobuf when updating.
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "curated-tokenizers" version))
+       (sha256
+        (base32 "09ffs2qjlli35wnf8wf64s14xm75vi5ynvkrn9nqllmk9bjlfgf9"))))
+    (build-system pyproject-build-system)
+    (arguments
+     (list
+      #:phases
+      #~(modify-phases %standard-phases
+          ;; For some reason when both local and installed exist,
+          ;; local is chosen and is missing shared libraries.
+          ;; Use installed version to run tests instead.
+          (add-before 'check 'pre-check
+            (lambda* (#:key tests? inputs outputs #:allow-other-keys)
+              (when tests?
+                (copy-recursively "curated_tokenizers/tests" "tests")
+                (delete-file-recursively "curated_tokenizers")
+                (add-installed-pythonpath inputs outputs)))))))
+    (propagated-inputs (list python-regex))
+    (native-inputs (list python-cython python-pytest))
+    (home-page "https://github.com/explosion/curated-tokenizers")
+    (synopsis "Lightweight piece tokenization library")
+    (description "This package provides a lightweight wordpiece and
+sentencepiece tokenization library.  It supports multiple tokenizers:
+@itemize
+@item BPE
+@item Byte BPE
+@item Unigram
+@item Wordpiece
+@end itemize")
+    (license license:expat)))
+
 (define-public python-curated-transformers
   (package
     (name "python-curated-transformers")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [bug#73266] [PATCH 9/9] gnu: Add python-spacy-curated-transformers.
  2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
                     ` (6 preceding siblings ...)
  2024-09-15  8:57   ` [bug#73266] [PATCH 8/9] gnu: Add python-curated-tokenizers Nicolas Graves via Guix-patches via
@ 2024-09-15  8:57   ` Nicolas Graves via Guix-patches via
  7 siblings, 0 replies; 10+ messages in thread
From: Nicolas Graves via Guix-patches via @ 2024-09-15  8:57 UTC (permalink / raw)
  To: 73266; +Cc: ngraves

* gnu/packages/machine-learning.scm (python-spacy-curated-transformers): New variable.

Change-Id: Id4b67b2ea2de4745831c3536124304860e9764d8
---
 gnu/packages/machine-learning.scm | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/gnu/packages/machine-learning.scm b/gnu/packages/machine-learning.scm
index e80412ed41..3afc224e7c 100644
--- a/gnu/packages/machine-learning.scm
+++ b/gnu/packages/machine-learning.scm
@@ -1292,6 +1292,37 @@ (define-public python-sentence-transformers
 models, to achieve maximal performance on your specific task.")
     (license license:asl2.0)))
 
+(define-public python-spacy-curated-transformers
+  (package
+    (name "python-spacy-curated-transformers")
+    (version "0.2.2")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "spacy-curated-transformers" version))
+       (sha256
+        (base32 "1hsqaai666yy9xzj14azli0hgipdkkc5x7xwszh58ndvxsij3dq3"))))
+    (build-system pyproject-build-system)
+    (arguments (list #:tests? #f))  ; Missing python-cupy dependency
+    (propagated-inputs (list python-curated-tokenizers
+                             python-curated-transformers
+                             python-cutlery
+                             python-fsspec
+                             python-pytorch
+                             python-spacy
+                             python-thinc))
+    (home-page "https://github.com/explosion/spacy-curated-transformers")
+    (synopsis "Curated transformer models for spaCy pipelines")
+    (description "This package provides transformer models for @code{spaCy}
+pipelines.  It allows you to use pretrained models based on one of the
+following architectures to power your spaCy pipeline: ALBERT, BERT, CamemBERT,
+RoBERTa, XLM-RoBERTa.  It provides all the features supported by
+spacy-transformers such as support for Hugging Face Hub, multi-task learning,
+the extensible config system and out-of-the-box serialization, as well as deep
+integration into spaCy, which lays the groundwork for deployment-focused
+features such as distillation and quantization.")
+    (license license:expat)))
+
 (define-public python-spacy-legacy
   (package
     (name "python-spacy-legacy")
-- 
2.46.0





^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-09-15  9:35 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-15  8:11 [bug#73266] [PATCH 0/9] Add python-spacy-curated-transformers Nicolas Graves via Guix-patches via
2024-09-15  8:57 ` [bug#73266] [PATCH 1/9] gnu: Add python-azure-storage-file-datalake Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 2/9] gnu: Add python-cloudpathlib Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 3/9] gnu: Add python-weasel Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 4/9] gnu: python-thinc: Update to 8.2.2 Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 5/9] gnu: python-spacy: Update to 3.7.5 Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 6/9] gnu: Add python-cutlery Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 7/9] gnu: Add python-curated-transformers Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 8/9] gnu: Add python-curated-tokenizers Nicolas Graves via Guix-patches via
2024-09-15  8:57   ` [bug#73266] [PATCH 9/9] gnu: Add python-spacy-curated-transformers Nicolas Graves via Guix-patches via

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/guix.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.