Unverified Commit a330c9ce authored by Gaétan Lepage's avatar Gaétan Lepage Committed by GitHub
Browse files

python3Packages.pytorch-tokenizers: init at 1.0.1 (#460929)

parents cc53790c ce042502
Loading
Loading
Loading
Loading
+94 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  replaceVars,

  # build-system
  cmake,
  pybind11,
  setuptools,

  # dependencies
  sentencepiece,
  tiktoken,
  tokenizers,

  # tests
  pytestCheckHook,
  transformers,
}:

let
  # https://github.com/meta-pytorch/tokenizers/blob/v1.0.1/CMakeLists.txt#L174-L175
  pybind11-src = fetchFromGitHub {
    owner = "pybind";
    repo = "pybind11";
    tag = "v2.13.6";
    hash = "sha256-SNLdtrOjaC3lGHN9MAqTf51U9EzNKQLyTMNPe0GcdrU=";
  };
in
buildPythonPackage rec {
  pname = "pytorch-tokenizers";
  version = "1.0.1";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "meta-pytorch";
    repo = "tokenizers";
    tag = "v${version}";
    fetchSubmodules = true;
    hash = "sha256-1BGazimbauNBN/VfLiuhk21VEhbP07GEpPc+GAfKTQY=";
  };

  patches = [
    (replaceVars ./dont-fetch-pybind11.patch {
      pybind11 = pybind11-src;
    })
  ];

  postPatch = ''
    substituteInPlace pyproject.toml \
      --replace-fail '"pip>=23",' "" \
      --replace-fail '"pytest",' ""
  '';

  build-system = [
    cmake
    pybind11
    setuptools
  ];
  dontUseCmakeConfigure = true;

  dependencies = [
    sentencepiece
    tiktoken
    tokenizers
  ];

  pythonImportsCheck = [
    "pytorch_tokenizers"
    "pytorch_tokenizers.pytorch_tokenizers_cpp"
  ];

  preCheck = ''
    rm -rf pytorch_tokenizers
  '';

  nativeCheckInputs = [
    pytestCheckHook
    transformers
  ];

  disabledTestPaths = [
    # Require downloading models from huggingface
    "test/test_hf_tokenizer.py"
  ];

  meta = {
    description = "C++ implementations for various tokenizers (sentencepiece, tiktoken, etc.)";
    homepage = "https://github.com/meta-pytorch/tokenizers";
    license = lib.licenses.bsd3;
    maintainers = with lib.maintainers; [ GaetanLepage ];
  };
}
+14 −0
Original line number Diff line number Diff line
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97f0fe6..8c78f85 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,8 +171,7 @@ if(TOKENIZERS_BUILD_PYTHON)
   include(FetchContent)
   FetchContent_Declare(
     pybind11
-    GIT_REPOSITORY https://github.com/pybind/pybind11.git
-    GIT_TAG v2.13.6
+    URL @pybind11@
   )
   FetchContent_MakeAvailable(pybind11)
 
+2 −0
Original line number Diff line number Diff line
@@ -15436,6 +15436,8 @@ self: super: with self; {
  pytorch-tabnet = callPackage ../development/python-modules/pytorch-tabnet { };
  pytorch-tokenizers = callPackage ../development/python-modules/pytorch-tokenizers { };
  pytorch3d = callPackage ../development/python-modules/pytorch3d { };
  pytorchviz = callPackage ../development/python-modules/pytorchviz { };