Unverified Commit 82eeda2a authored by Gaétan Lepage's avatar Gaétan Lepage Committed by GitHub
Browse files

python3Packages.tokenizers: 0.21.1 -> 0.21.2 (#419584)

parents 5d6b848b 00facaee
Loading
Loading
Loading
Loading
+1334 −0

File added.

Preview size limit exceeded, changes collapsed.

+27 −16
Original line number Diff line number Diff line
@@ -47,6 +47,10 @@ let
      url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
      hash = "sha256-B+ztN1zsFE0nyQAkHz4zlHjeyVj5L928VR8pXJkgOKM=";
    };
    "tokenizer-llama3.json" = fetchurl {
      url = "https://huggingface.co/Narsil/llama-tokenizer/resolve/main/tokenizer.json";
      hash = "sha256-eePlImNfMXEwCRO7QhRkqH3mIiGCoFcLmyzLoqlksrQ=";
    };
    "big.txt" = fetchurl {
      url = "https://norvig.com/big.txt";
      hash = "sha256-+gZsfUDw8gGsQUTmUqpiQw5YprOAXscGUPZ42lgE6Hs=";
@@ -71,33 +75,40 @@ let
in
buildPythonPackage rec {
  pname = "tokenizers";
  version = "0.21.1";
  version = "0.21.2";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "huggingface";
    repo = "tokenizers";
    tag = "v${version}";
    hash = "sha256-3S7ZCaZnnwyNjoZ4Y/q3ngQE2MIm2iyCCjYAkdMVG2A=";
    hash = "sha256-HO7Zg/yLY6yxjOo5Jf6Iu2zCreCyv7IaNrWtBKrspqQ=";
  };

  # TestUnigram.test_continuing_prefix_trainer_mismatch fails with:
  # Exception: No such file or directory (os error 2)
  # Fix submitted upstream: https://github.com/huggingface/tokenizers/pull/1747
  # Cargo.lock shipped with 0.21.2 is invalid:
  # error: no matching package found
  # searched package name: `ahash`
  # perhaps you meant:      wasi
  # location searched: directory source `/build/tokenizers-0.21.2-vendor` (which is replacing registry `crates-io`)
  # required by package `tokenizers-python v0.21.2 (/build/source/bindings/python)`
  #
  # Hence, I (@GaetanLepage) re-generated the lockfile and embedded it here for now.
  # TODO: Try to switch back to `rustPlatform.fetchCargoVendor` at the next release.
  postPatch = ''
    substituteInPlace tests/bindings/test_trainers.py \
      --replace-fail '"data/' '"tests/data/'
    ln -sf '${./Cargo.lock}' Cargo.lock
  '';

  cargoDeps = rustPlatform.fetchCargoVendor {
    inherit
      pname
      version
      src
      sourceRoot
      ;
    hash = "sha256-I7LlBmeVY2rWI0ta6x311iAurQKuutsClrbUgkt9xWk=";
  cargoDeps = rustPlatform.importCargoLock {
    lockFile = ./Cargo.lock;
  };
  # cargoDeps = rustPlatform.fetchCargoVendor {
  #   inherit
  #     pname
  #     version
  #     src
  #     sourceRoot
  #     ;
  #   hash = "sha256-EKiHjcXUjU8CWe2CB2EgAQlRcZebwe4EpD7P8lWbCjw=";
  # };

  sourceRoot = "${src.name}/bindings/python";