Unverified Commit cbf63579 authored by luftmensch-luftmensch's avatar luftmensch-luftmensch
Browse files

datatrove: init at 0.2.0

parent a01ee9db
Loading
Loading
Loading
Loading
+65 −0
Original line number Diff line number Diff line
{
  lib,
  fetchFromGitHub,
  python3Packages,
}:
let
  version = "0.2.0";
in
python3Packages.buildPythonPackage {
  pname = "datatrove";
  inherit version;
  pyproject = true;

  src = fetchFromGitHub {
    owner = "huggingface";
    repo = "datatrove";
    rev = "refs/tags/v${version}";
    hash = "sha256-2NJja2yWeHOgo1pCuwHN6SgYnsimuZdK0jE8ucTH4r8=";
  };

  nativeBuildInputs = with python3Packages; [ setuptools ];

  propagatedBuildInputs = with python3Packages; [
    dill
    fsspec
    huggingface-hub
    tokenizers
    humanize
    loguru
    multiprocess
    numpy
    rich
  ];

  nativeCheckInputs = with python3Packages; [ pytestCheckHook ];
  dependencies = with python3Packages; [
    boto3
    fasteners
    huggingface-hub
    moto
    nltk
    s3fs
    xxhash
  ];

  disabledTestPaths = [
    "tests/executor/test_local.py"
    "tests/pipeline/test_filters.py"
    "tests/pipeline/test_bloom_filter.py"
    "tests/pipeline/test_minhash.py"
    "tests/pipeline/test_sentence_deduplication.py"
    "tests/pipeline/test_tokenization.py"
    "tests/pipeline/test_exact_substrings.py"
  ];

  pythonImportsCheck = [ "datatrove" ];
  meta = {
    description = "Set of platform-agnostic customizable pipeline processing blocks for data processing";
    homepage = "https://github.com/huggingface/datatrove";
    changelog = "https://github.com/huggingface/datatrove/releases/tag/v${version}";
    license = lib.licenses.asl20;
    maintainers = with lib.maintainers; [ luftmensch-luftmensch ];
    platforms = lib.platforms.all;
  };
}