Unverified Commit 9e18dc36 authored by Sandro Jäckel's avatar Sandro Jäckel Committed by GitHub
Browse files

python3Packages.dedupe: init at 3.0.3 (#453792)

parents e999589a b77e8364
Loading
Loading
Loading
Loading
+50 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  cython,
  setuptools,

  # tests
  pytestCheckHook,
}:

buildPythonPackage rec {
  pname = "affinegap";
  version = "1.12";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "affinegap";
    tag = "v${version}";
    hash = "sha256-9eX41eoME5Vdtq+c04eQbMYnViy6QKOhKkafrkeMylI=";
  };

  build-system = [
    cython
    setuptools
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  # Prevent importing from source during test collection (only $out has compiled extensions)
  preCheck = ''
    rm -rf affinegap
  '';

  pythonImportsCheck = [
    "affinegap"
  ];

  meta = {
    description = "Cython implementation of the affine gap string distance";
    homepage = "https://github.com/dedupeio/affinegap";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}
+53 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  unstableGitUpdater,

  # build-system
  setuptools,

  # dependencies
  numpy,

  # tests
  pytestCheckHook,
}:

buildPythonPackage {
  pname = "categorical-distance";
  version = "1.9-unstable-2020-03-31";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "categorical-distance";
    rev = "07d079fd412ccf06cdb200b3cd2cfa4b67f78722";
    hash = "sha256-zSjSrlFiRus/T2XZdakLQpF1u/LV0VNWwrc8lhss6kU=";
  };

  build-system = [
    setuptools
  ];

  dependencies = [
    numpy
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  pythonImportsCheck = [
    "categorical"
  ];

  passthru.updateScript = unstableGitUpdater { };

  meta = {
    description = "Compare similarity of categorical variables using Jaccard index";
    homepage = "https://github.com/dedupeio/categorical-distance";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}
+45 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  setuptools,

  # tests
  pytestCheckHook,
}:

buildPythonPackage rec {
  pname = "dedupe-levenshtein-search";
  version = "1.4.5";
  pyproject = true;

  # NOTE: This is a fork of mattandahalfew/Levenshtein_search created for MIT licensing.
  # TODO: Evaluate if upstream version could be used instead.
  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "Levenshtein_search";
    tag = "v${version}";
    hash = "sha256-YhsZA28H4OUkQEBtJ+9OXJld4Z/PJbOPqAQQ9qaXSjk=";
  };

  build-system = [
    setuptools
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  pythonImportsCheck = [
    "Levenshtein_search"
  ];

  meta = {
    description = "Search through documents for approximately matching strings using Levenshtein distance";
    homepage = "https://github.com/dedupeio/Levenshtein_search";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}
+59 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  cython,
  numpy,
  setuptools,

  # tests
  pytest-cov-stub,
  pytestCheckHook,
}:

buildPythonPackage rec {
  pname = "dedupe-pylbfgs";
  version = "0.2.0.16";
  pyproject = true;

  # NOTE: This is a fork of larsmans/pylbfgs maintained by dedupeio
  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "pylbfgs";
    tag = "${version}";
    hash = "sha256-H416dgZQxyqsnhmlK5keW8cJWY6gea4mebVuP0IEVOU=";
  };

  build-system = [
    cython
    numpy
    setuptools
  ];

  dependencies = [
    numpy
  ];

  nativeCheckInputs = [
    pytest-cov-stub
    pytestCheckHook
  ];

  # Prevent importing from source during test collection (only $out has compiled extensions)
  preCheck = ''
    rm -rf lbfgs
  '';

  pythonImportsCheck = [
    "lbfgs"
  ];

  meta = {
    description = "Python wrapper for L-BFGS and OWL-QN optimization algorithms";
    homepage = "https://github.com/dedupeio/pylbfgs";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}
+106 −0
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  cython,
  setuptools,

  # dependencies
  affinegap,
  btrees,
  categorical-distance,
  dedupe-levenshtein-search,
  doublemetaphone,
  haversine,
  highered,
  numpy,
  scikit-learn,
  simplecosine,
  zope-index,
  dedupe,

  # tests
  pytest-cov-stub,
  pytestCheckHook,
  python,
  runCommand,
}:

buildPythonPackage rec {
  pname = "dedupe";
  version = "3.0.3";
  pyproject = true;

  src = fetchFromGitHub {
    owner = "dedupeio";
    repo = "dedupe";
    tag = "v${version}";
    hash = "sha256-tfBJeaeZw5w5OwM+AOfy9H6P2zbShjN/kuzEbpxATHI=";
  };

  build-system = [
    cython
    setuptools
  ];

  dependencies = [
    affinegap
    btrees
    categorical-distance
    dedupe-levenshtein-search
    doublemetaphone
    haversine
    highered
    numpy
    scikit-learn
    simplecosine
    zope-index
  ];

  nativeCheckInputs = [
    pytest-cov-stub
    pytestCheckHook
  ];

  # Remove source directory so pytest imports compiled extension from $out
  preCheck = ''
    rm -rf dedupe
  '';

  pythonImportsCheck = [
    "dedupe"
  ];

  passthru.tests.benchmarks =
    runCommand "dedupe-benchmarks-test"
      {
        nativeBuildInputs = [ (python.withPackages (ps: [ dedupe ])) ];
      }
      ''
        # Copy benchmarks to writable location
        cp -r ${src}/benchmarks benchmarks
        chmod -R +w benchmarks
        cd benchmarks

        # Run all three canonical benchmarks
        for benchmark in canonical canonical_gazetteer canonical_matching; do
          echo "Running $benchmark benchmark..."
          # Redirect stderr to /dev/null (`2>/dev/null`) to suppress Python 3.13
          # multiprocessing resource tracker warnings from scikit-learn/joblib subprocesses
          # `|| exit 1` provides fail-fast behavior: exit immediately if any benchmark fails
          PYTHONPATH=$PWD python -m benchmarks.$benchmark 2>/dev/null || exit 1
        done

        touch $out
      '';

  meta = {
    description = "Library for accurate and scalable fuzzy matching, deduplication and entity resolution";
    homepage = "https://github.com/dedupeio/dedupe";
    changelog = "https://github.com/dedupeio/dedupe/blob/${src.tag}/CHANGELOG.md";
    license = lib.licenses.mit;
    maintainers = with lib.maintainers; [ daniel-fahey ];
  };
}
Loading