Unverified Commit bc48aabf authored by Sandro Jäckel's avatar Sandro Jäckel Committed by GitHub
Browse files

Merge pull request #244656 from jokatzke/trafilatura

python3Packages.trafilatura: init at 1.6.3
parents aeddcf46 8200b0b5
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -9162,6 +9162,12 @@
      fingerprint = "7249 70E6 A661 D84E 8B47  678A 0590 93B1 A278 BCD0";
    }];
  };
  jokatzke = {
    email = "jokatzke@fastmail.com";
    github = "jokatzke";
    githubId = 46931073;
    name = "Jonas Katzke";
  };
  joko = {
    email = "ioannis.koutras@gmail.com";
    github = "jokogr";
+54 −0
Original line number Diff line number Diff line
{ lib
, buildPythonPackage
, fetchPypi
, langcodes
, pytestCheckHook
, tld
, urllib3
, pythonOlder
}:

buildPythonPackage rec {
  pname = "courlan";
  version = "0.9.5";
  format = "setuptools";

  disabled = pythonOlder "3.6";

  src = fetchPypi {
    inherit pname version;
    hash = "sha256-ONw1suO/H11RbQDVGsEuveVD40F8a+b2oic8D8W1s1M=";
  };

  propagatedBuildInputs = [
    langcodes
    tld
    urllib3
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  # disable tests that require an internet connection
  disabledTests = [
    "test_urlcheck"
  ];

  # nixify path to the courlan binary in the test suite
  postPatch = ''
    substituteInPlace tests/unit_tests.py \
      --replace "\"courlan --help\"" "\"$out/bin/courlan --help\"" \
      --replace "courlan_bin = \"courlan\"" "courlan_bin = \"$out/bin/courlan\""
  '';

  pythonImportsCheck = [ "courlan" ];

  meta = with lib; {
    description = "Clean, filter and sample URLs to optimize data collection";
    homepage = "https://github.com/adbar/courlan";
    changelog = "https://github.com/adbar/courlan/blob/v${version}/HISTORY.md";
    license = licenses.gpl3Plus;
    maintainers = with maintainers; [ jokatzke ];
  };
}
+56 −0
Original line number Diff line number Diff line
{ lib
, buildPythonPackage
, fetchPypi
, pythonOlder
, charset-normalizer
, dateparser
, lxml
, pytestCheckHook
, python-dateutil
, urllib3
, backports-datetime-fromisoformat
}:

buildPythonPackage rec {
  pname = "htmldate";
  version = "1.6.0";
  format = "setuptools";

  disabled = pythonOlder "3.6";

  src = fetchPypi {
    inherit pname version;
    hash = "sha256-WCfI9iahaACinlfoGIo9MtCwjKTHvWYlN7c7u/IsRaY=";
  };

  propagatedBuildInputs = [
    charset-normalizer
    dateparser
    lxml
    python-dateutil
    urllib3
  ] ++ lib.optionals (pythonOlder "3.7") [
    backports-datetime-fromisoformat
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  # disable tests that require an internet connection
  disabledTests = [
    "test_input"
    "test_cli"
    "test_download"
  ];

  pythonImportsCheck = [ "htmldate" ];

  meta = with lib; {
    description = "Fast and robust extraction of original and updated publication dates from URLs and web pages";
    homepage = "https://htmldate.readthedocs.io";
    changelog = "https://github.com/adbar/htmldate/blob/v${version}/CHANGELOG.md";
    license = licenses.gpl3Plus;
    maintainers = with maintainers; [ jokatzke ];
  };
}
+43 −0
Original line number Diff line number Diff line
{ lib
, buildPythonPackage
, fetchFromGitHub
, pytestCheckHook
, lxml
}:

buildPythonPackage rec {
  pname = "justext";
  version = "3.0.0";
  format = "setuptools";

  src = fetchFromGitHub {
    owner = "miso-belica";
    repo = "jusText";
    rev = "refs/tags/v${version}";
    hash = "sha256-WNxDoM5666tEHS9pMl5dOoig4S7dSYaCLZq71tehWqw=";
  };

  propagatedBuildInputs = [
    lxml
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  # patch out coverage report
  postPatch = ''
    substituteInPlace setup.cfg \
      --replace " --cov=justext --cov-report=term-missing --no-cov-on-fail" ""
  '';

  pythonImportsCheck = [ "justext" ];

  meta = with lib; {
    description = "Heuristic based boilerplate removal tool";
    homepage = "https://github.com/miso-belica/jusText";
    changelog = "https://github.com/miso-belica/jusText/blob/v${version}/CHANGELOG.rst";
    license = licenses.bsd2;
    maintainers = with maintainers; [ jokatzke ];
  };
}
+43 −0
Original line number Diff line number Diff line
{ lib
, buildPythonPackage
, fetchPypi
, pythonOlder
, numpy
, pytestCheckHook
}:

buildPythonPackage rec {
  pname = "py3langid";
  version = "0.2.2";
  format = "setuptools";

  disabled = pythonOlder "3.6";

  src = fetchPypi {
    inherit pname version;
    hash = "sha256-tN4B2tfnAfKdIWoJNeheCWzIZ1kD0j6oRFsrtfCQuW8=";
  };

  propagatedBuildInputs = [
    numpy
  ];

  nativeCheckInputs = [
    pytestCheckHook
  ];

  # nixify path to the courlan binary in the test suite
  postPatch = ''
    substituteInPlace tests/test_langid.py --replace "'langid'" "'$out/bin/langid'"
  '';

  pythonImportsCheck = [ "py3langid" ];

  meta = with lib; {
    description = "Fork of the language identification tool langid.py, featuring a modernized codebase and faster execution times";
    homepage = "https://github.com/adbar/py3langid";
    changelog = "https://github.com/adbar/py3langid/blob/v${version}/HISTORY.rst";
    license = licenses.bsd3;
    maintainers = with maintainers; [ jokatzke ];
  };
}
Loading