Unverified Commit 68fa71cc authored by Gaétan Lepage's avatar Gaétan Lepage Committed by GitHub
Browse files

python3Packages.lxml-html-clean: skip failing tests;...

python3Packages.lxml-html-clean: skip failing tests; python3Packages.trafilatura: skip failing tests, cleanup; python3Packages.html-sanitizer: skip failing test (#419713)
parents 4bafd946 9789685d
Loading
Loading
Loading
Loading
+15 −8
Original line number Diff line number Diff line
@@ -2,12 +2,17 @@
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  hatchling,

  # dependencies
  lxml,
  lxml-html-clean,
  beautifulsoup4,

  # tests
  pytestCheckHook,
  pythonOlder,
}:

buildPythonPackage rec {
@@ -15,8 +20,6 @@ buildPythonPackage rec {
  version = "2.4.4";
  pyproject = true;

  disabled = pythonOlder "3.7";

  src = fetchFromGitHub {
    owner = "matthiask";
    repo = "html-sanitizer";
@@ -24,9 +27,9 @@ buildPythonPackage rec {
    hash = "sha256-6OWFLsuefeDzQ1uHnLmboKDgrbY/xJCwqsSQlDaJlRs=";
  };

  nativeBuildInputs = [ hatchling ];
  build-system = [ hatchling ];

  propagatedBuildInputs = [
  dependencies = [
    lxml
    lxml-html-clean
    beautifulsoup4
@@ -40,15 +43,19 @@ buildPythonPackage rec {
    # Tests are sensitive to output
    "test_billion_laughs"
    "test_10_broken_html"

    # Mismatch snapshot (AssertionError)
    # https://github.com/matthiask/html-sanitizer/issues/53
    "test_keep_typographic_whitespace"
  ];

  pythonImportsCheck = [ "html_sanitizer" ];

  meta = with lib; {
  meta = {
    description = "Allowlist-based and very opinionated HTML sanitizer";
    homepage = "https://github.com/matthiask/html-sanitizer";
    changelog = "https://github.com/matthiask/html-sanitizer/blob/${version}/CHANGELOG.rst";
    license = with licenses; [ bsd3 ];
    maintainers = with maintainers; [ fab ];
    license = with lib.licenses; [ bsd3 ];
    maintainers = with lib.maintainers; [ fab ];
  };
}
+20 −7
Original line number Diff line number Diff line
@@ -2,10 +2,9 @@
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  setuptools,
  lxml,
  unittestCheckHook,
  pythonOlder,
  setuptools,
}:

buildPythonPackage rec {
@@ -13,8 +12,6 @@ buildPythonPackage rec {
  version = "0.4.2";
  pyproject = true;

  disabled = pythonOlder "3.7";

  src = fetchFromGitHub {
    owner = "fedora-python";
    repo = "lxml_html_clean";
@@ -22,6 +19,22 @@ buildPythonPackage rec {
    hash = "sha256-KGUFRbcaeDcX2jyoyyZMZsVTbN+h8uy+ugcritkZe38=";
  };

  # Disable failing snapshot tests (AssertionError)
  # https://github.com/fedora-python/lxml_html_clean/issues/24
  # As this derivation must use unittestCheckHook, we cannot use disabledTests
  postPatch = ''
    substituteInPlace tests/test_clean.py \
      --replace-fail \
        "test_host_whitelist_valid" \
        "DISABLED_test_host_whitelist_valid" \
      --replace-fail \
        "test_host_whitelist_invalid" \
        "DISABLED_test_host_whitelist_invalid" \
      --replace-fail \
        "test_host_whitelist_sneaky_userinfo" \
        "DISABLED_test_host_whitelist_sneaky_userinfo"
  '';

  build-system = [ setuptools ];

  dependencies = [ lxml ];
@@ -30,11 +43,11 @@ buildPythonPackage rec {

  pythonImportsCheck = [ "lxml_html_clean" ];

  meta = with lib; {
  meta = {
    description = "Separate project for HTML cleaning functionalities copied from lxml.html.clean";
    homepage = "https://github.com/fedora-python/lxml_html_clean/";
    changelog = "https://github.com/fedora-python/lxml_html_clean/blob/${version}/CHANGES.rst";
    license = licenses.bsd3;
    maintainers = with maintainers; [ fab ];
    license = lib.licenses.bsd3;
    maintainers = with lib.maintainers; [ fab ];
  };
}
+23 −9
Original line number Diff line number Diff line
{
  lib,
  buildPythonPackage,
  fetchFromGitHub,

  # build-system
  setuptools,

  # dependencies
  certifi,
  charset-normalizer,
  courlan,
  fetchPypi,
  htmldate,
  justext,
  lxml,
  pytestCheckHook,
  pythonOlder,
  setuptools,
  urllib3,

  # tests
  pytestCheckHook,
}:

buildPythonPackage rec {
@@ -19,11 +24,11 @@ buildPythonPackage rec {
  version = "2.0.0";
  pyproject = true;

  disabled = pythonOlder "3.9";

  src = fetchPypi {
    inherit pname version;
    hash = "sha256-zrcJSm7Ml+cv6nPH26NnFMXFtXe2Rw5FINyok3BtYkc=";
  src = fetchFromGitHub {
    owner = "adbar";
    repo = "trafilatura";
    tag = "v${version}";
    hash = "sha256-Cf1W3JEGSMkVmRZVTXYsXzZK/Nt/aDG890Sf0/0OZAA=";
  };

  postPatch = ''
@@ -48,6 +53,15 @@ buildPythonPackage rec {
  nativeCheckInputs = [ pytestCheckHook ];

  disabledTests = [
    # TypeError: argument of type 'NoneType' is not iterable
    # https://github.com/adbar/trafilatura/issues/805
    "test_external"
    "test_extract"

    # AttributeError: 'NoneType' object has no attribute 'find'
    # https://github.com/adbar/trafilatura/issues/805
    "test_table_processing"

    # Disable tests that require an internet connection
    "test_cli_pipeline"
    "test_crawl_page"