Commit 498b311b authored by happysalada's avatar happysalada
Browse files

python3Packages.unstructured: 0.16.10 -> 0.16.11; fix deps

parent 2b58e775
Loading
Loading
Loading
Loading
+194 −74
Original line number Diff line number Diff line
@@ -2,47 +2,103 @@
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  # propagated build inputs

  # core networking and async dependencies
  anyio,
  backoff,
  certifi,
  httpcore,
  httpx,
  h11,
  nest-asyncio,
  requests,
  requests-toolbelt,
  sniffio,
  urllib3,

  # core parsing and processing
  beautifulsoup4,
  chardet,
  charset-normalizer,
  emoji,
  filetype,
  lxml,
  msg-parser,
  html5lib,
  idna,
  joblib,
  # jsonpath-python,
  nltk,
  openpyxl,
  pandas,
  pdf2image,
  olefile,
  orderly-set,
  python-dateutil,
  # python-iso639,
  python-magic,
  # python-oxmsg,
  rapidfuzz,
  regex,
  soupsieve,
  webencodings,

  # core data handling
  dataclasses-json,
  deepdiff,
  marshmallow,
  mypy-extensions,
  packaging,
  typing-extensions,
  typing-inspect,

  # core system utilities
  cffi,
  cryptography,
  psutil,
  pycparser,
  six,
  tqdm,
  wrapt,

  # document format support
  markdown,
  pdfminer-six,
  pillow,
  pdfplumber,
  # pi-heif,
  pikepdf,
  pypandoc,
  pypdf,
  python-docx,
  # unstructured-client,
  # unstructured-pytesseract,
  # optional dependencies
  # csv
  pytz,
  tzdata,
  # markdown
  importlib-metadata,
  zipp,
  # pdf
  opencv-python,
  paddlepaddle,
  pdf2image,
  # unstructured-paddleocr,
  # pptx
  lxml,
  pillow,
  python-pptx,
  python-magic,
  markdown,
  requests,
  tabulate,
  xlsxwriter,
  # xslx
  et-xmlfile,
  networkx,
  numpy,
  openpyxl,
  pandas,
  xlrd,
  # optional-dependencies
  # huggingface
  langdetect,
  sacremoses,
  sentencepiece,
  torch,
  transformers,
  # local-inference
  unstructured-inference,
  s3fs,
  fsspec,
  adlfs,
  # , discord-py
  pygithub,
  python-gitlab,
  praw,
  slack-sdk,
  wikipedia,
  google-api-python-client,
  # , gcsfs
  elasticsearch8,
  jq,
  # , dropboxdrivefs
  atlassian-python-api,
  # test dependencies
  pytestCheckHook,
  black,
@@ -58,38 +114,6 @@
}:
let
  version = "0.16.11";
  optional-dependencies = {
    huggingflace = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
    local-inference = [ unstructured-inference ];
    s3 = [
      s3fs
      fsspec
    ];
    azure = [
      adlfs
      fsspec
    ];
    discord = [ ]; # discord-py
    github = [ pygithub ];
    gitlab = [ python-gitlab ];
    reddit = [ praw ];
    slack = [ slack-sdk ];
    wikipedia = [ wikipedia ];
    google-drive = [ google-api-python-client ];
    gcs = [ ]; # gcsfs fsspec
    elasticsearch = [
      elasticsearch8
      jq
    ];
    dropbox = [ ]; # dropboxdrivefs fsspec
    confluence = [ atlassian-python-api ];
  };
in
buildPythonPackage {
  pname = "unstructured";
@@ -99,30 +123,128 @@ buildPythonPackage {
  src = fetchFromGitHub {
    owner = "Unstructured-IO";
    repo = "unstructured";
    tag = version;
    rev = "refs/tags/${version}";
    hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs=";
  };

  propagatedBuildInputs = [
    # Base dependencies
    anyio
    backoff
    beautifulsoup4
    certifi
    cffi
    chardet
    charset-normalizer
    click
    cryptography
    dataclasses-json
    deepdiff
    emoji
    filetype
    h11
    html5lib
    httpcore
    httpx
    idna
    joblib
    # jsonpath-python
    langdetect
    lxml
    msg-parser
    marshmallow
    mypy-extensions
    nest-asyncio
    nltk
    openpyxl
    numpy
    olefile
    orderly-set
    packaging
    psutil
    pycparser
    pypdf
    python-dateutil
    # python-iso639
    python-magic
    # python-oxmsg
    rapidfuzz
    regex
    requests
    requests-toolbelt
    six
    sniffio
    soupsieve
    tqdm
    typing-extensions
    typing-inspect
    # unstructured-client
    urllib3
    webencodings
    wrapt
  ];

  optional-dependencies = {
    csv = [
      numpy
      pandas
      python-dateutil
      pytz
      tzdata
    ];
    docx = [
      lxml
      python-docx
      typing-extensions
    ];
    epub = [ pypandoc ];
    markdown = [
      importlib-metadata
      markdown
      zipp
    ];
    odt = [
      lxml
      pypandoc
      python-docx
      typing-extensions
    ];
    paddleocr = [
      opencv-python
      paddlepaddle
      pdf2image
      # unstructured-paddleocr
    ];
    pdf = [
      pdf2image
      pdfminer-six
      pdfplumber
      # pi-heif
      pikepdf
      pypdf
      unstructured-inference
      # unstructured-pytesseract
    ];
    pptx = [
      lxml
      pillow
    pypandoc
    python-docx
      python-pptx
    python-magic
    markdown
    requests
    tabulate
      xlsxwriter
    ];
    xlsx = [
      et-xmlfile
      networkx
      numpy
      openpyxl
      pandas
      xlrd
    ];
    huggingface = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
  };

  pythonImportsCheck = [ "unstructured" ];

@@ -143,8 +265,6 @@ buildPythonPackage {
    grpcio
  ];

  optional-dependencies = optional-dependencies;

  meta = with lib; {
    description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
    mainProgram = "unstructured-ingest";