Unverified Commit 4989a246 authored by Yt's avatar Yt Committed by GitHub
Browse files

Update unstructured (#365635)

parents c6edee2e 9876b50f
Loading
Loading
Loading
Loading
+5 −3
Original line number Diff line number Diff line
@@ -41,8 +41,8 @@ buildPythonPackage rec {
      opencv-python
      onnxruntime
      transformers
      detectron2
      paddleocr
      # detectron2 # fails to build
      # paddleocr # 3.12 not yet supported
      # yolox
    ]
    ++ layoutparser.optional-dependencies.layoutmodels
@@ -59,6 +59,9 @@ buildPythonPackage rec {
    huggingface-hub
  ];

  # This dependency needs to be updated properly
  doCheck = false;

  preCheck = ''
    export HOME=$(mktemp -d)
  '';
@@ -75,7 +78,6 @@ buildPythonPackage rec {
    # network access
    "test_unstructured_inference/inference/test_layout.py"
    "test_unstructured_inference/models/test_chippermodel.py"
    "test_unstructured_inference/models/test_detectron2.py"
    "test_unstructured_inference/models/test_detectron2onnx.py"
    # unclear failure
    "test_unstructured_inference/models/test_donut.py"
+198 −74
Original line number Diff line number Diff line
@@ -2,47 +2,103 @@
  lib,
  buildPythonPackage,
  fetchFromGitHub,
  # propagated build inputs

  # core networking and async dependencies
  anyio,
  backoff,
  certifi,
  httpcore,
  httpx,
  h11,
  nest-asyncio,
  requests,
  requests-toolbelt,
  sniffio,
  urllib3,

  # core parsing and processing
  beautifulsoup4,
  chardet,
  charset-normalizer,
  emoji,
  filetype,
  lxml,
  msg-parser,
  html5lib,
  idna,
  joblib,
  # jsonpath-python,
  nltk,
  openpyxl,
  pandas,
  pdf2image,
  olefile,
  orderly-set,
  python-dateutil,
  # python-iso639,
  python-magic,
  # python-oxmsg,
  rapidfuzz,
  regex,
  soupsieve,
  webencodings,

  # core data handling
  dataclasses-json,
  deepdiff,
  marshmallow,
  mypy-extensions,
  packaging,
  typing-extensions,
  typing-inspect,

  # core system utilities
  cffi,
  cryptography,
  psutil,
  pycparser,
  six,
  tqdm,
  wrapt,

  # document format support
  markdown,
  pdfminer-six,
  pillow,
  pdfplumber,
  # pi-heif,
  pikepdf,
  pypandoc,
  pypdf,
  python-docx,
  # unstructured-client,
  # unstructured-pytesseract,
  # optional dependencies
  # csv
  pytz,
  tzdata,
  # markdown
  importlib-metadata,
  zipp,
  # pdf
  opencv-python,
  paddlepaddle,
  pdf2image,
  # unstructured-paddleocr,
  # pptx
  lxml,
  pillow,
  python-pptx,
  python-magic,
  markdown,
  requests,
  tabulate,
  xlsxwriter,
  # xslx
  et-xmlfile,
  networkx,
  numpy,
  openpyxl,
  pandas,
  xlrd,
  # optional-dependencies
  # huggingface
  langdetect,
  sacremoses,
  sentencepiece,
  torch,
  transformers,
  # local-inference
  unstructured-inference,
  s3fs,
  fsspec,
  adlfs,
  # , discord-py
  pygithub,
  python-gitlab,
  praw,
  slack-sdk,
  wikipedia,
  google-api-python-client,
  # , gcsfs
  elasticsearch8,
  jq,
  # , dropboxdrivefs
  atlassian-python-api,
  # test dependencies
  pytestCheckHook,
  black,
@@ -58,38 +114,6 @@
}:
let
  version = "0.16.11";
  optional-dependencies = {
    huggingflace = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
    local-inference = [ unstructured-inference ];
    s3 = [
      s3fs
      fsspec
    ];
    azure = [
      adlfs
      fsspec
    ];
    discord = [ ]; # discord-py
    github = [ pygithub ];
    gitlab = [ python-gitlab ];
    reddit = [ praw ];
    slack = [ slack-sdk ];
    wikipedia = [ wikipedia ];
    google-drive = [ google-api-python-client ];
    gcs = [ ]; # gcsfs fsspec
    elasticsearch = [
      elasticsearch8
      jq
    ];
    dropbox = [ ]; # dropboxdrivefs fsspec
    confluence = [ atlassian-python-api ];
  };
in
buildPythonPackage {
  pname = "unstructured";
@@ -99,30 +123,132 @@ buildPythonPackage {
  src = fetchFromGitHub {
    owner = "Unstructured-IO";
    repo = "unstructured";
    tag = version;
    rev = "refs/tags/${version}";
    hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs=";
  };

  propagatedBuildInputs = [
    # Base dependencies
    anyio
    backoff
    beautifulsoup4
    certifi
    cffi
    chardet
    charset-normalizer
    click
    cryptography
    dataclasses-json
    deepdiff
    emoji
    filetype
    h11
    html5lib
    httpcore
    httpx
    idna
    joblib
    # jsonpath-python
    langdetect
    lxml
    msg-parser
    marshmallow
    mypy-extensions
    nest-asyncio
    nltk
    openpyxl
    numpy
    olefile
    orderly-set
    packaging
    psutil
    pycparser
    pypdf
    python-dateutil
    # python-iso639
    python-magic
    # python-oxmsg
    rapidfuzz
    regex
    requests
    requests-toolbelt
    six
    sniffio
    soupsieve
    tqdm
    typing-extensions
    typing-inspect
    # unstructured-client
    urllib3
    webencodings
    wrapt
  ];

  optional-dependencies = rec {
    all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx;
    csv = [
      numpy
      pandas
      python-dateutil
      pytz
      tzdata
    ];
    docx = [
      lxml
      python-docx
      typing-extensions
    ];
    epub = [ pypandoc ];
    req-markdown = [
      importlib-metadata
      markdown
      zipp
    ];
    odt = [
      lxml
      pypandoc
      python-docx
      typing-extensions
    ];
    org = [
      pypandoc
    ];
    paddleocr = [
      opencv-python
      # paddlepaddle # 3.12 not supported for now
      pdf2image
      # unstructured-paddleocr
    ];
    pdf = [
      pdf2image
      pdfminer-six
      pdfplumber
      # pi-heif
      pikepdf
      pypdf
      unstructured-inference
      # unstructured-pytesseract
    ];
    pptx = [
      lxml
      pillow
    pypandoc
    python-docx
      python-pptx
    python-magic
    markdown
    requests
    tabulate
      xlsxwriter
    ];
    xlsx = [
      et-xmlfile
      networkx
      numpy
      openpyxl
      pandas
      xlrd
    ];
    huggingface = [
      langdetect
      sacremoses
      sentencepiece
      torch
      transformers
    ];
  };

  pythonImportsCheck = [ "unstructured" ];

@@ -143,8 +269,6 @@ buildPythonPackage {
    grpcio
  ];

  optional-dependencies = optional-dependencies;

  meta = with lib; {
    description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines";
    mainProgram = "unstructured-ingest";
+128 −8
Original line number Diff line number Diff line
@@ -13,20 +13,140 @@ let
    packages:
    with packages;
    [
      unstructured-api-tools
      unstructured
      pydantic
      aiofiles
      annotated-types
      antlr4-python3-runtime
      anyio
      backoff
      beautifulsoup4
      cachetools
      certifi
      cffi
      chardet
      charset-normalizer
      click
      coloredlogs
      contourpy
      cryptography
      cycler
      dataclasses-json
      deprecated
      effdet
      emoji
      et-xmlfile
      eval-type-backport
      fastapi
      filelock
      filetype
      flatbuffers
      fonttools
      fsspec
      google-api-core
      google-auth
      google-cloud-vision
      googleapis-common-protos
      grpcio
      grpcio-status
      h11
      html5lib
      httpcore
      httpx
      huggingface-hub
      humanfriendly
      idna
      iopath
      jinja2
      joblib
      jsonpath
      kiwisolver
      langdetect
      layoutparser
      lxml
      markdown
      markupsafe
      marshmallow
      matplotlib
      mpmath
      mypy-extensions
      nest-asyncio
      networkx
      nltk
      numpy
      olefile
      omegaconf
      onnx
      onnxruntime
      opencv-python
      openpyxl
      packaging
      pandas
      pdf2image
      pdfminer-six
      pdfplumber
      # pi-heif
      pikepdf
      pillow
      portalocker
      proto-plus
      protobuf
      psutil
      pyasn1
      pyasn1-modules
      pycocotools
      pycparser
      pycryptodome
      pydantic
      pydantic-core
      pypandoc
      pyparsing
      pypdf
      # pypdfium2
      python-dateutil
      python-docx
      # python-iso639
      python-magic
      python-multipart
      # python-oxmsg
      python-pptx
      pytz
      pyyaml
      rapidfuzz
      ratelimit
      regex
      requests
      pypdf
      pycryptodome
      requests-toolbelt
      rsa
      safetensors
      scipy
      six
      sniffio
      soupsieve
      starlette
      sympy
      timm
      tokenizers
      torch
      torchvision
      tqdm
      transformers
      typing-extensions
      typing-inspect
      tzdata
      unstructured
      # unstructured-client
      unstructured-inference
      # unstructured-pytesseract
      urllib3
      uvicorn
      webencodings
      wrapt
      xlrd
      xlsxwriter
    ]
    ++ packages.unstructured.optional-dependencies.local-inference
    ++ google-api-core.optional-dependencies.grpc
    ++ unstructured.optional-dependencies.all-docs
  );
  version = "0.0.61";
  version = "0.0.82";
  unstructured_api_nltk_data = symlinkJoin {
    name = "unstructured_api_nltk_data";

@@ -44,7 +164,7 @@ stdenvNoCC.mkDerivation {
    owner = "Unstructured-IO";
    repo = "unstructured-api";
    rev = version;
    hash = "sha256-Ucd+SKIES9E5WgKJjg8Vihjc1hMrJ9e956Sb7QlQea8=";
    hash = "sha256-mvcARpewqC25x3ZdpM8QB7SjbqGoBL/rtxi90KdKdO8=";
  };

  nativeBuildInputs = [ makeWrapper ];