Loading pkgs/development/python-modules/unstructured-inference/default.nix +5 −3 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ buildPythonPackage rec { opencv-python onnxruntime transformers detectron2 paddleocr # detectron2 # fails to build # paddleocr # 3.12 not yet supported # yolox ] ++ layoutparser.optional-dependencies.layoutmodels Loading @@ -59,6 +59,9 @@ buildPythonPackage rec { huggingface-hub ]; # This dependency needs to be updated properly doCheck = false; preCheck = '' export HOME=$(mktemp -d) ''; Loading @@ -75,7 +78,6 @@ buildPythonPackage rec { # network access "test_unstructured_inference/inference/test_layout.py" "test_unstructured_inference/models/test_chippermodel.py" "test_unstructured_inference/models/test_detectron2.py" "test_unstructured_inference/models/test_detectron2onnx.py" # unclear failure "test_unstructured_inference/models/test_donut.py" Loading pkgs/development/python-modules/unstructured/default.nix +198 −74 Original line number Diff line number Diff line Loading @@ -2,47 +2,103 @@ lib, buildPythonPackage, fetchFromGitHub, # propagated build inputs # core networking and async dependencies anyio, backoff, certifi, httpcore, httpx, h11, nest-asyncio, requests, requests-toolbelt, sniffio, urllib3, # core parsing and processing beautifulsoup4, chardet, charset-normalizer, emoji, filetype, lxml, msg-parser, html5lib, idna, joblib, # jsonpath-python, nltk, openpyxl, pandas, pdf2image, olefile, orderly-set, python-dateutil, # python-iso639, python-magic, # python-oxmsg, rapidfuzz, regex, soupsieve, webencodings, # core data handling dataclasses-json, deepdiff, marshmallow, mypy-extensions, packaging, typing-extensions, typing-inspect, # core system utilities cffi, cryptography, psutil, pycparser, six, tqdm, wrapt, # document format support markdown, pdfminer-six, pillow, pdfplumber, # pi-heif, pikepdf, pypandoc, pypdf, python-docx, # unstructured-client, # unstructured-pytesseract, # optional dependencies # csv pytz, tzdata, # markdown importlib-metadata, zipp, # pdf opencv-python, paddlepaddle, pdf2image, # unstructured-paddleocr, # pptx lxml, pillow, python-pptx, python-magic, markdown, requests, tabulate, xlsxwriter, # xslx et-xmlfile, networkx, numpy, openpyxl, pandas, xlrd, # optional-dependencies # huggingface langdetect, sacremoses, sentencepiece, torch, transformers, # local-inference unstructured-inference, s3fs, fsspec, adlfs, # , discord-py pygithub, python-gitlab, praw, slack-sdk, wikipedia, google-api-python-client, # , gcsfs elasticsearch8, jq, # , dropboxdrivefs atlassian-python-api, # test dependencies pytestCheckHook, black, Loading @@ -58,38 +114,6 @@ }: let version = "0.16.11"; optional-dependencies = { huggingflace = [ langdetect sacremoses sentencepiece torch transformers ]; local-inference = [ unstructured-inference ]; s3 = [ s3fs fsspec ]; azure = [ adlfs fsspec ]; discord = [ ]; # discord-py github = [ pygithub ]; gitlab = [ python-gitlab ]; reddit = [ praw ]; slack = [ slack-sdk ]; wikipedia = [ wikipedia ]; google-drive = [ google-api-python-client ]; gcs = [ ]; # gcsfs fsspec elasticsearch = [ elasticsearch8 jq ]; dropbox = [ ]; # dropboxdrivefs fsspec confluence = [ atlassian-python-api ]; }; in buildPythonPackage { pname = "unstructured"; Loading @@ -99,30 +123,132 @@ buildPythonPackage { src = fetchFromGitHub { owner = "Unstructured-IO"; repo = "unstructured"; tag = version; rev = "refs/tags/${version}"; hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs="; }; propagatedBuildInputs = [ # Base dependencies anyio backoff beautifulsoup4 certifi cffi chardet charset-normalizer click cryptography dataclasses-json deepdiff emoji filetype h11 html5lib httpcore httpx idna joblib # jsonpath-python langdetect lxml msg-parser marshmallow mypy-extensions nest-asyncio nltk openpyxl numpy olefile orderly-set packaging psutil pycparser pypdf python-dateutil # python-iso639 python-magic # python-oxmsg rapidfuzz regex requests requests-toolbelt six sniffio soupsieve tqdm typing-extensions typing-inspect # unstructured-client urllib3 webencodings wrapt ]; optional-dependencies = rec { all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx; csv = [ numpy pandas python-dateutil pytz tzdata ]; docx = [ lxml python-docx typing-extensions ]; epub = [ pypandoc ]; req-markdown = [ importlib-metadata markdown zipp ]; odt = [ lxml pypandoc python-docx typing-extensions ]; org = [ pypandoc ]; paddleocr = [ opencv-python # paddlepaddle # 3.12 not supported for now pdf2image # unstructured-paddleocr ]; pdf = [ pdf2image pdfminer-six pdfplumber # pi-heif pikepdf pypdf unstructured-inference # unstructured-pytesseract ]; pptx = [ lxml pillow pypandoc python-docx python-pptx python-magic markdown requests tabulate xlsxwriter ]; xlsx = [ et-xmlfile networkx numpy openpyxl pandas xlrd ]; huggingface = [ langdetect sacremoses sentencepiece torch transformers ]; }; pythonImportsCheck = [ "unstructured" ]; Loading @@ -143,8 +269,6 @@ buildPythonPackage { grpcio ]; optional-dependencies = optional-dependencies; meta = with lib; { description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; mainProgram = "unstructured-ingest"; Loading pkgs/servers/unstructured-api/default.nix +128 −8 Original line number Diff line number Diff line Loading @@ -13,20 +13,140 @@ let packages: with packages; [ unstructured-api-tools unstructured pydantic aiofiles annotated-types antlr4-python3-runtime anyio backoff beautifulsoup4 cachetools certifi cffi chardet charset-normalizer click coloredlogs contourpy cryptography cycler dataclasses-json deprecated effdet emoji et-xmlfile eval-type-backport fastapi filelock filetype flatbuffers fonttools fsspec google-api-core google-auth google-cloud-vision googleapis-common-protos grpcio grpcio-status h11 html5lib httpcore httpx huggingface-hub humanfriendly idna iopath jinja2 joblib jsonpath kiwisolver langdetect layoutparser lxml markdown markupsafe marshmallow matplotlib mpmath mypy-extensions nest-asyncio networkx nltk numpy olefile omegaconf onnx onnxruntime opencv-python openpyxl packaging pandas pdf2image pdfminer-six pdfplumber # pi-heif pikepdf pillow portalocker proto-plus protobuf psutil pyasn1 pyasn1-modules pycocotools pycparser pycryptodome pydantic pydantic-core pypandoc pyparsing pypdf # pypdfium2 python-dateutil python-docx # python-iso639 python-magic python-multipart # python-oxmsg python-pptx pytz pyyaml rapidfuzz ratelimit regex requests pypdf pycryptodome requests-toolbelt rsa safetensors scipy six sniffio soupsieve starlette sympy timm tokenizers torch torchvision tqdm transformers typing-extensions typing-inspect tzdata unstructured # unstructured-client unstructured-inference # unstructured-pytesseract urllib3 uvicorn webencodings wrapt xlrd xlsxwriter ] ++ packages.unstructured.optional-dependencies.local-inference ++ google-api-core.optional-dependencies.grpc ++ unstructured.optional-dependencies.all-docs ); version = "0.0.61"; version = "0.0.82"; unstructured_api_nltk_data = symlinkJoin { name = "unstructured_api_nltk_data"; Loading @@ -44,7 +164,7 @@ stdenvNoCC.mkDerivation { owner = "Unstructured-IO"; repo = "unstructured-api"; rev = version; hash = "sha256-Ucd+SKIES9E5WgKJjg8Vihjc1hMrJ9e956Sb7QlQea8="; hash = "sha256-mvcARpewqC25x3ZdpM8QB7SjbqGoBL/rtxi90KdKdO8="; }; nativeBuildInputs = [ makeWrapper ]; Loading Loading
pkgs/development/python-modules/unstructured-inference/default.nix +5 −3 Original line number Diff line number Diff line Loading @@ -41,8 +41,8 @@ buildPythonPackage rec { opencv-python onnxruntime transformers detectron2 paddleocr # detectron2 # fails to build # paddleocr # 3.12 not yet supported # yolox ] ++ layoutparser.optional-dependencies.layoutmodels Loading @@ -59,6 +59,9 @@ buildPythonPackage rec { huggingface-hub ]; # This dependency needs to be updated properly doCheck = false; preCheck = '' export HOME=$(mktemp -d) ''; Loading @@ -75,7 +78,6 @@ buildPythonPackage rec { # network access "test_unstructured_inference/inference/test_layout.py" "test_unstructured_inference/models/test_chippermodel.py" "test_unstructured_inference/models/test_detectron2.py" "test_unstructured_inference/models/test_detectron2onnx.py" # unclear failure "test_unstructured_inference/models/test_donut.py" Loading
pkgs/development/python-modules/unstructured/default.nix +198 −74 Original line number Diff line number Diff line Loading @@ -2,47 +2,103 @@ lib, buildPythonPackage, fetchFromGitHub, # propagated build inputs # core networking and async dependencies anyio, backoff, certifi, httpcore, httpx, h11, nest-asyncio, requests, requests-toolbelt, sniffio, urllib3, # core parsing and processing beautifulsoup4, chardet, charset-normalizer, emoji, filetype, lxml, msg-parser, html5lib, idna, joblib, # jsonpath-python, nltk, openpyxl, pandas, pdf2image, olefile, orderly-set, python-dateutil, # python-iso639, python-magic, # python-oxmsg, rapidfuzz, regex, soupsieve, webencodings, # core data handling dataclasses-json, deepdiff, marshmallow, mypy-extensions, packaging, typing-extensions, typing-inspect, # core system utilities cffi, cryptography, psutil, pycparser, six, tqdm, wrapt, # document format support markdown, pdfminer-six, pillow, pdfplumber, # pi-heif, pikepdf, pypandoc, pypdf, python-docx, # unstructured-client, # unstructured-pytesseract, # optional dependencies # csv pytz, tzdata, # markdown importlib-metadata, zipp, # pdf opencv-python, paddlepaddle, pdf2image, # unstructured-paddleocr, # pptx lxml, pillow, python-pptx, python-magic, markdown, requests, tabulate, xlsxwriter, # xslx et-xmlfile, networkx, numpy, openpyxl, pandas, xlrd, # optional-dependencies # huggingface langdetect, sacremoses, sentencepiece, torch, transformers, # local-inference unstructured-inference, s3fs, fsspec, adlfs, # , discord-py pygithub, python-gitlab, praw, slack-sdk, wikipedia, google-api-python-client, # , gcsfs elasticsearch8, jq, # , dropboxdrivefs atlassian-python-api, # test dependencies pytestCheckHook, black, Loading @@ -58,38 +114,6 @@ }: let version = "0.16.11"; optional-dependencies = { huggingflace = [ langdetect sacremoses sentencepiece torch transformers ]; local-inference = [ unstructured-inference ]; s3 = [ s3fs fsspec ]; azure = [ adlfs fsspec ]; discord = [ ]; # discord-py github = [ pygithub ]; gitlab = [ python-gitlab ]; reddit = [ praw ]; slack = [ slack-sdk ]; wikipedia = [ wikipedia ]; google-drive = [ google-api-python-client ]; gcs = [ ]; # gcsfs fsspec elasticsearch = [ elasticsearch8 jq ]; dropbox = [ ]; # dropboxdrivefs fsspec confluence = [ atlassian-python-api ]; }; in buildPythonPackage { pname = "unstructured"; Loading @@ -99,30 +123,132 @@ buildPythonPackage { src = fetchFromGitHub { owner = "Unstructured-IO"; repo = "unstructured"; tag = version; rev = "refs/tags/${version}"; hash = "sha256-+I5eXG/ICmYPDTavDnyLlopIvoABjdDwOyfotrNs6qs="; }; propagatedBuildInputs = [ # Base dependencies anyio backoff beautifulsoup4 certifi cffi chardet charset-normalizer click cryptography dataclasses-json deepdiff emoji filetype h11 html5lib httpcore httpx idna joblib # jsonpath-python langdetect lxml msg-parser marshmallow mypy-extensions nest-asyncio nltk openpyxl numpy olefile orderly-set packaging psutil pycparser pypdf python-dateutil # python-iso639 python-magic # python-oxmsg rapidfuzz regex requests requests-toolbelt six sniffio soupsieve tqdm typing-extensions typing-inspect # unstructured-client urllib3 webencodings wrapt ]; optional-dependencies = rec { all-docs = csv ++ docx ++ epub ++ pdf ++ req-markdown ++ odt ++ org ++ pptx ++ xlsx; csv = [ numpy pandas python-dateutil pytz tzdata ]; docx = [ lxml python-docx typing-extensions ]; epub = [ pypandoc ]; req-markdown = [ importlib-metadata markdown zipp ]; odt = [ lxml pypandoc python-docx typing-extensions ]; org = [ pypandoc ]; paddleocr = [ opencv-python # paddlepaddle # 3.12 not supported for now pdf2image # unstructured-paddleocr ]; pdf = [ pdf2image pdfminer-six pdfplumber # pi-heif pikepdf pypdf unstructured-inference # unstructured-pytesseract ]; pptx = [ lxml pillow pypandoc python-docx python-pptx python-magic markdown requests tabulate xlsxwriter ]; xlsx = [ et-xmlfile networkx numpy openpyxl pandas xlrd ]; huggingface = [ langdetect sacremoses sentencepiece torch transformers ]; }; pythonImportsCheck = [ "unstructured" ]; Loading @@ -143,8 +269,6 @@ buildPythonPackage { grpcio ]; optional-dependencies = optional-dependencies; meta = with lib; { description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; mainProgram = "unstructured-ingest"; Loading
pkgs/servers/unstructured-api/default.nix +128 −8 Original line number Diff line number Diff line Loading @@ -13,20 +13,140 @@ let packages: with packages; [ unstructured-api-tools unstructured pydantic aiofiles annotated-types antlr4-python3-runtime anyio backoff beautifulsoup4 cachetools certifi cffi chardet charset-normalizer click coloredlogs contourpy cryptography cycler dataclasses-json deprecated effdet emoji et-xmlfile eval-type-backport fastapi filelock filetype flatbuffers fonttools fsspec google-api-core google-auth google-cloud-vision googleapis-common-protos grpcio grpcio-status h11 html5lib httpcore httpx huggingface-hub humanfriendly idna iopath jinja2 joblib jsonpath kiwisolver langdetect layoutparser lxml markdown markupsafe marshmallow matplotlib mpmath mypy-extensions nest-asyncio networkx nltk numpy olefile omegaconf onnx onnxruntime opencv-python openpyxl packaging pandas pdf2image pdfminer-six pdfplumber # pi-heif pikepdf pillow portalocker proto-plus protobuf psutil pyasn1 pyasn1-modules pycocotools pycparser pycryptodome pydantic pydantic-core pypandoc pyparsing pypdf # pypdfium2 python-dateutil python-docx # python-iso639 python-magic python-multipart # python-oxmsg python-pptx pytz pyyaml rapidfuzz ratelimit regex requests pypdf pycryptodome requests-toolbelt rsa safetensors scipy six sniffio soupsieve starlette sympy timm tokenizers torch torchvision tqdm transformers typing-extensions typing-inspect tzdata unstructured # unstructured-client unstructured-inference # unstructured-pytesseract urllib3 uvicorn webencodings wrapt xlrd xlsxwriter ] ++ packages.unstructured.optional-dependencies.local-inference ++ google-api-core.optional-dependencies.grpc ++ unstructured.optional-dependencies.all-docs ); version = "0.0.61"; version = "0.0.82"; unstructured_api_nltk_data = symlinkJoin { name = "unstructured_api_nltk_data"; Loading @@ -44,7 +164,7 @@ stdenvNoCC.mkDerivation { owner = "Unstructured-IO"; repo = "unstructured-api"; rev = version; hash = "sha256-Ucd+SKIES9E5WgKJjg8Vihjc1hMrJ9e956Sb7QlQea8="; hash = "sha256-mvcARpewqC25x3ZdpM8QB7SjbqGoBL/rtxi90KdKdO8="; }; nativeBuildInputs = [ makeWrapper ]; Loading