Commit a7d3ac57 authored by Benjamin Sparks's avatar Benjamin Sparks
Browse files

nltk-data: add all downloadables

parent 4375d986
Loading
Loading
Loading
Loading
+206 −35
Original line number Diff line number Diff line
@@ -54,41 +54,212 @@ let
        '';
      }
    );
in
lib.makeScope newScope (self: {
  punkt = makeNltkDataPackage {
    pname = "punkt";
    location = "tokenizers";
    hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg=";

  makeChunker =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "chunkers";
      hash = "sha256-kemjqaCM9hlKAdMw8oVJnp62EAC9rMQ50dKg7wlAwEc=";
    };
  punkt-tab = makeNltkDataPackage {
    pname = "punkt_tab";
    location = "tokenizers";
    hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg=";

  makeCorpus =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "corpora";
      hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk=";
    };
  averaged-perceptron-tagger = makeNltkDataPackage {
    pname = "averaged_perceptron_tagger";
    location = "taggers";
    hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M=";

  makeGrammar =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "grammars";
      hash = "sha256-pyLEcX3Azv8j1kCGvVYonuiNgVJxtWt7veU0S/yNbIM=";
    };

  makeHelp =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "help";
      hash = "sha256-97mYLNES5WujLF5gD8Ul4cJ6LqSzz+jDzclUsdBeHNE=";
    };

  makeMisc =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "misc";
      hash = "sha256-XtizfEsc8TYWqvvC/eSFdha2ClC5/ZiJM8nue0vXLb4=";
    };

  makeModel =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "models";
      hash = "sha256-iq3weEgCci6rgLW2j28F2eRLprJtInGXKe/awJPSVG4=";
    };
  averaged-perceptron-tagger-eng = makeNltkDataPackage {
    pname = "averaged_perceptron_tagger_eng";

  makeTagger =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "taggers";
      hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M=";
    };
  snowball-data = makeNltkDataPackage {
    pname = "snowball_data";

  makeTokenizer =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "tokenizers";
      hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg=";
    };

  makeStemmer =
    pname:
    makeNltkDataPackage {
      inherit pname;
      location = "stemmers";
      hash = "sha256-mNefwOPVJGz9kXV3LV4DuV7FJpNir/Nwg4ujd0CogEk=";
    };
  stopwords = makeNltkDataPackage {
    pname = "stopwords";
    location = "corpora";
    hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk=";
  };
  wordnet = makeNltkDataPackage {
    pname = "wordnet";
    location = "corpora";
    hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk=";
  };
in
lib.makeScope newScope (self: {
  ## Chunkers
  maxent-ne-chunker = makeChunker "maxent_ne_chunker";
  maxent-ne-chunker-tab = makeChunker "maxent_ne_chunker_tab";

  ## Corpora
  abc = makeCorpus "abc";
  alpino = makeCorpus "alpino";
  bcp47 = makeCorpus "bcp47";
  biocreative-ppi = makeCorpus "biocreative_ppi";
  brown = makeCorpus "brown";
  brown-tei = makeCorpus "brown_tei";
  cess-cat = makeCorpus "cess_cat";
  cess-esp = makeCorpus "cess_esp";
  chat80 = makeCorpus "chat80";
  city-database = makeCorpus "city_database";
  cmudict = makeCorpus "cmudict";
  comparative-sentences = makeCorpus "comparative_sentences";
  comtrans = makeCorpus "comtrans";
  conll2000 = makeCorpus "conll2000";
  conll2002 = makeCorpus "conll2002";
  conll2007 = makeCorpus "conll2007";
  crubadan = makeCorpus "crubadan";
  dependency-treebank = makeCorpus "dependency_treebank";
  dolch = makeCorpus "dolch";
  europarl-raw = makeCorpus "europarl_raw";
  extended-omw = makeCorpus "extended_omw";
  floresta = makeCorpus "floresta";
  framenet-v15 = makeCorpus "framenet_v15";
  framenet-v17 = makeCorpus "framenet_v17";
  gazetteers = makeCorpus "gazetteers";
  genesis = makeCorpus "genesis";
  gutenberg = makeCorpus "gutenberg";
  ieer = makeCorpus "ieer";
  inaugural = makeCorpus "inaugural";
  indian = makeCorpus "indian";
  jeita = makeCorpus "jeita";
  kimmo = makeCorpus "kimmo";
  knbc = makeCorpus "knbc";
  lin-thesaurus = makeCorpus "lin_thesaurus";
  mac-morpho = makeCorpus "mac_morpho";
  machado = makeCorpus "machado";
  masc-tagged = makeCorpus "masc_tagged";
  movie-reviews = makeCorpus "movie_reviews";
  mte-teip5 = makeCorpus "mte_teip5";
  names = makeCorpus "names";
  nombank-1-0 = makeCorpus "nombank.1.0";
  nonbreaking-prefixes = makeCorpus "nonbreaking_prefixes";
  nps-chat = makeCorpus "nps_chat";
  omw = makeCorpus "omw";
  omw-1-4 = makeCorpus "omw-1.4";
  opinion-lexicon = makeCorpus "opinion_lexicon";
  panlex-swadesh = makeCorpus "panlex_swadesh";
  paradigms = makeCorpus "paradigms";
  pe08 = makeCorpus "pe08";
  pil = makeCorpus "pil";
  pl196x = makeCorpus "pl196x";
  ppattach = makeCorpus "ppattach";
  problem-reports = makeCorpus "problem_reports";
  product-reviews-1 = makeCorpus "product_reviews_1";
  product-reviews-2 = makeCorpus "product_reviews_2";
  propbank = makeCorpus "propbank";
  pros-cons = makeCorpus "pros_cons";
  ptb = makeCorpus "ptb";
  qc = makeCorpus "qc";
  reuters = makeCorpus "reuters";
  rte = makeCorpus "rte";
  semcor = makeCorpus "semcor";
  senseval = makeCorpus "senseval";
  sentence-polarity = makeCorpus "sentence_polarity";
  sentiwordnet = makeCorpus "sentiwordnet";
  shakespeare = makeCorpus "shakespeare";
  sinica-treebank = makeCorpus "sinica_treebank";
  smultron = makeCorpus "smultron";
  state-union = makeCorpus "state_union";
  stopwords = makeCorpus "stopwords";
  subjectivity = makeCorpus "subjectivity";
  swadesh = makeCorpus "swadesh";
  switchboard = makeCorpus "switchboard";
  timit = makeCorpus "timit";
  toolbox = makeCorpus "toolbox";
  treebank = makeCorpus "treebank";
  twitter-samples = makeCorpus "twitter_samples";
  udhr = makeCorpus "udhr";
  udhr2 = makeCorpus "udhr2";
  unicode-samples = makeCorpus "unicode_samples";
  universal-treebanks-v20 = makeCorpus "universal_treebanks_v20";
  verbnet = makeCorpus "verbnet";
  verbnet3 = makeCorpus "verbnet3";
  webtext = makeCorpus "webtext";
  wordnet = makeCorpus "wordnet";
  wordnet-ic = makeCorpus "wordnet_ic";
  wordnet2021 = makeCorpus "wordnet2021";
  wordnet2022 = makeCorpus "wordnet2022";
  wordnet31 = makeCorpus "wordnet31";
  words = makeCorpus "words";
  ycoe = makeCorpus "ycoe";

  ## Grammars
  basque-grammars = makeGrammar "basque_grammars";
  book-grammars = makeGrammar "book_grammars";
  large-grammars = makeGrammar "large_grammars";
  sample-grammars = makeGrammar "sample_grammars";
  spanish-grammars = makeGrammar "spanish_grammars";

  ## Help
  tagsets-json = makeHelp "tagsets_json";

  ## Misc
  mwa-ppdb = makeMisc "mwa_ppdb";
  perluniprops = makeMisc "perluniprops";

  ## Models
  bllip-wsj-no-aux = makeModel "bllip_wsj_no_aux";
  moses-sample = makeModel "moses_sample";
  wmt15-eval = makeModel "wmt15_eval";
  word2vec-sample = makeModel "word2vec_sample";

  ## Taggers
  averaged-perceptron-tagger = makeTagger "averaged_perceptron_tagger";
  averaged-perceptron-tagger-eng = makeTagger "averaged_perceptron_tagger_eng";
  averaged-perceptron-tagger-ru = makeTagger "averaged_perceptron_tagger_ru";
  averaged-perceptron-tagger-rus = makeTagger "averaged_perceptron_tagger_rus";
  maxent-treebank-pos-tagger = makeTagger "maxent_treebank_pos_tagger";
  maxent-treebank-pos-tagger-tab = makeTagger "maxent_treebank_pos_tagger_tab";
  universal-tagset = makeTagger "universal_tagset";

  ## Tokenizers
  punkt = makeTokenizer "punkt";
  punkt-tab = makeTokenizer "punkt_tab";

  ## Stemmers
  porter-test = makeStemmer "porter_test";
  rslp = makeStemmer "rslp";
  snowball-data = makeStemmer "snowball_data";
})