Unverified Commit d0878333 authored by Raul Puri's avatar Raul Puri Committed by GitHub
Browse files

added missing presplit_sentences_json.py

parent 66719e97
Loading
Loading
Loading
Loading
+27 −0
Original line number Diff line number Diff line
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""

import sys
import json

import nltk

nltk.download('punkt')

input_file = sys.argv[1]
output_file = sys.argv[2]

line_seperator = "\n"

with open(input_file, 'r') as ifile:
  with open(output_file, "w") as ofile:
    for doc in ifile.readlines():
      parsed = json.loads(doc)
      sent_list = []
      for line in parsed['text'].split('\n'):
          if line != '\n':
              sent_list.extend(nltk.tokenize.sent_tokenize(line))
      parsed['text'] = line_seperator.join(sent_list)
      ofile.write(json.dumps(parsed)+'\n')