Commit 3f4bc91b authored by Jared Casper's avatar Jared Casper
Browse files

Skip any empty sentences during preprocessing.

parent 61697cab
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -44,6 +44,7 @@ class Encoder(object):
        for sentence in Encoder.splitter.tokenize(text):
            tokens = Encoder.tokenizer.tokenize(sentence)
            ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
            if len(ids) > 0:
                doc_ids.append(ids)
        return doc_ids, len(json_line)

+6 −4
Original line number Diff line number Diff line
@@ -18,16 +18,18 @@ def test_indexed_dataset(args):
    if ds.supports_prefetch:
        # just prefetch the whole thing in test (so assume it is small)
        ds.prefetch(range(len(ds)))
    for i in range(2):
    for i in range(len(ds.doc_idx)-1):
        start = ds.doc_idx[i]
        end = ds.doc_idx[i+1]
        ids = ds[start:end]
        for s in ids:
            assert len(s) > 0
            l = s.data.tolist()
            print(l)
            tokens = tokenizer.convert_ids_to_tokens(l)
            print(tokens)
        print("******** END DOCUMENT **********")
            for t in tokens:
                if '\n' in t:
                    print("Newline in string!")
        print(i)

def main():
    parser = argparse.ArgumentParser()