Loading megatron/data/preprocess_data.py +2 −1 Original line number Diff line number Diff line Loading @@ -44,6 +44,7 @@ class Encoder(object): for sentence in Encoder.splitter.tokenize(text): tokens = Encoder.tokenizer.tokenize(sentence) ids = Encoder.tokenizer.convert_tokens_to_ids(tokens) if len(ids) > 0: doc_ids.append(ids) return doc_ids, len(json_line) Loading megatron/data/test/test_indexed_dataset.py +6 −4 Original line number Diff line number Diff line Loading @@ -18,16 +18,18 @@ def test_indexed_dataset(args): if ds.supports_prefetch: # just prefetch the whole thing in test (so assume it is small) ds.prefetch(range(len(ds))) for i in range(2): for i in range(len(ds.doc_idx)-1): start = ds.doc_idx[i] end = ds.doc_idx[i+1] ids = ds[start:end] for s in ids: assert len(s) > 0 l = s.data.tolist() print(l) tokens = tokenizer.convert_ids_to_tokens(l) print(tokens) print("******** END DOCUMENT **********") for t in tokens: if '\n' in t: print("Newline in string!") print(i) def main(): parser = argparse.ArgumentParser() Loading Loading
megatron/data/preprocess_data.py +2 −1 Original line number Diff line number Diff line Loading @@ -44,6 +44,7 @@ class Encoder(object): for sentence in Encoder.splitter.tokenize(text): tokens = Encoder.tokenizer.tokenize(sentence) ids = Encoder.tokenizer.convert_tokens_to_ids(tokens) if len(ids) > 0: doc_ids.append(ids) return doc_ids, len(json_line) Loading
megatron/data/test/test_indexed_dataset.py +6 −4 Original line number Diff line number Diff line Loading @@ -18,16 +18,18 @@ def test_indexed_dataset(args): if ds.supports_prefetch: # just prefetch the whole thing in test (so assume it is small) ds.prefetch(range(len(ds))) for i in range(2): for i in range(len(ds.doc_idx)-1): start = ds.doc_idx[i] end = ds.doc_idx[i+1] ids = ds[start:end] for s in ids: assert len(s) > 0 l = s.data.tolist() print(l) tokens = tokenizer.convert_ids_to_tokens(l) print(tokens) print("******** END DOCUMENT **********") for t in tokens: if '\n' in t: print("Newline in string!") print(i) def main(): parser = argparse.ArgumentParser() Loading