cleaned up old gpt2 dataset stuff from openwebtext (5655f076) · Commits · candle / Megatron-LM

openwebtext/make_gpt2_dataset.py

deleted100644 → 0

+0 −77

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.


		import json
		import numpy as np
		import time
		import os
		import sys

		from tokenizer import Tokenizer


		def tokenize_corpus(filename, np_filename, print_interval=10000):

		print(' > tokenizing {}'.format(filename))

		tokenizer = Tokenizer(cache_dir='./cache')

		tokenized_docs = []
		num_docs = 0
		num_tokens = 0
		start_time = time.time()
		with open(filename, 'r') as f:
		for line in f:
		try:
		myjson = json.loads(line)
		url = myjson['url']
		sample = myjson['text']
		tokens = tokenizer.tokenize_document(sample)
		tokenized_docs.append(np.array(tokens, dtype=np.uint16))
		num_docs += 1
		num_tokens += len(tokens)
		if num_docs % print_interval == 0:
		print(' processed {:9d} documents in {:.2f} (s) so far'.
		format(num_docs, time.time() - start_time),
		flush=True)
		except Exception as e:
		print(' skipping ', line, e)

		print(' >> processed {} document with total of {} tokens ...'.format(
		num_docs, num_tokens))

		tokenized_docs = np.array(tokenized_docs, dtype=object)
		np.save(np_filename, tokenized_docs, allow_pickle=True)
		print(' >> saved the tokenzed document to {} ...'.format(np_filename))


		if __name__ == '__main__':

		print('building gpt2 dataset ...')

		path = sys.argv[1]
		shard = sys.argv[2]

		input_filename = os.path.join(path,
		'shards/shard_{:04d}'.format(int(shard)))
		output_filename = os.path.join(path,
		'npys/shard_{:04d}.npy'.format(int(shard)))
		print('will be reading {}'.format(input_filename))
		print('and will write the results to {}'.format(output_filename))

		tokenize_corpus(input_filename, output_filename)

openwebtext/make_gpt2_sizes.py

deleted100644 → 0

+0 −38

Original line number	Diff line number	Diff line

		import glob
		import json
		import os
		import time
		import sys

		import numpy as np


		if __name__ == '__main__':

		print('building the shard sizes ...')

		path = sys.argv[1]
		print('> reading numpy files from {}'.format(path))

		npy_files = glob.glob(path + '/*.npy')
		npy_files.sort()
		print(' found {} numpy files'.format(len(npy_files)))

		size_dict = {}
		counter = 0
		start_time = time.time()
		for filename in npy_files:
		data = np.load(filename, allow_pickle=True)
		size = np.hstack(data).size
		np_filename = os.path.basename(filename)
		size_dict[np_filename] = size
		counter += 1
		if counter % 10 == 0:
		print(' processed {} files in {:.2f} seconds'.format(
		counter, time.time() - start_time))

		output_filename = os.path.join(path, 'sizes.txt')
		with open(output_filename, 'w') as f:
		json.dump(size_dict, f)
		print('> wrote sizes to {}'.format(output_filename))

openwebtext/run_make_gpt2_dataset.sh

deleted100755 → 0

+0 −8

Original line number	Diff line number	Diff line
		#!/bin/bash

		echo "processing gpt2 data ..."
		DIR="/raid/mpatwary/redownload_v0/0-21"

		for thread in {0..3}; do
		echo " launching thread "$thread && python make_gpt2_dataset.py $DIR $thread > $DIR/logs/shard_$thread.log 2>&1 &
		done

openwebtext/tokenizer.py

deleted100644 → 0

+0 −36

Original line number	Diff line number	Diff line
		# coding=utf-8
		# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.

		import sys
		sys.path.append('..')

		from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer


		class Tokenizer:

		def __init__(self, cache_dir=None):
		self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
		cache_dir=cache_dir)
		self.tokenizer.max_len = int(1e12)
		self.eod_token = self.tokenizer.encoder['<\|endoftext\|>']
		assert self.eod_token < 65535, 'vocab size will not fit in uint16'
		print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
		len(self.tokenizer.encoder), self.eod_token))

		def tokenize_document(self, document):
		tokens = self.tokenizer.encode(document)
		tokens.append(self.eod_token)
		return tokens