Commit 5655f076 authored by Mohammad's avatar Mohammad
Browse files

cleaned up old gpt2 dataset stuff from openwebtext

parent 20764e12
Loading
Loading
Loading
Loading

openwebtext/make_gpt2_dataset.py

deleted100644 → 0
+0 −77
Original line number Diff line number Diff line
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import json
import numpy as np
import time
import os
import sys

from tokenizer import Tokenizer


def tokenize_corpus(filename, np_filename, print_interval=10000):

    print(' > tokenizing {}'.format(filename))

    tokenizer = Tokenizer(cache_dir='./cache')

    tokenized_docs = []
    num_docs = 0
    num_tokens = 0
    start_time = time.time()
    with open(filename, 'r') as f:
        for line in f:
            try:
                myjson = json.loads(line)
                url = myjson['url']
                sample = myjson['text']
                tokens = tokenizer.tokenize_document(sample)
                tokenized_docs.append(np.array(tokens, dtype=np.uint16))
                num_docs += 1
                num_tokens += len(tokens)
                if num_docs % print_interval == 0:
                    print('    processed {:9d} documents in {:.2f} (s) so far'.
                          format(num_docs, time.time() - start_time),
                          flush=True)
            except Exception as e:
                print('    skipping ', line, e)

    print('  >> processed {} document with total of {} tokens ...'.format(
        num_docs, num_tokens))

    tokenized_docs = np.array(tokenized_docs, dtype=object)
    np.save(np_filename, tokenized_docs, allow_pickle=True)
    print('  >> saved the tokenzed document to {} ...'.format(np_filename))


if __name__ == '__main__':

    print('building gpt2 dataset ...')

    path = sys.argv[1]
    shard = sys.argv[2]

    input_filename = os.path.join(path,
                                  'shards/shard_{:04d}'.format(int(shard)))
    output_filename = os.path.join(path,
                                  'npys/shard_{:04d}.npy'.format(int(shard)))
    print('will be reading {}'.format(input_filename))
    print('and will write the results to {}'.format(output_filename))

    tokenize_corpus(input_filename, output_filename)

openwebtext/make_gpt2_sizes.py

deleted100644 → 0
+0 −38
Original line number Diff line number Diff line

import glob
import json
import os
import time
import sys

import numpy as np


if __name__ == '__main__':

    print('building the shard sizes ...')

    path = sys.argv[1]
    print('> reading numpy files from {}'.format(path))

    npy_files = glob.glob(path + '/*.npy')
    npy_files.sort()
    print('  found {} numpy files'.format(len(npy_files)))

    size_dict = {}
    counter = 0
    start_time = time.time()
    for filename in npy_files:
        data = np.load(filename, allow_pickle=True)
        size = np.hstack(data).size
        np_filename = os.path.basename(filename)
        size_dict[np_filename] = size
        counter += 1
        if counter % 10 == 0:
            print('   processed {} files in {:.2f} seconds'.format(
                counter, time.time() - start_time))

    output_filename = os.path.join(path, 'sizes.txt')
    with open(output_filename, 'w') as f:
        json.dump(size_dict, f)
    print('> wrote sizes to {}'.format(output_filename))
+0 −8
Original line number Diff line number Diff line
#!/bin/bash

echo "processing gpt2 data ..."
DIR="/raid/mpatwary/redownload_v0/0-21"

for thread in {0..3}; do
    echo " launching thread "$thread && python make_gpt2_dataset.py $DIR $thread > $DIR/logs/shard_$thread.log 2>&1 &
done

openwebtext/tokenizer.py

deleted100644 → 0
+0 −36
Original line number Diff line number Diff line
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
sys.path.append('..')

from megatron.data_utils.tokenization_gpt2 import GPT2Tokenizer


class Tokenizer:

    def __init__(self, cache_dir=None):
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                                       cache_dir=cache_dir)
        self.tokenizer.max_len = int(1e12)
        self.eod_token = self.tokenizer.encoder['<|endoftext|>']
        assert self.eod_token < 65535, 'vocab size will not fit in uint16'
        print('> GPT2 tokenizer with {} vocab size and eod token {} ...'.format(
            len(self.tokenizer.encoder), self.eod_token))

    def tokenize_document(self, document):
        tokens = self.tokenizer.encode(document)
        tokens.append(self.eod_token)
        return tokens