addressed the comments (ebc4a408) · Commits · candle / Megatron-LM

tools/openwebtext/find_duplicates.py

+29 −29

Original line number	Diff line number	Diff line
		@@ -39,9 +39,11 @@ def jaccard(set_a, set_b):

		if __name__ == '__main__':

		print('parsing the inputs ...')
		print('parsing the arguments ...')

		parser = argparse.ArgumentParser()
		parser.add_argument('--seed', type=int, default=1234,
		help='Random seed used for python, numpy')
		parser.add_argument('--inputs', nargs = '*', default=None, help = \
		'Pairwise list of the input files and keys, '
		'e.g. --inputs cc.json cc_id news.json news_id')
		@@ -58,7 +60,7 @@ if __name__ == '__main__':
		print('finding possible duplicate content ...')

		# set seed and get an array of seeds of 100 integers
		np.random.seed(1234)
		np.random.seed(args.seed)
		seeds = np.random.randint(0, 1e6, size=100)

		# initialize minhash and lsh cache
		@@ -69,10 +71,7 @@ if __name__ == '__main__':

		# load fingerprints from pickle file if needed
		if args.load_fingerprints is not None:
		count_fingerprints = len(args.load_fingerprints)

		for count_fp in range(count_fingerprints):
		fp_file_name = args.load_fingerprints[count_fp]
		for count_fp, fp_file_name in enumerate(args.load_fingerprints):
		print("Loading fingerprints from pickle file {}".format(
		fp_file_name), flush=True)
		fp = open(fp_file_name, "rb")
		@@ -87,6 +86,7 @@ if __name__ == '__main__':
		for url in local_lshcache.fingerprints.keys():
		url_doc[url] = local_url_doc[url]
		lshcache.add_fingerprint(local_lshcache.fingerprints[url], url)
		fp.close()

		counter = 0
		start_time = time.time()
		@@ -94,11 +94,10 @@ if __name__ == '__main__':
		print("Computing fingerprints", flush=True)

		# compute finger prints of the inputs if any
		input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2)
		for input_pair in range(input_pairs):
		# input file and the key to use as id
		input_file = args.inputs[2 * input_pair]
		key = args.inputs[2 * input_pair + 1]
		if args.inputs is not None:
		assert len(args.inputs) % 2 == 0
		for input_file, key in zip(args.inputs[::2], args.inputs[1::2]):
		print(' document processing {} with key {}'.format(input_file, key),
		flush=True)
		# traverse all the texts and add fingerprints
		@@ -160,5 +159,6 @@ if __name__ == '__main__':
		ensure_ascii=False)
		f_out.write(myjson.encode('utf-8'))
		f_out.write('\n'.encode('utf-8'))
		f_out.close()

		print('done :-)')