Loading tools/openwebtext/find_duplicates.py +29 −29 Original line number Diff line number Diff line Loading @@ -39,9 +39,11 @@ def jaccard(set_a, set_b): if __name__ == '__main__': print('parsing the inputs ...') print('parsing the arguments ...') parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=1234, help='Random seed used for python, numpy') parser.add_argument('--inputs', nargs = '*', default=None, help = \ 'Pairwise list of the input files and keys, ' 'e.g. --inputs cc.json cc_id news.json news_id') Loading @@ -58,7 +60,7 @@ if __name__ == '__main__': print('finding possible duplicate content ...') # set seed and get an array of seeds of 100 integers np.random.seed(1234) np.random.seed(args.seed) seeds = np.random.randint(0, 1e6, size=100) # initialize minhash and lsh cache Loading @@ -69,10 +71,7 @@ if __name__ == '__main__': # load fingerprints from pickle file if needed if args.load_fingerprints is not None: count_fingerprints = len(args.load_fingerprints) for count_fp in range(count_fingerprints): fp_file_name = args.load_fingerprints[count_fp] for count_fp, fp_file_name in enumerate(args.load_fingerprints): print("Loading fingerprints from pickle file {}".format( fp_file_name), flush=True) fp = open(fp_file_name, "rb") Loading @@ -87,6 +86,7 @@ if __name__ == '__main__': for url in local_lshcache.fingerprints.keys(): url_doc[url] = local_url_doc[url] lshcache.add_fingerprint(local_lshcache.fingerprints[url], url) fp.close() counter = 0 start_time = time.time() Loading @@ -94,11 +94,10 @@ if __name__ == '__main__': print("Computing fingerprints", flush=True) # compute finger prints of the inputs if any input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2) for input_pair in range(input_pairs): # input file and the key to use as id input_file = args.inputs[2 * input_pair] key = args.inputs[2 * input_pair + 1] if args.inputs is not None: assert len(args.inputs) % 2 == 0 for input_file, key in zip(args.inputs[::2], args.inputs[1::2]): print(' document processing {} with key {}'.format(input_file, key), flush=True) # traverse all the texts and add fingerprints Loading Loading @@ -160,5 +159,6 @@ if __name__ == '__main__': ensure_ascii=False) f_out.write(myjson.encode('utf-8')) f_out.write('\n'.encode('utf-8')) f_out.close() print('done :-)') Loading
tools/openwebtext/find_duplicates.py +29 −29 Original line number Diff line number Diff line Loading @@ -39,9 +39,11 @@ def jaccard(set_a, set_b): if __name__ == '__main__': print('parsing the inputs ...') print('parsing the arguments ...') parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=1234, help='Random seed used for python, numpy') parser.add_argument('--inputs', nargs = '*', default=None, help = \ 'Pairwise list of the input files and keys, ' 'e.g. --inputs cc.json cc_id news.json news_id') Loading @@ -58,7 +60,7 @@ if __name__ == '__main__': print('finding possible duplicate content ...') # set seed and get an array of seeds of 100 integers np.random.seed(1234) np.random.seed(args.seed) seeds = np.random.randint(0, 1e6, size=100) # initialize minhash and lsh cache Loading @@ -69,10 +71,7 @@ if __name__ == '__main__': # load fingerprints from pickle file if needed if args.load_fingerprints is not None: count_fingerprints = len(args.load_fingerprints) for count_fp in range(count_fingerprints): fp_file_name = args.load_fingerprints[count_fp] for count_fp, fp_file_name in enumerate(args.load_fingerprints): print("Loading fingerprints from pickle file {}".format( fp_file_name), flush=True) fp = open(fp_file_name, "rb") Loading @@ -87,6 +86,7 @@ if __name__ == '__main__': for url in local_lshcache.fingerprints.keys(): url_doc[url] = local_url_doc[url] lshcache.add_fingerprint(local_lshcache.fingerprints[url], url) fp.close() counter = 0 start_time = time.time() Loading @@ -94,11 +94,10 @@ if __name__ == '__main__': print("Computing fingerprints", flush=True) # compute finger prints of the inputs if any input_pairs = 0 if args.inputs is None else int(len(args.inputs)/2) for input_pair in range(input_pairs): # input file and the key to use as id input_file = args.inputs[2 * input_pair] key = args.inputs[2 * input_pair + 1] if args.inputs is not None: assert len(args.inputs) % 2 == 0 for input_file, key in zip(args.inputs[::2], args.inputs[1::2]): print(' document processing {} with key {}'.format(input_file, key), flush=True) # traverse all the texts and add fingerprints Loading Loading @@ -160,5 +159,6 @@ if __name__ == '__main__': ensure_ascii=False) f_out.write(myjson.encode('utf-8')) f_out.write('\n'.encode('utf-8')) f_out.close() print('done :-)')