Source code for cerebras.modelzoo.data_preparation.nlp.slimpajama.dedup.generate_duplicate_pairs

# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import pickle
import queue
import time
from collections import defaultdict
from glob import glob
from multiprocessing import Process, Queue

from datasketch.lean_minhash import LeanMinHash
from more_itertools import divide


def _H(hs):
    return bytes(hs.byteswap().data)


[docs]def split_files(input_dir, n_proc): files = [] for dataset in [ "arxiv", "stackexchange", "book", "wikipedia", "github", "c4", "common_crawl", ]: if dataset == "common_crawl": files.extend(glob(f"{input_dir}/{dataset}/*/minhash_nfc/*")) else: files.extend(glob(f"{input_dir}/{dataset}/minhash_nfc/*")) files = sorted(files) parts = divide(n_proc, files) return [list(p) for p in parts]
[docs]def get_hashes(files, doc_queues, r): for fp in files: with open(fp, "rb") as fin: for item in pickle.load(fin): key = f"{item['file_name']}@{item['doc_id']}" minhash = LeanMinHash(item["hash"]) for i, doc_queue in enumerate(doc_queues): H = _H(minhash.hashvalues[i * r : (i + 1) * r]) doc_queue.put((key, H))
[docs]def lsh(out_file, doc_queue, idx): lsh_dict = defaultdict(str) i = 0 start_time = time.time() f = open(out_file.replace(".txt", f"-{idx}.txt"), "w") while True: try: key, H = doc_queue.get(timeout=30) cand = lsh_dict.get(H, "None") if cand != "None": f.write(f'{key} :: {cand}\n') else: lsh_dict[H] = key if i % 100000 == 0: print( f"{idx}: Processed {i / 931361530 * 100}%.", time.time() - start_time, ) i += 1 except queue.Empty: break print(f"Total number of documents: {i}") f.close()
[docs]def generate_pairs(args): # size of the queue was tuned for optimal perf and memory constraints. doc_queues = [Queue(1000000) for _ in range(args.bands)] files = split_files(args.input_dir, args.processes) processes = [] for process_id in range(args.processes): p = Process( target=get_hashes, args=( files[process_id], doc_queues, args.range, ), ) processes.append(p) p.start() for process_id in range(args.bands): p = Process( target=lsh, args=( args.out_file, doc_queues[process_id], process_id, ), ) processes.append(p) p.start() for p in processes: p.join()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_dir") parser.add_argument("--out_file") parser.add_argument( "--range", type=int, ) parser.add_argument( "--bands", type=int, ) parser.add_argument( "--processes", type=int, ) args = parser.parse_args() generate_pairs(args)