Commit 8b73fe99 authored by Nouamane Laanait's avatar Nouamane Laanait
Browse files

adding more scripts

parent 105782f1
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ def simulate(filehandle, cif_path, idx= None, gpu_id=0, clean_up=False):
    sim_params['space_group']= spgroup_num
    sim_params['material'] = matname
    energies = [100e3, 125e3, 150e3, 175e3, 200e3]
    for (sample_idx, y_dir, (z_idx, z_dir), energy) in enumerate(product(y_dirs, enumerate(z_dirs), energies)):
    for (sample_idx, (y_dir, (z_idx, z_dir), energy)) in enumerate(product(y_dirs, enumerate(z_dirs), energies)):
        try:
            t = time()
            # build supercell
+1 −1
Original line number Diff line number Diff line
@@ -48,6 +48,6 @@ def main(lmdb_dir, delete=False):

if __name__ == "__main__":
    if len(sys.argv) == 3:
        main(sys.argv[-2], delete=bool(sys.argv[-1]))
        main(sys.argv[-2], delete=bool(int(sys.argv[-1])))
    else:
        main(sys.argv[-1])
+4 −4
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ def swap_out(lmdb_path):
def simulate(filehandle, cif_path, idx= None, gpu_id=0, clean_up=False):
    # load cif and get sim params
    spgroup_num, matname = parse_cif_path(cif_path)
    index = 1 
    index = 0 
    sp = SupercellBuilder(cif_path, verbose=False, debug=False)
    sim_params = get_sim_params(sp)
    z_dir = sim_params['z_dirs'][index]
@@ -307,9 +307,9 @@ def generate_training_data(cifpaths, outdir_path, save_mode="h5", runtime=1800*0
def main(cifdir_path, outdir_path, save_mode, runtime=1800):
    global t_elaps
    t_elaps = time()
    cifpaths_train, cifpaths_eval= get_cif_paths(cifdir_path, ratio=0.2)
    generate_training_data(cifpaths_train, outdir_path, save_mode=save_mode, runtime=runtime*0.7)
    generate_eval_data(cifpaths_eval, outdir_path, save_mode=save_mode, runtime=runtime*0.9)
    cifpaths_train, cifpaths_eval= get_cif_paths(cifdir_path, ratio=0.9)
    generate_training_data(cifpaths_train, outdir_path, save_mode=save_mode, runtime=runtime*0.9)
    generate_eval_data(cifpaths_eval, outdir_path, save_mode=save_mode, runtime=runtime)
    return

if __name__ == "__main__":
+33 −18
Original line number Diff line number Diff line
@@ -25,24 +25,24 @@ def swap_out(lmdb_path):
    except subprocess.SubprocessError as e:
        print("rank %d: %s" % (comm_rank, format(e)))

    # replace with lmdb from repo
    user = os.environ.get('USER')
    lmdb_repo = "/gpfs/alpine/lrn001/proj-shared/nl/sims/data/lmdb_bank_0405_3096"
    lmdb_repo_list = os.listdir(lmdb_repo)
    index = np.random.randint(0, len(lmdb_repo_list))
    lmdb_path_src = os.path.join(lmdb_repo, lmdb_repo_list[index])
    if not os.path.exists(lmdb_path_src):
        print('replacement file %s not found' % lmdb_path_src)
        return
    src = lmdb_path_src 
    trg = lmdb_path 
    cp_args = "cp -r %s %s" %(src, trg)
    cp_args = shlex.split(cp_args)
    if not os.path.exists(trg):
        try:
            subprocess.run(cp_args, check=True)
        except subprocess.SubprocessError as e:
            print("rank %d: %s" % (comm_rank, format(e)))
    ## replace with lmdb from repo
    #user = os.environ.get('USER')
    #lmdb_repo = "/gpfs/alpine/lrn001/proj-shared/nl/sims/data/lmdb_bank_0405_3096"
    #lmdb_repo_list = os.listdir(lmdb_repo)
    #index = np.random.randint(0, len(lmdb_repo_list))
    #lmdb_path_src = os.path.join(lmdb_repo, lmdb_repo_list[index])
    #if not os.path.exists(lmdb_path_src):
    #    print('replacement file %s not found' % lmdb_path_src)
    #    return
    #src = lmdb_path_src 
    #trg = lmdb_path 
    #cp_args = "cp -r %s %s" %(src, trg)
    #cp_args = shlex.split(cp_args)
    #if not os.path.exists(trg):
    #    try:
    #        subprocess.run(cp_args, check=True)
    #    except subprocess.SubprocessError as e:
    #        print("rank %d: %s" % (comm_rank, format(e)))


def simulate(filehandle, h5g, idx= None, gpu_id=0, clean_up=False):
@@ -63,6 +63,21 @@ def simulate(filehandle, h5g, idx= None, gpu_id=0, clean_up=False):

        # build supercell
        sp = SupercellBuilder(cif_path, verbose=False, debug=False)
        # filter out 
        angles = np.array(sp.structure.lattice.angles)
        angles = np.round(angles).astype(np.int)
        cutoff = np.array([90,90,90])
        tol = 2
        cubic_cond = np.logical_not(np.logical_and(angles > cutoff - tol, angles < cutoff + tol)).any()
        hexag_cond_1 = np.logical_and(angles[:2] > cutoff[:2] - tol, angles[:2] < cutoff[:2] + tol).any()
        hexag_cond_2 = np.logical_and(angles[-1] > 120 - tol, angles[-1] < 120 + tol)
        hexag_cond = np.logical_not(hexag_cond_1 and hexag_cond_2)
        if cubic_cond:
            if hexag_cond:
                return False
            else:
                pass

        sp.build_unit_cell()
        sp.make_orthogonal_supercell(supercell_size=np.array([cell_dim,cell_dim,slab_t]),
                             projec_1=y_dir, projec_2=z_dir)
+108 −0
Original line number Diff line number Diff line
import os, sys
import subprocess, shlex
import lmdb
import multiprocessing as mp
import numpy as np

def read_lmdb(args):
    lmdb_path, delete = args[:]
    env = lmdb.open(lmdb_path, readahead=False, readonly=True, writemap=False, lock=False)
    num_samples = env.stat()['entries'] - 4 ## TODO: remove hard-coded # of headers by storing #_headers key
    num_samples = num_samples//2
    return (num_samples, lmdb_path)

def replace_lmdb(args):
    src, trg = args[:]
    if src.size > 2:
        ind = np.random.randint(0,src.size)
        src = src[ind]
    else:
        src = src[0]
    rm_args = "rm -r %s" % trg
    rm_args = shlex.split(rm_args)
    cp_args = "cp -r %s %s" %(src, trg)
    cp_args = shlex.split(cp_args)
    try:
        subprocess.run(rm_args, check=True)
    except subprocess.SubprocessError as e:
        print("subprocess error: %s" % format(e))
    try:
        subprocess.run(cp_args, check=True)
        print("replaced %s" % trg)
    except subprocess.SubprocessError as e:
        print("subprocess error: %s" % format(e))


def main(lmdb_dir, delete=False):
    lmdb_files = os.listdir(lmdb_dir)
    lmdb_paths = [os.path.join(lmdb_dir, path) for path in lmdb_files]
    processes = min(mp.cpu_count(), len(lmdb_files))
    pool = mp.Pool(processes=processes)
    tasks = [(lmdb_path, delete) for lmdb_path in lmdb_paths]
    chunk = max(np.int(np.floor(len(lmdb_files) / processes)), 1)
    jobs = pool.imap(read_lmdb, tasks, chunksize=chunk)
    tally = [j for j in jobs]
    pool.close()
    tally = np.array(tally,  dtype=[('num_samples', 'i4'), ('filepath', np.dtype('U100'))])
    np.save('tally_%s.npy' % lmdb_dir.split('/')[-1], tally )
    mask = np.array([itm.find('_train_') for itm in tally['filepath']])
    mask[mask >= 0] = 1 
    mask[mask < 0] = 0 
    mask = mask.astype(np.bool)
    train_files = tally[mask]
    #print(train_files['filepath'])
    eval_files = tally[np.logical_not(mask)]
    #print(eval_files['filepath'])
    if train_files['num_samples'].size != 0:
        print("stats of train samples (total, min, max, mean): %d, %2.2f, %2.2f, %2.2f" %(train_files['num_samples'].sum(), 
            train_files['num_samples'].min(), train_files['num_samples'].max(), train_files['num_samples'].mean()))
        std = train_files['num_samples'].std()
        mean = train_files['num_samples'].mean()
    #up = min(train_files['num_samples'].max(), mean + 2 *std)
    #down = max(train_files['num_samples'].min(), mean - 2 * std)
    if delete and train_files['num_samples'].size != 0:
        up = 15
        down= 10 
        cutoff = 4 
        #rep = train_files[np.logical_and(train_files['num_samples'] >= down, train_files['num_samples'] <= up)]
        rep = train_files[train_files['num_samples'] > down]
        if rep.size > 1:
            files_to_repl = train_files[train_files['num_samples'] < cutoff]
            if files_to_repl.size > 0:
                print('Replacing Training Data')
                pool = mp.Pool(processes=processes)
                tasks = [(rep['filepath'], itm['filepath']) for itm in files_to_repl]
                chunk = max(np.int(np.floor(files_to_repl.size / processes)), 1)
                jobs = pool.imap(replace_lmdb, tasks, chunksize=chunk)
                _ = [j for j in jobs]
                pool.close()
    if eval_files['num_samples'].size != 0: 
        print("stats of eval samples (total, min, max, mean):%d, %2.2f, %2.2f, %2.2f" %(eval_files['num_samples'].sum(), 
            eval_files['num_samples'].min(), eval_files['num_samples'].max(), eval_files['num_samples'].mean()))
        std = eval_files['num_samples'].std()
        mean = eval_files['num_samples'].mean()
    #up = min(eval_files['num_samples'].max(), mean + 2 *std)
    #down = max(eval_files['num_samples'].min(), mean - 2 * std)
    if delete and eval_files['num_samples'].size != 0:
        up = 20
        down= 2 
        cutoff = 2 
        rep = eval_files[eval_files['num_samples'] > down]
        if rep.size > 1:
            files_to_repl = eval_files[eval_files['num_samples'] < cutoff]
            if files_to_repl.size > 0:
                print('Replacing Eval Data')
                pool = mp.Pool(processes=processes)
                tasks = [(rep['filepath'], itm['filepath']) for itm in files_to_repl]
                chunk = max(np.int(np.floor(files_to_repl.size / processes)), 1)
                jobs = pool.imap(replace_lmdb, tasks, chunksize=chunk)
                _ = [j for j in jobs]
                pool.close()
    

if __name__ == "__main__":
    if len(sys.argv) == 3:
        main(sys.argv[-2], delete=bool(int(sys.argv[-1])))
    else:
        main(sys.argv[-1])
        print('DONE')
Loading