Commit 8d8eed8a authored by Laanait, Nouamane's avatar Laanait, Nouamane

adding code to search through saved pandas dataframes from ditributed search

parent edb09351
Pipeline #79481 failed with stage
in 2 minutes and 33 seconds
import pandas as pd
import sys, os, subprocess, shlex
def get_file_paths(wdir, tag):
f_list = [itm for itm in os.listdir(wdir) if tag in itm and 'pdf' in itm]
f_list = [os.path.join(wdir, itm) for itm in f_list]
print(f_list)
return f_list
def combine_df(wdir, tag, delete=False):
f_path = get_file_paths(wdir, tag)
#group_id = [int(path.split('_')[3]) for path in f_path if 'params' in path]
for path in f_path:
print(path)
if 'params' in path:
master_params = pd.read_pickle(path)
if 'results' in path:
master_results = pd.read_pickle(path)
for path in f_path:
g_id = int(path.split('_')[3])
print('group_id:{}'.format(g_id))
pdf_group = pd.read_pickle(path)
print('read: {}'.format(path))
if 'params' in path:
master_params['group_%d' % g_id] = pdf_group['group_%d' % g_id]
elif 'results' in path:
master_results['group_%d' % g_id] = pdf_group['group_%d' % g_id]
args = 'rm %s' % path
args = shlex.split(args)
if delete:
try:
subprocess.run(args, check=True, timeout=10)
except subprocess.SubprocessError as e:
print('could not delete file:{}'.format(path))
master_results.to_csv(os.path.join(wdir,'results_{}.csv'.format(tag)))
master_params.to_csv(os.path.join(wdir,'params_{}.csv'.format(tag)))
master_results.to_pickle(os.path.join(wdir,'results_{}.pkl'.format(tag)))
master_params.to_pickle(os.path.join(wdir,'params_{}.pkl'.format(tag)))
return master_results, master_params
def extract_vals_params(df_results, df_params):
pass
def main(wdir, tag):
master_results, master_params= combine_df(wdir, tag)
master_results
return
if __name__ == "__main__":
wdir, job_id = sys.argv[1:]
main(wdir, job_id)
This diff is collapsed.
......@@ -13,9 +13,10 @@ from tensorflow.python.ops import data_flow_ops
import horovod.tensorflow as hvd
import lmdb
import time
#from nvidia.dali.pipeline import Pipeline
#import nvidia.dali.ops as dali_ops
#import nvidia.dali.plugin.tf as dali_tf
from mpi4py import MPI
global world_rank
world_rank = MPI.COMM_WORLD.Get_rank()
tf.logging.set_verbosity(tf.logging.ERROR)
......@@ -442,7 +443,8 @@ class DatasetLMDB(DatasetTFRecords):
super(DatasetLMDB, self).__init__(*args, **kwargs)
self.mode = self.params['mode']
lmdb_dir = self.params['data_dir']
lmdb_path = os.path.join(lmdb_dir, 'batch_%s_%d.db' % (self.mode, int(hvd.rank())))
#lmdb_path = os.path.join(lmdb_dir, 'batch_%s_%d.db' % (self.mode, int(hvd.rank())))
lmdb_path = os.path.join(lmdb_dir, 'batch_%s_%d.db' % (self.mode, world_rank))
self.env = lmdb.open(lmdb_path, create=False, readahead=False, readonly=True, writemap=False, lock=False)
self.num_samples = (self.env.stat()['entries'] - 6)//2 ## TODO: remove hard-coded # of headers by storing #samples key, val
self.first_record = 0
......@@ -515,13 +517,9 @@ class DatasetLMDB(DatasetTFRecords):
for _ in range(self.params['batch_size']):
image, label = iterator.get_next()
image = tf.reshape(image, self.data_specs['image_shape'])
# if self.params[self.mode + '_distort']:
# with tf.device('/gpu:%i' % hvd.local_rank()):
# image = self.add_noise_image(image)
images.append(tf.reshape(image, self.data_specs['image_shape']))
labels.append(tf.reshape(label, self.data_specs['label_shape']))
elif self.mode == 'eval':
# self.params['batch_size'] = 1
ds = ds.batch(self.params['batch_size'], drop_remainder=True)
ds = ds.map(self.wrapped_decode)
iterator = ds.make_one_shot_iterator()
......@@ -531,8 +529,6 @@ class DatasetLMDB(DatasetTFRecords):
for _ in range(self.params['batch_size']):
image, label = iterator.get_next()
image = tf.reshape(image, self.data_specs['image_shape'])
# if self.params[self.mode + '_distort']:
# image = self.add_noise_image(image)
images.append(tf.reshape(image, self.data_specs['image_shape']))
labels.append(tf.reshape(label, self.data_specs['label_shape']))
if tf.executing_eagerly():
......@@ -546,28 +542,8 @@ class DatasetLMDB(DatasetTFRecords):
images_newshape = [self.params['batch_size']] + self.data_specs['image_shape']
labels = tf.reshape(labels, labels_newshape)
images = tf.reshape(images, images_newshape)
# if self.params[self.mode + '_distort']:
# class DaliPipeline(Pipeline):
# def __init__(self, batch_size, num_threads, gpu_id, images=None):
# super(DaliPipeline, self).__init__(batch_size, num_threads, gpu_id)
# self.input = dali_ops.Cast(device='gpu', dtype=tf.float16)(images)
# self.rotate = dali_ops.Rotate(angle=10.0)
# def define_graph(self):
# images = self.rotate(self.input)
# return images
# pipe = DaliPipeline(self.params['batch_size'], 10, hvd.local_rank(), images=images)
# daliop = dali_tf.DALIIterator()
# with tf.device('/gpu:%i' % hvd.local_rank()):
# images = daliop(pipeline=pipe, shapes = images.shape.as_list(), dtypes=[images.dtype])
# images = pipe.run()
labels = self.image_scaling(labels)
# labels -= tf.reduce_min(labels, keepdims=True)
# abels= self.label_minmaxscaling(labels, 0.0, 1.0, scale_range=[0., 10.0])
# images = self.image_scaling(images)
# Display the training images in the Tensorboard visualizer.
#if self.debug:
# tf.summary.image("potential", tf.transpose(labels, perm=[0,2,3,1]), max_outputs=4)
# tf.summary.image("images", tf.transpose(tf.reduce_mean(images, axis=1, keepdims=True), perm=[0,2,3,1]), max_outputs=4)
# data augmentation
if self.params[self.mode + '_distort']:
with tf.device('/gpu:%i' % hvd.local_rank()):
images = tf.transpose(images, perm=[0,2,3,1])
......
......@@ -77,16 +77,17 @@ def calc_loss(n_net, scope, hyper_params, params, labels, step=None, images=None
n_net.model_output = tf.reduce_mean(n_net.model_output, axis=[1], keepdims=True)
_ = calculate_loss_regressor(n_net.model_output, labels, params, hyper_params)
if hyper_params['network_type'] == 'YNet':
weight=None
probe_im = n_net.model_output['decoder_IM']
probe_re = n_net.model_output['decoder_RE']
pot = n_net.model_output['inverter']
pot_labels, probe_labels_re, probe_labels_im = [tf.expand_dims(itm, axis=1) for itm in tf.unstack(labels, axis=1)]
weight= np.prod(pot_labels.shape.as_list()[-2:])
weight=None
inverter_loss = calculate_loss_regressor(pot, pot_labels, params, hyper_params, weight=weight)
decoder_loss_im = calculate_loss_regressor(probe_im, probe_labels_im, params, hyper_params, weight=weight)
decoder_loss_re = calculate_loss_regressor(probe_re, probe_labels_re, params, hyper_params, weight=weight)
psi_out_mod = thin_object(probe_re, probe_im, pot)
reg_loss = 10 * calculate_loss_regressor(psi_out_mod, tf.reduce_mean(images, axis=[1], keepdims=True),
reg_loss = calculate_loss_regressor(psi_out_mod, tf.reduce_mean(images, axis=[1], keepdims=True),
params, hyper_params, weight=weight)
tf.summary.scalar('reg_loss ', reg_loss)
tf.summary.scalar('Inverter loss ', inverter_loss)
......@@ -168,7 +169,6 @@ def calculate_loss_regressor(net_output, labels, params, hyper_params, weight=No
:param params: dictionary, specifies the objective to use
:return: cost
"""
# weight = 1./ hyper_params.get('scaling', 1)
if weight is None:
weight = 1.0
if global_step is None:
......@@ -194,14 +194,10 @@ def calculate_loss_regressor(net_output, labels, params, hyper_params, weight=No
reduction=tf.losses.Reduction.MEAN)
if loss_params['type'] == 'MSE':
cost = tf.losses.mean_squared_error(labels, weights=weight, predictions=net_output,
reduction=tf.losses.Reduction.MEAN)
reduction=tf.losses.Reduction.SUM)
if loss_params['type'] == 'ABS_DIFF':
cost = tf.losses.absolute_difference(labels, weights=weight, predictions=net_output,
reduction=tf.losses.Reduction.MEAN)
#if loss_params['type'] == 'ABS_DIFF_SCALED':
# weight= 1./512.
# cost = tf.losses.absolute_difference(labels, weights=weight, predictions=net_output,
#reduction=tf.losses.Reduction.SUM)
if loss_params['type'] == 'MSE_PAIR':
cost = tf.losses.mean_pairwise_squared_error(labels, net_output, weights=weight)
if loss_params['type'] == 'rMSE':
......@@ -272,4 +268,4 @@ def thin_object(psi_k_re, psi_k_im, potential):
psi_out_mod = tf.reduce_mean(psi_out_mod, axis=1, keep_dims=True)
tf.summary.image('Psi_k_out', tf.transpose(tf.abs(psi_out_mod)**0.25, perm=[0,2,3,1]), max_outputs=1)
tf.summary.image('Psi_x_in', tf.transpose(tf.abs(psi_x)**0.25, perm=[0,2,3,1]), max_outputs=1)
return psi_out_mod
\ No newline at end of file
return psi_out_mod
......@@ -116,7 +116,7 @@ class TrainHelper(object):
flops = self.net_ops * examples_per_sec
avg_flops = self.net_ops * self.params['batch_size'] * hvd.size() / self.cumm_time
format_str = (
'time= %.1f, step= %d, epoch= %2.2e, loss= %.2f, lr= %.2e, step_time= %2.2f sec, ranks= %d, examples/sec= %.1f, flops = %3.2e, average_time= %2.2f, average_flops= %3.3e')
'time= %.1f, step= %d, epoch= %2.2e, loss= %.3e, lr= %.2e, step_time= %2.2f sec, ranks= %d, examples/sec= %.1f, flops = %3.2e, average_time= %2.2f, average_flops= %3.3e')
print_rank(format_str % ( t - self.params[ 'start_time' ], self.last_step, self.elapsed_epochs,
loss_value, learning_rate, duration, hvd.size(), examples_per_sec, flops, self.cumm_time, avg_flops) )
self.cumm_time = time.time()
......@@ -363,6 +363,7 @@ def train(network_config, hyper_params, params, gpu_id=None):
val_results = []
loss_results = []
loss_value = 1e10
val = 1e10
while train_elf.last_step < maxSteps :
train_elf.before_run()
doLog = bool(train_elf.last_step % logFreq == 0)
......@@ -408,15 +409,15 @@ def train(network_config, hyper_params, params, gpu_id=None):
if doValidate:
val = validate(network_config, hyper_params, params, sess, dset)
val_results.append((train_elf.last_step,val))
if doFinish or np.isnan(loss_value):
if doFinish:
val = validate(network_config, hyper_params, params, sess, dset)
val_results.append((train_elf.last_step, val))
tf.reset_default_graph()
tf.keras.backend.clear_session()
sess.close()
return val_results, loss_results
# Do a validation before exiting
val = validate(network_config, hyper_params, params, sess, dset)
if np.isnan(loss_value):
break
val_results.append((train_elf.last_step,val))
tf.reset_default_graph()
tf.keras.backend.clear_session()
......@@ -521,9 +522,12 @@ def validate(network_config, hyper_params, params, sess, dset, num_batches=10):
if loss_params['type'] == 'MSE_PAIR':
errors = tf.losses.mean_pairwise_squared_error(tf.cast(labels, tf.float32), tf.cast(model_output, tf.float32))
loss_label= loss_params['type']
else:
elif loss_params['type'] == 'ABS_DIFF':
loss_label= 'ABS_DIFF'
errors = tf.losses.absolute_difference(tf.cast(labels, tf.float32), tf.cast(model_output, tf.float32), reduction=tf.losses.Reduction.MEAN)
errors = tf.losses.absolute_difference(tf.cast(labels, tf.float32), tf.cast(model_output, tf.float32), reduction=tf.losses.Reduction.SUM)
elif loss_params['type'] == 'MSE':
errors = tf.losses.mean_squared_error(tf.cast(labels, tf.float32), tf.cast(model_output, tf.float32), reduction=tf.losses.Reduction.SUM)
loss_label= loss_params['type']
errors = tf.expand_dims(errors,axis=0)
error_averaging = hvd.allreduce(errors)
if num_batches is not None:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment