two new scripts checking the normalicy and doing bootstrap analysis

c26bd007 · Doak, Peter W. · a5ee9925 · c26bd007 · c26bd007
Commit c26bd007 authored 2 years ago by Doak, Peter W.
--- a/CTINT_bootstrap.py
+++ b/CTINT_bootstrap.py
+# // Copyright (C) 2022 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+# //
+# // Author: Peter Doak (doakpw@ornl.gov)
+# //
+
+# This script allows reading a single measure G dca++ dataset and does a bootstrap
+# analysis taking the default number of scipy.stats.bootstrap (9999 I believe) to look at the 95%
+# reduced G_k_w. What I see is that this results in a what to me is quite a low level of error.
+# so at least for single band Hubbard single site CTINT runs nothing like 1million samples is needed to
+# get a good result with a converged g0,
+# The std deviation for a particular sample is quite wide.  I don't think error is the correct interpretation of the
+# spread of the std::deviation.  The range of values for random vertexes is not "error" but the correct evaluation of
+# particular diagram solutions.
+
+import CTINT
+import numpy as np
+import adios2
+from mpi4py import MPI
+import re
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from Adios2DCA import readDCARun, readSampleG_k_w
+import statsmodels.api as sm
+import os.path
+from scipy.stats import bootstrap
+
+from multiprocessing import Pool
+import pickle
+
+comm = MPI.COMM_WORLD
+
+def do_bootstrap(freq_data):
+    conf_level = 0.9999
+    resamples = 1000
+    freq_mean = np.mean(freq_data)
+    freq_data = (freq_data,)
+    res = bootstrap(freq_data, np.mean, n_resamples=resamples, method="basic", confidence_level=conf_level)
+    return res.confidence_interval.low, freq_mean, res.confidence_interval.high
+            
+
+def read_and_std(file_name):
+    with adios2.open(file_name, "r", comm) as adios_fh:
+        #get all data in a form of histigrams of value at frequency
+        frequency_histigram_data = np.zeros([1024, 10000])
+        meas_index = 0
+        for rank in range(10):
+            for samp in range(1000):
+                G_k_w = readSampleG_k_w(adios_fh,rank,0,samp+1)
+                frequency_histigram_data[:, meas_index] = G_k_w[:,0,0,0,0,0]
+                meas_index += 1
+        freq_actual = frequency_histigram_data[451:712, :]
+        
+        # with Pool(8) as pool:
+        #     interval_mean = pool.map(do_bootstrap, freq_actual, chunksize=4 )
+
+        freq_std = np.std(freq_actual, 1)
+    
+        return freq_std
+
+
+def read_and_bootstrap(file_name):
+    with adios2.open(file_name, "r", comm) as adios_fh:
+        dca_globals = readDCARun(adios_fh)
+        parameters = [(key, item) for key,item in dca_globals.items() if re.search("parameters", key)]
+        for param in parameters:
+            print("param", param)
+        g0_r_t = dca_globals['cluster_excluded_greens_function_G0_r_t']
+        g0_k_w = dca_globals['cluster_excluded_greens_function_G0_k_w']
+        g0_r_w = dca_globals['cluster_excluded_greens_function_G0_r_w']
+        wn = dca_globals['frequency_domain_elements']
+        beta = dca_globals['physics_beta'][0]
+        U = dca_globals['single_band_Hubbard_model_U'][0]
+
+        #get all data in a form of histigrams of value at frequency
+        frequency_histigram_data = np.zeros([1024, 10000])
+        meas_index = 0
+        for rank in range(10):
+            for samp in range(1000):
+                G_k_w = readSampleG_k_w(adios_fh,rank,0,samp+1)
+                frequency_histigram_data[:, meas_index] = G_k_w[:,0,0,0,0,0]
+                meas_index += 1
+        n_bins = 100
+        freq_actual = frequency_histigram_data[451:712, :]
+
+        with Pool(8) as pool:
+            interval_mean = pool.map(do_bootstrap, freq_actual, chunksize=4 )
+
+        with open('bootstrap_results.pickle', 'wb') as the_pick:
+            pickle.dump(interval_mean, the_pick, pickle.HIGHEST_PROTOCOL)
+
+        return interval_mean
+            # for freq in range(1024):
+            #     freq_data = frequency_histigram_data[:,freq];
+            #     freq_mean = np.mean(freq_data)
+            #     # freq_data = freq_data - freq_mean
+            #     freq_data = (freq_data, starmap)
+            #     res = bootstrap(do_bootstrap, freq_data, np.mean)
+            #     print ( res.confidence_interval.low, freq_mean, res.confidence_interval.high )                
+            #     interval_mean[freq, 0] = res.confidence_interval.low
+            #     interval_mean[freq, 1] = freq_mean
+            #     interval_mean[freq, 2] = res.confidence_interval.high
+                
+                
+                # freq_mean = np.mean(freq_data)
+                # freq_data = freq_data - freq_mean
+                # axs[ip].hist(freq_data, bins=n_bins)
+
+
+if __name__ == '__main__':
+    file_name = "single_meas_G_100B_10k.bp"
+    num_samples = 10
+
+    pickle_name = 'bootstrap_results.pickle'
+    if os.path.isfile(pickle_name):
+       with open(pickle_name, 'rb') as the_pick:
+           interval_mean = pickle.load(the_pick)
+    else:
+        interval_mean = read_and_bootstrap(file_name)
+
+    interval_mean = np.array(interval_mean)
+    fig = plt.figure(figsize=(10,10), constrained_layout=True)
+    axs = fig.subplots(1,1)
+
+    with adios2.open(file_name, "r", comm) as adios_fh:
+        dca_globals = readDCARun(adios_fh)
+        parameters = [(key, item) for key,item in dca_globals.items() if re.search("parameters", key)]
+        cluster_g_k_w = dca_globals['cluster_greens_function_G_k_w']
+        wn = dca_globals['frequency_domain_elements']
+        beta = dca_globals['physics_beta'][0]
+        U = dca_globals['single_band_Hubbard_model_U'][0]
+    
+    freq_std = read_and_std(file_name)
+
+    up_std = cluster_g_k_w[451:712,0,0,0,0,0] + freq_std
+    dn_std = cluster_g_k_w[451:712,0,0,0,0,0] - freq_std
+    
+    axs.plot(wn[451:712], cluster_g_k_w[451:712,0,0,0,0,0], label="accumulated cluster G_k_w")
+    axs.plot(wn[451:712], up_std, label="up std")
+    axs.plot(wn[451:712], dn_std, label="dwn_std")
+    
+    
+    axs.scatter(wn[451:712], interval_mean[:,0], s=3, marker='.', label="99.99% bootstrap high")
+    #axs.scatter(wn[451:712], interval_mean[:,1], s=1.5, marker=".", label="mean samples")
+    axs.scatter(wn[451:712], interval_mean[:,2], s=3, marker=".", label="99.99% bootstrap high")
+
+    axs.legend()
+    
+    plt.show()
+            
--- a/CTINT_check_normalicy.py
+++ b/CTINT_check_normalicy.py
+# // Copyright (C) 2022 UT-Battelle, LLC
+# // All rights reserved.
+# //
+# // See LICENSE for terms of usage.
+# // See CITATION.md for citation guidelines, if DCA++ is used for scientific publications.
+# //
+# // Author: Peter Doak (doakpw@ornl.gov)
+# //
+
+# This script allows quick examination of many samples and comparison between the
+# python miniapp single configuration G_k_w and that written from DCA++
+# it is set up to read from an adios2 BP4 archive set as filename
+
+import CTINT
+import numpy as np
+import adios2
+from mpi4py import MPI
+import re
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+from Adios2DCA import readDCARun, readSampleG_k_w
+import statsmodels.api as sm
+from scipy.stats import norm
+
+comm = MPI.COMM_WORLD
+
+if __name__ == '__main__':
+        file_name = "single_meas_G_100B_10k.bp"
+        num_samples = 10
+        
+        with adios2.open(file_name, "r", comm) as adios_fh:
+            dca_globals = readDCARun(adios_fh)
+            parameters = [(key, item) for key,item in dca_globals.items() if re.search("parameters", key)]
+            for param in parameters:
+                    print("param", param)
+            g0_r_t = dca_globals['cluster_excluded_greens_function_G0_r_t']
+            g0_k_w = dca_globals['cluster_excluded_greens_function_G0_k_w']
+            g0_r_w = dca_globals['cluster_excluded_greens_function_G0_r_w']
+            wn = dca_globals['frequency_domain_elements']
+            beta = dca_globals['physics_beta'][0]
+            U = dca_globals['single_band_Hubbard_model_U'][0]
+
+            #get all data in a form of histigrams of value at frequency
+            frequency_histigram_data = np.zeros([10000, 1024])
+            meas_index = 0
+            for rank in range(10):
+                for samp in range(1000):
+                    G_k_w = readSampleG_k_w(adios_fh,rank,0,samp+1)
+                    frequency_histigram_data[meas_index, :] = G_k_w[:,0,0,0,0,0]
+                    meas_index += 1
+            n_bins = 100
+            
+            fig, axs = plt.subplots(3,1)
+            fig.suptitle("QQ plot for normality of values for G_k_w frequencies")
+            for ip, freq in zip(range(3),[512,550,600]):
+                freq_data = frequency_histigram_data[:,freq];
+                freq_mean = np.mean(freq_data)
+                freq_data = freq_data - freq_mean
+
+                
+                fig = sm.qqplot(freq_data, line='s', fit=True, ax=axs[ip])
+                axs[ip].set_title("Freq {}".format(freq))
+                # freq_mean = np.mean(freq_data)
+                # freq_data = freq_data - freq_mean
+                # axs[ip].hist(freq_data, bins=n_bins)
+
+
+            plt.show()
+