Commit 0f5212a6 authored by Gao, Shang's avatar Gao, Shang
Browse files

updated naive bayes experiment to save results to csv

parent 17a5383d
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -31,5 +31,8 @@ class crossbowOlcf(object):
                self.ssh.exec_command("qsub %s" % (olcf_path+pbs_script))
        ssh_stdout.readlines()
        
    def scp(self):
        pass
        
if __name__ == "__main__":
    olcf = crossbowOlcf('rhea')
+1 −0
Original line number Diff line number Diff line
@@ -24,4 +24,5 @@ dl_id = cbow.download_resource('yelp','yelp_academic_dataset_review.json',
                               wait_for_download=True,timeout=600)
                               
#run Naive Bayes
print 'submitting pbs script to start experiment'
olcf.qsub('yelp.pbs')
+7 −2
Original line number Diff line number Diff line
@@ -23,6 +23,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
#store records
labels = []
tokens = []
ids = []
maxsentlen = 0
maxdoclen = 0

@@ -82,6 +83,9 @@ with open(json_path,'r') as f:
        #add label 
        labels.append(dic['stars'])
        
        #add review id
        ids.append(dic['review_id'])
        
print '\nsaved %i records' % len(tokens)
        
#generate Word2Vec embeddings
@@ -128,7 +132,7 @@ for key,val in model.wv.vocab.iteritems():
    
#normalize embeddings
vocab -= vocab.mean()
vocab /= (vocab.std()*2)
vocab /= vocab.std()

#reset first row to 0
vocab[0,:] = np.zeros((embedding_size))
@@ -152,7 +156,8 @@ for idx,doc in enumerate(tokens):
    for sent in doc:
        indicies.append([word2id[word] if word in word2id else unk for word in sent])
    dic['idx'] = indicies
    data[idx] = dic
    review_id = ids[idx]
    data[review_id] = dic

#save preprocessed data and embeddings to disk
print "\nsaving data to disk"
+11 −35
Original line number Diff line number Diff line
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
@@ -18,9 +19,12 @@ with open('data.pkl', 'rb') as f:
print "creating features and labels"
docs = []
labels = []
ids = []
for key,value in data.iteritems():
    docs.append(value['text'])
    labels.append(value['label'])
    ids.append(key)
ids = np.array(ids)
    
docstrings = []
for doc in docs:
@@ -40,40 +44,12 @@ y = le.fit_transform(labels)
splits = 10
kf = StratifiedKFold(n_splits=splits,shuffle=True,random_state=1234)

#function for plotting confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#classify using Naive Bayes
print "training naive bayes"
scores = []
y_preds = []
y_tests = []
review_ids = []
i = 0
for train_index, test_index in kf.split(X,y):
    i += 1
@@ -91,6 +67,7 @@ for train_index, test_index in kf.split(X,y):
    #for calculating f-score
    y_preds.extend(clf.predict(X_test))
    y_tests.extend(y_test)
    review_ids.extend(ids[test_index]) 
    
print "Naive Bayes - overall accuracy: %.4f" % (np.mean(scores)*100)

@@ -100,9 +77,8 @@ macro = f1_score(y_tests,y_preds,average='macro')
print "Naive Bayes - overall f-score: %.4f" % (micro)
print "Naive Bayes - overall f-score: %.4f" % (macro)

#plot confusion matrix
confusion = confusion_matrix(y_tests, y_preds)
plt.figure()
plot_confusion_matrix(confusion,classes=le.classes_,title='Naive Bayes')
plt.savefig('naive_bayes.png')
plt.show()
#save results to file
results = {'id':review_ids,'pred':y_preds,'true':y_tests}
results = pd.DataFrame(results)
results.to_csv('naive_bayes.csv')
+1 −1
Original line number Diff line number Diff line
@@ -383,7 +383,7 @@ if __name__ == "__main__":
    #train nn
    print "building text cnn"
    nn = text_cnn(vocab,classes)
    nn.train(X_train,y_train,epochs=25,validation_data=(X_test,y_test),
    nn.train(X_train,y_train,epochs=3,validation_data=(X_test,y_test),
             savebest=True,filepath='cnn.p')
    
    #load best nn
Loading