Loading crossbow/crossbowOlcf.py +3 −0 Original line number Diff line number Diff line Loading @@ -31,5 +31,8 @@ class crossbowOlcf(object): self.ssh.exec_command("qsub %s" % (olcf_path+pbs_script)) ssh_stdout.readlines() def scp(self): pass if __name__ == "__main__": olcf = crossbowOlcf('rhea') yelp_example/run_experiment.py +1 −0 Original line number Diff line number Diff line Loading @@ -24,4 +24,5 @@ dl_id = cbow.download_resource('yelp','yelp_academic_dataset_review.json', wait_for_download=True,timeout=600) #run Naive Bayes print 'submitting pbs script to start experiment' olcf.qsub('yelp.pbs') yelp_example/scripts/feature_extraction.py +7 −2 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', #store records labels = [] tokens = [] ids = [] maxsentlen = 0 maxdoclen = 0 Loading Loading @@ -82,6 +83,9 @@ with open(json_path,'r') as f: #add label labels.append(dic['stars']) #add review id ids.append(dic['review_id']) print '\nsaved %i records' % len(tokens) #generate Word2Vec embeddings Loading Loading @@ -128,7 +132,7 @@ for key,val in model.wv.vocab.iteritems(): #normalize embeddings vocab -= vocab.mean() vocab /= (vocab.std()*2) vocab /= vocab.std() #reset first row to 0 vocab[0,:] = np.zeros((embedding_size)) Loading @@ -152,7 +156,8 @@ for idx,doc in enumerate(tokens): for sent in doc: indicies.append([word2id[word] if word in word2id else unk for word in sent]) dic['idx'] = indicies data[idx] = dic review_id = ids[idx] data[review_id] = dic #save preprocessed data and embeddings to disk print "\nsaving data to disk" Loading yelp_example/scripts/naive_bayes.py +11 −35 Original line number Diff line number Diff line import pickle import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import StratifiedKFold Loading @@ -18,9 +19,12 @@ with open('data.pkl', 'rb') as f: print "creating features and labels" docs = [] labels = [] ids = [] for key,value in data.iteritems(): docs.append(value['text']) labels.append(value['label']) ids.append(key) ids = np.array(ids) docstrings = [] for doc in docs: Loading @@ -40,40 +44,12 @@ y = le.fit_transform(labels) splits = 10 kf = StratifiedKFold(n_splits=splits,shuffle=True,random_state=1234) #function for plotting confusion matrix def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #classify using Naive Bayes print "training naive bayes" scores = [] y_preds = [] y_tests = [] review_ids = [] i = 0 for train_index, test_index in kf.split(X,y): i += 1 Loading @@ -91,6 +67,7 @@ for train_index, test_index in kf.split(X,y): #for calculating f-score y_preds.extend(clf.predict(X_test)) y_tests.extend(y_test) review_ids.extend(ids[test_index]) print "Naive Bayes - overall accuracy: %.4f" % (np.mean(scores)*100) Loading @@ -100,9 +77,8 @@ macro = f1_score(y_tests,y_preds,average='macro') print "Naive Bayes - overall f-score: %.4f" % (micro) print "Naive Bayes - overall f-score: %.4f" % (macro) #plot confusion matrix confusion = confusion_matrix(y_tests, y_preds) plt.figure() plot_confusion_matrix(confusion,classes=le.classes_,title='Naive Bayes') plt.savefig('naive_bayes.png') plt.show() #save results to file results = {'id':review_ids,'pred':y_preds,'true':y_tests} results = pd.DataFrame(results) results.to_csv('naive_bayes.csv') yelp_example/scripts/th_cnn.py +1 −1 Original line number Diff line number Diff line Loading @@ -383,7 +383,7 @@ if __name__ == "__main__": #train nn print "building text cnn" nn = text_cnn(vocab,classes) nn.train(X_train,y_train,epochs=25,validation_data=(X_test,y_test), nn.train(X_train,y_train,epochs=3,validation_data=(X_test,y_test), savebest=True,filepath='cnn.p') #load best nn Loading Loading
crossbow/crossbowOlcf.py +3 −0 Original line number Diff line number Diff line Loading @@ -31,5 +31,8 @@ class crossbowOlcf(object): self.ssh.exec_command("qsub %s" % (olcf_path+pbs_script)) ssh_stdout.readlines() def scp(self): pass if __name__ == "__main__": olcf = crossbowOlcf('rhea')
yelp_example/run_experiment.py +1 −0 Original line number Diff line number Diff line Loading @@ -24,4 +24,5 @@ dl_id = cbow.download_resource('yelp','yelp_academic_dataset_review.json', wait_for_download=True,timeout=600) #run Naive Bayes print 'submitting pbs script to start experiment' olcf.qsub('yelp.pbs')
yelp_example/scripts/feature_extraction.py +7 −2 Original line number Diff line number Diff line Loading @@ -23,6 +23,7 @@ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', #store records labels = [] tokens = [] ids = [] maxsentlen = 0 maxdoclen = 0 Loading Loading @@ -82,6 +83,9 @@ with open(json_path,'r') as f: #add label labels.append(dic['stars']) #add review id ids.append(dic['review_id']) print '\nsaved %i records' % len(tokens) #generate Word2Vec embeddings Loading Loading @@ -128,7 +132,7 @@ for key,val in model.wv.vocab.iteritems(): #normalize embeddings vocab -= vocab.mean() vocab /= (vocab.std()*2) vocab /= vocab.std() #reset first row to 0 vocab[0,:] = np.zeros((embedding_size)) Loading @@ -152,7 +156,8 @@ for idx,doc in enumerate(tokens): for sent in doc: indicies.append([word2id[word] if word in word2id else unk for word in sent]) dic['idx'] = indicies data[idx] = dic review_id = ids[idx] data[review_id] = dic #save preprocessed data and embeddings to disk print "\nsaving data to disk" Loading
yelp_example/scripts/naive_bayes.py +11 −35 Original line number Diff line number Diff line import pickle import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import StratifiedKFold Loading @@ -18,9 +19,12 @@ with open('data.pkl', 'rb') as f: print "creating features and labels" docs = [] labels = [] ids = [] for key,value in data.iteritems(): docs.append(value['text']) labels.append(value['label']) ids.append(key) ids = np.array(ids) docstrings = [] for doc in docs: Loading @@ -40,40 +44,12 @@ y = le.fit_transform(labels) splits = 10 kf = StratifiedKFold(n_splits=splits,shuffle=True,random_state=1234) #function for plotting confusion matrix def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') #classify using Naive Bayes print "training naive bayes" scores = [] y_preds = [] y_tests = [] review_ids = [] i = 0 for train_index, test_index in kf.split(X,y): i += 1 Loading @@ -91,6 +67,7 @@ for train_index, test_index in kf.split(X,y): #for calculating f-score y_preds.extend(clf.predict(X_test)) y_tests.extend(y_test) review_ids.extend(ids[test_index]) print "Naive Bayes - overall accuracy: %.4f" % (np.mean(scores)*100) Loading @@ -100,9 +77,8 @@ macro = f1_score(y_tests,y_preds,average='macro') print "Naive Bayes - overall f-score: %.4f" % (micro) print "Naive Bayes - overall f-score: %.4f" % (macro) #plot confusion matrix confusion = confusion_matrix(y_tests, y_preds) plt.figure() plot_confusion_matrix(confusion,classes=le.classes_,title='Naive Bayes') plt.savefig('naive_bayes.png') plt.show() #save results to file results = {'id':review_ids,'pred':y_preds,'true':y_tests} results = pd.DataFrame(results) results.to_csv('naive_bayes.csv')
yelp_example/scripts/th_cnn.py +1 −1 Original line number Diff line number Diff line Loading @@ -383,7 +383,7 @@ if __name__ == "__main__": #train nn print "building text cnn" nn = text_cnn(vocab,classes) nn.train(X_train,y_train,epochs=25,validation_data=(X_test,y_test), nn.train(X_train,y_train,epochs=3,validation_data=(X_test,y_test), savebest=True,filepath='cnn.p') #load best nn Loading