Commit 09619ca4 authored by Gao, Shang's avatar Gao, Shang
Browse files

added initial files for yelp example

parent 860e4ecc
Loading
Loading
Loading
Loading
+7 −11
Original line number Diff line number Diff line
@@ -186,7 +186,7 @@ class crossbowGlobus(crossbowBase):
                                    label="Crossbow Transfer", sync_level="checksum")
        tdata.add_item(resource_path,'/data/cades-crossbow/'+package+'/'+filename)
        transfer_result = self.transfer.submit_transfer(tdata)
        print("task_id =", transfer_result["task_id"])
        print "task_id =", transfer_result["task_id"]
        
        #add file to CKAN
        self.ckan.action.resource_create(
@@ -226,7 +226,7 @@ class crossbowGlobus(crossbowBase):
                                    label="Crossbow Transfer", sync_level="checksum")
        tdata.add_item(resource_path,dest_path+filename)
        transfer_result = self.transfer.submit_transfer(tdata)
        print("task_id =", transfer_result["task_id"])
        print "task_id =", transfer_result["task_id"]
        
        return transfer_result["task_id"]

@@ -257,7 +257,7 @@ class crossbowGlobus(crossbowBase):
        ddata = globus_sdk.DeleteData(self.transfer, self.cadesdtn)
        ddata.add_item(resource_path)
        delete_result = self.transfer.submit_delete(ddata)
        print("task_id =", delete_result["task_id"])
        print "task_id =", delete_result["task_id"]
        
        return delete_result["task_id"]
        
@@ -268,16 +268,12 @@ class crossbowGlobus(crossbowBase):
        parameters:
          - task_id: string
            id of task to check
            
        outputs:
            string indicating status of task
        '''
        r = self.transfer.get_task(task_id)
        print "Label:", r["label"]
        print "Status:", r["status"]
        print "Transfer: %s -> %s" % (r["source_endpoint_display_name"],
                                      r["destination_endpoint_display_name"])
        if r.data["status"] == "SUCCEEDED":
            print "Bytes transferred:", r["bytes_transferred"]
            print "Files transferred:", r["files_transferred"]
            print "Transfer rate:", r["effective_bytes_per_second"], "bps"
        return r["status"]

#add model api later
#add scheduling api later (for both filters and models)
+161 −0
Original line number Diff line number Diff line
import sys
import ast
import re
from itertools import groupby
import numpy as np
import collections
from gensim.models import Word2Vec
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
import logging
import pickle

#get json filepath
args = (sys.argv)
if len(args) != 2:
    raise Exception("Usage: python feature_extraction.py <path to Yelp json file>")
json_path = args[1]

#logging setup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

#store records
labels = []
tokens = []
maxsentlen = 0
maxdoclen = 0

#process json one line at a time
with open(json_path,'r') as f:
    lineno = 0
    for line in f:
    
        lineno += 1
        sys.stdout.write("processing line %i of aprox 4.15 million     \r" \
                         % lineno)
        sys.stdout.flush()
        dic = ast.literal_eval(line)
        
        #only keep records from 2013 (to reduce dataset size)
        if dic['date'][:4]!='2013':
            continue
        
        text = dic['text']
                
        #process text
        text = text.lower()
        text = re.sub("'", '', text)
        text = re.sub("\.{2,}", '.', text)
        text = re.sub('[^\w_|\.|\?|!]+', ' ', text)
        text = re.sub('\.', ' . ', text)
        text = re.sub('\?', ' ? ', text)
        text = re.sub('!', ' ! ', text)

        #tokenize
        text = text.split()
        
        #drop empty reviews
        if len(text) == 0:
            continue

        #split into sentences
        sentences = []
        sentence = []
        for t in text:
            if t not in ['.','!','?']:
                sentence.append(t)
            else:
                sentence.append(t)
                sentences.append(sentence)
                if len(sentence) > maxsentlen:
                    maxsentlen = len(sentence)
                sentence = []
        if len(sentence) > 0:
            sentences.append(sentence)
        
        #add split sentences to tokens
        tokens.append(sentences)
        if len(sentences) > maxdoclen:
            maxdoclen = len(sentences)
        
        #add label 
        labels.append(dic['stars'])
        
print '\nsaved %i records' % len(tokens)
        
#generate Word2Vec embeddings
print "generating word2vec embeddings"

#used all processed raw text to train word2vec
allsents = [sent for doc in tokens for sent in doc]

embedding_size = 200
model = Word2Vec(allsents, min_count=5, size=embedding_size, workers=4, iter=5)
model.init_sims(replace=True)
'''
#get most common words
print "getting common words"
allwords = [word for sent in allsents for word in sent]
counts = collections.Counter(allwords).most_common(500)

#reduce embeddings to 2d using tsne
print "reducing embeddings to 2D"
embeddings = np.empty((500,embedding_size))
for i in range(500):
    embeddings[i,:] = model[counts[i][0]]
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=7500)
embeddings = tsne.fit_transform(embeddings)

#plot embeddings
print "plotting most common words"
fig, ax = plt.subplots(figsize=(30, 30))
for i in range(500):
    ax.scatter(embeddings[i,0],embeddings[i,1])
    ax.annotate(counts[i][0], (embeddings[i,0],embeddings[i,1]))
plt.show()
'''
#save all word embeddings to matrix
print "saving word vectors to matrix"
vocab = np.zeros((len(model.vocab)+1,embedding_size))
word2id = {}

#first row of embedding matrix isn't used so that 0 can be masked
for key,val in model.vocab.iteritems():
    idx = val.__dict__['index'] + 1
    vocab[idx,:] = model[key]
    word2id[key] = idx
    
#normalize embeddings
vocab -= vocab.mean()
vocab /= (vocab.std()*2)

#reset first row to 0
vocab[0,:] = np.zeros((embedding_size))

#add additional word embedding for unknown words
vocab = np.concatenate((vocab, np.random.rand(1,embedding_size)))

#index for unknown words
unk = len(vocab)-1

#convert words to word indicies
print "converting words to indices"
data = {}
for idx,doc in enumerate(tokens):
    sys.stdout.write('processing %i of %i records       \r' % (idx+1,len(tokens)))
    sys.stdout.flush()
    dic = {}
    dic['label'] = labels[idx]
    dic['text'] = doc
    indicies = []
    for sent in doc:
        indicies.append([word2id[word] if word in word2id else unk for word in sent])
    dic['idx'] = indicies
    data[idx] = dic

#save preprocessed data and embeddings to disk
print "\nsaving data to disk"
np.save('embeddings',vocab)
with open('data.pkl', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
+28 −0
Original line number Diff line number Diff line
from crossbowGlobus import crossbowGlobus
import time

#connect to CKAN, make sure yelp dataset exists
cbow = crossbowGlobus(api_key="eaabd7d9-3cb4-4014-85fe-73736e658472")
packages = cbow.list_packages()
if 'yelp' not in packages:
    raise Exception("yelp package is missing from CKAN")
idx = packages.index("yelp")
resources = cbow.list_resources(packages[idx])
if 'yelp_academic_dataset_review.json' not in resources:
    raise Exception("yelp_academic_dataset_review.json is missing from yelp package")

#download yelp dataset
endpoint = '6a3ced3a-0d9a-11e7-bb38-22000b9a448b'
path = '/~/Desktop/crossbow/yelp_example'
dl_id = cbow.download_resource('yelp','yelp_academic_dataset_review.json',
                               dest_endpoint=endpoint,dest_path=path)
                               
#wait for transfer to complete
start = time.time()
timeout = 600
print "initializing file transfer"
while cbow.check_task_status(dl_id) != "SUCCEEDED":
    time.sleep(10)
    if (time.time() - start) > timeout:
        raise Exception("file transfer timed out")
print "transfer complete"
+108 −0
Original line number Diff line number Diff line
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix
import matplotlib.pyplot as plt
import itertools

#load saved files
print "loading data"
vocab = np.load('embeddings.npy')
with open('data.pkl', 'rb') as f:
    data = pickle.load(f)
    
#convert each doc into a string
print "creating features and labels"
docs = []
labels = []
for key,value in data.iteritems():
    docs.append(value['text'])
    labels.append(value['label'])
    
docstrings = []
for doc in docs:
    flattened = [word for line in doc for word in line]
    docstring = " ".join(flattened)
    docstrings.append(docstring)

#tfidf vectorization
vectorizer = TfidfVectorizer(min_df=3, stop_words='english',ngram_range=(1, 2))
X = vectorizer.fit_transform(docstrings)

#label encoder
le = LabelEncoder()
y = le.fit_transform(labels)

#kfold cross validation
splits = 10
kf = StratifiedKFold(n_splits=splits,shuffle=True,random_state=1234)

#function for plotting confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

#classify using Naive Bayes
print "training naive bayes"
scores = []
y_preds = []
y_tests = []
i = 0
for train_index, test_index in kf.split(X,y):
    i += 1

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    scores.append(score)

    print "Naive Bayes - kfold %i of %i accuracy: %.4f%%" % (i,splits,score*100)
    
    #for calculating f-score
    y_preds.extend(clf.predict(X_test))
    y_tests.extend(y_test)
    
print "Naive Bayes - overall accuracy: %.4f" % (np.mean(scores)*100)

#get naive bayes f-score
micro = f1_score(y_tests,y_preds,average='micro')
macro = f1_score(y_tests,y_preds,average='macro')
print "Naive Bayes - overall f-score: %.4f" % (micro)
print "Naive Bayes - overall f-score: %.4f" % (macro)

#plot confusion matrix
confusion = confusion_matrix(y_tests, y_preds)
plt.figure()
plot_confusion_matrix(confusion,classes=le.classes_,title='Naive Bayes')
plt.savefig('naive_bayes.png')
plt.show()

yelp_example/th_cnn.py

0 → 100644
+399 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading