Using sklearn to predict which MeSH terms should be assigned to PubMed articles
I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.
I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf
of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.
import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time
import util_5353
# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})
return data
# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())
# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE
for key in data:
pmids.append(key)
# End CODE
return pmids
# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE
article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))
# End CODE
return unigrams
# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE
article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1
doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))
doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1
for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})
# End CODE
return tfidf
# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE
work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]
# End CODE
return mesh
def outcomes(data, train):
bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]
return outcomes
def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions
# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions
# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions
# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters
# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]
recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)
# End CODE
return evaluation
# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':
# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')
pmid_list = ['22999938', '23010078', '23018989']
print('::: Problem A :::')
data = read_data(file_list)
print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)
tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]
print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')
print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')
print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')
print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')
print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')
print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')
print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')
print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')
print('~~~ All Tests Pass ~~~')
When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf
.
Is there a way to speed this up?
My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf
method with all 10,000 IDs.
python performance machine-learning clustering natural-language-processing
New contributor
add a comment |
I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.
I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf
of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.
import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time
import util_5353
# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})
return data
# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())
# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE
for key in data:
pmids.append(key)
# End CODE
return pmids
# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE
article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))
# End CODE
return unigrams
# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE
article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1
doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))
doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1
for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})
# End CODE
return tfidf
# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE
work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]
# End CODE
return mesh
def outcomes(data, train):
bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]
return outcomes
def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions
# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions
# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions
# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters
# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]
recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)
# End CODE
return evaluation
# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':
# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')
pmid_list = ['22999938', '23010078', '23018989']
print('::: Problem A :::')
data = read_data(file_list)
print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)
tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]
print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')
print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')
print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')
print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')
print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')
print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')
print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')
print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')
print('~~~ All Tests Pass ~~~')
When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf
.
Is there a way to speed this up?
My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf
method with all 10,000 IDs.
python performance machine-learning clustering natural-language-processing
New contributor
1
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
2
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26
add a comment |
I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.
I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf
of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.
import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time
import util_5353
# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})
return data
# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())
# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE
for key in data:
pmids.append(key)
# End CODE
return pmids
# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE
article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))
# End CODE
return unigrams
# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE
article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1
doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))
doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1
for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})
# End CODE
return tfidf
# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE
work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]
# End CODE
return mesh
def outcomes(data, train):
bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]
return outcomes
def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions
# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions
# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions
# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters
# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]
recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)
# End CODE
return evaluation
# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':
# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')
pmid_list = ['22999938', '23010078', '23018989']
print('::: Problem A :::')
data = read_data(file_list)
print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)
tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]
print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')
print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')
print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')
print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')
print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')
print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')
print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')
print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')
print('~~~ All Tests Pass ~~~')
When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf
.
Is there a way to speed this up?
My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf
method with all 10,000 IDs.
python performance machine-learning clustering natural-language-processing
New contributor
I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.
I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf
of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.
import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time
import util_5353
# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})
return data
# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())
# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE
for key in data:
pmids.append(key)
# End CODE
return pmids
# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE
article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))
# End CODE
return unigrams
# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE
article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1
doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))
doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1
for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})
# End CODE
return tfidf
# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE
work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]
# End CODE
return mesh
def outcomes(data, train):
bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]
return outcomes
def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions
# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions
# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions
# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters
# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]
recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)
# End CODE
return evaluation
# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':
# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')
pmid_list = ['22999938', '23010078', '23018989']
print('::: Problem A :::')
data = read_data(file_list)
print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)
tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]
print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')
print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')
print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')
print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')
print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')
print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')
print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')
print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')
print('~~~ All Tests Pass ~~~')
When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf
.
Is there a way to speed this up?
My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf
method with all 10,000 IDs.
python performance machine-learning clustering natural-language-processing
python performance machine-learning clustering natural-language-processing
New contributor
New contributor
edited Dec 17 at 17:24
200_success
128k15150412
128k15150412
New contributor
asked Dec 17 at 13:54
Michael Martin
1
1
New contributor
New contributor
1
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
2
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26
add a comment |
1
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
2
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26
1
1
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
2
2
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26
add a comment |
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209830%2fusing-sklearn-to-predict-which-mesh-terms-should-be-assigned-to-pubmed-articles%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209830%2fusing-sklearn-to-predict-which-mesh-terms-should-be-assigned-to-pubmed-articles%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25
2
Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26