Using sklearn to predict which MeSH terms should be assigned to PubMed articles












-1














I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.



I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.



import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time

import util_5353

# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})


return data

# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())

# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE

for key in data:
pmids.append(key)
# End CODE
return pmids

# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE

article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))

# End CODE
return unigrams

# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE

article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1

doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))

doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1

for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})

# End CODE
return tfidf

# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE

work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]

# End CODE
return mesh

def outcomes(data, train):

bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]

return outcomes

def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions

# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions

# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions

# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters

# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}

stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]

recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))

dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)

# End CODE
return evaluation

# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':

# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')

pmid_list = ['22999938', '23010078', '23018989']

print('::: Problem A :::')
data = read_data(file_list)

print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)

tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]

print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')

print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')

print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')

print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')

print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')

print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')

print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')

print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')

print('~~~ All Tests Pass ~~~')


When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.



Is there a way to speed this up?



My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.










share|improve this question









New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 1




    Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
    – Ludisposed
    Dec 17 at 17:25






  • 2




    Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
    – 200_success
    Dec 17 at 17:26
















-1














I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.



I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.



import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time

import util_5353

# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})


return data

# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())

# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE

for key in data:
pmids.append(key)
# End CODE
return pmids

# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE

article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))

# End CODE
return unigrams

# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE

article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1

doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))

doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1

for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})

# End CODE
return tfidf

# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE

work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]

# End CODE
return mesh

def outcomes(data, train):

bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]

return outcomes

def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions

# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions

# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions

# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters

# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}

stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]

recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))

dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)

# End CODE
return evaluation

# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':

# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')

pmid_list = ['22999938', '23010078', '23018989']

print('::: Problem A :::')
data = read_data(file_list)

print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)

tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]

print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')

print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')

print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')

print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')

print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')

print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')

print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')

print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')

print('~~~ All Tests Pass ~~~')


When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.



Is there a way to speed this up?



My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.










share|improve this question









New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 1




    Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
    – Ludisposed
    Dec 17 at 17:25






  • 2




    Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
    – 200_success
    Dec 17 at 17:26














-1












-1








-1







I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.



I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.



import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time

import util_5353

# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})


return data

# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())

# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE

for key in data:
pmids.append(key)
# End CODE
return pmids

# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE

article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))

# End CODE
return unigrams

# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE

article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1

doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))

doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1

for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})

# End CODE
return tfidf

# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE

work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]

# End CODE
return mesh

def outcomes(data, train):

bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]

return outcomes

def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions

# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions

# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions

# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters

# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}

stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]

recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))

dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)

# End CODE
return evaluation

# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':

# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')

pmid_list = ['22999938', '23010078', '23018989']

print('::: Problem A :::')
data = read_data(file_list)

print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)

tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]

print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')

print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')

print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')

print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')

print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')

print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')

print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')

print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')

print('~~~ All Tests Pass ~~~')


When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.



Is there a way to speed this up?



My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.










share|improve this question









New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.



I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.



import gzip
import math
import re
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time

import util_5353

# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents =
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween =
pmid_list =
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})


return data

# Problem B [0 points]
tokenizer = re.compile('w+|[^sw]+')
def tokenize(text):
return tokenizer.findall(text.lower())

# Problem C [0 points]
def pmids(data):
pmids =
# Begin CODE

for key in data:
pmids.append(key)
# End CODE
return pmids

# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE

article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))

# End CODE
return unigrams

# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE

article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1

doc_words =
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))

doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1

for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})

# End CODE
return tfidf

# Problem 3 [10 points]
def mesh(data, pmid):
mesh =
# Begin CODE

work =
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words =
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]

# End CODE
return mesh

def outcomes(data, train):

bin_list =
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes =
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]

return outcomes

def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions

# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions

# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m: for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions

# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters

# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}

stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
k = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m: for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict =
predictions = {m: for m in mesh}
work =
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work =
i = 0
m = len(test)
final =
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m: for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions

# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final =
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]

recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))

dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)

# End CODE
return evaluation

# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':

# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list =
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')

pmid_list = ['22999938', '23010078', '23018989']

print('::: Problem A :::')
data = read_data(file_list)

print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)

tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]

print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')

print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')

print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')

print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')

print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')

print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')

print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')

print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')

print('~~~ All Tests Pass ~~~')


When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.



Is there a way to speed this up?



My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.







python performance machine-learning clustering natural-language-processing






share|improve this question









New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











share|improve this question









New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









share|improve this question




share|improve this question








edited Dec 17 at 17:24









200_success

128k15150412




128k15150412






New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









asked Dec 17 at 13:54









Michael Martin

1




1




New contributor




Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.





New contributor





Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.






Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.








  • 1




    Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
    – Ludisposed
    Dec 17 at 17:25






  • 2




    Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
    – 200_success
    Dec 17 at 17:26














  • 1




    Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
    – Ludisposed
    Dec 17 at 17:25






  • 2




    Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
    – 200_success
    Dec 17 at 17:26








1




1




Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25




Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25




2




2




Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26




Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26















active

oldest

votes











Your Answer





StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});






Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.










draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209830%2fusing-sklearn-to-predict-which-mesh-terms-should-be-assigned-to-pubmed-articles%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown






























active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes








Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.










draft saved

draft discarded


















Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.













Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.












Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.
















Thanks for contributing an answer to Code Review Stack Exchange!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


Use MathJax to format equations. MathJax reference.


To learn more, see our tips on writing great answers.





Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


Please pay close attention to the following guidance:


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209830%2fusing-sklearn-to-predict-which-mesh-terms-should-be-assigned-to-pubmed-articles%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Список кардиналов, возведённых папой римским Каликстом III

Deduzione

Mysql.sock missing - “Can't connect to local MySQL server through socket”