Using sklearn to predict which MeSH terms should be assigned to PubMed articles

-1

I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms.

I am a novice at Python and have written the below code to find for every PMID , the Title and Abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.

import gzip

import math

import re

import sklearn

import numpy as np

from sklearn import svm

from sklearn.svm import LinearSVC

from sklearn.cluster import KMeans

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import recall_score

from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score

from sklearn.metrics import precision_score

import json

import lxml

from lxml import etree as Et

import re

import pandas as pd

import time



import util_5353



# Problem A [0 points]

def read_data(filenames):

    data = None

    # Begin CODE

    data = {}

    contents = 

    for filename in filenames:

        with gzip.open(filename,'rt') as f:

            contents.append(f.read())

    tween = 

    pmid_list = 

    for d in contents: 

        tween.extend(re.findall('^PMID- (.*?)SO  - ', d, re.DOTALL|re.MULTILINE))

        pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))

    for i in range(len(tween)):

        mh = re.findall('^MH  - (.*)$', tween[i], re.MULTILINE)

        content = tween[i].replace('n      ', ' ')

        ti = re.findall('^TI  - (.*)$', content, re.MULTILINE)

        ab = re.findall('^AB  - (.*)$', content, re.MULTILINE)  

        data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})





    return data



# Problem B [0 points]

tokenizer = re.compile('w+|[^sw]+')

def tokenize(text):

    return tokenizer.findall(text.lower())



# Problem C [0 points]

def pmids(data):

    pmids = 

    # Begin CODE



    for key in data:

        pmids.append(key)

    # End CODE

    return pmids



# Problem 1 [10 points]

def unigrams(data, pmid):

    unigrams = {}

    # Begin CODE



    article = data[pmid]

    title = tokenize(article['Ti'][0])

    abstract = (tokenize(article['Ab'][0]))

    unique_words = (list(set(title + abstract)))

    unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))



    # End CODE

    return unigrams



# Problem 2 [10 points]

def tfidf(data, pmid):

    tfidf = {}

    # Begin CODE



    article = data[pmid]

    N = len(data)

    title = tokenize(article['Ti'][0])

    abstract = tokenize(article['Ab'][0])

    pmid_words = title + abstract

    pmid_counts = {}

    for i in pmid_words:

        pmid_counts[i] = pmid_counts.get(i, 0) + 1



    doc_words = 

    for key in data:

        doc_words.extend(tokenize(data[key]['Ti'][0]))

        doc_words.extend(tokenize(data[key]['Ab'][0]))



    doc_counts = dict()

    for i in doc_words:

        doc_counts[i] = doc_counts.get(i, 0) + 1



    for val in pmid_words:

        tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})



    # End CODE

    return tfidf



# Problem 3 [10 points]

def mesh(data, pmid):

    mesh = 

    # Begin CODE



    work = 

    article = data[pmid]

    for term in article['Mh']:

        work.extend(tokenize(term))

    doc_words = 

    i = 0

    while i < len(article['Mh']):

        if '/' in article['Mh'][i]:

            x = article['Mh'][i]

            x = x.split('/')

            doc_words.append(x[0])

            i+=1

        else:

            doc_words.append(article['Mh'][i])

            i+=1

    mesh = [s.replace('*', '') for s in doc_words]



    # End CODE

    return mesh



def outcomes(data, train):



    bin_list = 

    n=len(train)

    for val in train:

        bin_list.append(mesh(data, val))

    i = 0

    k = 0

    outcomes = 

    mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

    for val in mesh_list:

        while i <len(bin_list):

            if val in bin_list[i]:

                outcomes.append('1')

                i+=1

            else:

                outcomes.append('0')  

                i+=1

        i = 0

    outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]



    return outcomes



def linear_svm(data, train, test, mesh, func):

    stuff = {}

    pmids_list = pmids(data)

    for val in pmids_list:

        stuff.update({val:func(data, val)})

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    return predictions



# Problem 4 [10 points]

def svm_predict_unigram(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, unigrams)

    # End CODE

    return predictions



# Problem 5 [10 points]

def svm_predict_tfidf(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, tfidf)

    # End CODE

    return predictions



# Problem 6 [10 points]

def kmeans(data, k):

    clusters = {}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)}) 

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)

    labels = km.labels_

    clusters =  {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}

    # End CODE

    return clusters



# Problem 7 [10 points]

def svm_predict_cluster(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}



    stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 8 [10 points]

def svm_predict_cluster_unigrams(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    tts = int(len(pmid_list) * 0.8)

    train = pmid_list[:tts]

    test = pmid_list[tts:]

    stuff = {}

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)})

    k_stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")

    X = X.join(X2, how='outer')

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 9 [20 points]

def evaluate(data, test, mesh_predict):

    evaluation = {}

    # Begin CODE

    outcome = outcomes(data, test)

    final = 

    i = 0 

    k = 0

    while i < len(outcome):

        while k < len(outcome[i]):

            if outcome[i][k] == '1':

                outcome[i][k] = test[k]

                k+=1

            else:

                k+=1

        k = 0

        i+=1

    for val in outcome:

        final.append(list(filter(lambda a: a != '0', val)))

    dic = {}

    i = 0

    for key in mesh_predict:

        gold_vals = [pmid in final[i] for pmid in test] 

        predict_vals = [pmid in mesh_predict[key] for pmid in test] 



        recall = recall_score(gold_vals, predict_vals, average='macro')

        accuracy = accuracy_score(gold_vals, predict_vals)

        precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals)) 

        f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))



        dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})

    evaluation.update(dic)



    # End CODE

    return evaluation



# Note: don't mess with this code block!  Your code will be tested by an outside

# program that will not call this __main__ block.  So if you mess with the

# following block of code you might crash the autograder.  You're definitely

# encouraged to look at this code, however, especially if your code crashes.

if __name__ == '__main__':



  # Comment out some file names to speed up the development process, but

  # ultimately you want to uncomment the filenames so you ensure that your code

  # works will all files.  The assertions below assume that medline.0.txt.gz is

  # in the list.

  file_list = 

  file_list.append('medline.0.txt.gz')

  file_list.append('medline.1.txt.gz')

  file_list.append('medline.2.txt.gz')

  file_list.append('medline.3.txt.gz')

  file_list.append('medline.4.txt.gz')

  file_list.append('medline.5.txt.gz')

  file_list.append('medline.6.txt.gz')

  file_list.append('medline.7.txt.gz')

  file_list.append('medline.8.txt.gz')

  file_list.append('medline.9.txt.gz')



  pmid_list = ['22999938', '23010078', '23018989']



  print('::: Problem A :::')

  data = read_data(file_list)



  print('::: Problem C :::')

  _pmids = pmids(data)

  for pmid in pmid_list:

    if pmid not in _pmids:

      util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)



  tts = int(len(_pmids) * 0.8)

  train = _pmids[:tts]

  test = _pmids[tts:]



  print('::: Problem 1 :::')

  one_ret = unigrams(data, pmid_list[0])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(99, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')

  one_ret = unigrams(data, pmid_list[1])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(95, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')

  one_ret = unigrams(data, pmid_list[2])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(133, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')



  print('::: Problem 2 :::')

  two_ret = tfidf(data, pmid_list[0])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(99, len(two_ret), '2')

  util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')

  two_ret = tfidf(data, pmid_list[1])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(95, len(two_ret), '2')

  util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')

  two_ret = tfidf(data, pmid_list[2])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(133, len(two_ret), '2')

  util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')



  print('::: Problem 3 :::')

  three_ret = mesh(data, pmid_list[0])

  GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[1])

  GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[2])

  GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)



  print('::: Problem 4 :::')

  mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

  mesh_set = set()

  for pmid in _pmids:

    mesh_set.update(mesh(data, pmid))

  for m in mesh_list:

    if m not in mesh_set:

      util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)

  four_ret = svm_predict_unigram(data, train, test, mesh_list)

  util_5353.assert_dict(four_ret, '4')

  for m in mesh_list:

    util_5353.assert_dict_key(four_ret, m, '4')

    util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')

  util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')



  print('::: Problem 5 :::')

  five_ret = svm_predict_tfidf(data, train, test, mesh_list)

  util_5353.assert_dict(five_ret, '5')

  for m in mesh_list:

    util_5353.assert_dict_key(five_ret, m, '5')

    util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')

  util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')



  print('::: Problem 6 :::')

  K = 10

  six_ret = kmeans(data, K)

  util_5353.assert_dict(six_ret, '6')

  util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')

  for pmid in _pmids:

    util_5353.assert_dict_key(six_ret, pmid, '6')

    util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')



  print('::: Problem 7 :::')

  seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)

  util_5353.assert_dict(seven_ret, '7')

  for m in mesh_list:

    util_5353.assert_dict_key(seven_ret, m, '7')

    util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')

  util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')



  print('::: Problem 8 :::')

  eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)

  util_5353.assert_dict(eight_ret, '8')

  for m in mesh_list:

    util_5353.assert_dict_key(eight_ret, m, '8')

    util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')

  util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')



  print(':: Problem 9 ::')

  nine_ret4 = evaluate(data, test, four_ret)

  nine_ret5 = evaluate(data, test, five_ret)

  nine_ret7 = evaluate(data, test, seven_ret)

  nine_ret8 = evaluate(data, test, eight_ret)

  for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:

    util_5353.assert_dict(nine_ret, '9')

    for m in mesh_list:

      util_5353.assert_dict_key(nine_ret, m, '9')

      util_5353.assert_dict(nine_ret[m], '9')

      for k in ['accuracy', 'precision', 'recall', 'f1']:

        util_5353.assert_dict_key(nine_ret[m], k, '9')

        util_5353.assert_float(nine_ret[m][k], '9')

        util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')



  print('~~~ All Tests Pass ~~~')

When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.

Is there a way to speed this up?

My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

1

Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25

2

Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26

add a comment |

-1

import gzip

import math

import re

import sklearn

import numpy as np

from sklearn import svm

from sklearn.svm import LinearSVC

from sklearn.cluster import KMeans

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import recall_score

from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score

from sklearn.metrics import precision_score

import json

import lxml

from lxml import etree as Et

import re

import pandas as pd

import time



import util_5353



# Problem A [0 points]

def read_data(filenames):

    data = None

    # Begin CODE

    data = {}

    contents = 

    for filename in filenames:

        with gzip.open(filename,'rt') as f:

            contents.append(f.read())

    tween = 

    pmid_list = 

    for d in contents: 

        tween.extend(re.findall('^PMID- (.*?)SO  - ', d, re.DOTALL|re.MULTILINE))

        pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))

    for i in range(len(tween)):

        mh = re.findall('^MH  - (.*)$', tween[i], re.MULTILINE)

        content = tween[i].replace('n      ', ' ')

        ti = re.findall('^TI  - (.*)$', content, re.MULTILINE)

        ab = re.findall('^AB  - (.*)$', content, re.MULTILINE)  

        data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})





    return data



# Problem B [0 points]

tokenizer = re.compile('w+|[^sw]+')

def tokenize(text):

    return tokenizer.findall(text.lower())



# Problem C [0 points]

def pmids(data):

    pmids = 

    # Begin CODE



    for key in data:

        pmids.append(key)

    # End CODE

    return pmids



# Problem 1 [10 points]

def unigrams(data, pmid):

    unigrams = {}

    # Begin CODE



    article = data[pmid]

    title = tokenize(article['Ti'][0])

    abstract = (tokenize(article['Ab'][0]))

    unique_words = (list(set(title + abstract)))

    unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))



    # End CODE

    return unigrams



# Problem 2 [10 points]

def tfidf(data, pmid):

    tfidf = {}

    # Begin CODE



    article = data[pmid]

    N = len(data)

    title = tokenize(article['Ti'][0])

    abstract = tokenize(article['Ab'][0])

    pmid_words = title + abstract

    pmid_counts = {}

    for i in pmid_words:

        pmid_counts[i] = pmid_counts.get(i, 0) + 1



    doc_words = 

    for key in data:

        doc_words.extend(tokenize(data[key]['Ti'][0]))

        doc_words.extend(tokenize(data[key]['Ab'][0]))



    doc_counts = dict()

    for i in doc_words:

        doc_counts[i] = doc_counts.get(i, 0) + 1



    for val in pmid_words:

        tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})



    # End CODE

    return tfidf



# Problem 3 [10 points]

def mesh(data, pmid):

    mesh = 

    # Begin CODE



    work = 

    article = data[pmid]

    for term in article['Mh']:

        work.extend(tokenize(term))

    doc_words = 

    i = 0

    while i < len(article['Mh']):

        if '/' in article['Mh'][i]:

            x = article['Mh'][i]

            x = x.split('/')

            doc_words.append(x[0])

            i+=1

        else:

            doc_words.append(article['Mh'][i])

            i+=1

    mesh = [s.replace('*', '') for s in doc_words]



    # End CODE

    return mesh



def outcomes(data, train):



    bin_list = 

    n=len(train)

    for val in train:

        bin_list.append(mesh(data, val))

    i = 0

    k = 0

    outcomes = 

    mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

    for val in mesh_list:

        while i <len(bin_list):

            if val in bin_list[i]:

                outcomes.append('1')

                i+=1

            else:

                outcomes.append('0')  

                i+=1

        i = 0

    outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]



    return outcomes



def linear_svm(data, train, test, mesh, func):

    stuff = {}

    pmids_list = pmids(data)

    for val in pmids_list:

        stuff.update({val:func(data, val)})

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    return predictions



# Problem 4 [10 points]

def svm_predict_unigram(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, unigrams)

    # End CODE

    return predictions



# Problem 5 [10 points]

def svm_predict_tfidf(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, tfidf)

    # End CODE

    return predictions



# Problem 6 [10 points]

def kmeans(data, k):

    clusters = {}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)}) 

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)

    labels = km.labels_

    clusters =  {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}

    # End CODE

    return clusters



# Problem 7 [10 points]

def svm_predict_cluster(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}



    stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 8 [10 points]

def svm_predict_cluster_unigrams(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    tts = int(len(pmid_list) * 0.8)

    train = pmid_list[:tts]

    test = pmid_list[tts:]

    stuff = {}

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)})

    k_stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")

    X = X.join(X2, how='outer')

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 9 [20 points]

def evaluate(data, test, mesh_predict):

    evaluation = {}

    # Begin CODE

    outcome = outcomes(data, test)

    final = 

    i = 0 

    k = 0

    while i < len(outcome):

        while k < len(outcome[i]):

            if outcome[i][k] == '1':

                outcome[i][k] = test[k]

                k+=1

            else:

                k+=1

        k = 0

        i+=1

    for val in outcome:

        final.append(list(filter(lambda a: a != '0', val)))

    dic = {}

    i = 0

    for key in mesh_predict:

        gold_vals = [pmid in final[i] for pmid in test] 

        predict_vals = [pmid in mesh_predict[key] for pmid in test] 



        recall = recall_score(gold_vals, predict_vals, average='macro')

        accuracy = accuracy_score(gold_vals, predict_vals)

        precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals)) 

        f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))



        dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})

    evaluation.update(dic)



    # End CODE

    return evaluation



# Note: don't mess with this code block!  Your code will be tested by an outside

# program that will not call this __main__ block.  So if you mess with the

# following block of code you might crash the autograder.  You're definitely

# encouraged to look at this code, however, especially if your code crashes.

if __name__ == '__main__':



  # Comment out some file names to speed up the development process, but

  # ultimately you want to uncomment the filenames so you ensure that your code

  # works will all files.  The assertions below assume that medline.0.txt.gz is

  # in the list.

  file_list = 

  file_list.append('medline.0.txt.gz')

  file_list.append('medline.1.txt.gz')

  file_list.append('medline.2.txt.gz')

  file_list.append('medline.3.txt.gz')

  file_list.append('medline.4.txt.gz')

  file_list.append('medline.5.txt.gz')

  file_list.append('medline.6.txt.gz')

  file_list.append('medline.7.txt.gz')

  file_list.append('medline.8.txt.gz')

  file_list.append('medline.9.txt.gz')



  pmid_list = ['22999938', '23010078', '23018989']



  print('::: Problem A :::')

  data = read_data(file_list)



  print('::: Problem C :::')

  _pmids = pmids(data)

  for pmid in pmid_list:

    if pmid not in _pmids:

      util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)



  tts = int(len(_pmids) * 0.8)

  train = _pmids[:tts]

  test = _pmids[tts:]



  print('::: Problem 1 :::')

  one_ret = unigrams(data, pmid_list[0])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(99, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')

  one_ret = unigrams(data, pmid_list[1])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(95, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')

  one_ret = unigrams(data, pmid_list[2])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(133, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')



  print('::: Problem 2 :::')

  two_ret = tfidf(data, pmid_list[0])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(99, len(two_ret), '2')

  util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')

  two_ret = tfidf(data, pmid_list[1])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(95, len(two_ret), '2')

  util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')

  two_ret = tfidf(data, pmid_list[2])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(133, len(two_ret), '2')

  util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')



  print('::: Problem 3 :::')

  three_ret = mesh(data, pmid_list[0])

  GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[1])

  GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[2])

  GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)



  print('::: Problem 4 :::')

  mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

  mesh_set = set()

  for pmid in _pmids:

    mesh_set.update(mesh(data, pmid))

  for m in mesh_list:

    if m not in mesh_set:

      util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)

  four_ret = svm_predict_unigram(data, train, test, mesh_list)

  util_5353.assert_dict(four_ret, '4')

  for m in mesh_list:

    util_5353.assert_dict_key(four_ret, m, '4')

    util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')

  util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')



  print('::: Problem 5 :::')

  five_ret = svm_predict_tfidf(data, train, test, mesh_list)

  util_5353.assert_dict(five_ret, '5')

  for m in mesh_list:

    util_5353.assert_dict_key(five_ret, m, '5')

    util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')

  util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')



  print('::: Problem 6 :::')

  K = 10

  six_ret = kmeans(data, K)

  util_5353.assert_dict(six_ret, '6')

  util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')

  for pmid in _pmids:

    util_5353.assert_dict_key(six_ret, pmid, '6')

    util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')



  print('::: Problem 7 :::')

  seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)

  util_5353.assert_dict(seven_ret, '7')

  for m in mesh_list:

    util_5353.assert_dict_key(seven_ret, m, '7')

    util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')

  util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')



  print('::: Problem 8 :::')

  eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)

  util_5353.assert_dict(eight_ret, '8')

  for m in mesh_list:

    util_5353.assert_dict_key(eight_ret, m, '8')

    util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')

  util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')



  print(':: Problem 9 ::')

  nine_ret4 = evaluate(data, test, four_ret)

  nine_ret5 = evaluate(data, test, five_ret)

  nine_ret7 = evaluate(data, test, seven_ret)

  nine_ret8 = evaluate(data, test, eight_ret)

  for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:

    util_5353.assert_dict(nine_ret, '9')

    for m in mesh_list:

      util_5353.assert_dict_key(nine_ret, m, '9')

      util_5353.assert_dict(nine_ret[m], '9')

      for k in ['accuracy', 'precision', 'recall', 'f1']:

        util_5353.assert_dict_key(nine_ret[m], k, '9')

        util_5353.assert_float(nine_ret[m][k], '9')

        util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')



  print('~~~ All Tests Pass ~~~')

When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.

Is there a way to speed this up?

My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

1

Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25

2

Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26

add a comment |

-1

import gzip

import math

import re

import sklearn

import numpy as np

from sklearn import svm

from sklearn.svm import LinearSVC

from sklearn.cluster import KMeans

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import recall_score

from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score

from sklearn.metrics import precision_score

import json

import lxml

from lxml import etree as Et

import re

import pandas as pd

import time



import util_5353



# Problem A [0 points]

def read_data(filenames):

    data = None

    # Begin CODE

    data = {}

    contents = 

    for filename in filenames:

        with gzip.open(filename,'rt') as f:

            contents.append(f.read())

    tween = 

    pmid_list = 

    for d in contents: 

        tween.extend(re.findall('^PMID- (.*?)SO  - ', d, re.DOTALL|re.MULTILINE))

        pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))

    for i in range(len(tween)):

        mh = re.findall('^MH  - (.*)$', tween[i], re.MULTILINE)

        content = tween[i].replace('n      ', ' ')

        ti = re.findall('^TI  - (.*)$', content, re.MULTILINE)

        ab = re.findall('^AB  - (.*)$', content, re.MULTILINE)  

        data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})





    return data



# Problem B [0 points]

tokenizer = re.compile('w+|[^sw]+')

def tokenize(text):

    return tokenizer.findall(text.lower())



# Problem C [0 points]

def pmids(data):

    pmids = 

    # Begin CODE



    for key in data:

        pmids.append(key)

    # End CODE

    return pmids



# Problem 1 [10 points]

def unigrams(data, pmid):

    unigrams = {}

    # Begin CODE



    article = data[pmid]

    title = tokenize(article['Ti'][0])

    abstract = (tokenize(article['Ab'][0]))

    unique_words = (list(set(title + abstract)))

    unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))



    # End CODE

    return unigrams



# Problem 2 [10 points]

def tfidf(data, pmid):

    tfidf = {}

    # Begin CODE



    article = data[pmid]

    N = len(data)

    title = tokenize(article['Ti'][0])

    abstract = tokenize(article['Ab'][0])

    pmid_words = title + abstract

    pmid_counts = {}

    for i in pmid_words:

        pmid_counts[i] = pmid_counts.get(i, 0) + 1



    doc_words = 

    for key in data:

        doc_words.extend(tokenize(data[key]['Ti'][0]))

        doc_words.extend(tokenize(data[key]['Ab'][0]))



    doc_counts = dict()

    for i in doc_words:

        doc_counts[i] = doc_counts.get(i, 0) + 1



    for val in pmid_words:

        tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})



    # End CODE

    return tfidf



# Problem 3 [10 points]

def mesh(data, pmid):

    mesh = 

    # Begin CODE



    work = 

    article = data[pmid]

    for term in article['Mh']:

        work.extend(tokenize(term))

    doc_words = 

    i = 0

    while i < len(article['Mh']):

        if '/' in article['Mh'][i]:

            x = article['Mh'][i]

            x = x.split('/')

            doc_words.append(x[0])

            i+=1

        else:

            doc_words.append(article['Mh'][i])

            i+=1

    mesh = [s.replace('*', '') for s in doc_words]



    # End CODE

    return mesh



def outcomes(data, train):



    bin_list = 

    n=len(train)

    for val in train:

        bin_list.append(mesh(data, val))

    i = 0

    k = 0

    outcomes = 

    mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

    for val in mesh_list:

        while i <len(bin_list):

            if val in bin_list[i]:

                outcomes.append('1')

                i+=1

            else:

                outcomes.append('0')  

                i+=1

        i = 0

    outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]



    return outcomes



def linear_svm(data, train, test, mesh, func):

    stuff = {}

    pmids_list = pmids(data)

    for val in pmids_list:

        stuff.update({val:func(data, val)})

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    return predictions



# Problem 4 [10 points]

def svm_predict_unigram(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, unigrams)

    # End CODE

    return predictions



# Problem 5 [10 points]

def svm_predict_tfidf(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, tfidf)

    # End CODE

    return predictions



# Problem 6 [10 points]

def kmeans(data, k):

    clusters = {}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)}) 

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)

    labels = km.labels_

    clusters =  {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}

    # End CODE

    return clusters



# Problem 7 [10 points]

def svm_predict_cluster(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}



    stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 8 [10 points]

def svm_predict_cluster_unigrams(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    tts = int(len(pmid_list) * 0.8)

    train = pmid_list[:tts]

    test = pmid_list[tts:]

    stuff = {}

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)})

    k_stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")

    X = X.join(X2, how='outer')

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 9 [20 points]

def evaluate(data, test, mesh_predict):

    evaluation = {}

    # Begin CODE

    outcome = outcomes(data, test)

    final = 

    i = 0 

    k = 0

    while i < len(outcome):

        while k < len(outcome[i]):

            if outcome[i][k] == '1':

                outcome[i][k] = test[k]

                k+=1

            else:

                k+=1

        k = 0

        i+=1

    for val in outcome:

        final.append(list(filter(lambda a: a != '0', val)))

    dic = {}

    i = 0

    for key in mesh_predict:

        gold_vals = [pmid in final[i] for pmid in test] 

        predict_vals = [pmid in mesh_predict[key] for pmid in test] 



        recall = recall_score(gold_vals, predict_vals, average='macro')

        accuracy = accuracy_score(gold_vals, predict_vals)

        precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals)) 

        f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))



        dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})

    evaluation.update(dic)



    # End CODE

    return evaluation



# Note: don't mess with this code block!  Your code will be tested by an outside

# program that will not call this __main__ block.  So if you mess with the

# following block of code you might crash the autograder.  You're definitely

# encouraged to look at this code, however, especially if your code crashes.

if __name__ == '__main__':



  # Comment out some file names to speed up the development process, but

  # ultimately you want to uncomment the filenames so you ensure that your code

  # works will all files.  The assertions below assume that medline.0.txt.gz is

  # in the list.

  file_list = 

  file_list.append('medline.0.txt.gz')

  file_list.append('medline.1.txt.gz')

  file_list.append('medline.2.txt.gz')

  file_list.append('medline.3.txt.gz')

  file_list.append('medline.4.txt.gz')

  file_list.append('medline.5.txt.gz')

  file_list.append('medline.6.txt.gz')

  file_list.append('medline.7.txt.gz')

  file_list.append('medline.8.txt.gz')

  file_list.append('medline.9.txt.gz')



  pmid_list = ['22999938', '23010078', '23018989']



  print('::: Problem A :::')

  data = read_data(file_list)



  print('::: Problem C :::')

  _pmids = pmids(data)

  for pmid in pmid_list:

    if pmid not in _pmids:

      util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)



  tts = int(len(_pmids) * 0.8)

  train = _pmids[:tts]

  test = _pmids[tts:]



  print('::: Problem 1 :::')

  one_ret = unigrams(data, pmid_list[0])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(99, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')

  one_ret = unigrams(data, pmid_list[1])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(95, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')

  one_ret = unigrams(data, pmid_list[2])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(133, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')



  print('::: Problem 2 :::')

  two_ret = tfidf(data, pmid_list[0])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(99, len(two_ret), '2')

  util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')

  two_ret = tfidf(data, pmid_list[1])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(95, len(two_ret), '2')

  util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')

  two_ret = tfidf(data, pmid_list[2])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(133, len(two_ret), '2')

  util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')



  print('::: Problem 3 :::')

  three_ret = mesh(data, pmid_list[0])

  GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[1])

  GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[2])

  GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)



  print('::: Problem 4 :::')

  mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

  mesh_set = set()

  for pmid in _pmids:

    mesh_set.update(mesh(data, pmid))

  for m in mesh_list:

    if m not in mesh_set:

      util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)

  four_ret = svm_predict_unigram(data, train, test, mesh_list)

  util_5353.assert_dict(four_ret, '4')

  for m in mesh_list:

    util_5353.assert_dict_key(four_ret, m, '4')

    util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')

  util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')



  print('::: Problem 5 :::')

  five_ret = svm_predict_tfidf(data, train, test, mesh_list)

  util_5353.assert_dict(five_ret, '5')

  for m in mesh_list:

    util_5353.assert_dict_key(five_ret, m, '5')

    util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')

  util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')



  print('::: Problem 6 :::')

  K = 10

  six_ret = kmeans(data, K)

  util_5353.assert_dict(six_ret, '6')

  util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')

  for pmid in _pmids:

    util_5353.assert_dict_key(six_ret, pmid, '6')

    util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')



  print('::: Problem 7 :::')

  seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)

  util_5353.assert_dict(seven_ret, '7')

  for m in mesh_list:

    util_5353.assert_dict_key(seven_ret, m, '7')

    util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')

  util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')



  print('::: Problem 8 :::')

  eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)

  util_5353.assert_dict(eight_ret, '8')

  for m in mesh_list:

    util_5353.assert_dict_key(eight_ret, m, '8')

    util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')

  util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')



  print(':: Problem 9 ::')

  nine_ret4 = evaluate(data, test, four_ret)

  nine_ret5 = evaluate(data, test, five_ret)

  nine_ret7 = evaluate(data, test, seven_ret)

  nine_ret8 = evaluate(data, test, eight_ret)

  for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:

    util_5353.assert_dict(nine_ret, '9')

    for m in mesh_list:

      util_5353.assert_dict_key(nine_ret, m, '9')

      util_5353.assert_dict(nine_ret[m], '9')

      for k in ['accuracy', 'precision', 'recall', 'f1']:

        util_5353.assert_dict_key(nine_ret[m], k, '9')

        util_5353.assert_float(nine_ret[m][k], '9')

        util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')



  print('~~~ All Tests Pass ~~~')

When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.

Is there a way to speed this up?

My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

import gzip

import math

import re

import sklearn

import numpy as np

from sklearn import svm

from sklearn.svm import LinearSVC

from sklearn.cluster import KMeans

from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import recall_score

from sklearn.metrics import accuracy_score

from sklearn.metrics import f1_score

from sklearn.metrics import precision_score

import json

import lxml

from lxml import etree as Et

import re

import pandas as pd

import time



import util_5353



# Problem A [0 points]

def read_data(filenames):

    data = None

    # Begin CODE

    data = {}

    contents = 

    for filename in filenames:

        with gzip.open(filename,'rt') as f:

            contents.append(f.read())

    tween = 

    pmid_list = 

    for d in contents: 

        tween.extend(re.findall('^PMID- (.*?)SO  - ', d, re.DOTALL|re.MULTILINE))

        pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))

    for i in range(len(tween)):

        mh = re.findall('^MH  - (.*)$', tween[i], re.MULTILINE)

        content = tween[i].replace('n      ', ' ')

        ti = re.findall('^TI  - (.*)$', content, re.MULTILINE)

        ab = re.findall('^AB  - (.*)$', content, re.MULTILINE)  

        data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})





    return data



# Problem B [0 points]

tokenizer = re.compile('w+|[^sw]+')

def tokenize(text):

    return tokenizer.findall(text.lower())



# Problem C [0 points]

def pmids(data):

    pmids = 

    # Begin CODE



    for key in data:

        pmids.append(key)

    # End CODE

    return pmids



# Problem 1 [10 points]

def unigrams(data, pmid):

    unigrams = {}

    # Begin CODE



    article = data[pmid]

    title = tokenize(article['Ti'][0])

    abstract = (tokenize(article['Ab'][0]))

    unique_words = (list(set(title + abstract)))

    unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))



    # End CODE

    return unigrams



# Problem 2 [10 points]

def tfidf(data, pmid):

    tfidf = {}

    # Begin CODE



    article = data[pmid]

    N = len(data)

    title = tokenize(article['Ti'][0])

    abstract = tokenize(article['Ab'][0])

    pmid_words = title + abstract

    pmid_counts = {}

    for i in pmid_words:

        pmid_counts[i] = pmid_counts.get(i, 0) + 1



    doc_words = 

    for key in data:

        doc_words.extend(tokenize(data[key]['Ti'][0]))

        doc_words.extend(tokenize(data[key]['Ab'][0]))



    doc_counts = dict()

    for i in doc_words:

        doc_counts[i] = doc_counts.get(i, 0) + 1



    for val in pmid_words:

        tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})



    # End CODE

    return tfidf



# Problem 3 [10 points]

def mesh(data, pmid):

    mesh = 

    # Begin CODE



    work = 

    article = data[pmid]

    for term in article['Mh']:

        work.extend(tokenize(term))

    doc_words = 

    i = 0

    while i < len(article['Mh']):

        if '/' in article['Mh'][i]:

            x = article['Mh'][i]

            x = x.split('/')

            doc_words.append(x[0])

            i+=1

        else:

            doc_words.append(article['Mh'][i])

            i+=1

    mesh = [s.replace('*', '') for s in doc_words]



    # End CODE

    return mesh



def outcomes(data, train):



    bin_list = 

    n=len(train)

    for val in train:

        bin_list.append(mesh(data, val))

    i = 0

    k = 0

    outcomes = 

    mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

    for val in mesh_list:

        while i <len(bin_list):

            if val in bin_list[i]:

                outcomes.append('1')

                i+=1

            else:

                outcomes.append('0')  

                i+=1

        i = 0

    outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]



    return outcomes



def linear_svm(data, train, test, mesh, func):

    stuff = {}

    pmids_list = pmids(data)

    for val in pmids_list:

        stuff.update({val:func(data, val)})

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    return predictions



# Problem 4 [10 points]

def svm_predict_unigram(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, unigrams)

    # End CODE

    return predictions



# Problem 5 [10 points]

def svm_predict_tfidf(data, train, test, mesh):

    predictions = {m: for m in mesh}

    # Begin CODE

    predictions = linear_svm(data, train, test, mesh, tfidf)

    # End CODE

    return predictions



# Problem 6 [10 points]

def kmeans(data, k):

    clusters = {}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)}) 

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X = X.replace({np.nan:0})

    km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)

    labels = km.labels_

    clusters =  {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}

    # End CODE

    return clusters



# Problem 7 [10 points]

def svm_predict_cluster(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}



    stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    k = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 8 [10 points]

def svm_predict_cluster_unigrams(data, train, test, mesh, k):

    predictions = {m: for m in mesh}

    # Begin CODE

    stuff = {}

    pmid_list = pmids(data)

    tts = int(len(pmid_list) * 0.8)

    train = pmid_list[:tts]

    test = pmid_list[tts:]

    stuff = {}

    for val in pmid_list:

        stuff.update({val:unigrams(data, val)})

    k_stuff = (kmeans(data, k))

    X = pd.DataFrame.from_dict(stuff, orient = "index")

    X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")

    X = X.join(X2, how='outer')

    X = X.replace({np.nan:0})

    outcome_data = outcomes(data, train)

    df1 = pd.DataFrame()

    df2 = pd.DataFrame()

    clf = LinearSVC()

    predict = 

    predictions = {m: for m in mesh}

    work = 

    for val in train:

        df1 = df1.append(X.loc[val])

    for val in test:

        df2 = df2.append(X.loc[val])

    for val in outcome_data:

        clf.fit(df1, val)

        predict.append(list(clf.predict(df2)))

    work = 

    i = 0

    m = len(test)

    final = 

    for val in predict:

        while i < len(val):

            if val[i] == '1':

                work.append(test[i])

                i+=1

            else:

                work.append('0')

                i+=1

        i = 0

    work = [work[i:i+m] for i in range(0, len(work), m)]

    for val in work:

        final.append(list(filter(lambda a: a != '0', val)))

    predictions = {m: for m in mesh}

    for i in range(0,10):

        predictions.update({mesh[i]:final[i]})

    # End CODE

    return predictions



# Problem 9 [20 points]

def evaluate(data, test, mesh_predict):

    evaluation = {}

    # Begin CODE

    outcome = outcomes(data, test)

    final = 

    i = 0 

    k = 0

    while i < len(outcome):

        while k < len(outcome[i]):

            if outcome[i][k] == '1':

                outcome[i][k] = test[k]

                k+=1

            else:

                k+=1

        k = 0

        i+=1

    for val in outcome:

        final.append(list(filter(lambda a: a != '0', val)))

    dic = {}

    i = 0

    for key in mesh_predict:

        gold_vals = [pmid in final[i] for pmid in test] 

        predict_vals = [pmid in mesh_predict[key] for pmid in test] 



        recall = recall_score(gold_vals, predict_vals, average='macro')

        accuracy = accuracy_score(gold_vals, predict_vals)

        precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals)) 

        f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))



        dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})

    evaluation.update(dic)



    # End CODE

    return evaluation



# Note: don't mess with this code block!  Your code will be tested by an outside

# program that will not call this __main__ block.  So if you mess with the

# following block of code you might crash the autograder.  You're definitely

# encouraged to look at this code, however, especially if your code crashes.

if __name__ == '__main__':



  # Comment out some file names to speed up the development process, but

  # ultimately you want to uncomment the filenames so you ensure that your code

  # works will all files.  The assertions below assume that medline.0.txt.gz is

  # in the list.

  file_list = 

  file_list.append('medline.0.txt.gz')

  file_list.append('medline.1.txt.gz')

  file_list.append('medline.2.txt.gz')

  file_list.append('medline.3.txt.gz')

  file_list.append('medline.4.txt.gz')

  file_list.append('medline.5.txt.gz')

  file_list.append('medline.6.txt.gz')

  file_list.append('medline.7.txt.gz')

  file_list.append('medline.8.txt.gz')

  file_list.append('medline.9.txt.gz')



  pmid_list = ['22999938', '23010078', '23018989']



  print('::: Problem A :::')

  data = read_data(file_list)



  print('::: Problem C :::')

  _pmids = pmids(data)

  for pmid in pmid_list:

    if pmid not in _pmids:

      util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)



  tts = int(len(_pmids) * 0.8)

  train = _pmids[:tts]

  test = _pmids[tts:]



  print('::: Problem 1 :::')

  one_ret = unigrams(data, pmid_list[0])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(99, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')

  one_ret = unigrams(data, pmid_list[1])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(95, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')

  one_ret = unigrams(data, pmid_list[2])

  util_5353.assert_dict(one_ret, '1')

  util_5353.assert_int_eq(133, len(one_ret), '1')

  util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')



  print('::: Problem 2 :::')

  two_ret = tfidf(data, pmid_list[0])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(99, len(two_ret), '2')

  util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')

  two_ret = tfidf(data, pmid_list[1])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(95, len(two_ret), '2')

  util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')

  two_ret = tfidf(data, pmid_list[2])

  util_5353.assert_dict(two_ret, '2')

  util_5353.assert_int_eq(133, len(two_ret), '2')

  util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')



  print('::: Problem 3 :::')

  three_ret = mesh(data, pmid_list[0])

  GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[1])

  GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)

  three_ret = mesh(data, pmid_list[2])

  GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']

  util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)



  print('::: Problem 4 :::')

  mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',

               'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']

  mesh_set = set()

  for pmid in _pmids:

    mesh_set.update(mesh(data, pmid))

  for m in mesh_list:

    if m not in mesh_set:

      util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)

  four_ret = svm_predict_unigram(data, train, test, mesh_list)

  util_5353.assert_dict(four_ret, '4')

  for m in mesh_list:

    util_5353.assert_dict_key(four_ret, m, '4')

    util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')

  util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')



  print('::: Problem 5 :::')

  five_ret = svm_predict_tfidf(data, train, test, mesh_list)

  util_5353.assert_dict(five_ret, '5')

  for m in mesh_list:

    util_5353.assert_dict_key(five_ret, m, '5')

    util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')

  util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')



  print('::: Problem 6 :::')

  K = 10

  six_ret = kmeans(data, K)

  util_5353.assert_dict(six_ret, '6')

  util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')

  for pmid in _pmids:

    util_5353.assert_dict_key(six_ret, pmid, '6')

    util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')



  print('::: Problem 7 :::')

  seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)

  util_5353.assert_dict(seven_ret, '7')

  for m in mesh_list:

    util_5353.assert_dict_key(seven_ret, m, '7')

    util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')

  util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')



  print('::: Problem 8 :::')

  eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)

  util_5353.assert_dict(eight_ret, '8')

  for m in mesh_list:

    util_5353.assert_dict_key(eight_ret, m, '8')

    util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)

    util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')

  util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')



  print(':: Problem 9 ::')

  nine_ret4 = evaluate(data, test, four_ret)

  nine_ret5 = evaluate(data, test, five_ret)

  nine_ret7 = evaluate(data, test, seven_ret)

  nine_ret8 = evaluate(data, test, eight_ret)

  for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:

    util_5353.assert_dict(nine_ret, '9')

    for m in mesh_list:

      util_5353.assert_dict_key(nine_ret, m, '9')

      util_5353.assert_dict(nine_ret[m], '9')

      for k in ['accuracy', 'precision', 'recall', 'f1']:

        util_5353.assert_dict_key(nine_ret[m], k, '9')

        util_5353.assert_float(nine_ret[m][k], '9')

        util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')



  print('~~~ All Tests Pass ~~~')

When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.

Is there a way to speed this up?

My professor says it takes him 4.5 minutes to run his version of svm_predict_tfidf method with all 10,000 IDs.

python performance machine-learning clustering natural-language-processing

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

edited Dec 17 at 17:24

200_success

128k15150412

edited Dec 17 at 17:24

200_success

128k15150412

edited Dec 17 at 17:24

200_success

128k15150412

asked Dec 17 at 13:54

Michael Martin

New contributor

asked Dec 17 at 13:54

Michael Martin

asked Dec 17 at 13:54

Michael Martin

New contributor

Michael Martin is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.

1

Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25

2

Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26

add a comment |

1

Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25

2

Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26

Have you tested this with smaller inputs? Does the code not work (infinete loop), or just slow?
– Ludisposed
Dec 17 at 17:25

Welcome to Code Review! I suggest that you include a sample of the data files, to help any reviewers.
– 200_success
Dec 17 at 17:26

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f209830%2fusing-sklearn-to-predict-which-mesh-terms-should-be-assigned-to-pubmed-articles%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

active

oldest

votes

Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.

draft saved

draft discarded

Michael Martin is a new contributor. Be nice, and check out our Code of Conduct.

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl