Python Program Generating N-Gram Language Model












0












$begingroup$


I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.



Any suggestion would be really helpful.



def N_Gram(corpus, n):
corpus = ''.join(corpus)
corpus = corpus.split(' ')
output = {}
for i in range(len(corpus)-n+1):
g = ' '.join(corpus[i:i+n])
output.setdefault(g,0)
output[g] += 1
return output

def Uni_Generation():
corpus = ReadFile()
uni = N_Gram(corpus, 1)
print(uni)
final = unsmoothed_totalcount(uni)
print(final)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while sentence.split()[-1] != '</s>': #last word is not </s>
sentence += return_random_selected_item(final, uni)
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
#print(sentence)
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list

def unsmoothed_totalcount(n_gram_dict):
#get total count of words
keyList = n_gram_dict.keys()
final = 0
for x in n_gram_dict:
for i,a in enumerate(keyList):
if a == x:
prev_word = keyList[i-1]
prev_count = n_gram_dict[prev_word] # get the previous word count
final += prev_count
return final

def find_next_word(N_Gram_dic, histo):
#find all bigram start with histo, use the count and do everything
corpus = ReadFile()
n_count_dic = {}
uni_dic= N_Gram(corpus, 1)
if histo.count(' ') == 0: # there is no history for it
#print("get here")
n_count_dic = uni_dic
else:
keyList = N_Gram_dic.keys()
#print(keyList)
leng = histo.count(' ') + 1
#print("histo is ")
#print(histo)
for a in keyList:
#word = ' '.join(a.split()[:leng])
#print("word is " + word)
if(histo == a):

# if histo == ' '.join(a.split()[:leng]):
n_count_dic[a] = N_Gram_dic[a]
#print(n_count_dic)
final = unsmoothed_totalcount(n_count_dic)
#print("i did my best")
#print(final)
return return_random_selected_item(final, n_count_dic)

def return_random_selected_item(total_count, n_count_dict):
print(total_count)
r = random.randint(1,total_count)
for x in n_count_dict:
f1 = n_count_dict[x]
if r - f1 <= 0 : # if the word choosen is not ending token
return x.split()[-1] + ' '
if r > f1:
r = r - f1


and the main part is here



def N_Gram_Generation(n):
corpus = ReadFile()
if n == 1:
return Uni_Generation()

uni_gram = N_Gram(corpus, 1)
n_Gram = N_Gram(corpus, n)
N_m1_Gram = N_Gram(corpus, n-1)
#final = unsmoothed_totalcount(N_m1_Gram)
sentence_list = # the list of 5 sentences

for b in xrange(0,5):
sentence = '<s> '
while len(sentence.split()) < n:
word = find_next_word(uni_gram, sentence.split()[-1])
sentence += word
while sentence.split()[-1] != '</s>':
# list = sentence.split()
# his= list[-(n-1):]
# histor = ' '.join(his)
# print(histor)
next_word = find_next_word(n_Gram, sentence.split()[-1])
if next_word != '</s>':
sentence += next_word
if next_word == '</s>':
break
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'

sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list


Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!









share









$endgroup$

















    0












    $begingroup$


    I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.



    Any suggestion would be really helpful.



    def N_Gram(corpus, n):
    corpus = ''.join(corpus)
    corpus = corpus.split(' ')
    output = {}
    for i in range(len(corpus)-n+1):
    g = ' '.join(corpus[i:i+n])
    output.setdefault(g,0)
    output[g] += 1
    return output

    def Uni_Generation():
    corpus = ReadFile()
    uni = N_Gram(corpus, 1)
    print(uni)
    final = unsmoothed_totalcount(uni)
    print(final)
    sentence_list = # the list of 5 sentences
    for b in xrange(0,5):
    sentence = '<s> '
    while sentence.split()[-1] != '</s>': #last word is not </s>
    sentence += return_random_selected_item(final, uni)
    if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
    sentence += '</s>'
    #print(sentence)
    sentence = post_processing(sentence)
    sentence_list.append(sentence)
    return sentence_list

    def unsmoothed_totalcount(n_gram_dict):
    #get total count of words
    keyList = n_gram_dict.keys()
    final = 0
    for x in n_gram_dict:
    for i,a in enumerate(keyList):
    if a == x:
    prev_word = keyList[i-1]
    prev_count = n_gram_dict[prev_word] # get the previous word count
    final += prev_count
    return final

    def find_next_word(N_Gram_dic, histo):
    #find all bigram start with histo, use the count and do everything
    corpus = ReadFile()
    n_count_dic = {}
    uni_dic= N_Gram(corpus, 1)
    if histo.count(' ') == 0: # there is no history for it
    #print("get here")
    n_count_dic = uni_dic
    else:
    keyList = N_Gram_dic.keys()
    #print(keyList)
    leng = histo.count(' ') + 1
    #print("histo is ")
    #print(histo)
    for a in keyList:
    #word = ' '.join(a.split()[:leng])
    #print("word is " + word)
    if(histo == a):

    # if histo == ' '.join(a.split()[:leng]):
    n_count_dic[a] = N_Gram_dic[a]
    #print(n_count_dic)
    final = unsmoothed_totalcount(n_count_dic)
    #print("i did my best")
    #print(final)
    return return_random_selected_item(final, n_count_dic)

    def return_random_selected_item(total_count, n_count_dict):
    print(total_count)
    r = random.randint(1,total_count)
    for x in n_count_dict:
    f1 = n_count_dict[x]
    if r - f1 <= 0 : # if the word choosen is not ending token
    return x.split()[-1] + ' '
    if r > f1:
    r = r - f1


    and the main part is here



    def N_Gram_Generation(n):
    corpus = ReadFile()
    if n == 1:
    return Uni_Generation()

    uni_gram = N_Gram(corpus, 1)
    n_Gram = N_Gram(corpus, n)
    N_m1_Gram = N_Gram(corpus, n-1)
    #final = unsmoothed_totalcount(N_m1_Gram)
    sentence_list = # the list of 5 sentences

    for b in xrange(0,5):
    sentence = '<s> '
    while len(sentence.split()) < n:
    word = find_next_word(uni_gram, sentence.split()[-1])
    sentence += word
    while sentence.split()[-1] != '</s>':
    # list = sentence.split()
    # his= list[-(n-1):]
    # histor = ' '.join(his)
    # print(histor)
    next_word = find_next_word(n_Gram, sentence.split()[-1])
    if next_word != '</s>':
    sentence += next_word
    if next_word == '</s>':
    break
    if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
    sentence += '</s>'

    sentence = post_processing(sentence)
    sentence_list.append(sentence)
    return sentence_list


    Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
    Thanks!









    share









    $endgroup$















      0












      0








      0





      $begingroup$


      I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.



      Any suggestion would be really helpful.



      def N_Gram(corpus, n):
      corpus = ''.join(corpus)
      corpus = corpus.split(' ')
      output = {}
      for i in range(len(corpus)-n+1):
      g = ' '.join(corpus[i:i+n])
      output.setdefault(g,0)
      output[g] += 1
      return output

      def Uni_Generation():
      corpus = ReadFile()
      uni = N_Gram(corpus, 1)
      print(uni)
      final = unsmoothed_totalcount(uni)
      print(final)
      sentence_list = # the list of 5 sentences
      for b in xrange(0,5):
      sentence = '<s> '
      while sentence.split()[-1] != '</s>': #last word is not </s>
      sentence += return_random_selected_item(final, uni)
      if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
      sentence += '</s>'
      #print(sentence)
      sentence = post_processing(sentence)
      sentence_list.append(sentence)
      return sentence_list

      def unsmoothed_totalcount(n_gram_dict):
      #get total count of words
      keyList = n_gram_dict.keys()
      final = 0
      for x in n_gram_dict:
      for i,a in enumerate(keyList):
      if a == x:
      prev_word = keyList[i-1]
      prev_count = n_gram_dict[prev_word] # get the previous word count
      final += prev_count
      return final

      def find_next_word(N_Gram_dic, histo):
      #find all bigram start with histo, use the count and do everything
      corpus = ReadFile()
      n_count_dic = {}
      uni_dic= N_Gram(corpus, 1)
      if histo.count(' ') == 0: # there is no history for it
      #print("get here")
      n_count_dic = uni_dic
      else:
      keyList = N_Gram_dic.keys()
      #print(keyList)
      leng = histo.count(' ') + 1
      #print("histo is ")
      #print(histo)
      for a in keyList:
      #word = ' '.join(a.split()[:leng])
      #print("word is " + word)
      if(histo == a):

      # if histo == ' '.join(a.split()[:leng]):
      n_count_dic[a] = N_Gram_dic[a]
      #print(n_count_dic)
      final = unsmoothed_totalcount(n_count_dic)
      #print("i did my best")
      #print(final)
      return return_random_selected_item(final, n_count_dic)

      def return_random_selected_item(total_count, n_count_dict):
      print(total_count)
      r = random.randint(1,total_count)
      for x in n_count_dict:
      f1 = n_count_dict[x]
      if r - f1 <= 0 : # if the word choosen is not ending token
      return x.split()[-1] + ' '
      if r > f1:
      r = r - f1


      and the main part is here



      def N_Gram_Generation(n):
      corpus = ReadFile()
      if n == 1:
      return Uni_Generation()

      uni_gram = N_Gram(corpus, 1)
      n_Gram = N_Gram(corpus, n)
      N_m1_Gram = N_Gram(corpus, n-1)
      #final = unsmoothed_totalcount(N_m1_Gram)
      sentence_list = # the list of 5 sentences

      for b in xrange(0,5):
      sentence = '<s> '
      while len(sentence.split()) < n:
      word = find_next_word(uni_gram, sentence.split()[-1])
      sentence += word
      while sentence.split()[-1] != '</s>':
      # list = sentence.split()
      # his= list[-(n-1):]
      # histor = ' '.join(his)
      # print(histor)
      next_word = find_next_word(n_Gram, sentence.split()[-1])
      if next_word != '</s>':
      sentence += next_word
      if next_word == '</s>':
      break
      if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
      sentence += '</s>'

      sentence = post_processing(sentence)
      sentence_list.append(sentence)
      return sentence_list


      Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
      Thanks!









      share









      $endgroup$




      I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.



      Any suggestion would be really helpful.



      def N_Gram(corpus, n):
      corpus = ''.join(corpus)
      corpus = corpus.split(' ')
      output = {}
      for i in range(len(corpus)-n+1):
      g = ' '.join(corpus[i:i+n])
      output.setdefault(g,0)
      output[g] += 1
      return output

      def Uni_Generation():
      corpus = ReadFile()
      uni = N_Gram(corpus, 1)
      print(uni)
      final = unsmoothed_totalcount(uni)
      print(final)
      sentence_list = # the list of 5 sentences
      for b in xrange(0,5):
      sentence = '<s> '
      while sentence.split()[-1] != '</s>': #last word is not </s>
      sentence += return_random_selected_item(final, uni)
      if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
      sentence += '</s>'
      #print(sentence)
      sentence = post_processing(sentence)
      sentence_list.append(sentence)
      return sentence_list

      def unsmoothed_totalcount(n_gram_dict):
      #get total count of words
      keyList = n_gram_dict.keys()
      final = 0
      for x in n_gram_dict:
      for i,a in enumerate(keyList):
      if a == x:
      prev_word = keyList[i-1]
      prev_count = n_gram_dict[prev_word] # get the previous word count
      final += prev_count
      return final

      def find_next_word(N_Gram_dic, histo):
      #find all bigram start with histo, use the count and do everything
      corpus = ReadFile()
      n_count_dic = {}
      uni_dic= N_Gram(corpus, 1)
      if histo.count(' ') == 0: # there is no history for it
      #print("get here")
      n_count_dic = uni_dic
      else:
      keyList = N_Gram_dic.keys()
      #print(keyList)
      leng = histo.count(' ') + 1
      #print("histo is ")
      #print(histo)
      for a in keyList:
      #word = ' '.join(a.split()[:leng])
      #print("word is " + word)
      if(histo == a):

      # if histo == ' '.join(a.split()[:leng]):
      n_count_dic[a] = N_Gram_dic[a]
      #print(n_count_dic)
      final = unsmoothed_totalcount(n_count_dic)
      #print("i did my best")
      #print(final)
      return return_random_selected_item(final, n_count_dic)

      def return_random_selected_item(total_count, n_count_dict):
      print(total_count)
      r = random.randint(1,total_count)
      for x in n_count_dict:
      f1 = n_count_dict[x]
      if r - f1 <= 0 : # if the word choosen is not ending token
      return x.split()[-1] + ' '
      if r > f1:
      r = r - f1


      and the main part is here



      def N_Gram_Generation(n):
      corpus = ReadFile()
      if n == 1:
      return Uni_Generation()

      uni_gram = N_Gram(corpus, 1)
      n_Gram = N_Gram(corpus, n)
      N_m1_Gram = N_Gram(corpus, n-1)
      #final = unsmoothed_totalcount(N_m1_Gram)
      sentence_list = # the list of 5 sentences

      for b in xrange(0,5):
      sentence = '<s> '
      while len(sentence.split()) < n:
      word = find_next_word(uni_gram, sentence.split()[-1])
      sentence += word
      while sentence.split()[-1] != '</s>':
      # list = sentence.split()
      # his= list[-(n-1):]
      # histor = ' '.join(his)
      # print(histor)
      next_word = find_next_word(n_Gram, sentence.split()[-1])
      if next_word != '</s>':
      sentence += next_word
      if next_word == '</s>':
      break
      if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
      sentence += '</s>'

      sentence = post_processing(sentence)
      sentence_list.append(sentence)
      return sentence_list


      Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
      Thanks!







      python





      share












      share










      share



      share










      asked 4 mins ago









      Yuhe ZhuYuhe Zhu

      233




      233






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212668%2fpython-program-generating-n-gram-language-model%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212668%2fpython-program-generating-n-gram-language-model%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Сан-Квентин

          Алькесар

          Josef Freinademetz