twitter data mining script in python

up vote
2
down vote

favorite

I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.

How can I improve this code to generally be more efficient and be up to coding standards ?

"""

Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.

"""



import tweepy

import datetime, time, csv, codecs

from textblob import TextBlob

import cleanit



##setting authorization stuff for twitter##

consumer_key = "xxx"

consumer_secret = "xxx"

access_token = "xxx"

access_token_secret = "xxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)



##initializing lists##

big_list = 

text_list = 

id_list = 

name_list = 

created_list = 

query_list = 

polarityy = 



t = 0



#use words in this list as search terms for tweepy.cursor function

security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']



# if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard

double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']

gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']



def storing_data(stat):

##store id,username,datetime,text and polarity for filtered tweets in csv##

text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))                                   

    id_list.append(str(status.id))                              # append id number to list                                                                                      

    name_list.append(str(status.user.screen_name))              # append user name to list                                                                      

    created_list.append((status.created_at).strftime('%c'))     # append date time to list                                                             

    analysis = TextBlob(status.text)

    analysis = analysis.sentiment.polarity                      # use textblob on text to get sentiment score of text                                                                              



    if analysis >= -1 and analysis <= 0:                        # append sentiment score to list                                                                          

        polarityy.append("4")

    else:

    polarityy.append("0")



def rejects(stat):

##store tweets which do not pass filters into csv##

    with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:

        logger = csv.writer(rejectfile)

        logger.writerow([status.text])





while True:

    print ('running', datetime.datetime.now())

    with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:

        logger = csv.writer(logfile)

        for i in security_words:

            alex = 

            for status in tweepy.Cursor(api.search, i,lang="en").items(40):                                             #search twitter for word in security word list in english

                if (status.retweeted == False) or ('RT @' not in status.text):                                          #is tweet is retweeted dont store it

                    if i in double_meaning_words and i in status.text:                                                  #if search term being used from security words list also in double meaning words check if it also contains word -

                        for words in gen_words:                                                                         # - from gen_words list. If it does continue to storing if not dont store.

                            if words in status.text:

                                storing_data(status)

                                break

                            else:

                                rejects(status)

                    else:

                        storing_data(status)



                rejects(status)



                while t < len(polarityy):

                    alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])

                    t += 1

                    logger.writerow(alex)

    time.sleep(1800)

asked Nov 30 at 2:27

dmnte

111

add a comment |

up vote
2
down vote

favorite

I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.

How can I improve this code to generally be more efficient and be up to coding standards ?

"""

Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.

"""



import tweepy

import datetime, time, csv, codecs

from textblob import TextBlob

import cleanit



##setting authorization stuff for twitter##

consumer_key = "xxx"

consumer_secret = "xxx"

access_token = "xxx"

access_token_secret = "xxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)



##initializing lists##

big_list = 

text_list = 

id_list = 

name_list = 

created_list = 

query_list = 

polarityy = 



t = 0



#use words in this list as search terms for tweepy.cursor function

security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']



# if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard

double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']

gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']



def storing_data(stat):

##store id,username,datetime,text and polarity for filtered tweets in csv##

text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))                                   

    id_list.append(str(status.id))                              # append id number to list                                                                                      

    name_list.append(str(status.user.screen_name))              # append user name to list                                                                      

    created_list.append((status.created_at).strftime('%c'))     # append date time to list                                                             

    analysis = TextBlob(status.text)

    analysis = analysis.sentiment.polarity                      # use textblob on text to get sentiment score of text                                                                              



    if analysis >= -1 and analysis <= 0:                        # append sentiment score to list                                                                          

        polarityy.append("4")

    else:

    polarityy.append("0")



def rejects(stat):

##store tweets which do not pass filters into csv##

    with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:

        logger = csv.writer(rejectfile)

        logger.writerow([status.text])





while True:

    print ('running', datetime.datetime.now())

    with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:

        logger = csv.writer(logfile)

        for i in security_words:

            alex = 

            for status in tweepy.Cursor(api.search, i,lang="en").items(40):                                             #search twitter for word in security word list in english

                if (status.retweeted == False) or ('RT @' not in status.text):                                          #is tweet is retweeted dont store it

                    if i in double_meaning_words and i in status.text:                                                  #if search term being used from security words list also in double meaning words check if it also contains word -

                        for words in gen_words:                                                                         # - from gen_words list. If it does continue to storing if not dont store.

                            if words in status.text:

                                storing_data(status)

                                break

                            else:

                                rejects(status)

                    else:

                        storing_data(status)



                rejects(status)



                while t < len(polarityy):

                    alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])

                    t += 1

                    logger.writerow(alex)

    time.sleep(1800)

asked Nov 30 at 2:27

dmnte

111

add a comment |

up vote
2
down vote

favorite

I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.

How can I improve this code to generally be more efficient and be up to coding standards ?

"""

Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.

"""



import tweepy

import datetime, time, csv, codecs

from textblob import TextBlob

import cleanit



##setting authorization stuff for twitter##

consumer_key = "xxx"

consumer_secret = "xxx"

access_token = "xxx"

access_token_secret = "xxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)



##initializing lists##

big_list = 

text_list = 

id_list = 

name_list = 

created_list = 

query_list = 

polarityy = 



t = 0



#use words in this list as search terms for tweepy.cursor function

security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']



# if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard

double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']

gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']



def storing_data(stat):

##store id,username,datetime,text and polarity for filtered tweets in csv##

text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))                                   

    id_list.append(str(status.id))                              # append id number to list                                                                                      

    name_list.append(str(status.user.screen_name))              # append user name to list                                                                      

    created_list.append((status.created_at).strftime('%c'))     # append date time to list                                                             

    analysis = TextBlob(status.text)

    analysis = analysis.sentiment.polarity                      # use textblob on text to get sentiment score of text                                                                              



    if analysis >= -1 and analysis <= 0:                        # append sentiment score to list                                                                          

        polarityy.append("4")

    else:

    polarityy.append("0")



def rejects(stat):

##store tweets which do not pass filters into csv##

    with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:

        logger = csv.writer(rejectfile)

        logger.writerow([status.text])





while True:

    print ('running', datetime.datetime.now())

    with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:

        logger = csv.writer(logfile)

        for i in security_words:

            alex = 

            for status in tweepy.Cursor(api.search, i,lang="en").items(40):                                             #search twitter for word in security word list in english

                if (status.retweeted == False) or ('RT @' not in status.text):                                          #is tweet is retweeted dont store it

                    if i in double_meaning_words and i in status.text:                                                  #if search term being used from security words list also in double meaning words check if it also contains word -

                        for words in gen_words:                                                                         # - from gen_words list. If it does continue to storing if not dont store.

                            if words in status.text:

                                storing_data(status)

                                break

                            else:

                                rejects(status)

                    else:

                        storing_data(status)



                rejects(status)



                while t < len(polarityy):

                    alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])

                    t += 1

                    logger.writerow(alex)

    time.sleep(1800)

asked Nov 30 at 2:27

dmnte

111

I have written a simple script that searches twitter for keywords and saves them to a csv file if they contain those words. It can be found on my github here.

How can I improve this code to generally be more efficient and be up to coding standards ?

"""

Script that goes through english tweets that are filtered by security words and posted in the last one hour and stores the polarity, id, date time, query, username and text into a csv file.

"""



import tweepy

import datetime, time, csv, codecs

from textblob import TextBlob

import cleanit



##setting authorization stuff for twitter##

consumer_key = "xxx"

consumer_secret = "xxx"

access_token = "xxx"

access_token_secret = "xxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)



##initializing lists##

big_list = 

text_list = 

id_list = 

name_list = 

created_list = 

query_list = 

polarityy = 



t = 0



#use words in this list as search terms for tweepy.cursor function

security_words = ['phishing','dos','botnet','xss','smb','wannacry','heartbleed','ransomware','trojan','spyware','exploit','virus','malware','mitm']



# if word in security words list and double_meaning_words list if text also contains word from gen words list, if it does store if not discard

double_meaning_words = ['petya','smb','dos','infosec','hacker','backdoor']

gen_words = ["attack","security","hit","detected","protected","injection","data","exploit", "router", 'ransomware', 'phishing', 'wannacry', 'security']



def storing_data(stat):

##store id,username,datetime,text and polarity for filtered tweets in csv##

text_list.append(str(cleanit.tweet_cleaner_updated(status.text)).encode("utf-8"))                                   

    id_list.append(str(status.id))                              # append id number to list                                                                                      

    name_list.append(str(status.user.screen_name))              # append user name to list                                                                      

    created_list.append((status.created_at).strftime('%c'))     # append date time to list                                                             

    analysis = TextBlob(status.text)

    analysis = analysis.sentiment.polarity                      # use textblob on text to get sentiment score of text                                                                              



    if analysis >= -1 and analysis <= 0:                        # append sentiment score to list                                                                          

        polarityy.append("4")

    else:

    polarityy.append("0")



def rejects(stat):

##store tweets which do not pass filters into csv##

    with open('rejects.csv', "a", newline='', encoding='utf-8') as rejectfile:

        logger = csv.writer(rejectfile)

        logger.writerow([status.text])





while True:

    print ('running', datetime.datetime.now())

    with open('sec_tweet_dataset_5.csv', "a", newline='', encoding='utf-8') as logfile:

        logger = csv.writer(logfile)

        for i in security_words:

            alex = 

            for status in tweepy.Cursor(api.search, i,lang="en").items(40):                                             #search twitter for word in security word list in english

                if (status.retweeted == False) or ('RT @' not in status.text):                                          #is tweet is retweeted dont store it

                    if i in double_meaning_words and i in status.text:                                                  #if search term being used from security words list also in double meaning words check if it also contains word -

                        for words in gen_words:                                                                         # - from gen_words list. If it does continue to storing if not dont store.

                            if words in status.text:

                                storing_data(status)

                                break

                            else:

                                rejects(status)

                    else:

                        storing_data(status)



                rejects(status)



                while t < len(polarityy):

                    alex = ([polarityy[t],id_list[t],created_list[t],name_list[t],text_list[int(t)]])

                    t += 1

                    logger.writerow(alex)

    time.sleep(1800)

python python-3.x pandas twitter

asked Nov 30 at 2:27

dmnte

111

asked Nov 30 at 2:27

dmnte

111

asked Nov 30 at 2:27

dmnte

111

asked Nov 30 at 2:27

dmnte

111

asked Nov 30 at 2:27

dmnte

111

add a comment |

1 Answer
1

active

oldest

votes

up vote
1
down vote

These following rules are pretty general and take time to internalize. I hope you can apply some of them to your code anyways.

Global variables (variables you don't declare in functions, but at the top level) should be avoided). Constants (variables which you never change) are okay. Instead of changing/mutating global variables in your functions, try to rewrite them so they take input and return something.

Try to break your code up into more functions.

Give descriptive variable names (What does "t" do in your code?).

Read through PEP8(https://www.python.org/dev/peps/pep-0008/) and try to apply it to your code.

answered Nov 30 at 14:05

Noah Haasis

1112

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208740%2ftwitter-data-mining-script-in-python%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
1
down vote

These following rules are pretty general and take time to internalize. I hope you can apply some of them to your code anyways.

Try to break your code up into more functions.

Give descriptive variable names (What does "t" do in your code?).

Read through PEP8(https://www.python.org/dev/peps/pep-0008/) and try to apply it to your code.

answered Nov 30 at 14:05

Noah Haasis

1112

add a comment |

up vote
1
down vote

These following rules are pretty general and take time to internalize. I hope you can apply some of them to your code anyways.

Try to break your code up into more functions.

Give descriptive variable names (What does "t" do in your code?).

Read through PEP8(https://www.python.org/dev/peps/pep-0008/) and try to apply it to your code.

answered Nov 30 at 14:05

Noah Haasis

1112

add a comment |

up vote
1
down vote

These following rules are pretty general and take time to internalize. I hope you can apply some of them to your code anyways.

Try to break your code up into more functions.

Give descriptive variable names (What does "t" do in your code?).

Read through PEP8(https://www.python.org/dev/peps/pep-0008/) and try to apply it to your code.

answered Nov 30 at 14:05

Noah Haasis

1112

These following rules are pretty general and take time to internalize. I hope you can apply some of them to your code anyways.

Try to break your code up into more functions.

Give descriptive variable names (What does "t" do in your code?).

Read through PEP8(https://www.python.org/dev/peps/pep-0008/) and try to apply it to your code.

answered Nov 30 at 14:05

Noah Haasis

1112

answered Nov 30 at 14:05

Noah Haasis

1112

answered Nov 30 at 14:05

Noah Haasis

1112

answered Nov 30 at 14:05

Noah Haasis

1112

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl