Proxy scraper and multithreaded checker

up vote
2
down vote

favorite

This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:

Get args and set (if there are any)

Get links to scrape from

Scrape for proxies and save them to a file

Check the scraped proxies using concurrency (threading) while saving the hits to a new file.

I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.

proxy_tool.py

import requests

import re

import data

import time

import sys

import os

import argparse

from bs4 import BeautifulSoup

from multiprocessing import Pool

from multiprocessing.dummy import Pool as ThreadPool



# Get's list of url's with proxies



def get_links():

    links = 

    keyword = 'server-list'

    index_url = 'http://www.proxyserverlist24.top/'

    page = requests.get(index_url)

    soup = BeautifulSoup(page.text, 'html.parser')

    temp_links = soup.find_all('a')

    for atag in temp_links:

        link = atag.get('href')

        if atag.get('href') is None:

            pass

        elif keyword in link and '#' not in link and link not in links:

            links.append(link)

    return links



# Scrape most recently uploaded proxies and returns a list of proxies

# according to the maximum amount entered by the user (default 800)



def scrape(links):

    url = links[0]

    page = requests.get(url)

    ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)

    return max_proxies(ip_list,data.max)



# Save scraped list into a file



def save_scraped(ip_list):

    if os.path.isfile(data.filename):

        os.remove(data.filename)

    with open(data.filename,'a') as wfile:

        for ip in ip_list:

            wfile.write(ip)

            wfile.write('n')

    print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))



# Maximum amount of proxies to scrape



def max_proxies(ip_list, max):

    ip_list = ip_list.copy()

    return ip_list[0:max]



# Check if proxy is alive and gets a 200 response 



def is_good(p):

    proxy = {'https' : '{}'.format(p)}

    try :

        r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)

        if r.status_code is 200:

            hits_count(p)

            save_hits(p)

    except (requests.exceptions.Timeout,

            requests.exceptions.ProxyError,

            requests.exceptions.SSLError,

            requests.exceptions.ConnectionError) as e:

        pass



# Save working proxy to a file



def save_hits(p):

    with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:

        wfile.write(p)

        wfile.write('n')



# Count hits to display when script finished executing 



def hits_count(p):

    data.hits += 1

    print('[+] HIT - {}'.format(p))



def hits():

    print('[!] {} Proxies checked and saved !'.format(data.hits))



def check_args(args=None):

    parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')

    parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')

    parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)

    parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)

    parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)





    results = parser.parse_args(args)

    return(results.url, results.max, results.timeout, results.set_threads)



# Check multiple proxies at once from a given proxy list



def check(p_list):



    pool = ThreadPool(data.num_threads)

    pool.map(is_good,p_list)

    pool.close()

    pool.join()



def main():



# Get_links returns a list with links which is passed to scrape() to scrape from

# which returns a proxy list to save in a file

    save_scraped(scrape(get_links()))



    p_list = open(data.filename).read().splitlines()

    check(p_list)

    hits()



if __name__ == "__main__":

# Set user input

    data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])

    main()

data.py

This is responsible for holding data.

import random

import datetime



user_agent_list = [

    # Chrome

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    # Firefox

    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',

    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',

    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'

]



headers = {'User-Agent': random.choice(user_agent_list)}



date = datetime.datetime.today().strftime('%Y-%m-%d')

filename = '{} ProxyList.txt'.format(date)

threads = 

url = 'https://www.google.com'

timeout = 8

hits = 0

num_threads = 30

max = 800

I have some specific questions:

Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

Is my PEP standard conformance okay?

That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

add a comment |

up vote
2
down vote

favorite

This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:

Get args and set (if there are any)

Get links to scrape from

Scrape for proxies and save them to a file

Check the scraped proxies using concurrency (threading) while saving the hits to a new file.

proxy_tool.py

import requests

import re

import data

import time

import sys

import os

import argparse

from bs4 import BeautifulSoup

from multiprocessing import Pool

from multiprocessing.dummy import Pool as ThreadPool



# Get's list of url's with proxies



def get_links():

    links = 

    keyword = 'server-list'

    index_url = 'http://www.proxyserverlist24.top/'

    page = requests.get(index_url)

    soup = BeautifulSoup(page.text, 'html.parser')

    temp_links = soup.find_all('a')

    for atag in temp_links:

        link = atag.get('href')

        if atag.get('href') is None:

            pass

        elif keyword in link and '#' not in link and link not in links:

            links.append(link)

    return links



# Scrape most recently uploaded proxies and returns a list of proxies

# according to the maximum amount entered by the user (default 800)



def scrape(links):

    url = links[0]

    page = requests.get(url)

    ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)

    return max_proxies(ip_list,data.max)



# Save scraped list into a file



def save_scraped(ip_list):

    if os.path.isfile(data.filename):

        os.remove(data.filename)

    with open(data.filename,'a') as wfile:

        for ip in ip_list:

            wfile.write(ip)

            wfile.write('n')

    print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))



# Maximum amount of proxies to scrape



def max_proxies(ip_list, max):

    ip_list = ip_list.copy()

    return ip_list[0:max]



# Check if proxy is alive and gets a 200 response 



def is_good(p):

    proxy = {'https' : '{}'.format(p)}

    try :

        r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)

        if r.status_code is 200:

            hits_count(p)

            save_hits(p)

    except (requests.exceptions.Timeout,

            requests.exceptions.ProxyError,

            requests.exceptions.SSLError,

            requests.exceptions.ConnectionError) as e:

        pass



# Save working proxy to a file



def save_hits(p):

    with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:

        wfile.write(p)

        wfile.write('n')



# Count hits to display when script finished executing 



def hits_count(p):

    data.hits += 1

    print('[+] HIT - {}'.format(p))



def hits():

    print('[!] {} Proxies checked and saved !'.format(data.hits))



def check_args(args=None):

    parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')

    parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')

    parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)

    parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)

    parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)





    results = parser.parse_args(args)

    return(results.url, results.max, results.timeout, results.set_threads)



# Check multiple proxies at once from a given proxy list



def check(p_list):



    pool = ThreadPool(data.num_threads)

    pool.map(is_good,p_list)

    pool.close()

    pool.join()



def main():



# Get_links returns a list with links which is passed to scrape() to scrape from

# which returns a proxy list to save in a file

    save_scraped(scrape(get_links()))



    p_list = open(data.filename).read().splitlines()

    check(p_list)

    hits()



if __name__ == "__main__":

# Set user input

    data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])

    main()

data.py

This is responsible for holding data.

import random

import datetime



user_agent_list = [

    # Chrome

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    # Firefox

    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',

    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',

    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'

]



headers = {'User-Agent': random.choice(user_agent_list)}



date = datetime.datetime.today().strftime('%Y-%m-%d')

filename = '{} ProxyList.txt'.format(date)

threads = 

url = 'https://www.google.com'

timeout = 8

hits = 0

num_threads = 30

max = 800

I have some specific questions:

Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

Is my PEP standard conformance okay?

That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

add a comment |

up vote
2
down vote

favorite

This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:

Get args and set (if there are any)

Get links to scrape from

Scrape for proxies and save them to a file

Check the scraped proxies using concurrency (threading) while saving the hits to a new file.

proxy_tool.py

import requests

import re

import data

import time

import sys

import os

import argparse

from bs4 import BeautifulSoup

from multiprocessing import Pool

from multiprocessing.dummy import Pool as ThreadPool



# Get's list of url's with proxies



def get_links():

    links = 

    keyword = 'server-list'

    index_url = 'http://www.proxyserverlist24.top/'

    page = requests.get(index_url)

    soup = BeautifulSoup(page.text, 'html.parser')

    temp_links = soup.find_all('a')

    for atag in temp_links:

        link = atag.get('href')

        if atag.get('href') is None:

            pass

        elif keyword in link and '#' not in link and link not in links:

            links.append(link)

    return links



# Scrape most recently uploaded proxies and returns a list of proxies

# according to the maximum amount entered by the user (default 800)



def scrape(links):

    url = links[0]

    page = requests.get(url)

    ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)

    return max_proxies(ip_list,data.max)



# Save scraped list into a file



def save_scraped(ip_list):

    if os.path.isfile(data.filename):

        os.remove(data.filename)

    with open(data.filename,'a') as wfile:

        for ip in ip_list:

            wfile.write(ip)

            wfile.write('n')

    print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))



# Maximum amount of proxies to scrape



def max_proxies(ip_list, max):

    ip_list = ip_list.copy()

    return ip_list[0:max]



# Check if proxy is alive and gets a 200 response 



def is_good(p):

    proxy = {'https' : '{}'.format(p)}

    try :

        r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)

        if r.status_code is 200:

            hits_count(p)

            save_hits(p)

    except (requests.exceptions.Timeout,

            requests.exceptions.ProxyError,

            requests.exceptions.SSLError,

            requests.exceptions.ConnectionError) as e:

        pass



# Save working proxy to a file



def save_hits(p):

    with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:

        wfile.write(p)

        wfile.write('n')



# Count hits to display when script finished executing 



def hits_count(p):

    data.hits += 1

    print('[+] HIT - {}'.format(p))



def hits():

    print('[!] {} Proxies checked and saved !'.format(data.hits))



def check_args(args=None):

    parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')

    parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')

    parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)

    parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)

    parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)





    results = parser.parse_args(args)

    return(results.url, results.max, results.timeout, results.set_threads)



# Check multiple proxies at once from a given proxy list



def check(p_list):



    pool = ThreadPool(data.num_threads)

    pool.map(is_good,p_list)

    pool.close()

    pool.join()



def main():



# Get_links returns a list with links which is passed to scrape() to scrape from

# which returns a proxy list to save in a file

    save_scraped(scrape(get_links()))



    p_list = open(data.filename).read().splitlines()

    check(p_list)

    hits()



if __name__ == "__main__":

# Set user input

    data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])

    main()

data.py

This is responsible for holding data.

import random

import datetime



user_agent_list = [

    # Chrome

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    # Firefox

    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',

    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',

    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'

]



headers = {'User-Agent': random.choice(user_agent_list)}



date = datetime.datetime.today().strftime('%Y-%m-%d')

filename = '{} ProxyList.txt'.format(date)

threads = 

url = 'https://www.google.com'

timeout = 8

hits = 0

num_threads = 30

max = 800

I have some specific questions:

Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

Is my PEP standard conformance okay?

That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:

Get args and set (if there are any)

Get links to scrape from

Scrape for proxies and save them to a file

Check the scraped proxies using concurrency (threading) while saving the hits to a new file.

proxy_tool.py

import requests

import re

import data

import time

import sys

import os

import argparse

from bs4 import BeautifulSoup

from multiprocessing import Pool

from multiprocessing.dummy import Pool as ThreadPool



# Get's list of url's with proxies



def get_links():

    links = 

    keyword = 'server-list'

    index_url = 'http://www.proxyserverlist24.top/'

    page = requests.get(index_url)

    soup = BeautifulSoup(page.text, 'html.parser')

    temp_links = soup.find_all('a')

    for atag in temp_links:

        link = atag.get('href')

        if atag.get('href') is None:

            pass

        elif keyword in link and '#' not in link and link not in links:

            links.append(link)

    return links



# Scrape most recently uploaded proxies and returns a list of proxies

# according to the maximum amount entered by the user (default 800)



def scrape(links):

    url = links[0]

    page = requests.get(url)

    ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)

    return max_proxies(ip_list,data.max)



# Save scraped list into a file



def save_scraped(ip_list):

    if os.path.isfile(data.filename):

        os.remove(data.filename)

    with open(data.filename,'a') as wfile:

        for ip in ip_list:

            wfile.write(ip)

            wfile.write('n')

    print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))



# Maximum amount of proxies to scrape



def max_proxies(ip_list, max):

    ip_list = ip_list.copy()

    return ip_list[0:max]



# Check if proxy is alive and gets a 200 response 



def is_good(p):

    proxy = {'https' : '{}'.format(p)}

    try :

        r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)

        if r.status_code is 200:

            hits_count(p)

            save_hits(p)

    except (requests.exceptions.Timeout,

            requests.exceptions.ProxyError,

            requests.exceptions.SSLError,

            requests.exceptions.ConnectionError) as e:

        pass



# Save working proxy to a file



def save_hits(p):

    with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:

        wfile.write(p)

        wfile.write('n')



# Count hits to display when script finished executing 



def hits_count(p):

    data.hits += 1

    print('[+] HIT - {}'.format(p))



def hits():

    print('[!] {} Proxies checked and saved !'.format(data.hits))



def check_args(args=None):

    parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')

    parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')

    parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)

    parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)

    parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)





    results = parser.parse_args(args)

    return(results.url, results.max, results.timeout, results.set_threads)



# Check multiple proxies at once from a given proxy list



def check(p_list):



    pool = ThreadPool(data.num_threads)

    pool.map(is_good,p_list)

    pool.close()

    pool.join()



def main():



# Get_links returns a list with links which is passed to scrape() to scrape from

# which returns a proxy list to save in a file

    save_scraped(scrape(get_links()))



    p_list = open(data.filename).read().splitlines()

    check(p_list)

    hits()



if __name__ == "__main__":

# Set user input

    data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])

    main()

data.py

This is responsible for holding data.

import random

import datetime



user_agent_list = [

    # Chrome

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',

    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',

    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',

    # Firefox

    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',

    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',

    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',

    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',

    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'

]



headers = {'User-Agent': random.choice(user_agent_list)}



date = datetime.datetime.today().strftime('%Y-%m-%d')

filename = '{} ProxyList.txt'.format(date)

threads = 

url = 'https://www.google.com'

timeout = 8

hits = 0

num_threads = 30

max = 800

I have some specific questions:

Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

Is my PEP standard conformance okay?

That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.

python beginner python-3.x

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

edited Oct 25 at 11:54

asked Oct 24 at 16:49

shaike

112

asked Oct 24 at 16:49

shaike

112

asked Oct 24 at 16:49

shaike

112

add a comment |

1 Answer
1

active

oldest

votes

up vote
0
down vote

Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.
```
for line in open('ua.txt').read().splitlines():

    user_agents_list.append(line.strip())
```
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.

You should not use different functions for different things unless you need to call/use them more than once.

Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.

Hope this helps.

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f206208%2fproxy-scraper-and-multithreaded-checker%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
0
down vote

Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.
```
for line in open('ua.txt').read().splitlines():

    user_agents_list.append(line.strip())
```
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.

You should not use different functions for different things unless you need to call/use them more than once.

Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.

Hope this helps.

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

add a comment |

up vote
0
down vote

Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.
```
for line in open('ua.txt').read().splitlines():

    user_agents_list.append(line.strip())
```
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.

You should not use different functions for different things unless you need to call/use them more than once.

Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.

Hope this helps.

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

add a comment |

up vote
0
down vote

Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.
```
for line in open('ua.txt').read().splitlines():

    user_agents_list.append(line.strip())
```
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.

You should not use different functions for different things unless you need to call/use them more than once.

Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.

Hope this helps.

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.
```
for line in open('ua.txt').read().splitlines():

    user_agents_list.append(line.strip())
```
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.

You should not use different functions for different things unless you need to call/use them more than once.

Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.

Hope this helps.

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

edited Oct 26 at 11:10

answered Oct 25 at 9:57

The Infected Drake

1012

answered Oct 25 at 9:57

The Infected Drake

1012

answered Oct 25 at 9:57

The Infected Drake

1012

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

add a comment |

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02

Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl