Proxy scraper and multithreaded checker











up vote
2
down vote

favorite












This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:




  • Get args and set (if there are any)

  • Get links to scrape from

  • Scrape for proxies and save them to a file

  • Check the scraped proxies using concurrency (threading) while saving the hits to a new file.


I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.





proxy_tool.py



import requests
import re
import data
import time
import sys
import os
import argparse
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool

# Get's list of url's with proxies

def get_links():
links =
keyword = 'server-list'
index_url = 'http://www.proxyserverlist24.top/'
page = requests.get(index_url)
soup = BeautifulSoup(page.text, 'html.parser')
temp_links = soup.find_all('a')
for atag in temp_links:
link = atag.get('href')
if atag.get('href') is None:
pass
elif keyword in link and '#' not in link and link not in links:
links.append(link)
return links

# Scrape most recently uploaded proxies and returns a list of proxies
# according to the maximum amount entered by the user (default 800)

def scrape(links):
url = links[0]
page = requests.get(url)
ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
return max_proxies(ip_list,data.max)

# Save scraped list into a file

def save_scraped(ip_list):
if os.path.isfile(data.filename):
os.remove(data.filename)
with open(data.filename,'a') as wfile:
for ip in ip_list:
wfile.write(ip)
wfile.write('n')
print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))

# Maximum amount of proxies to scrape

def max_proxies(ip_list, max):
ip_list = ip_list.copy()
return ip_list[0:max]

# Check if proxy is alive and gets a 200 response

def is_good(p):
proxy = {'https' : '{}'.format(p)}
try :
r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
if r.status_code is 200:
hits_count(p)
save_hits(p)
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
pass

# Save working proxy to a file

def save_hits(p):
with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
wfile.write(p)
wfile.write('n')

# Count hits to display when script finished executing

def hits_count(p):
data.hits += 1
print('[+] HIT - {}'.format(p))

def hits():
print('[!] {} Proxies checked and saved !'.format(data.hits))

def check_args(args=None):
parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)


results = parser.parse_args(args)
return(results.url, results.max, results.timeout, results.set_threads)

# Check multiple proxies at once from a given proxy list

def check(p_list):

pool = ThreadPool(data.num_threads)
pool.map(is_good,p_list)
pool.close()
pool.join()

def main():

# Get_links returns a list with links which is passed to scrape() to scrape from
# which returns a proxy list to save in a file
save_scraped(scrape(get_links()))

p_list = open(data.filename).read().splitlines()
check(p_list)
hits()

if __name__ == "__main__":
# Set user input
data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
main()


data.py



This is responsible for holding data.



import random
import datetime

user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

headers = {'User-Agent': random.choice(user_agent_list)}

date = datetime.datetime.today().strftime('%Y-%m-%d')
filename = '{} ProxyList.txt'.format(date)
threads =
url = 'https://www.google.com'
timeout = 8
hits = 0
num_threads = 30
max = 800




I have some specific questions:




  1. Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

  2. Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

  3. How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

  4. By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

  5. Is my PEP standard conformance okay?


That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.










share|improve this question




























    up vote
    2
    down vote

    favorite












    This is my first "serious" project after learning python for a while.
    The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:




    • Get args and set (if there are any)

    • Get links to scrape from

    • Scrape for proxies and save them to a file

    • Check the scraped proxies using concurrency (threading) while saving the hits to a new file.


    I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.





    proxy_tool.py



    import requests
    import re
    import data
    import time
    import sys
    import os
    import argparse
    from bs4 import BeautifulSoup
    from multiprocessing import Pool
    from multiprocessing.dummy import Pool as ThreadPool

    # Get's list of url's with proxies

    def get_links():
    links =
    keyword = 'server-list'
    index_url = 'http://www.proxyserverlist24.top/'
    page = requests.get(index_url)
    soup = BeautifulSoup(page.text, 'html.parser')
    temp_links = soup.find_all('a')
    for atag in temp_links:
    link = atag.get('href')
    if atag.get('href') is None:
    pass
    elif keyword in link and '#' not in link and link not in links:
    links.append(link)
    return links

    # Scrape most recently uploaded proxies and returns a list of proxies
    # according to the maximum amount entered by the user (default 800)

    def scrape(links):
    url = links[0]
    page = requests.get(url)
    ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
    return max_proxies(ip_list,data.max)

    # Save scraped list into a file

    def save_scraped(ip_list):
    if os.path.isfile(data.filename):
    os.remove(data.filename)
    with open(data.filename,'a') as wfile:
    for ip in ip_list:
    wfile.write(ip)
    wfile.write('n')
    print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))

    # Maximum amount of proxies to scrape

    def max_proxies(ip_list, max):
    ip_list = ip_list.copy()
    return ip_list[0:max]

    # Check if proxy is alive and gets a 200 response

    def is_good(p):
    proxy = {'https' : '{}'.format(p)}
    try :
    r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
    if r.status_code is 200:
    hits_count(p)
    save_hits(p)
    except (requests.exceptions.Timeout,
    requests.exceptions.ProxyError,
    requests.exceptions.SSLError,
    requests.exceptions.ConnectionError) as e:
    pass

    # Save working proxy to a file

    def save_hits(p):
    with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
    wfile.write(p)
    wfile.write('n')

    # Count hits to display when script finished executing

    def hits_count(p):
    data.hits += 1
    print('[+] HIT - {}'.format(p))

    def hits():
    print('[!] {} Proxies checked and saved !'.format(data.hits))

    def check_args(args=None):
    parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
    parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
    parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
    parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
    parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)


    results = parser.parse_args(args)
    return(results.url, results.max, results.timeout, results.set_threads)

    # Check multiple proxies at once from a given proxy list

    def check(p_list):

    pool = ThreadPool(data.num_threads)
    pool.map(is_good,p_list)
    pool.close()
    pool.join()

    def main():

    # Get_links returns a list with links which is passed to scrape() to scrape from
    # which returns a proxy list to save in a file
    save_scraped(scrape(get_links()))

    p_list = open(data.filename).read().splitlines()
    check(p_list)
    hits()

    if __name__ == "__main__":
    # Set user input
    data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
    main()


    data.py



    This is responsible for holding data.



    import random
    import datetime

    user_agent_list = [
    # Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    # Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
    ]

    headers = {'User-Agent': random.choice(user_agent_list)}

    date = datetime.datetime.today().strftime('%Y-%m-%d')
    filename = '{} ProxyList.txt'.format(date)
    threads =
    url = 'https://www.google.com'
    timeout = 8
    hits = 0
    num_threads = 30
    max = 800




    I have some specific questions:




    1. Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

    2. Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

    3. How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

    4. By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

    5. Is my PEP standard conformance okay?


    That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.










    share|improve this question


























      up vote
      2
      down vote

      favorite









      up vote
      2
      down vote

      favorite











      This is my first "serious" project after learning python for a while.
      The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:




      • Get args and set (if there are any)

      • Get links to scrape from

      • Scrape for proxies and save them to a file

      • Check the scraped proxies using concurrency (threading) while saving the hits to a new file.


      I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.





      proxy_tool.py



      import requests
      import re
      import data
      import time
      import sys
      import os
      import argparse
      from bs4 import BeautifulSoup
      from multiprocessing import Pool
      from multiprocessing.dummy import Pool as ThreadPool

      # Get's list of url's with proxies

      def get_links():
      links =
      keyword = 'server-list'
      index_url = 'http://www.proxyserverlist24.top/'
      page = requests.get(index_url)
      soup = BeautifulSoup(page.text, 'html.parser')
      temp_links = soup.find_all('a')
      for atag in temp_links:
      link = atag.get('href')
      if atag.get('href') is None:
      pass
      elif keyword in link and '#' not in link and link not in links:
      links.append(link)
      return links

      # Scrape most recently uploaded proxies and returns a list of proxies
      # according to the maximum amount entered by the user (default 800)

      def scrape(links):
      url = links[0]
      page = requests.get(url)
      ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
      return max_proxies(ip_list,data.max)

      # Save scraped list into a file

      def save_scraped(ip_list):
      if os.path.isfile(data.filename):
      os.remove(data.filename)
      with open(data.filename,'a') as wfile:
      for ip in ip_list:
      wfile.write(ip)
      wfile.write('n')
      print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))

      # Maximum amount of proxies to scrape

      def max_proxies(ip_list, max):
      ip_list = ip_list.copy()
      return ip_list[0:max]

      # Check if proxy is alive and gets a 200 response

      def is_good(p):
      proxy = {'https' : '{}'.format(p)}
      try :
      r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
      if r.status_code is 200:
      hits_count(p)
      save_hits(p)
      except (requests.exceptions.Timeout,
      requests.exceptions.ProxyError,
      requests.exceptions.SSLError,
      requests.exceptions.ConnectionError) as e:
      pass

      # Save working proxy to a file

      def save_hits(p):
      with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
      wfile.write(p)
      wfile.write('n')

      # Count hits to display when script finished executing

      def hits_count(p):
      data.hits += 1
      print('[+] HIT - {}'.format(p))

      def hits():
      print('[!] {} Proxies checked and saved !'.format(data.hits))

      def check_args(args=None):
      parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
      parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
      parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
      parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
      parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)


      results = parser.parse_args(args)
      return(results.url, results.max, results.timeout, results.set_threads)

      # Check multiple proxies at once from a given proxy list

      def check(p_list):

      pool = ThreadPool(data.num_threads)
      pool.map(is_good,p_list)
      pool.close()
      pool.join()

      def main():

      # Get_links returns a list with links which is passed to scrape() to scrape from
      # which returns a proxy list to save in a file
      save_scraped(scrape(get_links()))

      p_list = open(data.filename).read().splitlines()
      check(p_list)
      hits()

      if __name__ == "__main__":
      # Set user input
      data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
      main()


      data.py



      This is responsible for holding data.



      import random
      import datetime

      user_agent_list = [
      # Chrome
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
      # Firefox
      'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
      'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
      ]

      headers = {'User-Agent': random.choice(user_agent_list)}

      date = datetime.datetime.today().strftime('%Y-%m-%d')
      filename = '{} ProxyList.txt'.format(date)
      threads =
      url = 'https://www.google.com'
      timeout = 8
      hits = 0
      num_threads = 30
      max = 800




      I have some specific questions:




      1. Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

      2. Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

      3. How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

      4. By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

      5. Is my PEP standard conformance okay?


      That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.










      share|improve this question















      This is my first "serious" project after learning python for a while.
      The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:




      • Get args and set (if there are any)

      • Get links to scrape from

      • Scrape for proxies and save them to a file

      • Check the scraped proxies using concurrency (threading) while saving the hits to a new file.


      I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.





      proxy_tool.py



      import requests
      import re
      import data
      import time
      import sys
      import os
      import argparse
      from bs4 import BeautifulSoup
      from multiprocessing import Pool
      from multiprocessing.dummy import Pool as ThreadPool

      # Get's list of url's with proxies

      def get_links():
      links =
      keyword = 'server-list'
      index_url = 'http://www.proxyserverlist24.top/'
      page = requests.get(index_url)
      soup = BeautifulSoup(page.text, 'html.parser')
      temp_links = soup.find_all('a')
      for atag in temp_links:
      link = atag.get('href')
      if atag.get('href') is None:
      pass
      elif keyword in link and '#' not in link and link not in links:
      links.append(link)
      return links

      # Scrape most recently uploaded proxies and returns a list of proxies
      # according to the maximum amount entered by the user (default 800)

      def scrape(links):
      url = links[0]
      page = requests.get(url)
      ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
      return max_proxies(ip_list,data.max)

      # Save scraped list into a file

      def save_scraped(ip_list):
      if os.path.isfile(data.filename):
      os.remove(data.filename)
      with open(data.filename,'a') as wfile:
      for ip in ip_list:
      wfile.write(ip)
      wfile.write('n')
      print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))

      # Maximum amount of proxies to scrape

      def max_proxies(ip_list, max):
      ip_list = ip_list.copy()
      return ip_list[0:max]

      # Check if proxy is alive and gets a 200 response

      def is_good(p):
      proxy = {'https' : '{}'.format(p)}
      try :
      r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
      if r.status_code is 200:
      hits_count(p)
      save_hits(p)
      except (requests.exceptions.Timeout,
      requests.exceptions.ProxyError,
      requests.exceptions.SSLError,
      requests.exceptions.ConnectionError) as e:
      pass

      # Save working proxy to a file

      def save_hits(p):
      with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
      wfile.write(p)
      wfile.write('n')

      # Count hits to display when script finished executing

      def hits_count(p):
      data.hits += 1
      print('[+] HIT - {}'.format(p))

      def hits():
      print('[!] {} Proxies checked and saved !'.format(data.hits))

      def check_args(args=None):
      parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
      parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
      parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
      parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
      parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)


      results = parser.parse_args(args)
      return(results.url, results.max, results.timeout, results.set_threads)

      # Check multiple proxies at once from a given proxy list

      def check(p_list):

      pool = ThreadPool(data.num_threads)
      pool.map(is_good,p_list)
      pool.close()
      pool.join()

      def main():

      # Get_links returns a list with links which is passed to scrape() to scrape from
      # which returns a proxy list to save in a file
      save_scraped(scrape(get_links()))

      p_list = open(data.filename).read().splitlines()
      check(p_list)
      hits()

      if __name__ == "__main__":
      # Set user input
      data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
      main()


      data.py



      This is responsible for holding data.



      import random
      import datetime

      user_agent_list = [
      # Chrome
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
      'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
      # Firefox
      'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
      'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
      'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
      'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
      ]

      headers = {'User-Agent': random.choice(user_agent_list)}

      date = datetime.datetime.today().strftime('%Y-%m-%d')
      filename = '{} ProxyList.txt'.format(date)
      threads =
      url = 'https://www.google.com'
      timeout = 8
      hits = 0
      num_threads = 30
      max = 800




      I have some specific questions:




      1. Is using an external module to hold data like I did in data.py considered to be a good practice, or should I create the variables in main()? (it looks cleaner this way)

      2. Is using a lot of functions even for small things like I did with hits() or hits_count() considered to be a good practice ?

      3. How I implemented save_scraped(scrape(get_links())) looks pretty messy to me, but I tried to avoid using global variables; is that good practice?

      4. By changing to asyncio instead of threading could I achieve faster performance while checking the proxies?

      5. Is my PEP standard conformance okay?


      That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.







      python beginner python-3.x






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Oct 25 at 11:54

























      asked Oct 24 at 16:49









      shaike

      112




      112






















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          0
          down vote















          1. Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.



            for line in open('ua.txt').read().splitlines():
            user_agents_list.append(line.strip())


            But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.



          2. You should not use different functions for different things unless you need to call/use them more than once.


          3. Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

          4. Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

          5. Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.


          Hope this helps.






          share|improve this answer























          • About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
            – shaike
            Oct 25 at 12:02












          • Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
            – The Infected Drake
            Oct 26 at 11:06













          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f206208%2fproxy-scraper-and-multithreaded-checker%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          1 Answer
          1






          active

          oldest

          votes








          1 Answer
          1






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          up vote
          0
          down vote















          1. Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.



            for line in open('ua.txt').read().splitlines():
            user_agents_list.append(line.strip())


            But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.



          2. You should not use different functions for different things unless you need to call/use them more than once.


          3. Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

          4. Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

          5. Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.


          Hope this helps.






          share|improve this answer























          • About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
            – shaike
            Oct 25 at 12:02












          • Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
            – The Infected Drake
            Oct 26 at 11:06

















          up vote
          0
          down vote















          1. Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.



            for line in open('ua.txt').read().splitlines():
            user_agents_list.append(line.strip())


            But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.



          2. You should not use different functions for different things unless you need to call/use them more than once.


          3. Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

          4. Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

          5. Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.


          Hope this helps.






          share|improve this answer























          • About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
            – shaike
            Oct 25 at 12:02












          • Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
            – The Infected Drake
            Oct 26 at 11:06















          up vote
          0
          down vote










          up vote
          0
          down vote











          1. Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.



            for line in open('ua.txt').read().splitlines():
            user_agents_list.append(line.strip())


            But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.



          2. You should not use different functions for different things unless you need to call/use them more than once.


          3. Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

          4. Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

          5. Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.


          Hope this helps.






          share|improve this answer
















          1. Of course. You can also do another thing: keep all the user agents in a separate file (eg. ua.txt) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in the user_agents_list.



            for line in open('ua.txt').read().splitlines():
            user_agents_list.append(line.strip())


            But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.



          2. You should not use different functions for different things unless you need to call/use them more than once.


          3. Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)

          4. Instead of changing over to asyncio, you should know How to Wield Threaded Asynchronous Magic.

          5. Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.


          Hope this helps.







          share|improve this answer














          share|improve this answer



          share|improve this answer








          edited Oct 26 at 11:10

























          answered Oct 25 at 9:57









          The Infected Drake

          1012




          1012












          • About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
            – shaike
            Oct 25 at 12:02












          • Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
            – The Infected Drake
            Oct 26 at 11:06




















          • About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
            – shaike
            Oct 25 at 12:02












          • Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
            – The Infected Drake
            Oct 26 at 11:06


















          About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
          – shaike
          Oct 25 at 12:02






          About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
          – shaike
          Oct 25 at 12:02














          Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
          – The Infected Drake
          Oct 26 at 11:06






          Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
          – The Infected Drake
          Oct 26 at 11:06




















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.





          Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


          Please pay close attention to the following guidance:


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f206208%2fproxy-scraper-and-multithreaded-checker%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          Сан-Квентин

          Алькесар

          Josef Freinademetz