Proxy scraper and multithreaded checker
up vote
2
down vote
favorite
This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:
- Get args and set (if there are any)
- Get links to scrape from
- Scrape for proxies and save them to a file
- Check the scraped proxies using concurrency (threading) while saving the hits to a new file.
I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.
proxy_tool.py
import requests
import re
import data
import time
import sys
import os
import argparse
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
# Get's list of url's with proxies
def get_links():
links =
keyword = 'server-list'
index_url = 'http://www.proxyserverlist24.top/'
page = requests.get(index_url)
soup = BeautifulSoup(page.text, 'html.parser')
temp_links = soup.find_all('a')
for atag in temp_links:
link = atag.get('href')
if atag.get('href') is None:
pass
elif keyword in link and '#' not in link and link not in links:
links.append(link)
return links
# Scrape most recently uploaded proxies and returns a list of proxies
# according to the maximum amount entered by the user (default 800)
def scrape(links):
url = links[0]
page = requests.get(url)
ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
return max_proxies(ip_list,data.max)
# Save scraped list into a file
def save_scraped(ip_list):
if os.path.isfile(data.filename):
os.remove(data.filename)
with open(data.filename,'a') as wfile:
for ip in ip_list:
wfile.write(ip)
wfile.write('n')
print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))
# Maximum amount of proxies to scrape
def max_proxies(ip_list, max):
ip_list = ip_list.copy()
return ip_list[0:max]
# Check if proxy is alive and gets a 200 response
def is_good(p):
proxy = {'https' : '{}'.format(p)}
try :
r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
if r.status_code is 200:
hits_count(p)
save_hits(p)
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
pass
# Save working proxy to a file
def save_hits(p):
with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
wfile.write(p)
wfile.write('n')
# Count hits to display when script finished executing
def hits_count(p):
data.hits += 1
print('[+] HIT - {}'.format(p))
def hits():
print('[!] {} Proxies checked and saved !'.format(data.hits))
def check_args(args=None):
parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)
results = parser.parse_args(args)
return(results.url, results.max, results.timeout, results.set_threads)
# Check multiple proxies at once from a given proxy list
def check(p_list):
pool = ThreadPool(data.num_threads)
pool.map(is_good,p_list)
pool.close()
pool.join()
def main():
# Get_links returns a list with links which is passed to scrape() to scrape from
# which returns a proxy list to save in a file
save_scraped(scrape(get_links()))
p_list = open(data.filename).read().splitlines()
check(p_list)
hits()
if __name__ == "__main__":
# Set user input
data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
main()
data.py
This is responsible for holding data.
import random
import datetime
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
headers = {'User-Agent': random.choice(user_agent_list)}
date = datetime.datetime.today().strftime('%Y-%m-%d')
filename = '{} ProxyList.txt'.format(date)
threads =
url = 'https://www.google.com'
timeout = 8
hits = 0
num_threads = 30
max = 800
I have some specific questions:
- Is using an external module to hold data like I did in
data.py
considered to be a good practice, or should I create the variables inmain()
? (it looks cleaner this way) - Is using a lot of functions even for small things like I did with
hits()
orhits_count()
considered to be a good practice ? - How I implemented
save_scraped(scrape(get_links()))
looks pretty messy to me, but I tried to avoid using global variables; is that good practice? - By changing to
asyncio
instead ofthreading
could I achieve faster performance while checking the proxies? - Is my PEP standard conformance okay?
That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.
python beginner python-3.x
add a comment |
up vote
2
down vote
favorite
This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:
- Get args and set (if there are any)
- Get links to scrape from
- Scrape for proxies and save them to a file
- Check the scraped proxies using concurrency (threading) while saving the hits to a new file.
I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.
proxy_tool.py
import requests
import re
import data
import time
import sys
import os
import argparse
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
# Get's list of url's with proxies
def get_links():
links =
keyword = 'server-list'
index_url = 'http://www.proxyserverlist24.top/'
page = requests.get(index_url)
soup = BeautifulSoup(page.text, 'html.parser')
temp_links = soup.find_all('a')
for atag in temp_links:
link = atag.get('href')
if atag.get('href') is None:
pass
elif keyword in link and '#' not in link and link not in links:
links.append(link)
return links
# Scrape most recently uploaded proxies and returns a list of proxies
# according to the maximum amount entered by the user (default 800)
def scrape(links):
url = links[0]
page = requests.get(url)
ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
return max_proxies(ip_list,data.max)
# Save scraped list into a file
def save_scraped(ip_list):
if os.path.isfile(data.filename):
os.remove(data.filename)
with open(data.filename,'a') as wfile:
for ip in ip_list:
wfile.write(ip)
wfile.write('n')
print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))
# Maximum amount of proxies to scrape
def max_proxies(ip_list, max):
ip_list = ip_list.copy()
return ip_list[0:max]
# Check if proxy is alive and gets a 200 response
def is_good(p):
proxy = {'https' : '{}'.format(p)}
try :
r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
if r.status_code is 200:
hits_count(p)
save_hits(p)
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
pass
# Save working proxy to a file
def save_hits(p):
with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
wfile.write(p)
wfile.write('n')
# Count hits to display when script finished executing
def hits_count(p):
data.hits += 1
print('[+] HIT - {}'.format(p))
def hits():
print('[!] {} Proxies checked and saved !'.format(data.hits))
def check_args(args=None):
parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)
results = parser.parse_args(args)
return(results.url, results.max, results.timeout, results.set_threads)
# Check multiple proxies at once from a given proxy list
def check(p_list):
pool = ThreadPool(data.num_threads)
pool.map(is_good,p_list)
pool.close()
pool.join()
def main():
# Get_links returns a list with links which is passed to scrape() to scrape from
# which returns a proxy list to save in a file
save_scraped(scrape(get_links()))
p_list = open(data.filename).read().splitlines()
check(p_list)
hits()
if __name__ == "__main__":
# Set user input
data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
main()
data.py
This is responsible for holding data.
import random
import datetime
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
headers = {'User-Agent': random.choice(user_agent_list)}
date = datetime.datetime.today().strftime('%Y-%m-%d')
filename = '{} ProxyList.txt'.format(date)
threads =
url = 'https://www.google.com'
timeout = 8
hits = 0
num_threads = 30
max = 800
I have some specific questions:
- Is using an external module to hold data like I did in
data.py
considered to be a good practice, or should I create the variables inmain()
? (it looks cleaner this way) - Is using a lot of functions even for small things like I did with
hits()
orhits_count()
considered to be a good practice ? - How I implemented
save_scraped(scrape(get_links()))
looks pretty messy to me, but I tried to avoid using global variables; is that good practice? - By changing to
asyncio
instead ofthreading
could I achieve faster performance while checking the proxies? - Is my PEP standard conformance okay?
That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.
python beginner python-3.x
add a comment |
up vote
2
down vote
favorite
up vote
2
down vote
favorite
This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:
- Get args and set (if there are any)
- Get links to scrape from
- Scrape for proxies and save them to a file
- Check the scraped proxies using concurrency (threading) while saving the hits to a new file.
I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.
proxy_tool.py
import requests
import re
import data
import time
import sys
import os
import argparse
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
# Get's list of url's with proxies
def get_links():
links =
keyword = 'server-list'
index_url = 'http://www.proxyserverlist24.top/'
page = requests.get(index_url)
soup = BeautifulSoup(page.text, 'html.parser')
temp_links = soup.find_all('a')
for atag in temp_links:
link = atag.get('href')
if atag.get('href') is None:
pass
elif keyword in link and '#' not in link and link not in links:
links.append(link)
return links
# Scrape most recently uploaded proxies and returns a list of proxies
# according to the maximum amount entered by the user (default 800)
def scrape(links):
url = links[0]
page = requests.get(url)
ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
return max_proxies(ip_list,data.max)
# Save scraped list into a file
def save_scraped(ip_list):
if os.path.isfile(data.filename):
os.remove(data.filename)
with open(data.filename,'a') as wfile:
for ip in ip_list:
wfile.write(ip)
wfile.write('n')
print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))
# Maximum amount of proxies to scrape
def max_proxies(ip_list, max):
ip_list = ip_list.copy()
return ip_list[0:max]
# Check if proxy is alive and gets a 200 response
def is_good(p):
proxy = {'https' : '{}'.format(p)}
try :
r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
if r.status_code is 200:
hits_count(p)
save_hits(p)
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
pass
# Save working proxy to a file
def save_hits(p):
with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
wfile.write(p)
wfile.write('n')
# Count hits to display when script finished executing
def hits_count(p):
data.hits += 1
print('[+] HIT - {}'.format(p))
def hits():
print('[!] {} Proxies checked and saved !'.format(data.hits))
def check_args(args=None):
parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)
results = parser.parse_args(args)
return(results.url, results.max, results.timeout, results.set_threads)
# Check multiple proxies at once from a given proxy list
def check(p_list):
pool = ThreadPool(data.num_threads)
pool.map(is_good,p_list)
pool.close()
pool.join()
def main():
# Get_links returns a list with links which is passed to scrape() to scrape from
# which returns a proxy list to save in a file
save_scraped(scrape(get_links()))
p_list = open(data.filename).read().splitlines()
check(p_list)
hits()
if __name__ == "__main__":
# Set user input
data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
main()
data.py
This is responsible for holding data.
import random
import datetime
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
headers = {'User-Agent': random.choice(user_agent_list)}
date = datetime.datetime.today().strftime('%Y-%m-%d')
filename = '{} ProxyList.txt'.format(date)
threads =
url = 'https://www.google.com'
timeout = 8
hits = 0
num_threads = 30
max = 800
I have some specific questions:
- Is using an external module to hold data like I did in
data.py
considered to be a good practice, or should I create the variables inmain()
? (it looks cleaner this way) - Is using a lot of functions even for small things like I did with
hits()
orhits_count()
considered to be a good practice ? - How I implemented
save_scraped(scrape(get_links()))
looks pretty messy to me, but I tried to avoid using global variables; is that good practice? - By changing to
asyncio
instead ofthreading
could I achieve faster performance while checking the proxies? - Is my PEP standard conformance okay?
That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.
python beginner python-3.x
This is my first "serious" project after learning python for a while.
The purpose of this script is to scrape proxies and check if they pass HTTPS websites. The main functionality is:
- Get args and set (if there are any)
- Get links to scrape from
- Scrape for proxies and save them to a file
- Check the scraped proxies using concurrency (threading) while saving the hits to a new file.
I've heard one of the best ways to learn is to get feedback, and I don't have anyone close to me who has anything related to programming. I hope you guys could help me out and give me a harsh feedback.
proxy_tool.py
import requests
import re
import data
import time
import sys
import os
import argparse
from bs4 import BeautifulSoup
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
# Get's list of url's with proxies
def get_links():
links =
keyword = 'server-list'
index_url = 'http://www.proxyserverlist24.top/'
page = requests.get(index_url)
soup = BeautifulSoup(page.text, 'html.parser')
temp_links = soup.find_all('a')
for atag in temp_links:
link = atag.get('href')
if atag.get('href') is None:
pass
elif keyword in link and '#' not in link and link not in links:
links.append(link)
return links
# Scrape most recently uploaded proxies and returns a list of proxies
# according to the maximum amount entered by the user (default 800)
def scrape(links):
url = links[0]
page = requests.get(url)
ip_list = re.findall(r'[0-9]+(?:.[0-9]+){3}:[0-9]+', page.text)
return max_proxies(ip_list,data.max)
# Save scraped list into a file
def save_scraped(ip_list):
if os.path.isfile(data.filename):
os.remove(data.filename)
with open(data.filename,'a') as wfile:
for ip in ip_list:
wfile.write(ip)
wfile.write('n')
print('[!] {} Proxies were scraped and saved ! '.format(len(ip_list)))
# Maximum amount of proxies to scrape
def max_proxies(ip_list, max):
ip_list = ip_list.copy()
return ip_list[0:max]
# Check if proxy is alive and gets a 200 response
def is_good(p):
proxy = {'https' : '{}'.format(p)}
try :
r = requests.get(data.url,proxies=proxy,headers=data.headers,timeout=data.timeout)
if r.status_code is 200:
hits_count(p)
save_hits(p)
except (requests.exceptions.Timeout,
requests.exceptions.ProxyError,
requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
pass
# Save working proxy to a file
def save_hits(p):
with open('{} Checked ProxyList.txt'.format(data.date),'a') as wfile:
wfile.write(p)
wfile.write('n')
# Count hits to display when script finished executing
def hits_count(p):
data.hits += 1
print('[+] HIT - {}'.format(p))
def hits():
print('[!] {} Proxies checked and saved !'.format(data.hits))
def check_args(args=None):
parser = argparse.ArgumentParser(description='A script to quickly get alive HTTPS proxies')
parser.add_argument('-u', '--url', type=str, help='url to check proxy against', required=False, default='https://www.google.com')
parser.add_argument('-m', '--max', type=int, help='maximum proxies to scrape', required=False, default=800)
parser.add_argument('-t', '--timeout', type=int, help='set proxy timeout limit', required=False, default=8)
parser.add_argument('-st', '--set-threads', type=int, help='set number of threads to run', required=False, default=30)
results = parser.parse_args(args)
return(results.url, results.max, results.timeout, results.set_threads)
# Check multiple proxies at once from a given proxy list
def check(p_list):
pool = ThreadPool(data.num_threads)
pool.map(is_good,p_list)
pool.close()
pool.join()
def main():
# Get_links returns a list with links which is passed to scrape() to scrape from
# which returns a proxy list to save in a file
save_scraped(scrape(get_links()))
p_list = open(data.filename).read().splitlines()
check(p_list)
hits()
if __name__ == "__main__":
# Set user input
data.url, data.max, data.timeout, data.num_threads = check_args(sys.argv[1:])
main()
data.py
This is responsible for holding data.
import random
import datetime
user_agent_list = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
headers = {'User-Agent': random.choice(user_agent_list)}
date = datetime.datetime.today().strftime('%Y-%m-%d')
filename = '{} ProxyList.txt'.format(date)
threads =
url = 'https://www.google.com'
timeout = 8
hits = 0
num_threads = 30
max = 800
I have some specific questions:
- Is using an external module to hold data like I did in
data.py
considered to be a good practice, or should I create the variables inmain()
? (it looks cleaner this way) - Is using a lot of functions even for small things like I did with
hits()
orhits_count()
considered to be a good practice ? - How I implemented
save_scraped(scrape(get_links()))
looks pretty messy to me, but I tried to avoid using global variables; is that good practice? - By changing to
asyncio
instead ofthreading
could I achieve faster performance while checking the proxies? - Is my PEP standard conformance okay?
That's all I can think of right now. Feel free to suggest anything from more pythonic code to a better implementation of a function or whatever comes to your mind.
python beginner python-3.x
python beginner python-3.x
edited Oct 25 at 11:54
asked Oct 24 at 16:49
shaike
112
112
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
up vote
0
down vote
Of course. You can also do another thing: keep all the user agents in a separate file (eg.
ua.txt
) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in theuser_agents_list
.
for line in open('ua.txt').read().splitlines():
user_agents_list.append(line.strip())
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.
You should not use different functions for different things unless you need to call/use them more than once.
- Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)
- Instead of changing over to
asyncio
, you should know How to Wield Threaded Asynchronous Magic. - Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.
Hope this helps.
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
add a comment |
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
0
down vote
Of course. You can also do another thing: keep all the user agents in a separate file (eg.
ua.txt
) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in theuser_agents_list
.
for line in open('ua.txt').read().splitlines():
user_agents_list.append(line.strip())
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.
You should not use different functions for different things unless you need to call/use them more than once.
- Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)
- Instead of changing over to
asyncio
, you should know How to Wield Threaded Asynchronous Magic. - Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.
Hope this helps.
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
add a comment |
up vote
0
down vote
Of course. You can also do another thing: keep all the user agents in a separate file (eg.
ua.txt
) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in theuser_agents_list
.
for line in open('ua.txt').read().splitlines():
user_agents_list.append(line.strip())
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.
You should not use different functions for different things unless you need to call/use them more than once.
- Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)
- Instead of changing over to
asyncio
, you should know How to Wield Threaded Asynchronous Magic. - Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.
Hope this helps.
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
add a comment |
up vote
0
down vote
up vote
0
down vote
Of course. You can also do another thing: keep all the user agents in a separate file (eg.
ua.txt
) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in theuser_agents_list
.
for line in open('ua.txt').read().splitlines():
user_agents_list.append(line.strip())
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.
You should not use different functions for different things unless you need to call/use them more than once.
- Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)
- Instead of changing over to
asyncio
, you should know How to Wield Threaded Asynchronous Magic. - Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.
Hope this helps.
Of course. You can also do another thing: keep all the user agents in a separate file (eg.
ua.txt
) and then combine rest of the code with main file. With a one-liner, you can fetch all user-agents in theuser_agents_list
.
for line in open('ua.txt').read().splitlines():
user_agents_list.append(line.strip())
But if you like it that way, just keep it. Remember code is all about the person who writes it, not one who reads it.
You should not use different functions for different things unless you need to call/use them more than once.
- Of course. But you should give some comments to let the code reviewer know what you're up to in that part of the code. :)
- Instead of changing over to
asyncio
, you should know How to Wield Threaded Asynchronous Magic. - Yes but you should add DocStrings. To test your code for the PEP standard use a linter such as flake8.
Hope this helps.
edited Oct 26 at 11:10
answered Oct 25 at 9:57
The Infected Drake
1012
1012
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
add a comment |
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
About the user agent list , i've actually found a library that handles all of those strings pretty nicely fake-useragent . I also added comments to the code to make it more readable . Didn't know about flake8 or linter i would surely use them . Thank you for your insight , much appreciated !
– shaike
Oct 25 at 12:02
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
Another note here, avoid using external libraries unless you need them. It makes your execution runtime considerably slower. For example avoid importing and using an external library such as fake-useragent here, when you can have your own set of selected useragents. Edit: I missed a link in point 4 which I noticed just now. Check the updated answer.
– The Infected Drake
Oct 26 at 11:06
add a comment |
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f206208%2fproxy-scraper-and-multithreaded-checker%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown