一个简易的ip代理池
用py写了一个简单的ip代理池,源码如下:
从xici爬取ip
tip: 需要在目录下创建HTTPS_ip.txt和HTTP_ip.txt和一个enable_ip.txt
# -- coding: utf-8 --
from bs4 import BeautifulSoup
import requests
import os
def get_ip_list(url):
headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
'Accept - Encoding':'gzip, deflate',
'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
'Connection':'Keep-Alive',
'Host':'zhannei.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
web_data = requests.get(url=url,headers=headers)
if '200' not in web_data:
print("访问失败,可能是禁了ip,当前访问页面状态码:%s" % (web_data))
soup = BeautifulSoup(web_data.text,'html.parser')
ips = soup.find_all('tr')
fhttp = open("HTTP_IP.txt",'a')
fhttps = open("HTTPS_IP.txt",'a')
fsocks = open("socks_IP.txt",'a')
for i in range(1,len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
print("IP:{} port:{} noun:{}".format(tds[1].text,tds[2].text,tds[5].text))
item = str(tds[1].text) + ',' + str(tds[2].text) + ',' +str(tds[5].text) + '\n'
if 'HTTP' in item and len(tds[5].text) == 4:
fhttp.write(item)
if 'HTTPS' in item:
fhttps.write(item)
if 'socks4/5' in item:
fsocks.write(item)
fhttp.close()
fhttps.close()
fsocks.close()
if __name__=="__main__":
if (os.path.exists("HTTP_IP.txt")): #判断文件是否存在,存在返回True,否则返回False
os.remove("HTTP_IP.txt") #删除存在的文件,为了追加的写入方式写入的数据不重复
if (os.path.exists("HTTPS_IP.txt")):
os.remove("HTTPS_IP.txt")
if (os.path.exists("socks_IP.txt")):
os.remove("socks_IP.txt") #突然想到直接在这里就以w方式打开文件,就不需要考虑以追加的写入方式写入的数据重复的问题
for i in range(1,11): #设爬取代理的范围
target_url = "https://www.xicidaili.com/nn/" + str(i) # 爬取代理的网站
get_ip_list(target_url)
验证ip可用性
# -*- coding: utf-8 -*-
import requests
from queue import Queue
import sys
import threading
import telnetlib
class Check_IP(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self._queue = queue
def run(self):
global lines
lines = []
while (not self._queue.empty()): #只要队列不为空就继续从队列中取url路径
url = self._queue.get()
# print(url)
ip,port,types = url.split(',',3)
try :
telnetlib.Telnet(ip,port,timeout=6) #超过6秒则判定为无效IP代理
# if r.status_code == 200 :
# sys.stdout.write('[*] %s\n' % url) #显示状态码为200的url
print("%s可用" % (url))
lines.append(url+'\n')
except Exception:
print("%s不可用" % (url))
def write_enable_ip():
fw = open("enable_ip\\enable_ip.txt",'w')
for i in range(len(lines)):
fw.write(lines[i])
fw.close()
def start(txt,count):
queue = Queue()
fr = open('%s' % txt,'r',encoding='utf-8') #按行读取文件内容
lines = fr.readlines()
fr.close()
for line in lines:
ip,port,types= line.split(',',3) #将ip、端口和类型读取并分别赋值
types = types.rstrip('\n')
queue.put(ip+','+port+','+types)
threads = []
thread_count = int(count)
for i in range(thread_count):
threads.append(Check_IP(queue))
for t in threads:
t.start()
for t in threads:
t.join()
if __name__=="__main__":
txt = 'HTTPS_IP.txt' #设置要验证的代理类型的文件
count = 8 #设定线程数
start(txt,count)
Check_IP.write_enable_ip()
我是项目地址
password:31ex
-------------已经到底啦!-------------