爬虫实战----从免费IP代理网站获取连接率较好的可用IP

xiaoxiao2021-02-27  158

# coding:utf-8 import re import urllib.request import os import random import socket """ date:2017-08-03 version:2.0 """ class GetIP(): def __init__(self): self.iplist = self.get_ip_port() def get_ip_port(self): """ 从网站http://www.xicidaili.com/获取iplist :return:返回可用的ip """ url = "http://www.xicidaili.com/wt/" user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib.request.Request(url=url, headers=headers) try: page = urllib.request.urlopen(req, timeout=1).read().decode('utf-8') except: return url + "网站不可达" # 获取IP iplist = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', page) # 获取端口 ipports = re.findall(r'<td>\d{2,5}</td>', page) ipport = [re.findall(r'\d{2,5}', port)[0] for port in ipports] # 获取速度 speeds = re.findall(r'title="\d{1,3}\.\d{1,4}', page) speed = [re.findall(r'\d{1,3}\.\d{1,4}', spd)[0] for spd in speeds] ipspeed = [float(speed[i]) for i in range(0, len(speed), 2)] ip = [] for k in range(len(ipspeed)): if ipspeed[k] < 0.5: ip.append((iplist[k] + ":" + ipport[k])) print("我被执行了") return ip def ping(self, ip, timeout=2): """ 使用socket模块用来判断IP和端口是否可用,入参形式为:139.208.85.232:80 :param ip:入参形式为:139.208.85.232:80 :param timeout:默认2s不通,则认为超时 :return:1表示可用,0表示不可用 """ print(ip) ip = (ip.split(":")[0], int(ip.split(":")[-1])) # print(ip) try: cs = socket.socket(socket.AF_INET, socket.SOCK_STREAM) cs.settimeout(float(timeout)) address = ip status = cs.connect_ex(address) if status == 0: cs.close() return 1 else: return 0 # print("%s可用" % str(ip)) except Exception as e: # print(e.reason()) cs.close() def useful_ip(self): iplist = self.iplist print(iplist) while True: ip = random.choice(iplist) if self.ping(ip): break return ip if __name__ == '__main__': AA = GetIP() print(AA.get_ip_port()) # for i in range(0, 100): # AA.useful_ip()
转载请注明原文地址: https://www.6miu.com/read-13602.html

最新回复(0)