osc/deploy/ProxyPool/ProxyGetter/getFreeProxy.py
2025-05-28 19:16:17 +08:00

453 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name GetFreeProxy.py
Description : 抓取免费代理
Author : JHao
date 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25:
-------------------------------------------------
"""
import re
import sys
import json
import requests
from time import sleep
sys.path.append('..')
from Util.WebRequest import WebRequest
from Util.utilFunction import getHtmlTree
import requests
from DB.DbClient import DbClient
from Config.setting import USEFUL_PROXY_COUNT
from Config.setting import ZHIMA_PROXY_API
# for debug to disable insecureWarning
requests.packages.urllib3.disable_warnings()
proxy_dict = {
"proxy": "",
"region": "",
"anonimity": "",
"proxy_type": "",
"source": ""
}
db = DbClient()
def init_proxy_dict():
for _ in proxy_dict.keys():
proxy_dict[_] = ""
class GetFreeProxy(object):
"""
proxy getter
"""
# @staticmethod
# def freeProxy01():
# """
# 无忧代理 http://www.data5u.com/
# 几乎没有能用的
# :return:
# """
# url_list = [
# 'http://www.data5u.com/',
# 'http://www.data5u.com/free/gngn/index.shtml',
# 'http://www.data5u.com/free/gnpt/index.shtml'
# ]
# key = 'ABCDEFGHIZ'
# for url in url_list:
# html_tree = getHtmlTree(url)
# ul_list = html_tree.xpath('//ul[@class="l2"]')
# for ul in ul_list:
# try:
# ip = ul.xpath('./span[1]/li/text()')[0]
# classnames = ul.xpath('./span[2]/li/attribute::class')[0]
# classname = classnames.split(' ')[1]
# port_sum = 0
# for c in classname:
# port_sum *= 10
# port_sum += key.index(c)
# port = port_sum >> 3
# yield '{}:{}'.format(ip, port)
# except Exception as e:
# print(e)
@staticmethod
def freeProxy02(count=20):
"""
代理66 http://www.66ip.cn/
:param count: 提取数量
:return:
"""
urls = [
"http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=",
"http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s"
"tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip"
]
try:
import execjs
import requests
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Accept': '*/*',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
session = requests.session()
src = session.get("http://www.66ip.cn/", headers=headers).text
src = src.split("</script>")[0] + '}'
src = src.replace("<script>", "function test() {")
src = src.replace("while(z++)try{eval(",
';var num=10;while(z++)try{var tmp=')
src = src.replace(
");break}",
";num--;if(tmp.search('cookie') != -1 | num<0){return tmp}}")
ctx = execjs.compile(src)
src = ctx.call("test")
src = src[src.find("document.cookie="):src.find("};if((")]
src = src.replace("document.cookie=", "")
src = "function test() {var window={}; return %s }" % src
cookie = execjs.compile(src).call('test')
js_cookie = cookie.split(";")[0].split("=")[-1]
except Exception as e:
print(e)
return
for url in urls:
try:
html = session.get(url.format(count),
cookies={
"__jsl_clearance": js_cookie
},
headers=headers).text
ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}",
html)
for ip in ips:
init_proxy_dict()
proxy_dict["proxy"] = ip.strip()
yield json.dumps(proxy_dict)
except Exception as e:
print(e)
pass
@staticmethod
def freeProxy03(page_count=1):
"""
西刺代理 http://www.xicidaili.com
:return:
"""
url_list = [
'http://www.xicidaili.com/nn/', # 高匿
# 'http://www.xicidaili.com/nt/', # 透明
]
for each_url in url_list:
for i in range(1, page_count + 1):
page_url = each_url + str(i)
tree = getHtmlTree(page_url)
proxy_list = tree.xpath(
'.//table[@id="ip_list"]//tr[position()>1]')
for proxy in proxy_list:
try:
init_proxy_dict()
proxy_dict["proxy"] = ':'.join(
proxy.xpath('./td/text()')[0:2])
proxy_dict["proxy_type"] = proxy.xpath('./td/text()')[5]
yield json.dumps(proxy_dict)
except Exception as e:
pass
@staticmethod
def freeProxy04():
"""
guobanjia http://www.goubanjia.com/
:return:
"""
url = "http://www.goubanjia.com/"
tree = getHtmlTree(url)
proxy_list = tree.xpath('//td[@class="ip"]')
proxy_attr = tree.xpath('//tr[@class="success" or @class="warning"]')
# 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
# 需要过滤掉<p style="display:none;">的内容
xpath_str = """.//*[not(contains(@style, 'display: none'))
and not(contains(@style, 'display:none'))
and not(contains(@class, 'port'))
]/text()
"""
for each_proxy, each_attr in proxy_list, proxy_attr:
try:
# :符号裸放在td下其他放在div span p中先分割找出ip再找port
ip_addr = ''.join(each_proxy.xpath(xpath_str))
# HTML中的port是随机数真正的端口编码在class后面的字母中。
# 比如这个:
# <span class="port CFACE">9054</span>
# CFACE解码后对应的是3128。
port = 0
for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
"/attribute::class")[0]. \
replace("port ", ""):
port *= 10
port += (ord(_) - ord('A'))
port /= 8
init_proxy_dict()
proxy_dict["proxy"] = '{}:{}'.format(ip_addr, int(port))
proxy_dict["anonimity"] = each_attr.xpath(".//td/text()")[2]
proxy_dict["proxy_type"] = each_attr.xpath(".//td/text()")[3]
yield json.dumps(proxy_dict)
except Exception as e:
pass
@staticmethod
def freeProxy05():
"""
快代理 https://www.kuaidaili.com
"""
url_list = [
'https://www.kuaidaili.com/free/inha/'
# 'https://www.kuaidaili.com/free/intr/'
]
for url in url_list:
tree = getHtmlTree(url)
proxy_list = tree.xpath('.//table//tr')
sleep(1) # 必须sleep 不然第二条请求不到数据
for tr in proxy_list[1:]:
init_proxy_dict()
proxy_dict['proxy'] = ':'.join(tr.xpath('./td/text()')[0:2])
proxy_dict['proxy_type'] = tr.xpath('./td/text()')[3]
proxy_dict['anonimity'] = tr.xpath('./td/text()')[2]
yield json.dumps(proxy_dict)
# @staticmethod
# def freeProxy06():
# """
# 码农代理 https://proxy.coderbusy.com/
# :return:
# """
# urls = ['https://proxy.coderbusy.com/']
# for url in urls:
# tree = getHtmlTree(url)
# proxy_list = tree.xpath('.//table//tr')
# for tr in proxy_list[1:]:
# init_proxy_dict()
# proxy_dict['proxy'] = ':'.join(tr.xpath('./td/text()')[0:2])
# yield json.dumps(proxy_dict)
@staticmethod
def freeProxy07():
"""
云代理 http://www.ip3366.net/free/
:return:
"""
urls = [
'http://www.ip3366.net/free/?stype=1',
"http://www.ip3366.net/free/?stype=2"
]
request = WebRequest()
for url in urls:
tree = getHtmlTree(url)
proxy_list = tree.xpath('.//table//tr')
sleep(1)
for tr in proxy_list[1:]:
init_proxy_dict()
proxy_dict['proxy'] = ':'.join(tr.xpath('./td/text()')[0:2])
proxy_dict['proxy_type'] = tr.xpath('./td/text()')[3]
proxy_dict['anonimity'] = tr.xpath('./td/text()')[2]
yield json.dumps(proxy_dict)
# r = request.get(url, timeout=10)
# proxies = re.findall(
# r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
# r.text)
# for proxy in proxies:
# init_proxy_dict()
# proxy_dict['proxy'] = ":".join(proxy)
# yield json.dumps(proxy_dict)
# @staticmethod
# def freeProxy08():
# """
# IP海 http://www.iphai.com/free/ng
# :return:
# """
# urls = [
# 'http://www.iphai.com/free/ng', 'http://www.iphai.com/free/np',
# 'http://www.iphai.com/free/wg', 'http://www.iphai.com/free/wp'
# ]
# request = WebRequest()
# for url in urls:
# r = request.get(url, timeout=10)
# proxies = re.findall(
# r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>',
# r.text)
# for proxy in proxies:
# init_proxy_dict()
# proxy_dict['proxy'] = ":".join(proxy)
# yield json.dumps(proxy_dict)
@staticmethod
def freeProxy09(page_count=1):
"""
http://ip.jiangxianli.com/?page=
免费代理库
:return:
"""
for i in range(1, page_count + 1):
url = 'http://ip.jiangxianli.com/?country=中国&?page={}'.format(i)
html_tree = getHtmlTree(url)
for index, tr in enumerate(html_tree.xpath("//table//tr")):
if index == 0:
continue
init_proxy_dict()
proxy_dict['proxy'] = ":".join(tr.xpath("./td/text()")[0:2]).strip()
proxy_dict['proxy_type'] = tr.xpath("./td/text()")[3]
proxy_dict['anonimity'] = tr.xpath("./td/text()")[2]
yield json.dumps(proxy_dict)
# @staticmethod
# def freeProxy10():
# """
# 墙外网站 cn-proxy
# :return:
# """
# urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
# request = WebRequest()
# for url in urls:
# r = request.get(url, timeout=10)
# proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text)
# for proxy in proxies:
# yield ':'.join(proxy)
# @staticmethod
# def freeProxy11():
# """
# https://proxy-list.org/english/index.php
# :return:
# """
# urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
# request = WebRequest()
# import base64
# for url in urls:
# r = request.get(url, timeout=10)
# proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
# for proxy in proxies:
# yield base64.b64decode(proxy).decode()
# @staticmethod
# def freeProxy12():
# urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
# request = WebRequest()
# for url in urls:
# r = request.get(url, timeout=10)
# proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
# for proxy in proxies:
# yield ':'.join(proxy)
@staticmethod
def freeProxy13(max_page=2):
"""
http://www.qydaili.com/free/?action=china&page=1
齐云代理
:param max_page:
:return:
"""
base_url = 'http://www.qydaili.com/free/?action=china&page='
for page in range(1, max_page + 1):
url = base_url + str(page)
tree = getHtmlTree(url)
proxy_list = tree.xpath('.//table//tr')
sleep(1)
for tr in proxy_list[1:]:
init_proxy_dict()
proxy_dict['proxy'] = ':'.join(tr.xpath('./td/text()')[0:2])
proxy_dict['proxy_type'] = tr.xpath('./td/text()')[3]
proxy_dict['anonimity'] = tr.xpath('./td/text()')[2]
yield json.dumps(proxy_dict)
# r = request.get(url, timeout=10)
# proxies = re.findall(
# r'<td.*?>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td.*?>(\d+)</td>',
# r.text)
# for proxy in proxies:
# init_proxy_dict()
# proxy_dict['proxy'] = ':'.join(proxy)
# yield json.dumps(proxy_dict)
# @staticmethod
# def freeProxy14(max_page=2):
# """
# http://www.89ip.cn/index.html
# 89免费代理
# :param max_page:
# :return:
# """
# base_url = 'http://www.89ip.cn/index_{}.html'
# request = WebRequest()
# for page in range(1, max_page + 1):
# url = base_url.format(page)
# r = request.get(url, timeout=10)
# proxies = re.findall(
# r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>',
# r.text)
# for proxy in proxies:
# init_proxy_dict()
# proxy_dict['proxy'] = ':'.join(proxy)
# yield json.dumps(proxy_dict)
@staticmethod
def freeProxy15():
urls = [
# 'http://www.xiladaili.com/putong/',
"http://www.xiladaili.com/gaoni/"
# "http://www.xiladaili.com/http/",
# "http://www.xiladaili.com/https/"
]
for url in urls:
tree = getHtmlTree(url)
proxy_list = tree.xpath('.//table//tr')
sleep(1)
for tr in proxy_list[1:]:
init_proxy_dict()
proxy_dict['proxy'] = tr.xpath('./td/text()')[0]
proxy_dict['proxy_type'] = re.sub(re.compile(r'[\u4e00-\u9fa5]'), '', tr.xpath('./td/text()')[1])
proxy_dict['anonimity'] = tr.xpath('./td/text()')[2]
yield json.dumps(proxy_dict)
@staticmethod
def zhimaProxy():
# db.changeTable('ProxyPool:useful_proxy_3')
db.changeTable('ProxyPool:useful_proxy_63')
if db.getNumber() < USEFUL_PROXY_COUNT / 2:
rsp = json.loads(requests.get(ZHIMA_PROXY_API).text)
if rsp['success']:
for proxy in rsp['data']:
proxy_dict['proxy'] = proxy['ip'] + ':' + str(proxy['port'])
yield json.dumps(proxy_dict)
if __name__ == '__main__':
from CheckProxy import CheckProxy
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy02)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy03)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy04)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy05)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy06)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy07)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy08)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy09)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy13)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy14)
# CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy15)
CheckProxy.checkAllGetProxyFunc()