osc/deploy/ProxyPool/Util/utilFunction.py
2025-05-28 19:16:17 +08:00

96 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""
-------------------------------------------------
File Name utilFunction.py
Description : tool function
Author : JHao
date 2016/11/25
-------------------------------------------------
Change Activity:
2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
-------------------------------------------------
"""
import requests
from lxml import etree
from Util.WebRequest import WebRequest
def robustCrawl(func):
def decorate(*args, **kwargs):
try:
return func(*args, **kwargs)
except Exception as e:
pass
# logger.info(u"sorry, 抓取出错。错误原因:")
# logger.info(e)
return decorate
def verifyProxyFormat(proxy):
"""
检查代理格式
:param proxy:
:return:
"""
import re
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
_proxy = re.findall(verify_regex, proxy)
return True if len(_proxy) == 1 and _proxy[0] == proxy else False
def getHtmlTree(url, **kwargs):
"""
获取html树
:param url:
:param kwargs:
:return:
"""
header = {'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
# TODO 取代理服务器用代理服务器访问
wr = WebRequest()
html = wr.get(url=url, header=header).content
return etree.HTML(html)
def tcpConnect(proxy):
"""
TCP 三次握手
:param proxy:
:return:
"""
from socket import socket, AF_INET, SOCK_STREAM
s = socket(AF_INET, SOCK_STREAM)
ip, port = proxy.split(':')
result = s.connect_ex((ip, int(port)))
return True if result == 0 else False
def validUsefulProxy(proxy):
"""
检验代理是否可用
:param proxy:
:return:
"""
if isinstance(proxy, bytes):
proxy = proxy.decode("utf8")
proxies = {"http": "http://{proxy}".format(proxy=proxy)}
try:
r = requests.get('http://www.baidu.com', proxies=proxies, timeout=10, verify=False)
if r.status_code == 200:
return True
except Exception as e:
pass
return False