96 lines
2.5 KiB
Python
96 lines
2.5 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
# !/usr/bin/env python
|
|||
|
|
"""
|
|||
|
|
-------------------------------------------------
|
|||
|
|
File Name: utilFunction.py
|
|||
|
|
Description : tool function
|
|||
|
|
Author : JHao
|
|||
|
|
date: 2016/11/25
|
|||
|
|
-------------------------------------------------
|
|||
|
|
Change Activity:
|
|||
|
|
2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
|
|||
|
|
-------------------------------------------------
|
|||
|
|
"""
|
|||
|
|
import requests
|
|||
|
|
from lxml import etree
|
|||
|
|
|
|||
|
|
from Util.WebRequest import WebRequest
|
|||
|
|
|
|||
|
|
|
|||
|
|
def robustCrawl(func):
|
|||
|
|
def decorate(*args, **kwargs):
|
|||
|
|
try:
|
|||
|
|
return func(*args, **kwargs)
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
# logger.info(u"sorry, 抓取出错。错误原因:")
|
|||
|
|
# logger.info(e)
|
|||
|
|
|
|||
|
|
return decorate
|
|||
|
|
|
|||
|
|
|
|||
|
|
def verifyProxyFormat(proxy):
|
|||
|
|
"""
|
|||
|
|
检查代理格式
|
|||
|
|
:param proxy:
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
|
|||
|
|
_proxy = re.findall(verify_regex, proxy)
|
|||
|
|
return True if len(_proxy) == 1 and _proxy[0] == proxy else False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getHtmlTree(url, **kwargs):
|
|||
|
|
"""
|
|||
|
|
获取html树
|
|||
|
|
:param url:
|
|||
|
|
:param kwargs:
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
header = {'Connection': 'keep-alive',
|
|||
|
|
'Cache-Control': 'max-age=0',
|
|||
|
|
'Upgrade-Insecure-Requests': '1',
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
|
|||
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, sdch',
|
|||
|
|
'Accept-Language': 'zh-CN,zh;q=0.8',
|
|||
|
|
}
|
|||
|
|
# TODO 取代理服务器用代理服务器访问
|
|||
|
|
wr = WebRequest()
|
|||
|
|
html = wr.get(url=url, header=header).content
|
|||
|
|
return etree.HTML(html)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def tcpConnect(proxy):
|
|||
|
|
"""
|
|||
|
|
TCP 三次握手
|
|||
|
|
:param proxy:
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
from socket import socket, AF_INET, SOCK_STREAM
|
|||
|
|
s = socket(AF_INET, SOCK_STREAM)
|
|||
|
|
ip, port = proxy.split(':')
|
|||
|
|
result = s.connect_ex((ip, int(port)))
|
|||
|
|
return True if result == 0 else False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def validUsefulProxy(proxy):
|
|||
|
|
"""
|
|||
|
|
检验代理是否可用
|
|||
|
|
:param proxy:
|
|||
|
|
:return:
|
|||
|
|
"""
|
|||
|
|
if isinstance(proxy, bytes):
|
|||
|
|
proxy = proxy.decode("utf8")
|
|||
|
|
proxies = {"http": "http://{proxy}".format(proxy=proxy)}
|
|||
|
|
try:
|
|||
|
|
r = requests.get('http://www.baidu.com', proxies=proxies, timeout=10, verify=False)
|
|||
|
|
if r.status_code == 200:
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
pass
|
|||
|
|
return False
|
|||
|
|
|