# -*- coding: utf-8 -*- # !/usr/bin/env python """ ------------------------------------------------- File Name: utilFunction.py Description : tool function Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree ------------------------------------------------- """ import requests from lxml import etree from Util.WebRequest import WebRequest def robustCrawl(func): def decorate(*args, **kwargs): try: return func(*args, **kwargs) except Exception as e: pass # logger.info(u"sorry, 抓取出错。错误原因:") # logger.info(e) return decorate def verifyProxyFormat(proxy): """ 检查代理格式 :param proxy: :return: """ import re verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" _proxy = re.findall(verify_regex, proxy) return True if len(_proxy) == 1 and _proxy[0] == proxy else False def getHtmlTree(url, **kwargs): """ 获取html树 :param url: :param kwargs: :return: """ header = {'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', } # TODO 取代理服务器用代理服务器访问 wr = WebRequest() html = wr.get(url=url, header=header).content return etree.HTML(html) def tcpConnect(proxy): """ TCP 三次握手 :param proxy: :return: """ from socket import socket, AF_INET, SOCK_STREAM s = socket(AF_INET, SOCK_STREAM) ip, port = proxy.split(':') result = s.connect_ex((ip, int(port))) return True if result == 0 else False def validUsefulProxy(proxy): """ 检验代理是否可用 :param proxy: :return: """ if isinstance(proxy, bytes): proxy = proxy.decode("utf8") proxies = {"http": "http://{proxy}".format(proxy=proxy)} try: r = requests.get('http://www.baidu.com', proxies=proxies, timeout=10, verify=False) if r.status_code == 200: return True except Exception as e: pass return False