# -*- coding: utf-8 -*- # !/usr/bin/env python """ ------------------------------------------------- File Name: GetFreeProxy.py Description : 抓取免费代理 Author : JHao date: 2016/11/25 ------------------------------------------------- Change Activity: 2016/11/25: ------------------------------------------------- """ import re import sys import json import requests from time import sleep sys.path.append('..') from Util.WebRequest import WebRequest from Util.utilFunction import getHtmlTree import requests from DB.DbClient import DbClient from Config.setting import USEFUL_PROXY_COUNT from Config.setting import ZHIMA_PROXY_API # for debug to disable insecureWarning requests.packages.urllib3.disable_warnings() proxy_dict = { "proxy": "", "region": "", "anonimity": "", "proxy_type": "", "source": "" } db = DbClient() def init_proxy_dict(): for _ in proxy_dict.keys(): proxy_dict[_] = "" class GetFreeProxy(object): """ proxy getter """ # @staticmethod # def freeProxy01(): # """ # 无忧代理 http://www.data5u.com/ # 几乎没有能用的 # :return: # """ # url_list = [ # 'http://www.data5u.com/', # 'http://www.data5u.com/free/gngn/index.shtml', # 'http://www.data5u.com/free/gnpt/index.shtml' # ] # key = 'ABCDEFGHIZ' # for url in url_list: # html_tree = getHtmlTree(url) # ul_list = html_tree.xpath('//ul[@class="l2"]') # for ul in ul_list: # try: # ip = ul.xpath('./span[1]/li/text()')[0] # classnames = ul.xpath('./span[2]/li/attribute::class')[0] # classname = classnames.split(' ')[1] # port_sum = 0 # for c in classname: # port_sum *= 10 # port_sum += key.index(c) # port = port_sum >> 3 # yield '{}:{}'.format(ip, port) # except Exception as e: # print(e) @staticmethod def freeProxy02(count=20): """ 代理66 http://www.66ip.cn/ :param count: 提取数量 :return: """ urls = [ "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", "http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s" "tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip" ] try: import execjs import requests headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 'Accept': '*/*', 'Connection': 'keep-alive', 'Accept-Language': 'zh-CN,zh;q=0.8' } session = requests.session() src = session.get("http://www.66ip.cn/", headers=headers).text src = src.split("")[0] + '}' src = src.replace("