2025-05-28 19:16:17 +08:00

172 lines
7.0 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
from importlib import import_module
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium.webdriver.support.ui import WebDriverWait
import selenium
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
from .http import SeleniumRequest
from selenium.common.exceptions import WebDriverException
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self, driver_name, driver_executable_path, driver_arguments,
browser_executable_path):
"""Initialize the selenium webdriver
Parameters
----------
driver_name: str
The selenium ``WebDriver`` to use
driver_executable_path: str
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
"""
webdriver_base_path = f'selenium.webdriver.{driver_name}'
driver_options_module = import_module(f'{webdriver_base_path}.options')
driver_options_klass = getattr(driver_options_module, 'Options')
driver_options = driver_options_klass()
for argument in driver_arguments:
driver_options.add_argument(argument)
#本地浏览器#
# driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
# driver_klass = getattr(driver_klass_module, 'WebDriver')
# if browser_executable_path:
# driver_options.binary_location = browser_executable_path
# driver_kwargs = {
# 'executable_path': driver_executable_path,
# f'{driver_name}_options': driver_options
# }
# self.driver = driver_klass(**driver_kwargs)
#远程浏览器#
driver_type = None
if driver_name.lower() == 'firefox':
driver_type = DesiredCapabilities.FIREFOX
elif driver_name.lower() == 'chrome':
driver_type = DesiredCapabilities.CHROME
if driver_type is None:
raise NotConfigured(
'SELENIUM_DRIVER_NAME IS NOT RIGHT'
)
self.proxy_count = 0
self.driver = selenium.webdriver.remote.webdriver.WebDriver(command_executor=driver_executable_path,
desired_capabilities=driver_type,
options=driver_options)
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
if not driver_name or not driver_executable_path:
raise NotConfigured(
'SELENIUM_DRIVER_NAME and SELENIUM_DRIVER_EXECUTABLE_PATH must be set'
)
middleware = cls(
driver_name=driver_name,
driver_executable_path=driver_executable_path,
driver_arguments=driver_arguments,
browser_executable_path=browser_executable_path
)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return None
#添加代理ip#
if 'proxy' in request.meta and request.meta['proxy'] != '' and (self.proxy_count == 0 or self.proxy_count > 20):
self.proxy_count = 0
ip = request.meta['proxy'].split(':')[1][2:]
port = int(request.meta['proxy'].split(':')[2])
user_agent = 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)'
self.driver.get("about:config")
script = '''
var prefs = Components.classes["@mozilla.org/preferences-service;1"].getService(Components.interfaces.nsIPrefBranch);
prefs.setIntPref("network.proxy.type", 1);
prefs.setCharPref("network.proxy.http", "{ip}");
prefs.setIntPref("network.proxy.http_port", "{port}");
prefs.setCharPref("network.proxy.ssl", "{ip}");
prefs.setIntPref("network.proxy.ssl_port", "{port}");
prefs.setCharPref("network.proxy.ftp", "{ip}");
prefs.setIntPref("network.proxy.ftp_port", "{port}");
       prefs.setBoolPref("general.useragent.site_specific_overrides",true);
       prefs.setBoolPref("general.useragent.updates.enabled",true);
      prefs.setCharPref("general.useragent.override","{user_agent}");
'''.format(ip=ip, port=port, user_agent=user_agent)
self.driver.execute_script(script)
# time.sleep(0.5)
if 'proxy' in request.meta and request.meta['proxy'] != '':
self.proxy_count += 1
try:
self.driver.get(request.url)
except selenium.common.exceptions.TimeoutException:
self.driver.execute_script('window.stop()')
except selenium.common.exceptions.InvalidSessionIdException:
self.driver.start_session(capabilities=DesiredCapabilities.FIREFOX)
for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
if request.script:
self.driver.execute_script(request.script)
body = str.encode(self.driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
return HtmlResponse(
self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def process_exception(self, request, exception, spider):
if isinstance(exception, WebDriverException) and exception.args[0].find('No active session with ID') >= 0:
self.driver.start_session(capabilities=DesiredCapabilities.FIREFOX)
return request
return None
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
try:
self.driver.quit()
except:
pass