57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
import logging
|
|
import time
|
|
|
|
import selenium
|
|
import os
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
from selenium.webdriver.firefox.options import Options
|
|
|
|
driver_options = Options()
|
|
driver_options.add_argument('--headless')
|
|
# driver_options.add_argument('--no-sandbox')
|
|
# driver_options.add_argument('--disable-dev-shm-usage')
|
|
# driver_options.add_argument('--disable-gpu')
|
|
browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555",
|
|
desired_capabilities=DesiredCapabilities.CHROME,
|
|
options=driver_options)
|
|
|
|
root_url = 'https://www.congress.gov'
|
|
photo_save_path = r"/root/key-person/photo"
|
|
|
|
if __name__ == "__main__":
|
|
photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8')
|
|
photo_urls = photo_urls_file.read().split('\n')
|
|
photo_urls_file.close()
|
|
uris_count = len(photo_urls)
|
|
count = 0
|
|
line_num = 0
|
|
existing_file_names = os.listdir(photo_save_path)
|
|
for url in photo_urls:
|
|
line_num += 1
|
|
save_file_name = url.split('/')[-1]
|
|
if save_file_name in existing_file_names:
|
|
print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))
|
|
continue
|
|
try:
|
|
browser.get(url)
|
|
except Exception as e:
|
|
print(repr(e))
|
|
browser.start_session(capabilities=DesiredCapabilities.CHROME)
|
|
continue
|
|
time.sleep(0.5)
|
|
try:
|
|
with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file:
|
|
img = browser.find_element_by_xpath('//body/img[1]')
|
|
img_screenshot = img.screenshot_as_png
|
|
image_file.write(img_screenshot)
|
|
image_file.close()
|
|
count += 1
|
|
print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))
|
|
time.sleep(0.5)
|
|
except Exception as e:
|
|
logging.debug(repr(e))
|
|
if count >= 10:
|
|
break
|
|
browser.quit()
|