57 lines
2.1 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
import logging
import time
import selenium
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
driver_options = Options()
driver_options.add_argument('--headless')
# driver_options.add_argument('--no-sandbox')
# driver_options.add_argument('--disable-dev-shm-usage')
# driver_options.add_argument('--disable-gpu')
browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555",
desired_capabilities=DesiredCapabilities.CHROME,
options=driver_options)
root_url = 'https://www.congress.gov'
photo_save_path = r"/root/key-person/photo"
if __name__ == "__main__":
photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8')
photo_urls = photo_urls_file.read().split('\n')
photo_urls_file.close()
uris_count = len(photo_urls)
count = 0
line_num = 0
existing_file_names = os.listdir(photo_save_path)
for url in photo_urls:
line_num += 1
save_file_name = url.split('/')[-1]
if save_file_name in existing_file_names:
print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))
continue
try:
browser.get(url)
except Exception as e:
print(repr(e))
browser.start_session(capabilities=DesiredCapabilities.CHROME)
continue
time.sleep(0.5)
try:
with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file:
img = browser.find_element_by_xpath('//body/img[1]')
img_screenshot = img.screenshot_as_png
image_file.write(img_screenshot)
image_file.close()
count += 1
print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))
time.sleep(0.5)
except Exception as e:
logging.debug(repr(e))
if count >= 10:
break
browser.quit()