import logging import time import selenium import os from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.firefox.options import Options driver_options = Options() driver_options.add_argument('--headless') # driver_options.add_argument('--no-sandbox') # driver_options.add_argument('--disable-dev-shm-usage') # driver_options.add_argument('--disable-gpu') browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555", desired_capabilities=DesiredCapabilities.CHROME, options=driver_options) root_url = 'https://www.congress.gov' photo_save_path = r"/root/key-person/photo" if __name__ == "__main__": photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8') photo_urls = photo_urls_file.read().split('\n') photo_urls_file.close() uris_count = len(photo_urls) count = 0 line_num = 0 existing_file_names = os.listdir(photo_save_path) for url in photo_urls: line_num += 1 save_file_name = url.split('/')[-1] if save_file_name in existing_file_names: print("[Line {}] 已存在文件 {}".format(line_num, save_file_name)) continue try: browser.get(url) except Exception as e: print(repr(e)) browser.start_session(capabilities=DesiredCapabilities.CHROME) continue time.sleep(0.5) try: with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file: img = browser.find_element_by_xpath('//body/img[1]') img_screenshot = img.screenshot_as_png image_file.write(img_screenshot) image_file.close() count += 1 print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name)) time.sleep(0.5) except Exception as e: logging.debug(repr(e)) if count >= 10: break browser.quit()