osc/research/nuofang-db/key-person/congress/photo-fetcher.py

import logging
import time

import selenium
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options

driver_options = Options()
driver_options.add_argument('--headless')
# driver_options.add_argument('--no-sandbox')
# driver_options.add_argument('--disable-dev-shm-usage')
# driver_options.add_argument('--disable-gpu')
browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555",
                                                        desired_capabilities=DesiredCapabilities.CHROME,
                                                        options=driver_options)

root_url = 'https://www.congress.gov'
photo_save_path = r"/root/key-person/photo"

if __name__ == "__main__":
    photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8')
    photo_urls = photo_urls_file.read().split('\n')
    photo_urls_file.close()
    uris_count = len(photo_urls)
    count = 0
    line_num = 0
    existing_file_names = os.listdir(photo_save_path)
    for url in photo_urls:
        line_num += 1
        save_file_name = url.split('/')[-1]
        if save_file_name in existing_file_names:
            print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))
            continue
        try:
            browser.get(url)
        except Exception as e:
            print(repr(e))
            browser.start_session(capabilities=DesiredCapabilities.CHROME)
            continue
        time.sleep(0.5)
        try:
            with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file:
                img = browser.find_element_by_xpath('//body/img[1]')
                img_screenshot = img.screenshot_as_png
                image_file.write(img_screenshot)
            image_file.close()
            count += 1
            print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))
            time.sleep(0.5)
        except Exception as e:
            logging.debug(repr(e))
        if count >= 10:
            break
    browser.quit()