osc/research/nuofang-db/key-person/congress/photo-fetcher.py

import logging
import time

import selenium
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options

driver_options = Options()
driver_options.add_argument('--headless')
# driver_options.add_argument('--no-sandbox')
# driver_options.add_argument('--disable-dev-shm-usage')
# driver_options.add_argument('--disable-gpu')
browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555",
                                                        desired_capabilities=DesiredCapabilities.CHROME,
                                                        options=driver_options)

root_url = 'https://www.congress.gov'
photo_save_path = r"/root/key-person/photo"

if __name__ == "__main__":
    photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8')
    photo_urls = photo_urls_file.read().split('\n')
    photo_urls_file.close()
    uris_count = len(photo_urls)
    count = 0
    line_num = 0
    existing_file_names = os.listdir(photo_save_path)
    for url in photo_urls:
        line_num += 1
        save_file_name = url.split('/')[-1]
        if save_file_name in existing_file_names:
            print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))
            continue
        try:
            browser.get(url)
        except Exception as e:
            print(repr(e))
            browser.start_session(capabilities=DesiredCapabilities.CHROME)
            continue
        time.sleep(0.5)
        try:
            with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file:
                img = browser.find_element_by_xpath('//body/img[1]')
                img_screenshot = img.screenshot_as_png
                image_file.write(img_screenshot)
            image_file.close()
            count += 1
            print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))
            time.sleep(0.5)
        except Exception as e:
            logging.debug(repr(e))
        if count >= 10:
            break
    browser.quit()
init 2025-05-28 19:16:17 +08:00			`import logging`
			`import time`

			`import selenium`
			`import os`
			`from selenium import webdriver`
			`from selenium.webdriver.common.desired_capabilities import DesiredCapabilities`
			`from selenium.webdriver.firefox.options import Options`

			`driver_options = Options()`
			`driver_options.add_argument('--headless')`
			`# driver_options.add_argument('--no-sandbox')`
			`# driver_options.add_argument('--disable-dev-shm-usage')`
			`# driver_options.add_argument('--disable-gpu')`
			`browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:25555",`
			`desired_capabilities=DesiredCapabilities.CHROME,`
			`options=driver_options)`

			`root_url = 'https://www.congress.gov'`
			`photo_save_path = r"/root/key-person/photo"`

			`if __name__ == "__main__":`
			`photo_urls_file = open(r"./photo-urls.txt", 'r', encoding='utf-8')`
			`photo_urls = photo_urls_file.read().split('\n')`
			`photo_urls_file.close()`
			`uris_count = len(photo_urls)`
			`count = 0`
			`line_num = 0`
			`existing_file_names = os.listdir(photo_save_path)`
			`for url in photo_urls:`
			`line_num += 1`
			`save_file_name = url.split('/')[-1]`
			`if save_file_name in existing_file_names:`
			`print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))`
			`continue`
			`try:`
			`browser.get(url)`
			`except Exception as e:`
			`print(repr(e))`
			`browser.start_session(capabilities=DesiredCapabilities.CHROME)`
			`continue`
			`time.sleep(0.5)`
			`try:`
			`with open('{}/{}'.format(photo_save_path, save_file_name), 'wb') as image_file:`
			`img = browser.find_element_by_xpath('//body/img[1]')`
			`img_screenshot = img.screenshot_as_png`
			`image_file.write(img_screenshot)`
			`image_file.close()`
			`count += 1`
			`print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))`
			`time.sleep(0.5)`
			`except Exception as e:`
			`logging.debug(repr(e))`
			`if count >= 10:`
			`break`
			`browser.quit()`