import logging import time import selenium import os from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.firefox.options import Options driver_options = Options() driver_options.add_argument('--headless') # driver_options.add_argument('--no-sandbox') # driver_options.add_argument('--disable-dev-shm-usage') # driver_options.add_argument('--disable-gpu') browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:4444", desired_capabilities=DesiredCapabilities.FIREFOX, options=driver_options) root_url = 'https://www.congress.gov' html_save_path = r"E:\yuxin\us-congress-html\pages" if __name__ == "__main__": person_uris_file = open(r"E:\yuxin\us-congress-html\person-uris.txt", 'r', encoding='utf-8') person_uris = person_uris_file.read().split('\n') person_uris_file.close() uris_count = len(person_uris) count = 0 line_num = 0 existing_file_names = os.listdir(html_save_path) for uri in person_uris: line_num += 1 uri = uri.split("?")[0] save_file_name = "-".join(uri[1:].split('/')[1:]) if save_file_name + '.html' in existing_file_names: print("[Line {}] 已存在文件 {}".format(line_num, save_file_name)) continue complete_flag = False try: browser.get(root_url + uri) except Exception as e: print(repr(e)) browser.start_session(capabilities=DesiredCapabilities.FIREFOX) continue time.sleep(0.5) try: html_text = browser.page_source save_file = open("{}/{}.html".format(html_save_path, save_file_name), 'w', encoding='utf-8') save_file.write(html_text) save_file.close() count += 1 print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name)) time.sleep(0.5) except Exception as e: logging.debug(repr(e)) browser.quit()