2025-05-28 19:16:17 +08:00

56 lines
2.1 KiB
Python

import logging
import time
import selenium
import os
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.firefox.options import Options
driver_options = Options()
driver_options.add_argument('--headless')
# driver_options.add_argument('--no-sandbox')
# driver_options.add_argument('--disable-dev-shm-usage')
# driver_options.add_argument('--disable-gpu')
browser = selenium.webdriver.remote.webdriver.WebDriver(command_executor="http://96.45.186.11:4444",
desired_capabilities=DesiredCapabilities.FIREFOX,
options=driver_options)
root_url = 'https://www.congress.gov'
html_save_path = r"E:\yuxin\us-congress-html\pages"
if __name__ == "__main__":
person_uris_file = open(r"E:\yuxin\us-congress-html\person-uris.txt", 'r', encoding='utf-8')
person_uris = person_uris_file.read().split('\n')
person_uris_file.close()
uris_count = len(person_uris)
count = 0
line_num = 0
existing_file_names = os.listdir(html_save_path)
for uri in person_uris:
line_num += 1
uri = uri.split("?")[0]
save_file_name = "-".join(uri[1:].split('/')[1:])
if save_file_name + '.html' in existing_file_names:
print("[Line {}] 已存在文件 {}".format(line_num, save_file_name))
continue
complete_flag = False
try:
browser.get(root_url + uri)
except Exception as e:
print(repr(e))
browser.start_session(capabilities=DesiredCapabilities.FIREFOX)
continue
time.sleep(0.5)
try:
html_text = browser.page_source
save_file = open("{}/{}.html".format(html_save_path, save_file_name), 'w', encoding='utf-8')
save_file.write(html_text)
save_file.close()
count += 1
print("[No. {}] [Line {}] 保存文件 {} 成功".format(count, line_num, save_file_name))
time.sleep(0.5)
except Exception as e:
logging.debug(repr(e))
browser.quit()