53 lines
2.0 KiB
Python
53 lines
2.0 KiB
Python
import os
|
|
import re
|
|
import shutil
|
|
|
|
import pymysql
|
|
from lxml import etree
|
|
|
|
target_file_path = r"E:\yuxin\nuofang-data\person\webpage"
|
|
|
|
|
|
def trim_space(input_str):
|
|
result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
|
|
return result
|
|
|
|
|
|
def parse_avatar_info(current_selector):
|
|
img_url_info = current_selector.xpath("//table[@class='good-table table1']//img/@src")
|
|
img_url = "".join(img_url_info).strip()
|
|
return img_url
|
|
|
|
|
|
if __name__ == '__main__':
|
|
count = 0
|
|
db = pymysql.connect(host='39.101.194.63', port=23306,
|
|
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
|
cursor = db.cursor()
|
|
for _path in os.listdir(target_file_path):
|
|
if _path.endswith('.html'):
|
|
# if _path.endswith('梅家樹_军政主官_全球军事态势情报数据库.html'):
|
|
count += 1
|
|
webpage_info = {}
|
|
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
|
html_content = target_file.read().replace('\n', '')
|
|
target_file.close()
|
|
selector = etree.HTML(html_content)
|
|
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
|
|
title = ''.join(title_info).strip()
|
|
|
|
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
|
page_url = url_info.replace("#", "")
|
|
avatar_url = parse_avatar_info(selector)
|
|
person_id = page_url[37:45]
|
|
if len(avatar_url) > 0:
|
|
avatar_file_name = avatar_url.split("/")[-1]
|
|
sql_update = "UPDATE nfm.m_person_page SET images = '{}' WHERE id = {} AND images IS NULL".format(
|
|
avatar_file_name, person_id)
|
|
shutil.copy(r"E:\yuxin\nuofang-data\person\webpage" + avatar_url[1:],
|
|
r"E:\yuxin\nuofang-data\person\person-avatar-2")
|
|
cursor.execute(sql_update)
|
|
print([person_id, avatar_file_name])
|
|
db.commit()
|
|
db.close()
|