51 lines
2.2 KiB
Python
51 lines
2.2 KiB
Python
import json
|
||
import os
|
||
import shutil
|
||
|
||
import pymysql
|
||
from lxml import etree
|
||
|
||
from nfm.mstruct.webpage_parser import parse_base_info
|
||
|
||
target_file_path = r"E:/yuxin/nuofang-data/structure/0420/webpage"
|
||
images_path = r"E:/yuxin/nuofang-data/structure/0420/images"
|
||
|
||
if __name__ == '__main__':
|
||
count = 0
|
||
db = pymysql.connect(host='39.101.194.63', port=23306,
|
||
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
||
cursor = db.cursor()
|
||
internet_image_urls = []
|
||
for _path in os.listdir(target_file_path):
|
||
if _path.endswith('.html'):
|
||
# if _path.endswith('美国国防部(DoD)_组织结构_全球军事态势情报数据库.html'):
|
||
count += 1
|
||
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
||
html_content = target_file.read().replace('\n', '')
|
||
target_file.close()
|
||
selector = etree.HTML(html_content)
|
||
base_info = parse_base_info(selector)
|
||
internal_id = base_info['内部编号']
|
||
images_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/@href")
|
||
images_file_name = ''
|
||
images_file_list = []
|
||
if len(images_uris) > 0:
|
||
for image_uri in images_uris:
|
||
images_file_name = image_uri.split("/")[-1]
|
||
images_file_list.append(images_file_name)
|
||
if image_uri.startswith("./"):
|
||
image_source_path = target_file_path + image_uri[1:]
|
||
shutil.copy(image_source_path, images_path + "/" + images_file_name)
|
||
else:
|
||
if image_uri.startswith("http"):
|
||
internet_image_urls.append(image_uri)
|
||
print(
|
||
"[No. {}] {}: images - {}".format(count, internal_id, json.dumps(images_file_list, ensure_ascii=False)))
|
||
sql_update = "UPDATE nfm.m_struct_page SET images = '{}' WHERE internal_code = '{}'".format(
|
||
json.dumps(images_file_list, ensure_ascii=False), internal_id)
|
||
cursor.execute(sql_update)
|
||
db.commit()
|
||
db.close()
|
||
for url in internet_image_urls:
|
||
print(url)
|