50 lines
2.1 KiB
Python
50 lines
2.1 KiB
Python
import json
|
|
import os
|
|
|
|
import pymysql
|
|
from lxml import etree
|
|
|
|
from nfm.mbase.page_parser import parse_base_info
|
|
|
|
target_file_path = r"E:/yuxin/nuofang-data/base/webpage"
|
|
image_path = r"E:/yuxin/nuofang-data/base/images"
|
|
full_image_path = r"E:/yuxin/nuofang-data/base/full_images"
|
|
|
|
if __name__ == '__main__':
|
|
count = 0
|
|
db = pymysql.connect(host='39.101.194.63', port=23306,
|
|
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
|
cursor = db.cursor()
|
|
for _path in os.listdir(target_file_path):
|
|
if _path.endswith('.html'):
|
|
# if _path.endswith('埃尔门多夫-理查森堡联合基地_综合类设施_全球军事态势情报数据库.html'):
|
|
count += 1
|
|
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
|
html_content = target_file.read().replace('\n', '')
|
|
target_file.close()
|
|
selector = etree.HTML(html_content)
|
|
base_info = parse_base_info(selector, "")
|
|
internal_id = base_info['内部编号']
|
|
img_uris = selector.xpath("//div[@id='det_right']//a[@class='img']/img/@src")
|
|
full_images_urls = selector.xpath("//div[@id='det_right']//a[@class='img']/@href")
|
|
images = []
|
|
image = ''
|
|
# print("[No. {}] {}: {} images".format(count, _path[:-23], len(full_images_urls)))
|
|
for url in full_images_urls:
|
|
if url.startswith("http"):
|
|
print(url)
|
|
img_file_name = url.split("/")[-1]
|
|
images.append(img_file_name)
|
|
# for uri in img_uris:
|
|
# if uri.startswith("./"):
|
|
# img_file_name = uri.split("/")[-1]
|
|
# images.append(img_file_name)
|
|
# shutil.copy(target_file_path + uri[1:], image_path)
|
|
# if len(images) > 0:
|
|
# image = images[0]
|
|
sql_update = "UPDATE nfm.m_base_page SET full_size_images = '{}' WHERE internal_id = '{}'".format(
|
|
json.dumps(images, ensure_ascii=False), internal_id)
|
|
cursor.execute(sql_update)
|
|
db.commit()
|
|
db.close()
|