93 lines
3.9 KiB
Python
93 lines
3.9 KiB
Python
import os
|
||
from lxml import etree
|
||
import json
|
||
import re
|
||
|
||
target_file_path = r"E:\yuxin\nuofang-data\person\webpage"
|
||
|
||
|
||
def trim_space(input_str):
|
||
result = re.sub(' {2,}', ' ', input_str.replace("\u3000", " "))
|
||
return result
|
||
|
||
|
||
def parse_base_info(current_selector):
|
||
base_info_line = current_selector.xpath("//table[@class='good-table table1']/tbody/tr")
|
||
base_info = {}
|
||
for line in base_info_line:
|
||
label_info = line.xpath("./td[@class='b']/text()")
|
||
value_cell = line.xpath("./td[not(@class='b')]")
|
||
for index_label in range(len(label_info)):
|
||
label = label_info[index_label]
|
||
cell_info = value_cell[index_label]
|
||
cell_text = trim_space("".join(cell_info.xpath(".//text()"))).strip()
|
||
base_info[label] = cell_text
|
||
return base_info
|
||
|
||
|
||
def parse_website_info(current_selector):
|
||
website_info = {}
|
||
info_area = current_selector.xpath("//div[@class='tabInfo']/div[@class='cItem active']")
|
||
if info_area and len(info_area) > 0:
|
||
update_time_info = info_area[0].xpath(".//dt//text()")
|
||
if update_time_info and len(update_time_info) > 0:
|
||
update_time = trim_space("".join(update_time_info))
|
||
update_time_label = update_time.split(":")[0].strip()
|
||
update_time_value = update_time.split(":")[1].strip()
|
||
website_info[update_time_label] = update_time_value
|
||
subtitles = info_area[0].xpath("./div/h2/text()")
|
||
subcontents = []
|
||
subcontents_area = info_area[0].xpath("./div/dl")
|
||
for _area in subcontents_area:
|
||
_content_info = _area.xpath("./div//text()")
|
||
if _content_info and len(_content_info) > 0:
|
||
paragraphs = []
|
||
for _ in _content_info:
|
||
paragraphs.append(trim_space(_).strip())
|
||
_content = " \\n".join(paragraphs)
|
||
subcontents.append(" " + _content)
|
||
else:
|
||
_source_info = _area.xpath("./dd//text()")
|
||
if _source_info and len(_source_info) > 0:
|
||
_source = ";".join(_source_info)
|
||
subcontents.append(_source)
|
||
for i in range(len(subtitles)):
|
||
subtitle = subtitles[i].replace(":", "")
|
||
subcontent = subcontents[i]
|
||
website_info[subtitle] = subcontent
|
||
return website_info
|
||
else:
|
||
return []
|
||
|
||
|
||
if __name__ == '__main__':
|
||
count = 0
|
||
for _path in os.listdir(target_file_path):
|
||
if _path.endswith('.html'):
|
||
# if _path.endswith('Yancy B. Lindsey_军政主官_全球军事态势情报数据库.html'):
|
||
count += 1
|
||
webpage_info = {}
|
||
target_file = open(target_file_path + '\\' + _path, mode='r', encoding='utf-8')
|
||
html_content = target_file.read().replace('\n', '')
|
||
target_file.close()
|
||
selector = etree.HTML(html_content)
|
||
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
|
||
title = ''.join(title_info).strip()
|
||
|
||
url_info = selector.xpath('//ul[@class="osc-dropMenu"]/li[last()]/a/@href')[0]
|
||
webpage_info["url_info"] = url_info.replace("#", "")
|
||
|
||
base_info = parse_base_info(selector)
|
||
if len(base_info) > 0:
|
||
webpage_info["base_info"] = base_info
|
||
website_info = parse_website_info(selector)
|
||
if len(website_info) > 0:
|
||
webpage_info["website_info"] = website_info
|
||
|
||
result_file_name = title.replace("/", "-").replace('"', '').replace("'", '') + '.json'
|
||
result_file_path = 'E:/yuxin/nuofang-data/person/result0515/' + result_file_name
|
||
result_file = open(result_file_path, 'w', encoding='utf-8')
|
||
result_file.write(json.dumps(webpage_info, ensure_ascii=False))
|
||
result_file.close()
|
||
print("[No. {}] {} 写入完成,原始url是 {}".format(count, title, webpage_info["url_info"]))
|