osc/research/nuofang-db/surfpac/page_parser.py
2025-05-28 19:16:17 +08:00

60 lines
2.8 KiB
Python

import datetime
import os
import pymysql
from lxml import etree
target_file_path = 'E:/yuxin/surfpac/pages'
if __name__ == "__main__":
count = 0
db = pymysql.connect(host='39.101.194.63', port=23306,
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
cursor = db.cursor()
for _path in os.listdir(target_file_path):
if _path.endswith('.html'):
# if _path == 'capt-aaron-j-taylor.html':
count += 1
webpage_info = {}
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
html_content = target_file.read().replace('\n', '')
target_file.close()
selector = etree.HTML(html_content)
main_title = ''.join(selector.xpath("//h1[@class='maintitle']/text()")).strip().replace("'", "\\'")
sub_title = ''.join(selector.xpath("//h4[@class='subtitle']/text()")).strip().replace("'", "\\'")
i_left = sub_title.find('(')
i_right = sub_title.find(')')
if i_left > 0:
related_target = sub_title[i_left + 1:i_right]
else:
related_target = ''
page_date_str = ''.join(selector.xpath("//div[@class='content-wrap']/span[@class='date']/text()")).strip()
date_obj = datetime.datetime.strptime(page_date_str, '%d %B %Y')
page_date = date_obj.strftime('%Y-%m-%d')
page_content_info = selector.xpath("//div[@class='body']/div[@class='content-wrap']/text()")
page_content_paragraph = []
for para in page_content_info:
if len(para.strip()) > 0:
page_content_paragraph.append(para.strip())
page_content = '\n'.join(page_content_paragraph).replace("'", "\\'")
try:
image_url = selector.xpath("//div[@class='image-wrapper']/img[@class='img-responsive']/@src")[0]
image = image_url.split("/")[-1]
except:
image_url = ''
image = ''
try:
page_url = selector.xpath("//div[@class='social']/div/@data-page-url")[0]
except:
page_url = ''
print(f"[No. {count}] {main_title} - {related_target}")
SQL_INSERT = "INSERT INTO nfm.surfpac_person (id, main_title, sub_title, image, related_target, page_date, " \
"page_date_str, page_content, image_url, page_url) " \
"VALUES ({});".format(f"{count}, '{main_title}', '{sub_title}', '{image}', "
f"'{related_target}', '{page_date}', '{page_date_str}', "
f"'{page_content}', '{image_url}', '{page_url}'")
cursor.execute(SQL_INSERT)
db.commit()
db.close()