60 lines
2.8 KiB
Python
60 lines
2.8 KiB
Python
|
|
import datetime
|
||
|
|
import os
|
||
|
|
|
||
|
|
import pymysql
|
||
|
|
from lxml import etree
|
||
|
|
|
||
|
|
target_file_path = 'E:/yuxin/surfpac/pages'
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
count = 0
|
||
|
|
db = pymysql.connect(host='39.101.194.63', port=23306,
|
||
|
|
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
||
|
|
cursor = db.cursor()
|
||
|
|
|
||
|
|
for _path in os.listdir(target_file_path):
|
||
|
|
if _path.endswith('.html'):
|
||
|
|
# if _path == 'capt-aaron-j-taylor.html':
|
||
|
|
count += 1
|
||
|
|
webpage_info = {}
|
||
|
|
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
|
||
|
|
html_content = target_file.read().replace('\n', '')
|
||
|
|
target_file.close()
|
||
|
|
selector = etree.HTML(html_content)
|
||
|
|
main_title = ''.join(selector.xpath("//h1[@class='maintitle']/text()")).strip().replace("'", "\\'")
|
||
|
|
sub_title = ''.join(selector.xpath("//h4[@class='subtitle']/text()")).strip().replace("'", "\\'")
|
||
|
|
i_left = sub_title.find('(')
|
||
|
|
i_right = sub_title.find(')')
|
||
|
|
if i_left > 0:
|
||
|
|
related_target = sub_title[i_left + 1:i_right]
|
||
|
|
else:
|
||
|
|
related_target = ''
|
||
|
|
page_date_str = ''.join(selector.xpath("//div[@class='content-wrap']/span[@class='date']/text()")).strip()
|
||
|
|
date_obj = datetime.datetime.strptime(page_date_str, '%d %B %Y')
|
||
|
|
page_date = date_obj.strftime('%Y-%m-%d')
|
||
|
|
page_content_info = selector.xpath("//div[@class='body']/div[@class='content-wrap']/text()")
|
||
|
|
page_content_paragraph = []
|
||
|
|
for para in page_content_info:
|
||
|
|
if len(para.strip()) > 0:
|
||
|
|
page_content_paragraph.append(para.strip())
|
||
|
|
page_content = '\n'.join(page_content_paragraph).replace("'", "\\'")
|
||
|
|
try:
|
||
|
|
image_url = selector.xpath("//div[@class='image-wrapper']/img[@class='img-responsive']/@src")[0]
|
||
|
|
image = image_url.split("/")[-1]
|
||
|
|
except:
|
||
|
|
image_url = ''
|
||
|
|
image = ''
|
||
|
|
try:
|
||
|
|
page_url = selector.xpath("//div[@class='social']/div/@data-page-url")[0]
|
||
|
|
except:
|
||
|
|
page_url = ''
|
||
|
|
print(f"[No. {count}] {main_title} - {related_target}")
|
||
|
|
SQL_INSERT = "INSERT INTO nfm.surfpac_person (id, main_title, sub_title, image, related_target, page_date, " \
|
||
|
|
"page_date_str, page_content, image_url, page_url) " \
|
||
|
|
"VALUES ({});".format(f"{count}, '{main_title}', '{sub_title}', '{image}', "
|
||
|
|
f"'{related_target}', '{page_date}', '{page_date_str}', "
|
||
|
|
f"'{page_content}', '{image_url}', '{page_url}'")
|
||
|
|
cursor.execute(SQL_INSERT)
|
||
|
|
db.commit()
|
||
|
|
db.close()
|