39 lines
1.6 KiB
Python
39 lines
1.6 KiB
Python
import os
|
|
from urllib.parse import urlparse
|
|
|
|
import pymysql
|
|
from lxml import etree
|
|
|
|
from nfm.mact.page_parser import parse_base_info
|
|
|
|
target_file_path = r"E:/yuxin/nuofang-data/activity/webpage"
|
|
|
|
if __name__ == '__main__':
|
|
db = pymysql.connect(host='39.101.194.63', port=23306,
|
|
user='root', passwd='passok123A', db='nfm', charset='utf8mb4')
|
|
cursor = db.cursor()
|
|
count = 0
|
|
for _path in os.listdir(target_file_path):
|
|
if _path.endswith('.html'):
|
|
count += 1
|
|
webpage_info = {}
|
|
target_file = open(target_file_path + '/' + _path, mode='r', encoding='utf-8')
|
|
html_content = target_file.read().replace('\n', '')
|
|
target_file.close()
|
|
selector = etree.HTML(html_content)
|
|
title_info = selector.xpath("//div[@class='good-info-box']/div[@class='top cls']/span/text()")
|
|
title = ''.join(title_info).strip()
|
|
source_info = selector.xpath("/html/body/div[2]/div[1]/div[2]/div[3]/div/div/div[5]/a/text()")
|
|
source = ''.join(source_info)
|
|
domain = ''
|
|
if len(source) > 0:
|
|
domain = urlparse(source).netloc
|
|
base_info = parse_base_info(selector)
|
|
internal_id = base_info['内部编号']
|
|
SQL_UPDATE = "UPDATE nfm.m_act SET info_source_url = '{}' WHERE internal_id = '{}'".format(source,
|
|
internal_id)
|
|
cursor.execute(SQL_UPDATE)
|
|
print("[No. {}]".format(count))
|
|
db.commit()
|
|
db.close()
|