27 lines
1.2 KiB
Python
27 lines
1.2 KiB
Python
from lxml import etree
|
|
|
|
if __name__ == "__main__":
|
|
_file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8')
|
|
rsp_text = _file.read().replace("\n", "")
|
|
_file.close()
|
|
selector = etree.HTML(rsp_text)
|
|
lines = selector.xpath("//shreddit-comment")
|
|
# lines = selector.xpath("//article")
|
|
counter = 0
|
|
for line in lines:
|
|
try:
|
|
item = {}
|
|
# item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0]
|
|
# item['user_name'] = item['user_id']
|
|
item['user_name'] = line.xpath("./@author")[0]
|
|
item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5]
|
|
# item['title'] = line.xpath("./@aria-label")[0]
|
|
item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "")
|
|
# item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0]
|
|
# item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0]
|
|
# item['type'] = item['portal'][3:-1]
|
|
counter += 1
|
|
print(item['content'])
|
|
except:
|
|
pass
|