osc/research/archive/reddit_parser_profile.py

27 lines
1.2 KiB
Python
Raw Normal View History

2025-05-28 19:16:17 +08:00
from lxml import etree
if __name__ == "__main__":
_file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8')
rsp_text = _file.read().replace("\n", "")
_file.close()
selector = etree.HTML(rsp_text)
lines = selector.xpath("//shreddit-comment")
# lines = selector.xpath("//article")
counter = 0
for line in lines:
try:
item = {}
# item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0]
# item['user_name'] = item['user_id']
item['user_name'] = line.xpath("./@author")[0]
item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5]
# item['title'] = line.xpath("./@aria-label")[0]
item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "")
# item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0]
# item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0]
# item['type'] = item['portal'][3:-1]
counter += 1
print(item['content'])
except:
pass