osc/research/archive/reddit_parser_profile.py

from lxml import etree

if __name__ == "__main__":
    _file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8')
    rsp_text = _file.read().replace("\n", "")
    _file.close()
    selector = etree.HTML(rsp_text)
    lines = selector.xpath("//shreddit-comment")
    # lines = selector.xpath("//article")
    counter = 0
    for line in lines:
        try:
            item = {}
            # item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0]
            # item['user_name'] = item['user_id']
            item['user_name'] = line.xpath("./@author")[0]
            item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5]
            # item['title'] = line.xpath("./@aria-label")[0]
            item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "")
            # item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0]
            # item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0]
            # item['type'] = item['portal'][3:-1]
            counter += 1
            print(item['content'])
        except:
            pass