osc/research/archive/reddit_parser_profile.py

from lxml import etree

if __name__ == "__main__":
    _file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8')
    rsp_text = _file.read().replace("\n", "")
    _file.close()
    selector = etree.HTML(rsp_text)
    lines = selector.xpath("//shreddit-comment")
    # lines = selector.xpath("//article")
    counter = 0
    for line in lines:
        try:
            item = {}
            # item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0]
            # item['user_name'] = item['user_id']
            item['user_name'] = line.xpath("./@author")[0]
            item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5]
            # item['title'] = line.xpath("./@aria-label")[0]
            item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "")
            # item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0]
            # item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0]
            # item['type'] = item['portal'][3:-1]
            counter += 1
            print(item['content'])
        except:
            pass
init 2025-05-28 19:16:17 +08:00			`from lxml import etree`

			`if __name__ == "__main__":`
			`_file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8')`
			`rsp_text = _file.read().replace("\n", "")`
			`_file.close()`
			`selector = etree.HTML(rsp_text)`
			`lines = selector.xpath("//shreddit-comment")`
			`# lines = selector.xpath("//article")`
			`counter = 0`
			`for line in lines:`
			`try:`
			`item = {}`
			`# item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0]`
			`# item['user_name'] = item['user_id']`
			`item['user_name'] = line.xpath("./@author")[0]`
			`item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5]`
			`# item['title'] = line.xpath("./@aria-label")[0]`
			`item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "")`
			`# item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0]`
			`# item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0]`
			`# item['type'] = item['portal'][3:-1]`
			`counter += 1`
			`print(item['content'])`
			`except:`
			`pass`