from lxml import etree if __name__ == "__main__": _file = open(r"E:\yuxin\politics.html", 'r', encoding='utf-8') rsp_text = _file.read().replace("\n", "") _file.close() selector = etree.HTML(rsp_text) lines = selector.xpath("//shreddit-comment") # lines = selector.xpath("//article") counter = 0 for line in lines: try: item = {} # item['user_id'] = line.xpath(".//mod-notes-opener/@user-name")[0] # item['user_name'] = item['user_id'] item['user_name'] = line.xpath("./@author")[0] item['post_time'] = line.xpath(".//faceplate-timeago/time/@datetime")[0].replace("T", " ")[:-5] # item['title'] = line.xpath("./@aria-label")[0] item['content'] = "".join(line.xpath(".//div[@id='-post-rtjson-content']//text()")).replace("\n", "") # item['portal'] = line.xpath(".//a[@data-testid='location-anchor'][1]/@href")[0] # item['url'] = 'https://www.reddit.com' + line.xpath(".//shreddit-profile-comment[1]/@href")[0] # item['type'] = item['portal'][3:-1] counter += 1 print(item['content']) except: pass