资源简介
亚马逊评论详情页是动态加载的,不过多折腾,直接用selenium进行爬取;用pandas写入csv文件,解决乱码、无序问题;
代码片段和文件信息
# coding=utf-8
import time
from selenium import webdriver
from pandas import Dataframe
class Comment(object):
def __init__(self):
self.url = ‘https://www.amazon.com/PISEN-20000mAh-Portable-Capacity-External/product-reviews/B075D4SS7F/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews‘
self.driver = webdriver.Chrome()
# self.file = open(‘demo_Amazon-comment.json‘ ‘w‘ encoding=‘utf-8‘)
self.temp = {}
def __del__(self):
self.driver.close()
# self.file.close()
def parse_data(self):
# 获取所有节点列表
time.sleep(3)
node_list = self.driver.find_elements_by_xpath(‘//*[@id=“cm_cr-review_list“]/div/div‘)
# print(len(node_list))
# 遍历列表
for node in node_list:
self.temp[‘level‘].append(node.find_element_by_xpath(‘./div[1]/a[1]‘).get_attribute(‘title‘))
self.temp[‘name‘].append(node.find_element_by_xpath(‘./div[2]/span[1]/a‘).text)
self.temp[‘date‘].append(node.find_element_by_xpath(‘./div[2]/span[4]‘).text)
self.temp[‘colour‘].append(node.find_element
评论
共有 条评论