豆瓣影视短评爬虫记录
1.解决wordcloud乱码:
w = wordcloud.WordCloud(width=1000,\
font_path="/System/Library/Fonts/PingFang.ttc",\
height=700) #这里的字体路径需要搜索ttc拷贝
2.matplotlib中文乱码:
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']
3.完成可视化代码:
import matplotlib.pyplot as plt
import matplotlib
import jieba
import jieba.analyse
import xlwt
import xlrd
from wordcloud import WordCloud
import numpy as np
from collections import Counter
# 设置字体 有的linux字体有问题
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']
matplotlib.rcParams['axes.unicode_minus'] = False
# 类似comment 为评论的一些数据 [ ['1','名称','star星','赞同数','评论内容'] ,['2','名称','star星','赞同数','评论内容'] ]元组
def anylasescore(comment):
score = [0, 0, 0, 0, 0, 0] # 分别对应0 1 2 3 4 5分出现的次数
count = 0 # 评分总次数
for va in comment: # 遍历每条评论的数据 ['1','名称','star星','赞同数','评论内容']
try:
score[int(va[2])] += 1 # 第3列 为star星 要强制转换成int格式
count += 1
except Exception as e:
continue
print(score)
label = '1分', '2分', '3分', '4分', '5分'
color = 'blue', 'orange', 'yellow', 'green', 'red' # 各类别颜色
size = [0, 0, 0, 0, 0] # 一个百分比数字 合起来为100
explode = [0, 0, 0, 0, 0] # explode :(每一块)离开中心距离;
for i in range(1, 5): # 计算
size[i] = score[i] * 100 / count
explode[i] = score[i] / count / 10
pie = plt.pie(size, colors=color, explode=explode, labels=label, shadow=True, autopct='%1.1f%%')
for font in pie[1]:
font.set_size(8)
for digit in pie[2]:
digit.set_size(8)
plt.axis('equal') # 该行代码使饼图长宽相等
plt.title(u'各个评分占比', fontsize=12) # 标题
plt.legend(loc=0, bbox_to_anchor=(0.82, 1)) # 图例
# 设置legend的字体大小
leg = plt.gca().get_legend()
ltext = leg.get_texts()
plt.setp(ltext, fontsize=6)
plt.savefig("score.png")
# 显示图
plt.show()
def getzhifang(map): # 直方图二维,需要x和y两个坐标
x = []
y = []
for k, v in map.most_common(15): # 获取前15个最大数值
x.append(k)
y.append(v)
Xi = np.array(x) # 转成numpy的坐标
Yi = np.array(y)
width = 0.6
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # 用来正常显示中文标签
plt.figure(figsize=(8, 6)) # 指定图像比例: 8:6
plt.bar(Xi, Yi, width, color='blue', label='热门词频统计', alpha=0.8, )
plt.xlabel("词频")
plt.ylabel("次数")
plt.savefig('zhifang.png')
plt.show()
return
def getciyun_most(map): # 获取词云
# 一个存对应中文单词,一个存对应次数
x = []
y = []
for k, v in map.most_common(300): # 在前300个常用词语中
x.append(k)
y.append(v)
xi = x[0:150] # 截取前150个
xi = ' '.join(xi) # 以空格 ` `将其分割为固定格式(词云需要)
print(xi)
# backgroud_Image = plt.imread('') # 如果需要个性化词云
# 词云大小,字体等基本设置
wc = WordCloud(background_color="white",
width=1500, height=1200,
# min_font_size=40,
# mask=backgroud_Image,
font_path="/System/Library/Fonts/PingFang.ttc",
max_font_size=150, # 设置字体最大值
random_state=50, # 设置有多少种随机生成状态,即有多少种配色方案
) # 字体这里有个坑,一定要设这个参数。否则会显示一堆小方框wc.font_path="simhei.ttf" # 黑体
# wc.font_path="simhei.ttf"
my_wordcloud = wc.generate(xi) #需要放入词云的单词 ,这里前150个单词
plt.imshow(my_wordcloud) # 展示
my_wordcloud.to_file("img.jpg") # 保存
xi = ' '.join(x[150:300]) # 再次获取后150个单词再保存一张词云
my_wordcloud = wc.generate(xi)
my_wordcloud.to_file("img2.jpg")
plt.axis("off")
def anylaseword(comment):
# 这个过滤词,有些词语没意义需要过滤掉
list = ['这个', '一个', '不少', '起来', '没有', '就是', '不是', '那个', '还是', '剧情', '这样', '那样', '这种', '那种', '故事', '人物', '什么']
print(list)
commnetstr = '' # 评论的字符串
c = Counter() # python一种数据集合,用来存储字典
index = 0
for va in comment:
seg_list = jieba.cut(va[4], cut_all=False) ## jieba分词
index += 1
for x in seg_list:
if len(x) > 1 and x != '\r\n': # 不是单个字 并且不是特殊符号
try:
c[x] += 1 # 这个单词的次数加一
except:
continue
commnetstr += va[4]
for (k, v) in c.most_common(): # 过滤掉次数小于5的单词
if v < 5 or k in list:
c.pop(k)
continue
# print(k,v)
print(len(c), c)
getzhifang(c) # 用这个数据进行画直方图
getciyun_most(c) # 词云
# print(commnetstr)
def anylase():
data = xlrd.open_workbook('comments.xls') # 打开xls文件
table = data.sheets()[0] # 打开第i张表
nrows = table.nrows # 若干列的一个集合
comment = []
for i in range(nrows):
comment.append(table.row_values(i)) # 将该列数据添加到元组中
# print(comment)
anylasescore(comment)
anylaseword(comment)
if __name__ == '__main__':
anylase()
4.完整爬虫代码:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import xlwt
import xlrd
# 账号密码
def login(username, password):
url = 'https://accounts.douban.com/j/mobile/login/basic'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
'Origin': 'https://accounts.douban.com',
'content-Type': 'application/x-www-form-urlencoded',
'x-requested-with': 'XMLHttpRequest',
'accept': 'application/json',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'connection': 'keep-alive'
, 'Host': 'accounts.douban.com'
}
# 登陆需要携带的参数
data = {
'ck' : '',
'name': '',
'password': '',
'remember': 'false',
'ticket': ''
}
data['name'] = username
data['password'] = password
data = urllib.parse.urlencode(data)
print(data)
req = requests.post(url, headers=header, data=data, verify=False)
cookies = requests.utils.dict_from_cookiejar(req.cookies)
print(cookies)
return cookies
def getcomment(cookies, mvid): # 参数为登录成功的cookies(后台可通过cookies识别用户,电影的id)
start = 0
w = xlwt.Workbook(encoding='ascii') # #创建可写的workbook对象
ws = w.add_sheet('sheet1') # 创建工作表sheet
index = 1 # 表示行的意思,在xls文件中写入对应的行数
while True:
# 模拟浏览器头发送请求
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
}
# try catch 尝试,一旦有错误说明执行完成,没错误继续进行
try:
# 拼凑url 每次star加20
url = 'https://movie.douban.com/subject/' + str(mvid) + '/comments?start=' + str(
start) + '&limit=20&sort=new_score&status=P&comments_only=1'
start += 20
# 发送请求
req = requests.get(url, cookies=cookies, headers=header)
# 返回的结果是个json字符串 通过req.json()方法获取数据
res = req.json()
res = res['html'] # 需要的数据在`html`键下
soup = BeautifulSoup(res, 'lxml') # 把这个结构化html创建一个BeautifulSoup对象用来提取信息
node = soup.select('.comment-item') # 每组class 均为comment-item 这样分成20条记录(每个url有20个评论)
for va in node: # 遍历评论
name = va.a.get('title') # 获取评论者名称
star = va.select_one('.comment-info').select('span')[1].get('class')[0][-2] # 星数好评
votes = va.select_one('.votes').text # 投票数
comment = va.select_one('.short').text # 评论文本
print(name, star, votes, comment)
ws.write(index, 0, index) # 第index行,第0列写入 index
ws.write(index, 1, name) # 第index行,第1列写入 评论者
ws.write(index, 2, star) # 第index行,第2列写入 评星
ws.write(index, 3, votes) # 第index行,第3列写入 投票数
ws.write(index, 4, comment) # 第index行,第4列写入 评论内容
index += 1
except Exception as e: # 有异常退出
print(e)
break
w.save('test.xls') # 保存为test.xls文件
if __name__ == '__main__':
username = input('输入账号:')
password = input('输入密码:')
cookies = login(username, password)
mvid = input('电影的id为:')
getcomment(cookies, mvid)
5.参考文章:
https://juejin.im/post/6886997083052572686
https://blog.csdn.net/qq_32590631/article/details/80509741
https://blog.csdn.net/wlher/article/details/98186741