powerbi python词云图_Python 练手项目：抓取豆瓣陈情令评论，并制作词云图

2023-09-19 阅读 23 评论 0

摘要：一、项目简介qpython，1.内容：循环抓取豆瓣影评中所有观众对《陈情令》的评论，存储在文本文档中，并运用可视化库--词云对其进行分析。2.目标网站：https://movie.douban.com/subject/27195020/comments?start=3.使用软件：pycharm4.使用

一、项目简介

qpython，1.内容：循环抓取豆瓣影评中所有观众对《陈情令》的评论，存储在文本文档中，并运用可视化库--词云对其进行分析。

2.目标网站：

https://movie.douban.com/subject/27195020/comments?start=

3.使用软件：pycharm

4.使用 python3.7 版本

用python爬取网页数据？5.涉及的python类库：requests、lxml、wordcloud、numpy、PIL、jieba

二、具体思路

1.安装、导入相应的类库(本机已安装类库)

import requests
from lxml import etree #xpath
from wordcloud import WordCloud
import PIL.Image as image  #引入读取图片的工具
import numpy as np
import jieba   # 分词

2.确定网页，获取请求头，解决反爬机制，并且循环获取所有页面

#获取html源代码
def getPage(url):
    headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/63.0.3239.132 Safari/537.36"
    }
    response = requests.get(url,headers = headers).text
return response


#循环获得所有页面的url
def all_page():
    base_url = "https://movie.douban.com/subject/27195020/comments?start="
#列表存放所有的网页，共10页
    urllist = []
for page in range(0,200,20):
        allurl = base_url+str(page)
        urllist.append(allurl)
return urllist

3.运用xpath获取短评

#解析网页
def parse():
#列表存放所有的短评
    all_comment = []
    number = 1
for url in all_page():
#初始化
        html = etree.HTML(getPage(url))
#短评
        comment = html.xpath('//div[@]//p/span/text()')
        all_comment.append(comment)
        print('第'+str(number)+'页解析并保存成功')
        number += 1
return all_comment

4.存入txt文档

#保存为txt
def save_to_txt():
    result = parse()
for i in range(len(result)):
with open('陈情令评论集.txt','a+',encoding='utf-8') as f:
            f.write(str(result[i])+'\n')  #按行存储每一页的数据
            f.close()

python中numpy？5.将文档的短评进行分词

#将爬取的文档进行分词
def trans_CN(text):
    word_list = jieba.cut(text)
#分词后在单独个体之间加上空格
    result = " ".join(word_list)
return result

6.制作词云

#制作词云
def getWordCloud():
    path_txt = "陈情令评论集.txt"                #文档
    path_jpg = "1.jpg"                          #词云形状图片
    path_font = "C:\\Windows\\Fonts\\msyh.ttc"  #字体

    text = open(path_txt,encoding='utf-8').read()

#剔除无关字
    text = text.replace("真的"," ")
    text = text.replace("什么", " ")
    text = text.replace("但是", " ")
    text = text.replace("而且", " ")
    text = text.replace("那么", " ")
    text = text.replace("就是", " ")
    text = text.replace("可以", " ")
    text = text.replace("不是", " ")

    text = trans_CN(text)
    mask = np.array(image.open(path_jpg))  #词云图案
    wordcloud = WordCloud(
        background_color='white',   #词云背景颜色
        mask=mask,
        scale=15,
        max_font_size=80,
        font_path=path_font
    ).generate(text)

    wordcloud.to_file('陈情令评论词云.jpg')

三、代码生成

#！/usr/bin/env python
#-*- coding:utf-8 -*-
#author : Only  time:2019/8/3 0002


import requests
from lxml import etree #xpath
from wordcloud import WordCloud
import PIL.Image as image  #引入读取图片的工具
import numpy as np
import jieba   # 分词


#获取html源代码
def getPage(url):
    headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64)"
" AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/63.0.3239.132 Safari/537.36"
    }
    response = requests.get(url,headers = headers).text
return response


#获得所有页面
def all_page():
    base_url = "https://movie.douban.com/subject/27195020/comments?start="
#列表存放所有的网页，共10页
    urllist = []
for page in range(0,200,20):
        allurl = base_url+str(page)
        urllist.append(allurl)
return urllist


#解析网页
def parse():
#列表存放所有的短评
    all_comment = []
    number = 1
for url in all_page():
#初始化
        html = etree.HTML(getPage(url))
#短评
        comment = html.xpath('//div[@]//p/span/text()')
        all_comment.append(comment)
        print('第'+str(number)+'页解析并保存成功')
        number += 1
return all_comment


#保存为txt
def save_to_txt():
    result = parse()
for i in range(len(result)):
with open('陈情令评论集.txt','a+',encoding='utf-8') as f:
            f.write(str(result[i])+'\n')  #按行存储每一页的数据
            f.close()


#将爬取的文档进行分词
def trans_CN(text):
    word_list = jieba.cut(text)
#分词后在单独个体之间加上空格
    result = " ".join(word_list)
return result


#制作词云
def getWordCloud():
    path_txt = "陈情令评论集.txt"
    path_jpg = "1.jpg"
    path_font = "C:\\Windows\\Fonts\\msyh.ttc"

    text = open(path_txt,encoding='utf-8').read()

#剔除无关字
    text = text.replace("真的"," ")
    text = text.replace("什么", " ")
    text = text.replace("但是", " ")
    text = text.replace("而且", " ")
    text = text.replace("那么", " ")
    text = text.replace("就是", " ")
    text = text.replace("可以", " ")
    text = text.replace("不是", " ")

    text = trans_CN(text)
    mask = np.array(image.open(path_jpg))  #词云背景图案
    wordcloud = WordCloud(
        background_color='white',
        mask=mask,
        scale=15,
        max_font_size=80,
        font_path=path_font
    ).generate(text)
    wordcloud.to_file('陈情令评论词云.jpg')


#主函数
if __name__ == '__main__':
    save_to_txt()
    print('所有页面保存成功')
    getWordCloud()
    print('词云制作成功')