首頁>技術>

response = requests.get(self.comments_url % page, headers=self.headers)print('訪問第', page, '頁,狀態是', response.status_code, '。')time.sleep(random.random())html = etree.HTML(response.text)contents = html.xpath('//ul[contains(@class, "taptap-review-list")]/li')

然後遍歷列表解析出各個欄位:

user = content.xpath('.//a[@class="taptap-user-name"]/text()')[0] or '無名氏'score = content.xpath('.//div[@class="item-text-score"]/i[@class="colored"]/@style')[0][7:9]score = int(score) / 14comment_time = content.xpath('(.//span)[4]/text()')[0]comment = content.xpath('(.//div[@class="item-text-body"])[1]/p/text()')comment = '\n'.join(comment)

最後把資料存入檔案供之後使用:

comment_dir = {'user': users, 'score': scores, 'time': times, 'comment': comments}comment_df = pd.DataFrame(comment_dir)comment_df.to_csv('./tables/taptap_comments.csv')comment_df['comment'].to_csv('./tables/comments.csv', index=False)
分詞

爬蟲拿到了資料,接下來就要對資料進行分詞,這裡使用的是jieba庫:

jieba.load_userdict('./dictionary/my_dict.txt')with open('./tables/comments.csv', 'r', encoding='utf-8') as f:    word_list = jieba.cut(f.read())with open('./dictionary/ignore_dict.txt', 'r', encoding='utf-8') as f:    ignore_words = f.read().splitlines()

其中載入使用者詞典宣告一些jieba中沒有的詞,再用忽略詞典過濾掉一些無意義的詞:

for word in word_list:    if word not in ignore_words:        word = re.sub(r'[\n ]', '', word)        if len(word) < 1:            continue        words.append(word)

之後對分詞後的資料進行詞頻分析:

frq = {}for word in words:    frq[word] = frq.get(word, 0) + 1items = list(frq.items())items.sort(key=lambda x:x[1], reverse=True)print('詞頻前10統計如下:')for i in range(10):    word, count = items[i]    print(word, ':', count)
詞雲

最後呼叫WordCloud庫生成詞雲即可:

wordle = word_cloud.generate(wordle_data)image = wordle.to_image()image.show()123
結果

詞雲:

詞頻:

詞頻前10統計如下:鍾離 : 337角色 : 205氪 : 187玩家 : 182好 : 135人 : 131原神 : 130真的 : 126讓 : 123這個 : 118

詞頻統計這裡,忽略詞典還不完善,還有無意義的詞沒過濾掉,不想寫了,就這樣吧。詞雲用了晴寶的蒙版,刻晴,永遠的lp!(雖然我還沒抽到)

(免疫)

原始碼

主檔案genshin_wordle.py:

import requestsimport timeimport randomimport jiebaimport reimport pandas as pdimport numpy as npfrom lxml import etreefrom PIL import Imagefrom wordcloud import WordCloudclass genshin():    def __init__(self):        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}        self.comments_url = 'https://www.taptap.com/app/168332/review?order=default&page=%d#review-list'    def crawl_comments(self):        users = []        scores = []        times = []        comments = []        # 爬取10頁評論        for page in range(10):            response = requests.get(self.comments_url % page, headers=self.headers)            print('訪問第', page, '頁,狀態是', response.status_code, '。')            time.sleep(random.random())            html = etree.HTML(response.text)            contents = html.xpath('//ul[contains(@class, "taptap-review-list")]/li')            # 遍歷該頁的評論資訊            for content in contents:                # 解析各個欄位                user = content.xpath('.//a[@class="taptap-user-name"]/text()')[0] or '無名氏'                score = content.xpath('.//div[@class="item-text-score"]/i[@class="colored"]/@style')[0][7:9]                score = int(score) / 14                comment_time = content.xpath('(.//span)[4]/text()')[0]                comment = content.xpath('(.//div[@class="item-text-body"])[1]/p/text()')                comment = '\n'.join(comment)                # 把一條完整記錄存入陣列                users.append(user)                scores.append(score)                times.append(comment_time)                comments.append(comment)        # 封裝資料,寫入檔案        comment_dir = {'user': users, 'score': scores, 'time': times, 'comment': comments}        comment_df = pd.DataFrame(comment_dir)        comment_df.to_csv('./tables/taptap_comments.csv')        comment_df['comment'].to_csv('./tables/comments.csv', index=False)        print(comment_df)    def word_frequency(self, words):        frq = {}        for word in words:            frq[word] = frq.get(word, 0) + 1        items = list(frq.items())        items.sort(key=lambda x:x[1], reverse=True)        print('詞頻前10統計如下:')        for i in range(10):            word, count = items[i]            print(word, ':', count)    def segmentation(self):        # 載入自定義詞典和忽略詞典,進行分詞        jieba.load_userdict('./dictionary/my_dict.txt')        with open('./tables/comments.csv', 'r', encoding='utf-8') as f:            word_list = jieba.cut(f.read())        with open('./dictionary/ignore_dict.txt', 'r', encoding='utf-8') as f:            ignore_words = f.read().splitlines()        words = []        # 遍歷分詞        for word in word_list:            if word not in ignore_words:                word = re.sub(r'[\n ]', '', word)                if len(word) < 1:                    continue                words.append(word)        global wordle_data        wordle_data = ','.join(words)        print(wordle_data)        self.word_frequency(words)    def generate_wordle(self):        # 配置詞雲引數        wordle_mask = np.array(Image.open('./images/keqing.jpg'))        word_cloud = WordCloud(            font_path='./fonts/simhei.ttf',            background_color="white",            mask=wordle_mask,            max_words=300,            min_font_size=5,            max_font_size=100,            width=500,            height=350,        )        global wordle_data        wordle = word_cloud.generate(wordle_data)        image = wordle.to_image()        image.show()        wordle.to_file('./images/genshin_wordle.png')genshin = genshin()genshin.crawl_comments()genshin.segmentation()genshin.generate_wordle()

自定義詞典my_dict.txt:

璃月迪盧克刻晴阿貝多米哈遊

忽略詞典ignore_dict.txt:

11
最新評論
  • BSA-TRITC(10mg/ml) TRITC-BSA 牛血清白蛋白改性標記羅丹明
  • Python小遊戲:亂點鴛鴦譜生成七言絕句