response = requests.get(self.comments_url % page, headers=self.headers)print('訪問第', page, '頁,狀態是', response.status_code, '。')time.sleep(random.random())html = etree.HTML(response.text)contents = html.xpath('//ul[contains(@class, "taptap-review-list")]/li')
然後遍歷列表解析出各個欄位:
user = content.xpath('.//a[@class="taptap-user-name"]/text()')[0] or '無名氏'score = content.xpath('.//div[@class="item-text-score"]/i[@class="colored"]/@style')[0][7:9]score = int(score) / 14comment_time = content.xpath('(.//span)[4]/text()')[0]comment = content.xpath('(.//div[@class="item-text-body"])[1]/p/text()')comment = '\n'.join(comment)
最後把資料存入檔案供之後使用:
comment_dir = {'user': users, 'score': scores, 'time': times, 'comment': comments}comment_df = pd.DataFrame(comment_dir)comment_df.to_csv('./tables/taptap_comments.csv')comment_df['comment'].to_csv('./tables/comments.csv', index=False)
分詞
爬蟲拿到了資料,接下來就要對資料進行分詞,這裡使用的是jieba庫:
jieba.load_userdict('./dictionary/my_dict.txt')with open('./tables/comments.csv', 'r', encoding='utf-8') as f: word_list = jieba.cut(f.read())with open('./dictionary/ignore_dict.txt', 'r', encoding='utf-8') as f: ignore_words = f.read().splitlines()
其中載入使用者詞典宣告一些jieba中沒有的詞,再用忽略詞典過濾掉一些無意義的詞:
for word in word_list: if word not in ignore_words: word = re.sub(r'[\n ]', '', word) if len(word) < 1: continue words.append(word)
之後對分詞後的資料進行詞頻分析:
frq = {}for word in words: frq[word] = frq.get(word, 0) + 1items = list(frq.items())items.sort(key=lambda x:x[1], reverse=True)print('詞頻前10統計如下:')for i in range(10): word, count = items[i] print(word, ':', count)
詞雲
最後呼叫WordCloud庫生成詞雲即可:
wordle = word_cloud.generate(wordle_data)image = wordle.to_image()image.show()123
結果詞雲:
詞頻:
詞頻前10統計如下:鍾離 : 337角色 : 205氪 : 187玩家 : 182好 : 135人 : 131原神 : 130真的 : 126讓 : 123這個 : 118
詞頻統計這裡,忽略詞典還不完善,還有無意義的詞沒過濾掉,不想寫了,就這樣吧。詞雲用了晴寶的蒙版,刻晴,永遠的lp!(雖然我還沒抽到)
(免疫)
原始碼主檔案genshin_wordle.py:
import requestsimport timeimport randomimport jiebaimport reimport pandas as pdimport numpy as npfrom lxml import etreefrom PIL import Imagefrom wordcloud import WordCloudclass genshin(): def __init__(self): self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'} self.comments_url = 'https://www.taptap.com/app/168332/review?order=default&page=%d#review-list' def crawl_comments(self): users = [] scores = [] times = [] comments = [] # 爬取10頁評論 for page in range(10): response = requests.get(self.comments_url % page, headers=self.headers) print('訪問第', page, '頁,狀態是', response.status_code, '。') time.sleep(random.random()) html = etree.HTML(response.text) contents = html.xpath('//ul[contains(@class, "taptap-review-list")]/li') # 遍歷該頁的評論資訊 for content in contents: # 解析各個欄位 user = content.xpath('.//a[@class="taptap-user-name"]/text()')[0] or '無名氏' score = content.xpath('.//div[@class="item-text-score"]/i[@class="colored"]/@style')[0][7:9] score = int(score) / 14 comment_time = content.xpath('(.//span)[4]/text()')[0] comment = content.xpath('(.//div[@class="item-text-body"])[1]/p/text()') comment = '\n'.join(comment) # 把一條完整記錄存入陣列 users.append(user) scores.append(score) times.append(comment_time) comments.append(comment) # 封裝資料,寫入檔案 comment_dir = {'user': users, 'score': scores, 'time': times, 'comment': comments} comment_df = pd.DataFrame(comment_dir) comment_df.to_csv('./tables/taptap_comments.csv') comment_df['comment'].to_csv('./tables/comments.csv', index=False) print(comment_df) def word_frequency(self, words): frq = {} for word in words: frq[word] = frq.get(word, 0) + 1 items = list(frq.items()) items.sort(key=lambda x:x[1], reverse=True) print('詞頻前10統計如下:') for i in range(10): word, count = items[i] print(word, ':', count) def segmentation(self): # 載入自定義詞典和忽略詞典,進行分詞 jieba.load_userdict('./dictionary/my_dict.txt') with open('./tables/comments.csv', 'r', encoding='utf-8') as f: word_list = jieba.cut(f.read()) with open('./dictionary/ignore_dict.txt', 'r', encoding='utf-8') as f: ignore_words = f.read().splitlines() words = [] # 遍歷分詞 for word in word_list: if word not in ignore_words: word = re.sub(r'[\n ]', '', word) if len(word) < 1: continue words.append(word) global wordle_data wordle_data = ','.join(words) print(wordle_data) self.word_frequency(words) def generate_wordle(self): # 配置詞雲引數 wordle_mask = np.array(Image.open('./images/keqing.jpg')) word_cloud = WordCloud( font_path='./fonts/simhei.ttf', background_color="white", mask=wordle_mask, max_words=300, min_font_size=5, max_font_size=100, width=500, height=350, ) global wordle_data wordle = word_cloud.generate(wordle_data) image = wordle.to_image() image.show() wordle.to_file('./images/genshin_wordle.png')genshin = genshin()genshin.crawl_comments()genshin.segmentation()genshin.generate_wordle()
自定義詞典my_dict.txt:
璃月迪盧克刻晴阿貝多米哈遊
忽略詞典ignore_dict.txt:
最新評論