到了一定年齡,父母可能會催你找女朋友,結婚。大多數的父母催婚,是父母漸漸老了,想讓你找個人照顧你,有熱飯吃,生病了有人照顧。在外面不被人欺負。當然,也有一部分來自周圍人的壓力,跟你同齡的孩子差不多都結婚了,你父母的壓力自然就來了。跟父母給孩子報課外輔導班的心理一樣。很多時候讓你成家立業,在父母看來,幫你完成成家的任務,父母的一大任務算是完成了。不然單身的男女每個家,在父母心裡始終是個心結,這種心情,小城鎮特別的突出。父母幫你完成了結婚的任務,不需要像以前那樣辛辛苦苦奔波賺錢了。催婚,第一,是父母對你的關心。第二,是父母的私心(雖然有時候這種私心是被動的私心)第三,父母養育任務的完成,要開始享受生活了。
所以,今天作者就來爬取下交友網站,看看小姐姐的擇偶觀。結合博主的年齡所以博主的篩選條件是重慶,年齡21-27歲,未婚小姐姐。大姐姐們的擇偶觀我並不關心。對技術不感興趣的,下拉到後面看結論。
技術部分網站選取
世紀佳緣得到的資訊如圖,對擇偶條件未怎麼提及。所以該網站放棄。
世紀佳緣爬取程式碼
```python#!/usr/bin/env python3# -*- coding: utf-8 -*-# @Author: yudengwu(餘登武)# @Date : 2021/2/15#@email:[email protected]# -*- coding: utf-8 -*-import requestsimport jsonimport pandas as pdfrom requests.exceptions import ReadTimeout, ConnectionError, RequestExceptiondef get_page(url):#獲取請求並返回解析頁面, ''' 引數: url :目標網頁的 url 返回:目標網頁的 html 內容 ''' headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36', 'Cookie': 'guider_quick_search=on; SESSION_HASH=0ea6881596be6958acab86601f12c97fdc211b1d; jy_refer=www.baidu.com; accessID=20210215105936945652; user_access=1; Qs_lvt_336351=1613358012; _gscu_1380850711=133580687kjmry14; _gscbrs_1380850711=1; COMMON_HASH=03ccf3f907328da89142987423a9215b; stadate1=271766541; myloc=50%7C5001; myage=25; mysex=m; myuid=271766541; myincome=40; Qs_pv_336351=1107463009737048200%2C2408036345389375500%2C1494836557490850800%2C3408124253653422600%2C1396723355418865400; PHPSESSID=3699194bbb0a1fb7c7f3c46c813f162c; pop_avatar=1; PROFILE=272766541%3A%25E6%2580%25BB%25E8%25A3%2581%25E4%25BD%2599%3Am%3Aimages1.jyimg.com%2Fw4%2Fglobal%2Fi%3A0%3A%3A1%3Azwzp_m.jpg%3A1%3A1%3A50%3A10%3A3; main_search:272766541=%7C%7C%7C00; RAW_HASH=PBalPtMnGoSGsXuyDvb3BTznuvyG8MajCm%2AWrcDW%2Av1YkfseTjLUbLLCCHeQJ0B25bjAa%2Ak4IbveQI5X4uzQhvvD3qbP6ajy90MEyOpZDZzznTM.; is_searchv2=1; pop_1557218166=1613364302492; pop_time=1613363558012' } try: response = requests.get(url, headers=headers, timeout=10) response.encoding = 'unicode_escape' # 加編碼,重要!轉換為字串編碼,read()得到的是byte格式的 if response.status_code == 200: return response.text except ReadTimeout: # 超時 #get_page(url)#如果是這行則是遞迴呼叫函式 return None except ConnectionError: # 網路問題 #get_page(url) return None except RequestException: # 其他問題 #get_page(url) return None#解析網頁def pase_page(url): html = get_page(url) ''' 功能:嘗試解析其結構,獲取所需內容並儲存進CSV ''' if html is not None: html = str(html) s = json.loads(html,strict=False) # ka新增引數 strict=False。否則會出現 錯誤資訊如json.decoder.JSONDecodeError: Invalid control character at: line 1 column 4007 (char 4006) usrinfolist = [] # 存放一頁內所有小姐姐的資訊 for key in s['userInfo']: personlist = [] # 存放一個人的資訊 uid = key['uid'] nickname = key['nickname'] sex = key['sex'] age = key['age'] work_location = key['work_location'] height = key['height'] education = key['education'] marriage = key['marriage'] income = key['income'] matchCondition = key['matchCondition'] shortnote = key['shortnote'] image = key['image'] personlist.append(uid) personlist.append(nickname) personlist.append(sex) personlist.append(age) personlist.append(work_location) personlist.append(height) personlist.append(education) personlist.append(matchCondition) personlist.append(marriage) personlist.append(income) personlist.append(shortnote) personlist.append(image) usrinfolist.append(personlist) dataframe = pd.DataFrame(usrinfolist) dataframe.to_csv('世紀佳緣小姐姐資訊.csv', mode='a+', index=False, header=False) # mode='a+'追加寫入 print('當前頁數{0}'.format(page)) else: print('解析失敗')import threadingif __name__ == '__main__': for page in range(1, 5000,3): url1 = 'http://search.jiayuan.com/v2/search_v2.php?key=&sex=f&stc=2:18.24,3:155.170,23:1&sn=default&sv=1&p=%s&f=select'+str(page) #pase_page(url1) url2 = 'http://search.jiayuan.com/v2/search_v2.php?key=&sex=f&stc=2:18.24,3:155.170,23:1&sn=default&sv=1&p=%s&f=select' + str( page+1) url3 = 'http://search.jiayuan.com/v2/search_v2.php?key=&sex=f&stc=2:18.24,3:155.170,23:1&sn=default&sv=1&p=%s&f=select' + str( page + 2) t1 = threading.Thread(target=pase_page, kwargs={'url':url1}) # 執行緒1 t2 = threading.Thread(target=pase_page, kwargs={'url':url2}) # 執行緒2 t3 = threading.Thread(target=pase_page, kwargs={'url':url3}) # 執行緒3 t1.start() t2.start() t3.start()123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
所以我爬取其他網站http://www.lovewzly.com/jiaoyou.html
網站主頁圖
點開一個小姐姐,發現有擇偶觀資訊可以提取。發現該小姐姐的網址連結為http://www.lovewzly.com/user/4270839.html
在主頁中檢視原始碼,我們可以發現小姐姐的網頁地址連結可以從主頁圖data-uid分析得到。
於是我們可以認為,爬取主頁,得到所有小姐姐的data-uid,然後遍歷每一個data-uid,根據data-uid拼接小姐姐網頁的地址。然後分析該小姐姐的擇偶觀。
網頁分析從sources,我們可以找到城市年齡,星座等的數字標籤。這些我們用來自己動手寫函式,用於篩選。
網頁連結如圖
下拉檢視引數。因為我只勾選了幾個條件,所以網頁連結呈現出的引數少。
檢視資料,如圖
從上發現數據沒有我想要的擇偶要求。 所以我在此網頁只取userid。然後構建小姐姐網頁地址如http://www.lovewzly.com/user/4276242.html,再從該網頁中提取小姐姐的資訊和擇偶條件.
程式碼1:根據條件提取小姐姐的userid本次程式語言:python。其他語言也在學,但尚未成長為我的主語言,還不能殺敵。
該程式碼中我只設定了篩選條件:小姐姐年齡,性別,城市,是否婚配。
import requestsfrom requests.exceptions import ReadTimeout, ConnectionError, RequestExceptionimport pandas as pdimport numpy as npdef set_age(): age = int(input("請輸入對方的期望年齡:")) # 強制字串轉整型 if 21 <= age <= 30: startage = 21 endage = 30 elif 31 <= age <= 40: startage = 31 endage = 40 return startage, endage# 設定性別def set_sex(): sex = input("請輸入對方的期望性別:") if sex == '男': gender = 1 elif sex == '女': gender = 2 return gender# 設定城市def set_city(): city = input("請輸入對方的期望城市:") if city == '北京': cityid = 52 elif city == '深圳': cityid = 77 elif city == '廣州': cityid = 76 elif city == '福州': cityid = 53 elif city == '廈門': cityid = 60 elif city == '杭州': cityid = 383 elif city == '青島': cityid = 284 elif city == '長沙': cityid = 197 elif city == '濟南': cityid = 283 elif city == '南京': cityid = 220 elif city == '香港': cityid = 395 elif city == '上海': cityid = 321 elif city == '成都': cityid = 322 elif city == '武漢': cityid = 180 elif city == '蘇州': cityid = 221 elif city == '重慶': cityid = 394 elif city == '香港': cityid = 395 elif city == '南昌': cityid = 233 elif city == '南寧': cityid = 97 elif city == '合肥': cityid = 3401 elif city == '鄭州': cityid = 149 elif city == '佛山': cityid = 80 elif city == '珠海': cityid = 96 elif city == '昆明': cityid = 397 elif city == '石家莊': cityid = 138 elif city == '天津': cityid = 143 return cityid#是否婚配def marry(): print('請輸入是否婚配。輸入字元如:未婚,離異,喪偶') marry= input("輸入是否婚配:") if marry == '未婚': marryid=1 elif marry=='離異': marryid=3 elif marry=='喪偶': marryid=2 return marryid# 解析網頁def get_info(page, startage, endage, gender, cityid, marryid): # http://www.lovewzly.com/api/user/pc/list/search?startage=21&endage=30&gender=2&cityid=394&marry=1&page=1 # 字串格式化 %s url = 'http://www.lovewzly.com/api/user/pc/list/search?startage={}&endage={}&gender={}&cityid={}&marry={}&page={}'.format( startage, endage, gender, cityid, marryid, page) try: # response = requests.get(url).json() #簡單寫法 response = requests.get(url) if response.status_code == 200: result = response.json() return result except ReadTimeout: # 超時 print('Timeout') return None except ConnectionError: # 網路問題 print('Connection error') return None except RequestException: # 其他問題 print('Error') return None#主函式# 主程式def main(): print("請輸入你的篩選條件,開始本次姻緣:") # 呼叫上面編寫的函式 startage, endage = set_age()#年齡 gender = set_sex()#性別 cityid = set_city()#城市 marryid=marry()#是否婚配 for i in range(1, 100): # 取1~100的內容。大多數情況下沒有100頁 # 獲取抓取到的json資料 json = get_info(i, startage, endage, gender, cityid,marryid) # print(json['data']['list']) # 儲存圖片 for item in json['data']['list']: userid = item['userid'] # print(userid) userid = np.array(userid) userid = pd.Series(userid) userid.to_csv('小姐姐資訊userid.csv', mode='a+', index=False, header=False) # # mode='a+'追加寫入if __name__ =='__main__': main() print('讀取結束')123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
程式圖
結果圖。在此條件下,在該網站只找到454個重慶小姐姐。
再次執行程式,目標城市:成都,上海。將所有結果整合成一張
程式碼2:根據userid提取小姐姐的個人資訊和擇偶觀選項一個小姐姐單擊,審查元素,發現資訊直接顯示在網頁原始碼中,沒有經過渲染等。所以該部分資訊提取沒有難度,不再細講。
#!/usr/bin/env python3# -*- coding: utf-8 -*-# @Author: yudengwu(餘登武)# @Date : 2021/2/15#@email:[email protected] numpy as npimport pandas as pdimport requestsfrom bs4 import BeautifulSoupimport redef get_page(url):#獲取請求並返回解析頁面,offest,keyword為可變引數 headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} try: response = requests.get(url,headers = headers,timeout=10) response.encoding = 'utf-8' # 加編碼,重要!轉換為字串編碼,read()得到的是byte格式的 if response.status_code == 200 : return response.text except ConnectionError: print('程式錯誤') return None#解析網頁def pase_page(url,i): html = get_page(url) html = str(html) if html is not None: soup = BeautifulSoup(html, 'lxml') "----------------------------------小姐姐資訊------------------------------" "--暱稱--" nickname=soup.select('.view.fl.c6 .nick.c3e') nickname=''.join([i.get_text() for i in nickname]) "--年齡--" age=soup.select('.f18.c3e.p2 .age.s1') age=''.join(i.get_text()for i in age) "--身高--" height=soup.select('.f18.c3e.p2 .height') height=''.join(i.get_text()for i in height) "--學歷--" education=soup.select('.f18.c3e.p2 .education') education=''.join(i.get_text()for i in education) "--現居地--" present_address=soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div:nth-child(1) > div.view.fl.c6 > ul > li:nth-child(1) > span') present_address=''.join(i.get_text() for i in present_address) "--職業--" professional=soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div:nth-child(1) > div.view.fl.c6 > ul > li:nth-child(7) > span') professional=''.join(i.get_text() for i in professional) "--收入--" income=soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div:nth-child(1) > div.view.fl.c6 > ul > li:nth-child(8) > span') income=''.join(i.get_text()for i in income) "--個人照連結--" photo=soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div:nth-child(1) > div.photo.fl > div.imgwrap > ul > li:nth-child(1) > img') photo=str(photo) pat1 = '.+src="(.+)"' photo=re.compile(pat1).findall(photo) "----------------------------------擇偶要求------------------------------" "--是否介意對方抽菸--" smoking=soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(2) > div.body > ul > li:nth-child(2)') smoking=''.join(i.get_text() for i in smoking) "--是否介意對方喝酒--" drinking = soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(2) > div.body > ul > li:nth-child(4)') drinking=''.join(i.get_text() for i in drinking) "--是否介意對方有子女--" children = soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(2) > div.body > ul > li:nth-child(3)') children=''.join(i.get_text()for i in children) "--擇偶年齡--" age_man = soup.select('#userid > div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(1)') age_man=''.join(i.get_text()for i in age_man) "--擇偶身高--" height_man = soup.select('#userid > div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(2)') height_man=''.join(i.get_text()for i in height_man) "--擇偶月薪--" money_man = soup.select('#userid > div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(3)') money_man=''.join(i.get_text()for i in money_man) "--擇偶學歷--" study_man = soup.select('div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(4)') study_man=''.join(i.get_text()for i in study_man) "--擇偶職業--" professional_man = soup.select('#userid > div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(8)') professional_man=''.join(i.get_text() for i in professional_man) "--擇偶現居地--" present_addressman = soup.select('#userid > div.cm-wrapin.user-warpin > div.clearfix > div.users-left > div.clearfix.user-detail > div:nth-child(3) > div.body > ul > li:nth-child(6)') present_addressman=''.join(i.get_text()for i in present_addressman) "----------------------------------所有資訊寫入表格------------------------------" information = [nickname,age,height,education,present_address,professional,income,photo, smoking,drinking,children,age_man,height_man,money_man,study_man,professional_man, present_addressman] information = np.array(information) information = information.reshape(-1, 17) information = pd.DataFrame(information, columns=[nickname,age,height,education,present_address,professional,income,photo, smoking,drinking,children,age_man,height_man,money_man,study_man,professional_man, present_addressman]) if i==0: information.to_csv('相親網站小姐姐資料.csv', mode='a+', index=False, header=0) # mode='a+'追加寫入 else: information.to_csv('相親網站小姐姐資料.csv', mode='a+', index=False, header=False) # mode='a+'追加寫入 else: print('解析錯誤')def main(): f = open('小姐姐資訊.txt', encoding='gbk') txt = [] for line in f: txt.append(line.strip()) #txt = np.array(txt)[0] i=0 for userid in txt: # 遍歷userid print(i) #userid=4283676 base_url = 'http://www.lovewzly.com/user/'+str(userid)+'.html' pase_page(base_url,i) i+=1 #print(html)if __name__ =='__main__': main()123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
得到的資料如下,2500多條資料
資料分析部分資料分析部分,我懶得寫程式碼了,有些累了。簡單操作,透過表格的資料透視表來簡單分析下。資料透視表教程
對物件工資分析在有工資欄位內,58.78%的小姐姐要求物件月入1萬以上。(錢果然還是萬能的,前段時間聽說離我很近的一個成功企業家出軌一個比他女兒稍微大點的小姐姐,禽獸呀)
單身小姐姐的學歷分佈以本科和專科生居多。果然學歷越低越不容易單身。
檢視各個學歷階段小姐姐對物件工資要求本科生和專科生要求物件月入1萬的人數為294,188
檢視小姐姐與物件工資的區別橫座標為小姐姐的工資,縱座標為物件工資統計個數。能月收入5千到1萬的,基本都要求物件月收入1萬以上。
小姐姐物件學歷要求橫座標為小姐姐學歷,縱座標為物件學歷
y軸為小姐姐學歷,x軸為物件。學歷為本科的小姐姐還有不少人要求物件為初高中專科。
小姐姐物件身高要求欄位太多了,簡單截圖看下,幾個比較大的值170-180
透過此文,我發現了小姐姐們的擇偶觀。此刻的我很膨脹,我覺得她們都配不上我。