Python資料預處理(一)

首頁>技術>大漠孤駝2021-02-21 10:13

Python資料預處理(一)

資料預處理、資料選擇、數值操作

import numpy as np

import pandas as pd

#df=pd.read_excel(r'd:\mypythontest\customer.xlsx',sheet_name=0,index_col=0,usecols=[0,1,2,4,5])

df=pd.read_excel(r'd:\mypythontest\customer.xlsx',sheet_name=0)

print(df)

#print(df.describe()) #describe()方法可以獲取所有數值型別欄位的分佈值

#print(df.info())

#print(df.isnull()) #判斷哪個值是缺失數值，是則返回True

#df=df.fillna(0) #用0填充NaN值

#df=df.fillna({'性別':'男'}) #用鍵值對對性別進行填充

#df=df.fillna({'性別':'男','年齡':'30'}) #對性別和年齡進行填充

#df=df.drop_duplicates(subset=['客戶姓名','唯一識別碼'],keep='last')

#df["年齡"].dtype() #錯誤❌

#df["唯一識別碼"].astype("float64") #強制型別轉換

#df.index=np.arange(7) #為表新增行索引，np.arange(7)產生0-6的等差數列

#df.set_index("訂單編號") #重新設定行索引

#print(df['訂單編號']) #這種傳入列名選擇資料的方式稱為普通索引

#print(df.iloc[:,:3]) #傳入這些連續的位置區間，用iloc取得多列資料--亦稱切片索引

#df=df.rename(index={0:"零",1:"一",2:"二",3:"三"}) #重新命名行索引

#df=df.rename(columns={"訂單編號":"新訂單編號","客戶姓名":"新客戶姓名"}，index={0:"零",1:"一",2:"二"})

#同時重新命名行索引和列索引

#print(df.loc[["一","二"]]) #普通索引，選擇行

#print(df.iloc[0,1]) #位置索引，選擇第0,1行

#print(df[(df['年齡']<200) & (df['唯一識別碼']<102)])

#print(df.loc[["一","三"],["訂單編號","唯一識別碼"]])

#print(df[df['唯一識別碼']<104][["訂單編號","唯一識別碼"]]) #布林索引+普通索引

#print(df.iloc[:3,1:3]) #切片索引+切片索引

#from warnings import simplefilter #消除警告

#simplefilter(action='ignore',category=FutureWarning)

#print(df.ix[0:3,["客戶姓名","唯一識別碼"]]) #切片索引+普通索引

#df["年齡"].replace([240,260,280],33,inplace=True) #inplace改變原資料

#df["年齡"].replace({240:32,260:33,280:34},inplace=True)

#df=df.sort_values(by=["收入"],ascending=False) #ascending: True升序 False降序

#df=df.sort_values(by=["性別"],na_position="first") #na_position: first為NaN排在前邊預設為lase

#df=df.sort_values(by=["家屬數","收入"],ascending=[True,False]) #按照多列數值排序

#df['排名']=df["銷售ID"].rank(method="average") #對指定列進行排名，增加'排名'列

#df.insert(2,"銷售排名",df["銷售ID"].rank(method="average"))#上述語句的第二種實現方法

"""method的引數：average 返回重複值的平均排名

first 按照值在所有的待排列資料中出現的先後順序排名

min 與excel中RANK.EQ功能一樣，返回重複值的最佳排名

max 與min相反，取重複值對應的最大排名

"""

#df=df.drop(columns=['家屬數','排名']) #columns指定引數，不需要axis引數

#df=df.drop(df.index[[4,5]],axis=0)

#df=df[df['年齡']<40] #篩選

#print(df['銷售ID'].value_counts())

#print(df['銷售ID'].value_counts(normalize=True,sort=False))#normalize為所佔比

#print(df['銷售ID'].unique())

#print(df['年齡'].isin([31,21])) #在指定列中查詢

#print(df.isin(["A2",31])) #針對全表查詢

#print(pd.cut(df['年齡'],bins=[0,30,50,300])) #bin引數指明切分割槽間

#print(pd.qcut(df["年齡"],3)) #指明切分個數，系統自動切分，基本原則是每組個數儘可能相等

#df.insert(2,"商品類別",["cat01","cat02","cat03","cat04","cat05","cat06",""])

劇多

Python資料預處理(一)