# 缺失值剔除(單個變數)
def missing_delete_var(df,threshold=None):
"""
df:資料集
df2 = df.copy()
missing_df = missing_cal(df)
missing_col_num = missing_df[missing_df.missing_pct>=threshold].shape[0]
missing_col = list(missing_df[missing_df.missing_pct>=threshold].col)
df2 = df2.drop(missing_col,axis=1)
print("缺失率超過{}的變數個數為{}".format(threshold,missing_col_num))
return df2
# 缺失值剔除(單個樣本)
def missing_delete_user(df,threshold=None):
missing_series = df.isnull().sum(axis=1)
missing_list = list(missing_series)
missing_index_list = []
for i,j in enumerate(missing_list):
if j>=threshold:
missing_index_list.append(i)
df2 = df2[~(df2.index.isin(missing_index_list))]
print("缺失變數個數在{}以上的使用者數有{}個".format(threshold,len(missing_index_list)))
三、類別型變數缺失值填充
def fillna_cate_var(df,col_list,fill_type=None):
col_list:變數list集合
fill_type: 填充方式:眾數/當做一個類別
return :填充後的資料集
for col in col_list:
if fill_type=="class":
df2[col] = df2[col].fillna("unknown")
if fill_type=="mode":
df2[col] = df2[col].fillna(df2[col].mode()[0])
四、數值型變數缺失值填充
# 數值型變數的填充
# 針對缺失率在5%以下的變數用中位數填充
# 缺失率在5%--15%的變數用隨機森林填充,可先對缺失率較低的變數先用中位數填充,在用沒有缺失的樣本來對變數作隨機森林填充
# 缺失率超過15%的變數建議當做一個類別
def fillna_num_var(df,col_list,fill_type=None,filled_df=None):
fill_type:填充方式:中位數/隨機森林/當做一個類別
filled_df :已填充好的資料集,當填充方式為隨機森林時 使用
return:已填充好的資料集
if fill_type=="median":
df2[col] = df2[col].fillna(df2[col].median())
df2[col] = df2[col].fillna(-999)
if fill_type=="rf":
rf_df = pd.concat([df2[col],filled_df],axis=1)
known = rf_df[rf_df[col].notnull()]
unknown = rf_df[rf_df[col].isnull()]
x_train = known.drop([col],axis=1)
y_train = known[col]
x_pre = unknown.drop([col],axis=1)
rf = RandomForestRegressor(random_state=0)
rf.fit(x_train,y_train)
y_pre = rf.predict(x_pre)
df2.loc[df2[col].isnull(),col] = y_pre
# 缺失值剔除(單個變數)
def missing_delete_var(df,threshold=None):
"""
df:資料集
"""
df2 = df.copy()
missing_df = missing_cal(df)
missing_col_num = missing_df[missing_df.missing_pct>=threshold].shape[0]
missing_col = list(missing_df[missing_df.missing_pct>=threshold].col)
df2 = df2.drop(missing_col,axis=1)
print("缺失率超過{}的變數個數為{}".format(threshold,missing_col_num))
return df2
# 缺失值剔除(單個樣本)
def missing_delete_user(df,threshold=None):
"""
df:資料集
"""
df2 = df.copy()
missing_series = df.isnull().sum(axis=1)
missing_list = list(missing_series)
missing_index_list = []
for i,j in enumerate(missing_list):
if j>=threshold:
missing_index_list.append(i)
df2 = df2[~(df2.index.isin(missing_index_list))]
print("缺失變數個數在{}以上的使用者數有{}個".format(threshold,len(missing_index_list)))
return df2
三、類別型變數缺失值填充
def fillna_cate_var(df,col_list,fill_type=None):
"""
df:資料集
col_list:變數list集合
fill_type: 填充方式:眾數/當做一個類別
return :填充後的資料集
"""
df2 = df.copy()
for col in col_list:
if fill_type=="class":
df2[col] = df2[col].fillna("unknown")
if fill_type=="mode":
df2[col] = df2[col].fillna(df2[col].mode()[0])
return df2
四、數值型變數缺失值填充
# 數值型變數的填充
# 針對缺失率在5%以下的變數用中位數填充
# 缺失率在5%--15%的變數用隨機森林填充,可先對缺失率較低的變數先用中位數填充,在用沒有缺失的樣本來對變數作隨機森林填充
# 缺失率超過15%的變數建議當做一個類別
def fillna_num_var(df,col_list,fill_type=None,filled_df=None):
"""
df:資料集
col_list:變數list集合
fill_type:填充方式:中位數/隨機森林/當做一個類別
filled_df :已填充好的資料集,當填充方式為隨機森林時 使用
return:已填充好的資料集
"""
df2 = df.copy()
for col in col_list:
if fill_type=="median":
df2[col] = df2[col].fillna(df2[col].median())
if fill_type=="class":
df2[col] = df2[col].fillna(-999)
if fill_type=="rf":
rf_df = pd.concat([df2[col],filled_df],axis=1)
known = rf_df[rf_df[col].notnull()]
unknown = rf_df[rf_df[col].isnull()]
x_train = known.drop([col],axis=1)
y_train = known[col]
x_pre = unknown.drop([col],axis=1)
rf = RandomForestRegressor(random_state=0)
rf.fit(x_train,y_train)
y_pre = rf.predict(x_pre)
df2.loc[df2[col].isnull(),col] = y_pre
return df2