現有一個 csv檔案,包含"CNUM"和"COMPANY"兩列,資料裡包含空行,且有內容重複的行資料。
要求:
1)去掉空行;
2)重複行資料只保留一行有效資料;
3)修改"COMPANY"列的名稱為"Company_New‘;
4)並在其後增加六列,分別為"C_col",‘D_col",‘E_col",‘F_col",‘G_col",‘H_col"。
一,使用 python Pandas來處理: import pandas as pd
import numpy as np
from pandas import DataFrame,Series
def deal_with_data(filepath,newpath):
file_obj=open(filepath)
df=pd.read_csv(file_obj) # 讀取csv檔案,建立 DataFrame
df=df.reindex(columns=["CNUM","COMPANY","C_col","D_col","E_col","F_col","G_col","H_col"],fill_value=None) # 重新指定列索引
df.rename(columns={"COMPANY":"Company_New"}, inplace = True) # 修改列名
df=df.dropna(axis=0,how="all") # 去除 NAN 即檔案中的空行
df["CNUM"] = df["CNUM"].astype("int32") # 將 CNUM 列的資料型別指定為 int32
df = df.drop_duplicates(subset=["CNUM", "Company_New"], keep="first") # 去除重複行
df.to_csv(newpath,index=False,encoding="GBK")
file_obj.close()
if __name__=="__main__":
file_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY.csv"
file_save_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY_OUTPUT.csv"
deal_with_data(file_path,file_save_path)
二,使用 VBA來處理: Option Base 1
Option Explicit
Sub main()
On Error GoTo error_handling
Dim wb As Workbook
Dim wb_out As Workbook
Dim sht As Worksheet
Dim sht_out As Worksheet
Dim rng As Range
Dim usedrows As Byte
Dim usedrows_out As Byte
Dim dict_cnum_company As Object
Dim str_file_path As String
Dim str_new_file_path As String
"assign values to variables:
str_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY.csv"
str_new_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY_OUTPUT.csv"
Set wb = checkAndAttachWorkbook(str_file_path)
Set sht = wb.Worksheets("CNUM_COMPANY")
Set wb_out = Workbooks.Add
wb_out.SaveAs str_new_file_path, xlCSV "create a csv file
Set sht_out = wb_out.Worksheets("CNUM_COMPANY_OUTPUT")
Set dict_cnum_company = CreateObject("Scripting.Dictionary")
usedrows = WorksheetFunction.Max(getLastValidRow(sht, "A"), getLastValidRow(sht, "B"))
"rename the header "COMPANY" to "Company_New",remove blank & duplicate lines/rows.
Dim cnum_company As String
cnum_company = ""
For Each rng In sht.Range("A1", "A" & usedrows)
If VBA.Trim(rng.Offset(0, 1).Value) = "COMPANY" Then
rng.Offset(0, 1).Value = "Company_New"
End If
cnum_company = rng.Value & "-" & rng.Offset(0, 1).Value
If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset(0, 1).Value) Then
dict_cnum_company.Add rng.Value & "-" & rng.Offset(0, 1).Value, ""
Next rng
"loop the keys of dict split the keyes by "-" into cnum array and company array.
Dim index_dict As Byte
Dim arr_cnum()
Dim arr_Company()
For index_dict = 0 To UBound(dict_cnum_company.keys)
ReDim Preserve arr_cnum(1 To UBound(dict_cnum_company.keys) + 1)
ReDim Preserve arr_Company(1 To UBound(dict_cnum_company.keys) + 1)
arr_cnum(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(0)
arr_Company(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(1)
Debug.Print index_dict
Next
"assigns the value of the arrays to the celles.
sht_out.Range("A1", "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum)
sht_out.Range("B1", "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company)
"add 6 columns to output csv file:
Dim arr_columns() As Variant
arr_columns = Array("C_col", "D_col", "E_col", "F_col", "G_col", "H_col") "
sht_out.Range("C1:H1") = arr_columns
Call checkAndCloseWorkbook(str_file_path, False)
Call checkAndCloseWorkbook(str_new_file_path, True)
Exit Sub
error_handling:
Call checkAndCloseWorkbook(str_new_file_path, False)
End Sub
" 輔助函式:
"Get last row of Column N in a Worksheet
Function getLastValidRow(in_ws As Worksheet, in_col As String)
getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row
End Function
Function checkAndAttachWorkbook(in_wb_path As String) As Workbook
Dim mywb As String
mywb = in_wb_path
For Each wb In Workbooks
If LCase(wb.FullName) = LCase(mywb) Then
Set checkAndAttachWorkbook = wb
Exit Function
Set wb = Workbooks.Open(in_wb_path, UpdateLinks:=0)
Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean)
wb.Close savechanges:=in_saved
三,輸出結果:
兩種方法輸出結果相同:
四,比較總結:
Python pandas 內建了大量處理資料的方法,我們不需要重複造輪子,用起來很方便,程式碼簡潔的多。
Excel VBA 處理這個需求,使用了 陣列,字典等資料結構(實際需求中,資料量往往很大,所以一些地方沒有直接使用遍歷單元格的方法),以及處理字串,陣列和字典的很多方法,對檔案的操作也很複雜,一旦出錯,除錯起來比python也較困難,程式碼已經儘量最佳化,但還是遠比 Python要多。
現有一個 csv檔案,包含"CNUM"和"COMPANY"兩列,資料裡包含空行,且有內容重複的行資料。
要求:
1)去掉空行;
2)重複行資料只保留一行有效資料;
3)修改"COMPANY"列的名稱為"Company_New‘;
4)並在其後增加六列,分別為"C_col",‘D_col",‘E_col",‘F_col",‘G_col",‘H_col"。
一,使用 python Pandas來處理: import pandas as pd
import numpy as np
from pandas import DataFrame,Series
def deal_with_data(filepath,newpath):
file_obj=open(filepath)
df=pd.read_csv(file_obj) # 讀取csv檔案,建立 DataFrame
df=df.reindex(columns=["CNUM","COMPANY","C_col","D_col","E_col","F_col","G_col","H_col"],fill_value=None) # 重新指定列索引
df.rename(columns={"COMPANY":"Company_New"}, inplace = True) # 修改列名
df=df.dropna(axis=0,how="all") # 去除 NAN 即檔案中的空行
df["CNUM"] = df["CNUM"].astype("int32") # 將 CNUM 列的資料型別指定為 int32
df = df.drop_duplicates(subset=["CNUM", "Company_New"], keep="first") # 去除重複行
df.to_csv(newpath,index=False,encoding="GBK")
file_obj.close()
if __name__=="__main__":
file_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY.csv"
file_save_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY_OUTPUT.csv"
deal_with_data(file_path,file_save_path)
二,使用 VBA來處理: Option Base 1
Option Explicit
Sub main()
On Error GoTo error_handling
Dim wb As Workbook
Dim wb_out As Workbook
Dim sht As Worksheet
Dim sht_out As Worksheet
Dim rng As Range
Dim usedrows As Byte
Dim usedrows_out As Byte
Dim dict_cnum_company As Object
Dim str_file_path As String
Dim str_new_file_path As String
"assign values to variables:
str_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY.csv"
str_new_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY_OUTPUT.csv"
Set wb = checkAndAttachWorkbook(str_file_path)
Set sht = wb.Worksheets("CNUM_COMPANY")
Set wb_out = Workbooks.Add
wb_out.SaveAs str_new_file_path, xlCSV "create a csv file
Set sht_out = wb_out.Worksheets("CNUM_COMPANY_OUTPUT")
Set dict_cnum_company = CreateObject("Scripting.Dictionary")
usedrows = WorksheetFunction.Max(getLastValidRow(sht, "A"), getLastValidRow(sht, "B"))
"rename the header "COMPANY" to "Company_New",remove blank & duplicate lines/rows.
Dim cnum_company As String
cnum_company = ""
For Each rng In sht.Range("A1", "A" & usedrows)
If VBA.Trim(rng.Offset(0, 1).Value) = "COMPANY" Then
rng.Offset(0, 1).Value = "Company_New"
End If
cnum_company = rng.Value & "-" & rng.Offset(0, 1).Value
If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset(0, 1).Value) Then
dict_cnum_company.Add rng.Value & "-" & rng.Offset(0, 1).Value, ""
End If
Next rng
"loop the keys of dict split the keyes by "-" into cnum array and company array.
Dim index_dict As Byte
Dim arr_cnum()
Dim arr_Company()
For index_dict = 0 To UBound(dict_cnum_company.keys)
ReDim Preserve arr_cnum(1 To UBound(dict_cnum_company.keys) + 1)
ReDim Preserve arr_Company(1 To UBound(dict_cnum_company.keys) + 1)
arr_cnum(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(0)
arr_Company(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(1)
Debug.Print index_dict
Next
"assigns the value of the arrays to the celles.
sht_out.Range("A1", "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum)
sht_out.Range("B1", "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company)
"add 6 columns to output csv file:
Dim arr_columns() As Variant
arr_columns = Array("C_col", "D_col", "E_col", "F_col", "G_col", "H_col") "
sht_out.Range("C1:H1") = arr_columns
Call checkAndCloseWorkbook(str_file_path, False)
Call checkAndCloseWorkbook(str_new_file_path, True)
Exit Sub
error_handling:
Call checkAndCloseWorkbook(str_file_path, False)
Call checkAndCloseWorkbook(str_new_file_path, False)
End Sub
" 輔助函式:
"Get last row of Column N in a Worksheet
Function getLastValidRow(in_ws As Worksheet, in_col As String)
getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row
End Function
Function checkAndAttachWorkbook(in_wb_path As String) As Workbook
Dim wb As Workbook
Dim mywb As String
mywb = in_wb_path
For Each wb In Workbooks
If LCase(wb.FullName) = LCase(mywb) Then
Set checkAndAttachWorkbook = wb
Exit Function
End If
Next
Set wb = Workbooks.Open(in_wb_path, UpdateLinks:=0)
Set checkAndAttachWorkbook = wb
End Function
Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean)
Dim wb As Workbook
Dim mywb As String
mywb = in_wb_path
For Each wb In Workbooks
If LCase(wb.FullName) = LCase(mywb) Then
wb.Close savechanges:=in_saved
Exit Function
End If
Next
End Function
三,輸出結果:
兩種方法輸出結果相同:
四,比較總結:
Python pandas 內建了大量處理資料的方法,我們不需要重複造輪子,用起來很方便,程式碼簡潔的多。
Excel VBA 處理這個需求,使用了 陣列,字典等資料結構(實際需求中,資料量往往很大,所以一些地方沒有直接使用遍歷單元格的方法),以及處理字串,陣列和字典的很多方法,對檔案的操作也很複雜,一旦出錯,除錯起來比python也較困難,程式碼已經儘量最佳化,但還是遠比 Python要多。