回覆列表
  • 1 # 科學史話

    現有一個 csv檔案,包含"CNUM"和"COMPANY"兩列,資料裡包含空行,且有內容重複的行資料。

    要求:

    1)去掉空行;

    2)重複行資料只保留一行有效資料;

    3)修改"COMPANY"列的名稱為"Company_New‘;

    4)並在其後增加六列,分別為"C_col",‘D_col",‘E_col",‘F_col",‘G_col",‘H_col"。

    一,使用 python Pandas來處理: import pandas as pd

    import numpy as np

    from pandas import DataFrame,Series

    def deal_with_data(filepath,newpath):

    file_obj=open(filepath)

    df=pd.read_csv(file_obj) # 讀取csv檔案,建立 DataFrame

    df=df.reindex(columns=["CNUM","COMPANY","C_col","D_col","E_col","F_col","G_col","H_col"],fill_value=None) # 重新指定列索引

    df.rename(columns={"COMPANY":"Company_New"}, inplace = True) # 修改列名

    df=df.dropna(axis=0,how="all") # 去除 NAN 即檔案中的空行

    df["CNUM"] = df["CNUM"].astype("int32") # 將 CNUM 列的資料型別指定為 int32

    df = df.drop_duplicates(subset=["CNUM", "Company_New"], keep="first") # 去除重複行

    df.to_csv(newpath,index=False,encoding="GBK")

    file_obj.close()

    if __name__=="__main__":

    file_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY.csv"

    file_save_path=r"C:\Users\12078\Desktop\python\CNUM_COMPANY_OUTPUT.csv"

    deal_with_data(file_path,file_save_path)

    二,使用 VBA來處理: Option Base 1

    Option Explicit

    Sub main()

    On Error GoTo error_handling

    Dim wb As Workbook

    Dim wb_out As Workbook

    Dim sht As Worksheet

    Dim sht_out As Worksheet

    Dim rng As Range

    Dim usedrows As Byte

    Dim usedrows_out As Byte

    Dim dict_cnum_company As Object

    Dim str_file_path As String

    Dim str_new_file_path As String

    "assign values to variables:

    str_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY.csv"

    str_new_file_path = "C:\Users\12078\Desktop\Python\CNUM_COMPANY_OUTPUT.csv"

    Set wb = checkAndAttachWorkbook(str_file_path)

    Set sht = wb.Worksheets("CNUM_COMPANY")

    Set wb_out = Workbooks.Add

    wb_out.SaveAs str_new_file_path, xlCSV "create a csv file

    Set sht_out = wb_out.Worksheets("CNUM_COMPANY_OUTPUT")

    Set dict_cnum_company = CreateObject("Scripting.Dictionary")

    usedrows = WorksheetFunction.Max(getLastValidRow(sht, "A"), getLastValidRow(sht, "B"))

    "rename the header "COMPANY" to "Company_New",remove blank & duplicate lines/rows.

    Dim cnum_company As String

    cnum_company = ""

    For Each rng In sht.Range("A1", "A" & usedrows)

    If VBA.Trim(rng.Offset(0, 1).Value) = "COMPANY" Then

    rng.Offset(0, 1).Value = "Company_New"

    End If

    cnum_company = rng.Value & "-" & rng.Offset(0, 1).Value

    If VBA.Trim(cnum_company) <> "-" And Not dict_cnum_company.Exists(rng.Value & "-" & rng.Offset(0, 1).Value) Then

    dict_cnum_company.Add rng.Value & "-" & rng.Offset(0, 1).Value, ""

    End If

    Next rng

    "loop the keys of dict split the keyes by "-" into cnum array and company array.

    Dim index_dict As Byte

    Dim arr_cnum()

    Dim arr_Company()

    For index_dict = 0 To UBound(dict_cnum_company.keys)

    ReDim Preserve arr_cnum(1 To UBound(dict_cnum_company.keys) + 1)

    ReDim Preserve arr_Company(1 To UBound(dict_cnum_company.keys) + 1)

    arr_cnum(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(0)

    arr_Company(index_dict + 1) = Split(dict_cnum_company.keys()(index_dict), "-")(1)

    Debug.Print index_dict

    Next

    "assigns the value of the arrays to the celles.

    sht_out.Range("A1", "A" & UBound(arr_cnum)) = Application.WorksheetFunction.Transpose(arr_cnum)

    sht_out.Range("B1", "B" & UBound(arr_Company)) = Application.WorksheetFunction.Transpose(arr_Company)

    "add 6 columns to output csv file:

    Dim arr_columns() As Variant

    arr_columns = Array("C_col", "D_col", "E_col", "F_col", "G_col", "H_col") "

    sht_out.Range("C1:H1") = arr_columns

    Call checkAndCloseWorkbook(str_file_path, False)

    Call checkAndCloseWorkbook(str_new_file_path, True)

    Exit Sub

    error_handling:

    Call checkAndCloseWorkbook(str_file_path, False)

    Call checkAndCloseWorkbook(str_new_file_path, False)

    End Sub

    " 輔助函式:

    "Get last row of Column N in a Worksheet

    Function getLastValidRow(in_ws As Worksheet, in_col As String)

    getLastValidRow = in_ws.Cells(in_ws.Rows.count, in_col).End(xlUp).Row

    End Function

    Function checkAndAttachWorkbook(in_wb_path As String) As Workbook

    Dim wb As Workbook

    Dim mywb As String

    mywb = in_wb_path

    For Each wb In Workbooks

    If LCase(wb.FullName) = LCase(mywb) Then

    Set checkAndAttachWorkbook = wb

    Exit Function

    End If

    Next

    Set wb = Workbooks.Open(in_wb_path, UpdateLinks:=0)

    Set checkAndAttachWorkbook = wb

    End Function

    Function checkAndCloseWorkbook(in_wb_path As String, in_saved As Boolean)

    Dim wb As Workbook

    Dim mywb As String

    mywb = in_wb_path

    For Each wb In Workbooks

    If LCase(wb.FullName) = LCase(mywb) Then

    wb.Close savechanges:=in_saved

    Exit Function

    End If

    Next

    End Function

    三,輸出結果:

    兩種方法輸出結果相同:

    四,比較總結:

    Python pandas 內建了大量處理資料的方法,我們不需要重複造輪子,用起來很方便,程式碼簡潔的多。

    Excel VBA 處理這個需求,使用了 陣列,字典等資料結構(實際需求中,資料量往往很大,所以一些地方沒有直接使用遍歷單元格的方法),以及處理字串,陣列和字典的很多方法,對檔案的操作也很複雜,一旦出錯,除錯起來比python也較困難,程式碼已經儘量最佳化,但還是遠比 Python要多。

  • 中秋節和大豐收的關聯?
  • 春雨綿綿十首詩?