做生信分析,肯定需要面對各種資料,像最常見的txt,csv,固定分隔符的矩陣型別的資料,也有像json,透過鍵-值的方式儲存資料,或者xml標記語言,以標籤的方式儲存資料。
data.json{ 'name' : 'ACME', 'shares' : 100, 'price' : 542.23, 'books': ['C++', 'Python', 'R']}
c.xml
<?xml version="1.0"?><stop> <id>14791</id> <nm>Clark & Balmoral</nm> <sri> <rt>22</rt> <d>North Bound</d> <dd>North Bound</dd> </sri> <cr>22</cr> <pre> <pt>5 MIN</pt> <fd>Howard</fd> <v>1378</v> <rn>22</rn> </pre> <pre> <pt>15 MIN</pt> <fd>Howard</fd> <v>1867</v> <rn>22</rn> </pre></stop>
txt,csv 太常見了就不用說了。那有了這些檔案,又該如何讀取呢?
下面,簡單介紹下 python 的讀寫操作。
1、讀取txt格式資料
內建函式a.txtchr2 29449344 29449368 (A)24 0 +chr2 39573062 39573089 (A)27 0 +chr2 42553086 42553096 (A)10 0 +chr2 47641559 47641586 (A)27 0 +
file = 'a.txt'# 以讀的方式開啟檔案f = open(file, 'r')# 一次性讀取整個檔案為字串,然後以 \n 分割字串,迴圈每次取出一行for line in f.read().split('\n'): # 每行是以 \t 分割, split不加引數預設以一個或多個空白符作為分割 line = line.split() print(line)# 最後不要忘記關閉檔案f.close()# [out]:['chr2', '29449344', '29449368', '(A)24', '0', '+']['chr2', '39573062', '39573089', '(A)27', '0', '+']['chr2', '42553086', '42553096', '(A)10', '0', '+']['chr2', '47641559', '47641586', '(A)27', '0', '+']# 上下文管理器 with 的方式開啟檔案,不需要手動關閉檔案with open('a.txt', 'r') as f: for line in f.read().split('\n'): line = line.split() print(line)
2、讀取csv格式資料(其實csv與txt的操作基本互通)
內建函式# 透過 txt 檔案構造 csv 格式資料fc = open('b.csv', 'w')with open('a.txt', 'r') as f: for line in f.read().split('\n'): line = line.split() fc.write(','.join(line)) fc.write('\n')fc.close()
b.csv
csv資料的讀取方式和txt類似,只需將split()換成split(',')
csv模組# employee_birthday.csv名字,部門,月份張三,會計,11李四,IT,3
import csvwith open('employee_birthday.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'列名:\n {", ".join(row)}') line_count += 1 else: print(f'\t{row[0]},{row[1]}部門,{row[2]} 入職。') line_count += 1 print(f'行數 {line_count}。')
字典形式讀取
import csvwith open('employee_birthday.csv') as csv_file: csv_reader = csv.DictReader(csv_file, delimiter=',') line_count = 0 for row in csv_reader: if line_count == 0: print(f'列名:\n {", ".join(row)}') line_count += 1 else: print(f'\t{row["名字"]},{row["部門"]}部門,{row["月份"]} 入職。') line_count += 1 print(f'行數 {line_count}。')
寫入
import csvwith open('employee_file.csv', mode='w') as employee_file: employee_writer = csv.writer(employee_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) employee_writer.writerow(['John Smith', 'Accounting', 'November']) employee_writer.writerow(['Erica Meyers', 'IT', 'March'])
字典形式寫入
import csvwith open('employee_file2.csv', mode='w') as csv_file: # 表頭 fieldnames = ['emp_name', 'dept', 'birth_month'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) # 寫入表頭 writer.writeheader() # 寫入字典形式資料,鍵與表頭一致 writer.writerow({'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'}) writer.writerow({'emp_name': 'Erica Meyers', 'dept': 'IT', 'birth_month': 'March'})
pandas模組
import pandas as pd# sep 引數指定分隔符, header=None:表示沒有表頭,name 可指定表頭txt = pd.read_csv('a.txt', sep='\t', header=None, names=['chrom', 'start', 'end', 'repeat', 'num', 'orientation'])csv = pd.read_csv('b.csv', sep=',', header=None)
輸出檔案
# 預設輸出行名,index=False,不輸出行名txt.to_csv(filename, index=False)csv.to_csv(filename)
3、json資料
使用json模組# python 字典與 json 資料相互轉換data = { 'name' : 'ACME', 'shares' : 100, 'price' : 542.23, 'books': ['C++', 'Python', 'R']}json_str = json.dumps(data)# [out]:'{"name": "ACME", "shares": 100, "price": 542.23, "books": ["C++", "Python", "R"]}'json.loads(json_str)# [out]data = { 'name' : 'ACME', 'shares' : 100, 'price' : 542.23, 'books': ['C++', 'Python', 'R']}# 將字典資料儲存成 json 檔案with open('data.json', 'w') as f: json.dump(data, f)# 將json 檔案讀取成字典with open('data.json', 'r') as f: data = json.load(f)
pandasdf = pd.read_json('data.json')df
會將 json 檔案轉換為 DataFrame 格式資料
4、xml資料
xml模組from xml.etree.ElementTree import parse, Elementdoc = parse('c.xml')for item in doc.iterfind('pre'): print(item.findtext('pt'), end='\t') print(item.findtext('fd'), end='\t') print(item.findtext('v'), end='\t') print(item.findtext('rn'))
from xml.etree.ElementTree import parse, Elementdoc = parse('c.xml')root = doc.getroot()# 刪除兩個標籤元素root.remove(root.find('sri'))root.remove(root.find('cr'))# 獲取 nm 標籤所在的位置idx = root.getchildren().index(root.find('nm'))# 構造標籤元素 <spam> </spam>e = Element('spam')e.text = 'This is a spam'root.insert(idx+1, e)# 將修改過後的 xml 輸出doc.write('newc.xml', xml_declaration=True)
<--! 輸出結果 --><?xml version='1.0' encoding='us-ascii'?><stop> <id>14791</id> <nm>Clark & Balmoral</nm> <spam>This is a spam</spam> <pre> <pt>5 MIN</pt> <fd>Howard</fd> <v>1378</v> <rn>22</rn> </pre> <pre> <pt>15 MIN</pt> <fd>Howard</fd> <v>1867</v> <rn>22</rn> </pre></stop>
最新評論