原文地址:
https://github.com/AsuraDong/Blog/blob/master/Articles/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0/%E6%95%B0%E6%8D%AE%E5%8A%A0%E8%BD%BD%E5%AD%98%E5%82%A8%E5%92%8C%E6%96%87%E4%BB%B6%E6%A0%BC%E5%BC%8F.md
1.读取文本格式数据
import pandas as pdimport numpy as npimport sysimport pymysql
df = pd.read_csv('ex1.csv')print(df)
a b c d message0 1 2 3 4 hello1 5 6 7 8 world2 9 10 11 12 foo
df = pd.read_table('ex1.csv',sep=',') #可以使用read_table,但必须指定分隔符# sep还可以是正则表达式print(df)
a b c d message0 1 2 3 4 hello1 5 6 7 8 world2 9 10 11 12 foo
df = pd.read_csv('ex2.csv',header = None)#不是每一个csv都有headerprint(df)
0 1 2 3 40 1 2 3 4 hello1 5 6 7 8 world2 9 10 11 12 foo
df = pd.read_csv('ex2.csv',names=['a','b','c','d','names'])#指定名字print(df)
a b c d names0 1 2 3 4 hello1 5 6 7 8 world2 9 10 11 12 foo
names=['a','b','c','d','names']df = pd.read_csv('ex2.csv',names=names,index_col='names') #将names做成索引print(df)#names对应三个,abcd分别有对应的
a b c dnames hello 1 2 3 4world 5 6 7 8foo 9 10 11 12
df = pd.read_csv('csv_mindex.csv')print('原始样子:','\n',df)df = pd.read_csv('csv_mindex.csv',index_col=['keys','key2'])#层次化索引.#请注意keys和key2的顺序print(df)
原始样子: keys key2 value1 value20 one a 1 21 one b 3 42 two a 9 103 two c 13 14 value1 value2keys key2 one a 1 2 b 3 4two a 9 10 c 13 14
df = pd.read_csv('ex4.csv')print('原始样子:','\n',df)#跳过文件的第几行print()df = pd.read_csv('ex4.csv',skiprows=[0,2])print(df)
原始样子: # hey!a b c d message# just wanted to make things more difficult NaN NaN NaN NaN1 2 NaN 4 hello a b c d message0 1 2 NaN 4 hello
pd.isnull(df)# 处理缺失值df = pd.read_csv('ex4.csv',skiprows=[0,2],na_values=['hello'])# 接收一组用于表示缺失值的字符串print(df)print(pd.isnull(df))
a b c d message0 1 2 NaN 4 NaN a b c d message0 False False True False True
sentinels = {'message':['foo','NA'],'d':['a','NaN']}# 用一个字典为各列指定不同的NA标记值df = pd.read_csv('ex4.csv',skiprows=[0,2],na_values=sentinels)print(df)
a b c d message0 1 2 NaN 4 hello
2.逐块读取文本文件
# nrows参数指定只读取定行。算上第一行哦pd.read_csv('ex1.csv',nrows=4)
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th { text-align: left;}.dataframe tbody tr th { vertical-align: top;}
</style>
| a | b | c | d | message |
---|
0 | 1 | 2 | 3 | 4 | hello |
---|
1 | 5 | 6 | 7 | 8 | world |
---|
2 | 9 | 10 | 11 | 12 | foo |
---|
# chunksize 指定分块读取chunks = pd.read_csv('ex1.csv',chunksize=2)print(chunks)
<pandas.io.parsers.TextFileReader object at 0x0000007D7E4A39B0>
for chunk in chunks: print(chunk) print('='*10,)
a b c d message0 1 2 3 4 hello1 5 6 7 8 world========== a b c d message2 9 10 11 12 foo==========
3.将数据写出到文本格式
data = pd.read_csv('ex1.csv',nrows=3)data.to_csv('ex1_1.csv') #to_csv写入data.to_csv('ex1_2.csv',sep='|')# 别的分隔符data.to_csv('ex1_1.csv',na_rep='NULL')# 缺失值会被替换为na_rep
data.to_csv(sys.stdout,index=False,header=False) # 行、列标签被禁止# 输出到控制台
1,2,3,4,hello5,6,7,8,world9,10,11,12,foo
data.to_csv(sys.stdout,index=False,columns=['a','b'])
a,b1,25,69,10
,a,b,c,d,message0,1,2,3,4,hello1,5,6,7,8,world2,9,10,11,12,foo
4.DataFrame
# 可以将json格式的数据传给DataFreame# 也可以数据将数据库的rows传给DataFrame
conn = pymysql.Connect(host='172.31.238.166',port=3306,user='luowang',passwd='root', charset='UTF8',db='dyx')cursor=conn.cursor()sql='select * from access_log';cursor.execute(sql)rows= cursor.fetchall()print(cursor.description)
(('aid', 3, None, 16, 16, 0, False), ('site_id', 3, None, 16, 16, 0, False), ('count', 3, None, 32, 32, 0, False))
# cursor.description第一个保存了列的信息# pd.DataFrame(rows,columns=[i[0] for i in cursor.description])pd.DataFrame(rows,columns=zip(*cursor.description)[0])
---------------------------------------------------------------------------TypeError Traceback (most recent call last)<ipython-input-74-05969a36ac33> in <module>() 1 # cursor.description第一个保存了列的信息 2 # pd.DataFrame(rows,columns=[i[0] for i in cursor.description])----> 3 pd.DataFrame(rows,columns=zip(*cursor.description)[0])TypeError: 'zip' object is not subscriptable
[i[0] for i in cursor.description]
['aid', 'site_id', 'count']
pd.DataFrame(list(rows),columns=[i[0] for i in cursor.description]) #rows必须是list类型
<style> .dataframe thead tr:only-child th { text-align: right; }
.dataframe thead th { text-align: left;}.dataframe tbody tr th { vertical-align: top;}
</style>
| aid | site_id | count |
---|
0 | 1 | 1 | 45 |
---|
1 | 2 | 3 | 100 |
---|
2 | 3 | 1 | 230 |
---|
3 | 4 | 2 | 10 |
---|
4 | 5 | 5 | 205 |
---|
5 | 6 | 4 | 13 |
---|
6 | 7 | 3 | 220 |
---|
7 | 8 | 5 | 545 |
---|
8 | 9 | 3 | 201 |
---|
9 | 10 | 10 | 10 |
---|
10 | 11 | 11 | 11
|
---|