In [1]:
import csv # 通过Python自带的csv库创建CSV文件
fp = open('H:/python数据分析/数据/ch4ex1.csv','w',newline='') # 新建CSV文件
writer = csv.writer(fp)
writer.writerow(('id','name','grade')) # 向CSV写入数据
writer.writerow(('1','lucky','87'))
writer.writerow(('2','peter','92'))
writer.writerow(('3','lili','85'))
fp.close()
In [2]:
!type H:python数据分析数据ch4ex1.csv
'''通过!type方法查看数据,type方法只适用于Windows系统,UNIX系统使用!cat命令。'''
id,name,grade
1,lucky,87
2,peter,92
3,lili,85
In [3]:
import pandas as pd
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex1.csv')) # 使用read_csv函数读取CSV文件
'''读取CSV文件时,如果文件路径中有中文,需要加open函数,否则会报错'''
df
Out[3]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [4]:
df = pd.read_table(open('H:/python数据分析/数据/ch4ex1.csv'),sep=',')
'''使用read_table进行读取CSV文件,指定分隔符即可'''
df
Out[4]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [5]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex1.csv'),index_col='id')
'''默认情况下,读取的DataFrame的行索引是从0开始进行计数'''
'''通过index_col参数指定id列为行索引'''
df
Out[5]:
name | grade | |
id | ||
1 | lucky | 87 |
2 | peter | 92 |
3 | lili | 85 |
In [6]:
import csv # 通过Python自带的csv库创建CSV文件
fp = open('H:/python数据分析/数据/ch4ex2.csv','w',newline='')
writer = csv.writer(fp)
writer.writerow(('school','id','name','grade')) # 写入数据
writer.writerow(('a','1','lucky','87'))
writer.writerow(('a','2','peter','92'))
writer.writerow(('a','3','lili','85'))
writer.writerow(('b','1','coco','78'))
writer.writerow(('b','2','kevin','87'))
writer.writerow(('b','3','heven','96'))
fp.close()
In [7]:
!type H:python数据分析数据ch4ex2.csv # 查看数据
school,id,name,grade
a,1,lucky,87
a,2,peter,92
a,3,lili,85
b,1,coco,78
b,2,kevin,87
b,3,heven,96
In [8]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex2.csv'),index_col=[0,'id'])
'''层次化索引,传入列编号或者列名组成的列表即可'''
df
Out[8]:
name | grade | ||
school | id | ||
a | 1 | lucky | 87 |
2 | peter | 92 | |
3 | lili | 85 | |
b | 1 | coco | 78 |
2 | kevin | 87 | |
3 | heven | 96 |
In [9]:
import csv # 通过Python自带的csv库创建CSV文件
fp = open('H:/python数据分析/数据/ch4ex3.csv','w',newline='')
writer = csv.writer(fp)
writer.writerow(('1','lucky','87'))
writer.writerow(('2','peter','92'))
writer.writerow(('3','lili','85'))
fp.close()
In [10]:
!type H:python数据分析数据ch4ex3.csv # 查看数据
1,lucky,87
2,peter,92
3,lili,85
In [12]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex3.csv')) # 默认情况读取,会指定第一行为标题行
df
Out[12]:
1 | lucky | 87 | |
0 | 2 | peter | 92 |
1 | 3 | lili | 85 |
In [13]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex3.csv'),header=None) # 通过header参数分配默认的标题行
'''
如果表头的type和csv内容的type相一致的时候,那么直接读取,会让第一行来当表头
此时加header=None,可以让第一行不当表头,而默认给0、1 来当表头
header这个属性是指,在不加header=None这个属性所出来的数据的基础上,把那个数据的表头去掉,换成0开头的表头
'''
df
Out[13]:
0 | 1 | 2 | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [14]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex3.csv'),names=['id','name','grade'])
'''
通过names参数给其指定列名
当设置了names属性之后,header无论设不设置,都会是None
'''
df
Out[14]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [15]:
import csv # 通过Python自带的csv库创建CSV文件并写入数据
fp = open('H:/python数据分析/数据/ch4ex4.csv','w',newline='')
writer = csv.writer(fp)
writer.writerow(['#This is grade'])
writer.writerow(('id','name','grade'))
writer.writerow(('1','lucky','87'))
writer.writerow(('2','peter','92'))
writer.writerow(('3','lili','85'))
writer.writerow(['#time'])
fp.close()
In [16]:
!type H:python数据分析数据ch4ex4.csv # 查看数据
#This is grade
id,name,grade
1,lucky,87
2,peter,92
3,lili,85
#time
In [17]:
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex4.csv'),skiprows=[0,5]) # 通过skiprows参数跳过一些行
'''无论是带表头还是不带表头,skiprows=2的效果,都是读第三行(也就是跳了两行读)'''
df
Out[17]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [19]:
df = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'),nrows=10)
'''通过nrows参数,可以选择只读取部分行数据'''
df
Out[19]:
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
In [20]:
df = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'),nrows=10,usecols=['Survived','Sex'])
'''通过usecols参数进行部分列的选取'''
df
Out[20]:
Survived | Sex | |
0 | 0 | male |
1 | 1 | female |
2 | 1 | female |
3 | 1 | female |
4 | 0 | male |
5 | 0 | male |
6 | 0 | male |
7 | 0 | male |
8 | 1 | female |
9 | 1 | female |
In [21]:
df = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'))
'''
在处理很大文件的时候,需要对文件进行逐块读取,
首先通过info函数查看泰坦尼克号的生还者数据,共有891条数据
'''
df.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
In [22]:
chunker = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'),chunksize=100)
'''通过chunksize参数,即可逐步读取文件
设定读取的行数,返回一个固定行数的迭代器,
每次读取只消耗相应行数对应的dataframe的内存,
从而可以有效的解决内存消耗过多的问题
'''
chunker
Out[22]:
In [23]:
df = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'))
df['Sex'].value_counts()
Out[23]:
male 577
female 314
Name: Sex, dtype: int64
In [24]:
from pandas import Series
import pandas as pd
chunker = pd.read_csv(open('H:/python数据分析/数据/titanic.csv'),chunksize=100)
sex = Series([])
for i in chunker: # 返回的是可迭代的TextFileReader。通过迭代,可以对Sex列进行计数
sex = sex.add(i['Sex'].value_counts(),fill_value=0)
sex
Out[24]:
male 577.0
female 314.0
dtype: float64
In [25]:
fp = open('H:/python数据分析/数据/ch4ex6.txt','a+') # 创建TXT文件
fp.writelines('id?name?grade'+'
') # 写入数据
fp.writelines('1?lucky?87'+'
')
fp.writelines('2?peter?92'+'
')
fp.writelines('3?lili?85'+'
')
fp.close()
In [26]:
!type H:python数据分析数据ch4ex6.txt # 查看数据
id?name?grade
1?lucky?87
2?peter?92
3?lili?85
In [27]:
import pandas as pd
df = pd.read_table(open('H:/python数据分析/数据/ch4ex6.txt'),sep='?') # 读取TXT文件
'''通过read_table函数中的sep参数进行分隔符的指定'''
df
Out[27]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [28]:
!type H:python数据分析数据ch4ex7.txt # 查看TXT文件,以空格隔开的文件
id name grade
1 lucky 87
2 peter 92
3 lili 85
In [29]:
df = pd.read_table(open('H:/python数据分析/数据/ch4ex7.txt'),sep='s+') # 正则表达式处理空格读取数据
df
Out[29]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [30]:
import pandas as pd
df = pd.read_csv(open('H:/python数据分析/数据/ch4ex1.csv'))
df
Out[30]:
id | name | grade | |
0 | 1 | lucky | 87 |
1 | 2 | peter | 92 |
2 | 3 | lili | 85 |
In [31]:
'''利用DataFrame的to_csv方法,可以将数据存储到以逗号分隔的CSV文件中'''
df.to_csv('H:/python数据分析/数据/out1.csv')
!type H:python数据分析数据out1.csv
,id,name,grade
0,1,lucky,87
1,2,peter,92
2,3,lili,85
In [32]:
'''通过sep参数指定存储的分隔符,默认情况下会存储行和列索引'''
df.to_csv('H:/python数据分析/数据/out2.csv',sep='?')
!type H:python数据分析数据out2.csv
?id?name?grade
0?1?lucky?87
1?2?peter?92
2?3?lili?85
In [33]:
'''通过设置index和header分别处理行和列索引'''
df.to_csv('H:/python数据分析/数据/out3.csv',index=False)
!type H:python数据分析数据out3.csv
id,name,grade
1,lucky,87
2,peter,92
3,lili,85
页面更新:2024-04-18
本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828
© CopyRight 2020-2024 All Rights Reserved. Powered By 71396.com 闽ICP备11008920号-4
闽公网安备35020302034903号