In [87]:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
In [88]:
%matplotlib inline
In [3]:
df1 = DataFrame([[3,5,3],[1,6,np.nan],
['lili',np.nan,'pop'],[np.nan,'a','b']]) # 创建有缺失值的DataFrame
df1
Out[3]:
0 | 1 | 2 | |
0 | 3 | 5 | 3 |
1 | 1 | 6 | NaN |
2 | lili | NaN | pop |
3 | NaN | a | b |
In [4]:
df1.isnull() #True的为缺失值
Out[4]:
0 | 1 | 2 | |
0 | False | False | False |
1 | False | False | True |
2 | False | True | False |
3 | True | False | False |
In [5]:
df1.notnull() #False为缺失值
Out[5]:
0 | 1 | 2 | |
0 | True | True | True |
1 | True | True | False |
2 | True | False | True |
3 | False | True | True |
In [6]:
df1.isnull().sum() # 获取每列的缺失值数量,再通过求和就可以获取整个DataFrame的缺失值数量
Out[6]:
0 1
1 1
2 1
dtype: int64
In [7]:
df1.isnull().sum().sum()
Out[7]:
3
In [9]:
df1.isnull().values.any()
Out[9]:
True
In [10]:
df1.info() # 通过info方法,也可以看出DataFrame每列数据的缺失值情况
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
0 3 non-null object
1 3 non-null object
2 3 non-null object
dtypes: object(3)
memory usage: 176.0+ bytes
In [11]:
df1.dropna() # dropna方法可以删除具有缺失值的行,整行删除,传入how='all',则只会删除全为NaN的那些行
Out[11]:
0 | 1 | 2 | |
0 | 3 | 5 | 3 |
In [17]:
df2 = DataFrame(np.arange(12).reshape(3,4))
df2
Out[17]:
0 | 1 | 2 | 3 | |
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
In [19]:
df2.ix[2,:] = np.nan # 索引为2,即第三行所有列为缺失值
df2[3] = np.nan # 新建列索引为3的所有数据为缺失值
df2
Out[19]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | NaN |
1 | 4.0 | 5.0 | 6.0 | NaN |
2 | NaN | NaN | NaN | NaN |
In [20]:
df2.dropna(how='all') # 传入how='all',则只会删除全为NaN的那些行
Out[20]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | NaN |
1 | 4.0 | 5.0 | 6.0 | NaN |
In [21]:
df2.dropna(how='all',axis=1) #axis=1,轴向为列,删除整列为缺失值的,默认为行删除
Out[21]:
0 | 1 | 2 | |
0 | 0.0 | 1.0 | 2.0 |
1 | 4.0 | 5.0 | 6.0 |
2 | NaN | NaN | NaN |
In [22]:
df2
Out[22]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | NaN |
1 | 4.0 | 5.0 | 6.0 | NaN |
2 | NaN | NaN | NaN | NaN |
In [24]:
df2.fillna(0) # 通过fillna方法可以将缺失值替换为常数值
Out[24]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 |
In [25]:
df2.fillna({1:6,3:0}) # 针对列索引1和列索引3的缺失值进行相应的填充
'''
在fillna中传入字典结构数据,可以针对不同列填充不同的值,
fillna返回的是新对象,不会对原数据进行修改
'''
Out[25]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | NaN | 6.0 | NaN | 0.0 |
In [26]:
df2
Out[26]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | NaN |
1 | 4.0 | 5.0 | 6.0 | NaN |
2 | NaN | NaN | NaN | NaN |
In [27]:
df2.fillna({1:6,3:0},inplace=True) # 通过inplace就地进行修改
'''
1.df.fillna()函数的功能:该函数的功能是用指定的值去填充dataframe中的缺失值。
2.df.fillna()函数的基本语法:df.fillna(a,[inplace=False]),
其中参数a表示的是常数或字典,若a为常数,则用常数a填充缺失值,
若a为字典,则表示第key列的缺失值用key对应的value值填充,
如:df.fillna({0:10,1:20}),表示用10去填充第0列的缺失值,用20去填充第1列的缺失值;
inplace为可选参数,默认为False,表示不修改原对象,若指定inplace=True,则直接修改原对象。
3.df.fillna()函数的返回值:若指定inplace=True,则函数返回值为None,若未指定,则函数返回填充缺失值后的数据。
4.df.fillna()函数的用法补充:
4.1 指定method参数:
(1)method='ffill'或'pad',表示用前一个非缺失值去填充该缺失值,语法为df.fillna(method='ffill');
(2)method ='bflii'或'backfill',表示用下一个非缺失值填充该缺失值,语法为df.fillna(method='bflii');
4.2 指定limit参数和axis参数:
limit参数用于指定每列或每行缺失值填充的数量,默认按列操作,
axis参数用于指定对行还是对列操作。若axis=0,则对各行数据进行填充,若axis=1,则对各列数据进行填充,
如:df.fillna(method='ffill', limit=1,axis=1)
表示用上一个非缺失值填充该缺失值,且每行中只有一列被填充,
因为method='ffill'并且limit=1,所以每行中只有最先出现缺失值的一列被填充
'''
df2
Out[27]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | NaN | 6.0 | NaN | 0.0 |
In [28]:
df2.fillna(method='ffill') # method='ffill'或'pad',表示用前一个非缺失值去填充该缺失值
Out[28]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | 4.0 | 6.0 | 6.0 | 0.0 |
In [29]:
df2
Out[29]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | NaN | 6.0 | NaN | 0.0 |
In [31]:
df2[0] = df2[0].fillna(df2[0].mean()) # 指定第0列缺失值通过平均值作为填充数
df2
Out[31]:
0 | 1 | 2 | 3 | |
0 | 0.0 | 1.0 | 2.0 | 0.0 |
1 | 4.0 | 5.0 | 6.0 | 0.0 |
2 | 2.0 | 6.0 | NaN | 0.0 |
In [32]:
df2.fillna? #fillna的参数,可以通过“? ”进行帮助查询,这也是自我学习最好的方法
In [33]:
data = {
'name':['张三', '李四', '张三', '小明'],
'sex':['female', 'male', 'female', 'male'],
'year':[2001, 2002, 2001, 2002],
'city':['北京', '上海', '北京', '北京']
}
df1 = DataFrame(data)
df1
Out[33]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2001 |
1 | 上海 | 李四 | male | 2002 |
2 | 北京 | 张三 | female | 2001 |
3 | 北京 | 小明 | male | 2002 |
In [34]:
df1.duplicated() # 判断各行是否有重复数据,当每行的每个字段都相同时才会判断为重复项
Out[34]:
0 False
1 False
2 True
3 False
dtype: bool
In [35]:
df1.drop_duplicates() # 删除多余的重复项,当每行的每个字段都相同时才会判断为重复项
Out[35]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2001 |
1 | 上海 | 李四 | male | 2002 |
3 | 北京 | 小明 | male | 2002 |
In [36]:
df1.drop_duplicates(['sex','year']) # 指定部分列作为判断重复项的依据,默认保留的数据为第一个出现的组合
Out[36]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2001 |
1 | 上海 | 李四 | male | 2002 |
In [39]:
df1.drop_duplicates(['sex','year'],keep='last') # 传入keep='last’可以保留最后一个出现的组合
Out[39]:
city | name | sex | year | |
2 | 北京 | 张三 | female | 2001 |
3 | 北京 | 小明 | male | 2002 |
In [41]:
data = {
'name':['张三', '李四', '王五', '小明'],
'sex':['female', 'male', '', 'male'],
'year':[2001, 2003, 2001, 2002],
'city':['北京', '上海', '', '北京']
}
df1 = DataFrame(data)
df1
Out[41]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2001 |
1 | 上海 | 李四 | male | 2003 |
2 | 王五 | 2001 | ||
3 | 北京 | 小明 | male | 2002 |
In [42]:
df1.replace('','不详') # 通过replace可完成替换值的功能,空值替换为不详
Out[42]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2001 |
1 | 上海 | 李四 | male | 2003 |
2 | 不详 | 王五 | 不详 | 2001 |
3 | 北京 | 小明 | male | 2002 |
In [43]:
df1.replace(['',2001],['不详',2002]) #列表格式 多值替换,空值替换为不详,2001替换为2002
Out[43]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2002 |
1 | 上海 | 李四 | male | 2003 |
2 | 不详 | 王五 | 不详 | 2002 |
3 | 北京 | 小明 | male | 2002 |
In [44]:
df1.replace({'':'不详',2001:2002}) #字典格式 多值替换,空值替换为不详,2001替换为2002
Out[44]:
city | name | sex | year | |
0 | 北京 | 张三 | female | 2002 |
1 | 上海 | 李四 | male | 2003 |
2 | 不详 | 王五 | 不详 | 2002 |
3 | 北京 | 小明 | male | 2002 |
In [45]:
data = {
'name':['张三', '李四', '王五', '小明'],
'math':[79, 52, 63, 92]
}
df2 = DataFrame(data)
df2
Out[45]:
math | name | |
0 | 79 | 张三 |
1 | 52 | 李四 |
2 | 63 | 王五 |
3 | 92 | 小明 |
In [46]:
def f(x): # 定义函数f
if x >= 90:
return '优秀'
elif 70<=x<90:
return '良好'
elif 60<=x<70:
return '合格'
else:
return '不合格'
map()函数接收两个参数,一个是函数,一个是Iterable, map将传入的函数依次作用到序列的每一个元素,并把结果作为新的Iterable返回。 其语法格式为: map(function,iterable...) function---函数名 iterable---一个或多个序列
In [48]:
df2['class'] = df2['math'].map(f) # 新增class列,math列每个元素都运行函数f后填入class列
df2
Out[48]:
math | name | class | |
0 | 79 | 张三 | 良好 |
1 | 52 | 李四 | 不合格 |
2 | 63 | 王五 | 合格 |
3 | 92 | 小明 | 优秀 |
In [49]:
del df2['class'] # 删除class列
df2
Out[49]:
math | name | |
0 | 79 | 张三 |
1 | 52 | 李四 |
2 | 63 | 王五 |
3 | 92 | 小明 |
In [50]:
df2['class'] = df2['math'].apply(f) #apply函数运行f函数
df2
Out[50]:
math | name | class | |
0 | 79 | 张三 | 良好 |
1 | 52 | 李四 | 不合格 |
2 | 63 | 王五 | 合格 |
3 | 92 | 小明 | 优秀 |
In [2]:
df3 = DataFrame(np.arange(10),columns=['X'])
df3['Y'] = 2 * df3['X'] + 0.5
df3.iloc[9,1] = 185
df3
Out[2]:
X | Y | |
0 | 0 | 0.5 |
1 | 1 | 2.5 |
2 | 2 | 4.5 |
3 | 3 | 6.5 |
4 | 4 | 8.5 |
5 | 5 | 10.5 |
6 | 6 | 12.5 |
7 | 7 | 14.5 |
8 | 8 | 16.5 |
9 | 9 | 185.0 |
In [8]:
df3.plot(kind='scatter',x='X',y='Y')
Out[8]:
在数学建模和机器学习中,只有数值型数据才能供算法使用, 对于一些分类变量则需要将其转换为虚拟变量(哑变量)(也就是0,1矩阵), 通过get_dumnies函数即可实现该功能
In [10]:
df = DataFrame({
'朝向':['东','南','东','西','北'],
'价格':[1200,2100,2300,2900,1400]
})
df
Out[10]:
价格 | 朝向 | |
0 | 1200 | 东 |
1 | 2100 | 南 |
2 | 2300 | 东 |
3 | 2900 | 西 |
4 | 1400 | 北 |
In [11]:
pd.get_dummies(df['朝向']) # '朝向' 列转换为虚拟变量(哑变量)(也就是0,1矩阵)
Out[11]:
东 | 北 | 南 | 西 | |
0 | 1 | 0 | 0 | 0 |
1 | 0 | 0 | 1 | 0 |
2 | 1 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 1 |
4 | 0 | 1 | 0 | 0 |
In [12]:
df2 = DataFrame({
'朝向':['东/北','西/南','东','西/北','北'],
'价格':[1200,2100,2300,2900,1400]
})
df2
Out[12]:
价格 | 朝向 | |
0 | 1200 | 东/北 |
1 | 2100 | 西/南 |
2 | 2300 | 东 |
3 | 2900 | 西/北 |
4 | 1400 | 北 |
In [16]:
dummies = df2['朝向'].apply(lambda x:Series(x.split('/')).value_counts())
# 对于多类别的数据而言,需要通过apply函数来实现
dummies
Out[16]:
东 | 北 | 南 | 西 | |
0 | 1.0 | 1.0 | NaN | NaN |
1 | NaN | NaN | 1.0 | 1.0 |
2 | 1.0 | NaN | NaN | NaN |
3 | NaN | 1.0 | NaN | 1.0 |
4 | NaN | 1.0 | NaN | NaN |
In [19]:
dummies = dummies.fillna(0).astype(int)
dummies
Out[19]:
东 | 北 | 南 | 西 | |
0 | 1 | 1 | 0 | 0 |
1 | 0 | 0 | 1 | 1 |
2 | 1 | 0 | 0 | 0 |
3 | 0 | 1 | 0 | 1 |
4 | 0 | 1 | 0 | 0 |
In [43]:
price = DataFrame({
'fruit':['apple','banana','orange'],
'price':[23,32,45]
})
amount = DataFrame({
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
})
In [44]:
price
Out[44]:
fruit | price | |
0 | apple | 23 |
1 | banana | 32 |
2 | orange | 45 |
In [45]:
amount
Out[45]:
amount | fruit | |
0 | 5 | apple |
1 | 3 | banana |
2 | 6 | apple |
3 | 3 | apple |
4 | 5 | banana |
5 | 7 | pear |
In [46]:
pd.merge(amount,price) # merge函数是通过一个或多个键(DataFrame的列)将两个DataFrame按行合并起来
Out[46]:
amount | fruit | price | |
0 | 5 | apple | 23 |
1 | 6 | apple | 23 |
2 | 3 | apple | 23 |
3 | 3 | banana | 32 |
4 | 5 | banana | 32 |
In [47]:
pd.merge(amount,price,on='fruit') # 指定键名合并
Out[47]:
amount | fruit | price | |
0 | 5 | apple | 23 |
1 | 6 | apple | 23 |
2 | 3 | apple | 23 |
3 | 3 | banana | 32 |
4 | 5 | banana | 32 |
merge函数常用参数
In [48]:
pd.merge(amount,price,left_on='fruit',right_on='fruit')
'''merge默认为内连接(inner),也就是返回交集。
通过how参数可以选择连接方法:
左连接(left)、右连接(right)和外连接(outer)
'''
Out[48]:
amount | fruit | price | |
0 | 5 | apple | 23 |
1 | 6 | apple | 23 |
2 | 3 | apple | 23 |
3 | 3 | banana | 32 |
4 | 5 | banana | 32 |
In [49]:
pd.merge(amount,price,how='left')
Out[49]:
amount | fruit | price | |
0 | 5 | apple | 23.0 |
1 | 3 | banana | 32.0 |
2 | 6 | apple | 23.0 |
3 | 3 | apple | 23.0 |
4 | 5 | banana | 32.0 |
5 | 7 | pear | NaN |
In [50]:
pd.merge(amount,price,how='right')
Out[50]:
amount | fruit | price | |
0 | 5.0 | apple | 23 |
1 | 6.0 | apple | 23 |
2 | 3.0 | apple | 23 |
3 | 3.0 | banana | 32 |
4 | 5.0 | banana | 32 |
5 | NaN | orange | 45 |
In [52]:
pd.merge(amount,price,how='outer')
Out[52]:
amount | fruit | price | |
0 | 5.0 | apple | 23.0 |
1 | 6.0 | apple | 23.0 |
2 | 3.0 | apple | 23.0 |
3 | 3.0 | banana | 32.0 |
4 | 5.0 | banana | 32.0 |
5 | 7.0 | pear | NaN |
6 | NaN | orange | 45.0 |
In [53]:
price2 = DataFrame({
'fruit':['apple','banana','orange','apple'],
'price':[23,32,45,25]
})
amount2 = DataFrame({
'fruit':['apple','banana','apple','apple','banana','pear'],
'amount':[5,3,6,3,5,7]
})
In [54]:
amount2
Out[54]:
amount | fruit | |
0 | 5 | apple |
1 | 3 | banana |
2 | 6 | apple |
3 | 3 | apple |
4 | 5 | banana |
5 | 7 | pear |
In [55]:
price2
Out[55]:
fruit | price | |
0 | apple | 23 |
1 | banana | 32 |
2 | orange | 45 |
3 | apple | 25 |
In [57]:
pd.merge(amount2,price2)
Out[57]:
amount | fruit | price | |
0 | 5 | apple | 23 |
1 | 5 | apple | 25 |
2 | 6 | apple | 23 |
3 | 6 | apple | 25 |
4 | 3 | apple | 23 |
5 | 3 | apple | 25 |
6 | 3 | banana | 32 |
7 | 5 | banana | 32 |
In [59]:
left = DataFrame({
'key1':['one','one','two'],
'key2':['a','b','a'],
'val1':[2,3,4]
})
right = DataFrame({
'key1':['one','one','two','two'],
'key2':['a','a','a','b'],
'val2':[5,6,7,8]
})
In [60]:
left
Out[60]:
key1 | key2 | val1 | |
0 | one | a | 2 |
1 | one | b | 3 |
2 | two | a | 4 |
In [61]:
right
Out[61]:
key1 | key2 | val2 | |
0 | one | a | 5 |
1 | one | a | 6 |
2 | two | a | 7 |
3 | two | b | 8 |
In [62]:
pd.merge(left,right,on=['key1','key2'],how='outer') # 多键进行合并,即传入一个list即可
Out[62]:
key1 | key2 | val1 | val2 | |
0 | one | a | 2.0 | 5.0 |
1 | one | a | 2.0 | 6.0 |
2 | one | b | 3.0 | NaN |
3 | two | a | 4.0 | 7.0 |
4 | two | b | NaN | 8.0 |
In [63]:
pd.merge(left,right,on='key1')
Out[63]:
key1 | key2_x | val1 | key2_y | val2 | |
0 | one | a | 2 | a | 5 |
1 | one | a | 2 | a | 6 |
2 | one | b | 3 | a | 5 |
3 | one | b | 3 | a | 6 |
4 | two | a | 4 | a | 7 |
5 | two | a | 4 | b | 8 |
In [64]:
pd.merge(left,right,on='key1',suffixes=('_left','_right')) # 重复列名的修改,suffixes方法
Out[64]:
key1 | key2_left | val1 | key2_right | val2 | |
0 | one | a | 2 | a | 5 |
1 | one | a | 2 | a | 6 |
2 | one | b | 3 | a | 5 |
3 | one | b | 3 | a | 6 |
4 | two | a | 4 | a | 7 |
5 | two | a | 4 | b | 8 |
In [66]:
left2 = DataFrame({
'key':['a','a','b','b','c'],
'val1':range(5)
})
right2 = DataFrame({
'val2':[5,7]},
index = ['a','b']
)
In [67]:
left2
Out[67]:
key | val1 | |
0 | a | 0 |
1 | a | 1 |
2 | b | 2 |
3 | b | 3 |
4 | c | 4 |
In [68]:
right2
Out[68]:
val2 | |
a | 5 |
b | 7 |
In [70]:
pd.merge(left2,right2,left_on='key',right_index=True)
'''
连接的键位于DataFrame的行索引上,
可通过传入left_index=True或者right_index=True指定将索引作为连接键来使用
'''
Out[70]:
key | val1 | val2 | |
0 | a | 0 | 5 |
1 | a | 1 | 5 |
2 | b | 2 | 7 |
3 | b | 3 | 7 |
In [71]:
left3 = DataFrame({
'val1':range(4)},
index = ['a','b','a','c']
)
right3 = DataFrame({
'val2':[5,7]},
index = ['a','b']
)
In [72]:
left3
Out[72]:
val1 | |
a | 0 |
b | 1 |
a | 2 |
c | 3 |
In [73]:
right3
Out[73]:
val2 | |
a | 5 |
b | 7 |
In [74]:
left3.join(right3,how='outer') # join方法,可以快速完成按索引合并
Out[74]:
val1 | val2 | |
a | 0 | 5.0 |
a | 2 | 5.0 |
b | 1 | 7.0 |
c | 3 | NaN |
In [3]:
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3],index=['c','d'])
s3 = Series([4,5],index=['e','f'])
In [4]:
pd.concat([s1,s2,s3]) # 需要合并的DataFrame之间没有连接键,通过pandas的concat方法实现
Out[4]:
a 0
b 1
c 2
d 3
e 4
f 5
dtype: int64
In [5]:
pd.concat([s1,s2,s3],axis=1) # 默认情况下,concat是在axis=0上工作的,通过指定轴向也可以按列进行连接
Out[5]:
0 | 1 | 2 | |
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
c | NaN | 2.0 | NaN |
d | NaN | 3.0 | NaN |
e | NaN | NaN | 4.0 |
f | NaN | NaN | 5.0 |
In [6]:
s4 = pd.concat([s1*10,s3])
s4
Out[6]:
a 0
b 10
e 4
f 5
dtype: int64
In [8]:
pd.concat([s1,s4],axis=1)
Out[8]:
0 | 1 | |
a | 0.0 | 0 |
b | 1.0 | 10 |
e | NaN | 4 |
f | NaN | 5 |
In [9]:
pd.concat([s1,s4],axis=1,join='inner') # concat默认为外连接(并集),传入join='inner’可以实现内连接
Out[9]:
0 | 1 | |
a | 0 | 0 |
b | 1 | 10 |
In [14]:
pd.concat([s1,s4],axis=1,join='inner',join_axes=[['b','a']]) # 通过join_axes指定使用的索引顺序
Out[14]:
0 | 1 | |
b | 1 | 10 |
a | 0 | 0 |
In [15]:
pd.concat([s1,s4]) # concat只有内连接和外连接
Out[15]:
a 0
b 1
a 0
b 10
e 4
f 5
dtype: int64
In [17]:
pd.concat([s1,s4],keys=['one','two']) # 通过keys参数给连接对象创建一个层次化索引
Out[17]:
one a 0
b 1
two a 0
b 10
e 4
f 5
dtype: int64
In [18]:
pd.concat([s1,s4],axis=1,keys=['one','two']) # 如果按列连接,keys就成了DataFrame的列索引
Out[18]:
one | two | |
a | 0.0 | 0 |
b | 1.0 | 10 |
e | NaN | 4 |
f | NaN | 5 |
In [28]:
df1 = DataFrame({
'val1':range(3)},
index = ['a','b','c']
)
df2 = DataFrame({
'val2':[5,7]},
index = ['a','b']
)
In [29]:
df1
Out[29]:
val1 | |
a | 0 |
b | 1 |
c | 2 |
In [30]:
df2
Out[30]:
val2 | |
a | 5 |
b | 7 |
In [32]:
pd.concat([df1,df2],axis=1,keys=['one','two'])
Out[32]:
one | two | |
val1 | val2 | |
a | 0 | 5.0 |
b | 1 | 7.0 |
c | 2 | NaN |
In [33]:
pd.concat({'one':df1,'two':df2},axis=1) # 通过字典数据也可以完成连接,字典的键就是keys的值
Out[33]:
one | two | |
val1 | val2 | |
a | 0 | 5.0 |
b | 1 | 7.0 |
c | 2 | NaN |
In [34]:
df1 = DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.random.randn(2,2),columns=['d','c'])
In [35]:
df1
Out[35]:
a | b | c | d | |
0 | 0.023541 | -0.694903 | -0.515242 | 0.460737 |
1 | -1.326048 | 0.259269 | -0.685732 | 0.052237 |
2 | -0.110079 | 2.729854 | -0.503138 | -1.721161 |
In [36]:
df2
Out[36]:
d | c | |
0 | 0.995995 | -0.342845 |
1 | 0.848536 | 1.027354 |
In [37]:
pd.concat([df1,df2])
Out[37]:
a | b | c | d | |
0 | 0.023541 | -0.694903 | -0.515242 | 0.460737 |
1 | -1.326048 | 0.259269 | -0.685732 | 0.052237 |
2 | -0.110079 | 2.729854 | -0.503138 | -1.721161 |
0 | NaN | NaN | -0.342845 | 0.995995 |
1 | NaN | NaN | 1.027354 | 0.848536 |
In [38]:
pd.concat([df1,df2],ignore_index=True) # 通过ignore_index=‘True’忽略索引,以达到重排索引的效果
Out[38]:
a | b | c | d | |
0 | 0.023541 | -0.694903 | -0.515242 | 0.460737 |
1 | -1.326048 | 0.259269 | -0.685732 | 0.052237 |
2 | -0.110079 | 2.729854 | -0.503138 | -1.721161 |
3 | NaN | NaN | -0.342845 | 0.995995 |
4 | NaN | NaN | 1.027354 | 0.848536 |
In [39]:
df1 = DataFrame({
'a':[3,np.nan,6,np.nan],
'b':[np.nan,4,6,np.nan]
})
df2 = DataFrame({
'a':range(5),
'b':range(5)
})
In [40]:
df1
Out[40]:
a | b | |
0 | 3.0 | NaN |
1 | NaN | 4.0 |
2 | 6.0 | 6.0 |
3 | NaN | NaN |
In [41]:
df2
Out[41]:
a | b | |
0 | 0 | 0 |
1 | 1 | 1 |
2 | 2 | 2 |
3 | 3 | 3 |
4 | 4 | 4 |
In [42]:
df1.combine_first(df2) # 需要合并的两个DataFrame存在重复的索引,使用combine_first方法
Out[42]:
a | b | |
0 | 3.0 | 0.0 |
1 | 1.0 | 4.0 |
2 | 6.0 | 6.0 |
3 | 3.0 | 3.0 |
4 | 4.0 | 4.0 |
In [48]:
df = DataFrame(np.arange(9).reshape(3,3),
index = ['a','b','c'],
columns=['one','two','three'])
df.index.name = 'alph'
df.columns.name = 'number'
df
Out[48]:
number | one | two | three |
alph | |||
a | 0 | 1 | 2 |
b | 3 | 4 | 5 |
c | 6 | 7 | 8 |
In [50]:
result = df.stack() # stack方法用于将DataFrame的列“旋转”为行;默认情况下,数据重塑的操作都是最内层的
result
Out[50]:
alph number
a one 0
two 1
three 2
b one 3
two 4
three 5
c one 6
two 7
three 8
dtype: int32
In [51]:
result.unstack() # unstack方法用于将DataFrame的行“旋转”为列,默认情况下,数据重塑的操作都是最内层的
Out[51]:
number | one | two | three |
alph | |||
a | 0 | 1 | 2 |
b | 3 | 4 | 5 |
c | 6 | 7 | 8 |
In [52]:
result.unstack(0)
Out[52]:
alph | a | b | c |
number | |||
one | 0 | 3 | 6 |
two | 1 | 4 | 7 |
three | 2 | 5 | 8 |
In [53]:
result.unstack('alph')
Out[53]:
alph | a | b | c |
number | |||
one | 0 | 3 | 6 |
two | 1 | 4 | 7 |
three | 2 | 5 | 8 |
In [54]:
df = DataFrame(np.arange(16).reshape(4,4),
index=[['one','one','two','two'],['a','b','a','b']],
columns=[['apple','apple','orange','orange'],['red','green','red','green']])
df
Out[54]:
apple | orange | ||||
red | green | red | green | ||
one | a | 0 | 1 | 2 | 3 |
b | 4 | 5 | 6 | 7 | |
two | a | 8 | 9 | 10 | 11 |
b | 12 | 13 | 14 | 15 |
In [55]:
df.stack()
Out[55]:
apple | orange | |||
one | a | green | 1 | 3 |
red | 0 | 2 | ||
b | green | 5 | 7 | |
red | 4 | 6 | ||
two | a | green | 9 | 11 |
red | 8 | 10 | ||
b | green | 13 | 15 | |
red | 12 | 14 |
In [56]:
df.unstack()
Out[56]:
apple | orange | |||||||
red | green | red | green | |||||
a | b | a | b | a | b | a | b | |
one | 0 | 4 | 1 | 5 | 2 | 6 | 3 | 7 |
two | 8 | 12 | 9 | 13 | 10 | 14 | 11 | 15 |
In [71]:
data = {
'data':['张三|男', '李四|女', '王五|女', '小明|男'],
}
df = DataFrame(data)
df
Out[71]:
data | |
0 | 张三|男 |
1 | 李四|女 |
2 | 王五|女 |
3 | 小明|男 |
In [67]:
result = df['data'].apply(lambda x:Series(x.split('|')))
# 把数据分成两列,常用的方法是通过函数应用来完成
result
Out[67]:
0 | 1 | |
0 | 张三 | 男 |
1 | 李四 | 女 |
2 | 王五 | 女 |
3 | 小明 | 男 |
In [81]:
new_df = df['data'].str.split('|')
# pandas中字段的str属性可以轻松调用字符串的方法
new_df
Out[81]:
0 [张三, 男]
1 [李四, 女]
2 [王五, 女]
3 [小明, 男]
Name: data, dtype: object
In [82]:
df['name'] = new_df.str[0] # pandas中字段的str属性可以轻松调用字符串的方法
df['sex'] = new_df.str[1] # # pandas中字段的str属性可以轻松调用字符串的方法
df
Out[82]:
data | name | sex | |
0 | 张三|男 | 张三 | 男 |
1 | 李四|女 | 李四 | 女 |
2 | 王五|女 | 王五 | 女 |
3 | 小明|男 | 小明 | 男 |
In [83]:
df2 = DataFrame({
'email':['102345@qq.com','342167@qq.com','65132@qq.com']
})
df2
Out[83]:
0 | 102345@qq.com |
1 | 342167@qq.com |
2 | 65132@qq.com |
In [84]:
df2['email'].str.findall('(.*?)@')
Out[84]:
0 [102345]
1 [342167]
2 [65132]
Name: email, dtype: object
In [85]:
df2['QQ'] = df2['email'].str.findall('(.*?)@').str.get(0)
df2
Out[85]:
0 | 102345@qq.com | 102345 |
1 | 342167@qq.com | 342167 |
2 | 65132@qq.com | 65132 |
In [101]:
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt #导入pyplot绘图模块
import matplotlib as mpl #导入matplotlib绘图库
import seaborn as sns #导入seaborn绘图库
%matplotlib inline
In [107]:
iris_data = pd.read_csv(open('H:python数据分析数据iris-data.csv')) # 读取数据
iris_data.head()
Out[107]:
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | |
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
In [108]:
iris_data.shape # 数据大小行数和列数
Out[108]:
(150, 5)
In [110]:
iris_data.describe()
Out[110]:
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | |
count | 150.000000 | 150.000000 | 150.000000 | 145.000000 |
mean | 5.644627 | 3.054667 | 3.758667 | 1.236552 |
std | 1.312781 | 0.433123 | 1.764420 | 0.755058 |
min | 0.055000 | 2.000000 | 1.000000 | 0.100000 |
25% | 5.100000 | 2.800000 | 1.600000 | 0.400000 |
50% | 5.700000 | 3.000000 | 4.350000 | 1.300000 |
75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
max | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
In [112]:
iris_data['class'].unique() # 去重
# unique函数去除其中重复的元素,并按元素由大到小返回一个新的无元素重复的元组或者列表
Out[112]:
array(['Iris-setosa', 'Iris-setossa', 'Iris-versicolor', 'versicolor',
'Iris-virginica'], dtype=object)
In [115]:
iris_data.ix[iris_data['class'] == 'versicolor', 'class'] = 'Iris-versicolor'
iris_data.ix[iris_data['class'] == 'Iris-setossa', 'class'] = 'Iris-setosa'
iris_data['class'].unique()
Out[115]:
array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)
In [118]:
sns.pairplot(iris_data, hue='class') # 利用seaborn绘制散点图矩阵
F:Anacondaenvsdata-analysislibsite-packages
umpylibfunction_base.py:748: RuntimeWarning: invalid value encountered in greater_equal
keep = (tmp_a >= mn)
F:Anacondaenvsdata-analysislibsite-packages
umpylibfunction_base.py:749: RuntimeWarning: invalid value encountered in less_equal
keep &= (tmp_a <= mx)
Out[118]:
In [119]:
iris_data.ix[iris_data['class'] == 'Iris-setosa', 'sepal_width_cm'].hist()
Out[119]:
In [125]:
iris_data = iris_data.loc[(iris_data['class'] != 'Iris-setosa') | (iris_data['sepal_width_cm'] >= 2.5)]
iris_data.loc[iris_data['class'] == 'Iris-setosa', 'sepal_width_cm'].hist()
Out[125]:
In [126]:
iris_data.loc[(iris_data['class'] == 'Iris-versicolor') &
(iris_data['sepal_length_cm'] < 1.0)]
Out[126]:
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | |
77 | 0.067 | 3.0 | 5.0 | 1.7 | Iris-versicolor |
78 | 0.060 | 2.9 | 4.5 | 1.5 | Iris-versicolor |
79 | 0.057 | 2.6 | 3.5 | 1.0 | Iris-versicolor |
80 | 0.055 | 2.4 | 3.8 | 1.1 | Iris-versicolor |
81 | 0.055 | 2.4 | 3.7 | 1.0 | Iris-versicolor |
In [127]:
iris_data.loc[(iris_data['class'] == 'Iris-versicolor') &
(iris_data['sepal_length_cm'] < 1.0),
'sepal_length_cm'] *= 100.0
In [128]:
iris_data.isnull().sum()
Out[128]:
sepal_length_cm 0
sepal_width_cm 0
petal_length_cm 0
petal_width_cm 5
class 0
dtype: int64
In [131]:
iris_data[iris_data['petal_width_cm'].isnull()]
Out[131]:
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | |
7 | 5.0 | 3.4 | 1.5 | NaN | Iris-setosa |
8 | 4.4 | 2.9 | 1.4 | NaN | Iris-setosa |
9 | 4.9 | 3.1 | 1.5 | NaN | Iris-setosa |
10 | 5.4 | 3.7 | 1.5 | NaN | Iris-setosa |
11 | 4.8 | 3.4 | 1.6 | NaN | Iris-setosa |
In [132]:
iris_data.dropna(inplace=True) # 将缺失值进行删除处理
In [133]:
iris_data.to_csv('H:python数据分析数据iris-clean-data.csv', index=False) # 最后对清洗好的数据进行存储
In [135]:
iris_data = pd.read_csv(open('H:python数据分析数据iris-clean-data.csv'))
iris_data.head()
Out[135]:
sepal_length_cm | sepal_width_cm | petal_length_cm | petal_width_cm | class | |
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
In [136]:
iris_data.shape
Out[136]:
(144, 5)
In [137]:
sns.pairplot(iris_data, hue='class') # 绘制散点矩阵图
Out[137]:
In [145]:
iris_data.boxplot(column='petal_length_cm', by='class',grid=False,figsize=(6,6))
# boxplot用于绘制箱形图,figsize可设置画布的大小
F:Anacondaenvsdata-analysislibsite-packages
umpycorefromnumeric.py:57: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
return getattr(obj, method)(*args, **kwds)
Out[145]:
In [139]:
iris_data.boxplot? # 箱型图查询帮助
页面更新:2024-03-14
本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828
© CopyRight 2020-2024 All Rights Reserved. Powered By 71396.com 闽ICP备11008920号-4
闽公网安备35020302034903号