python数据分析pandas基础

from pandas import Series,DataFrame
import pandas as pd

In [4]:

obj = Series([1, -2, 3, -4])
obj

Out[4]:

0    1
1   -2
2    3
3   -4
dtype: int64

In [5]:

obj2 = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj2

Out[5]:

a    1
b   -2
c    3
d   -4
dtype: int64

In [6]:

obj2.values

Out[6]:

array([ 1, -2,  3, -4], dtype=int64)

In [7]:

obj2.index

Out[7]:

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:

obj2['b']

Out[8]:

-2

In [10]:

obj2['c'] = 23
obj2[['c', 'd']]

Out[10]:

c    23
d    -4
dtype: int64

In [11]:

obj2

Out[11]:

a     1
b    -2
c    23
d    -4
dtype: int64

In [12]:

obj2[obj2 < 0 ]

Out[12]:

b   -2
d   -4
dtype: int64

In [13]:

obj2 * 2

Out[13]:

a     2
b    -4
c    46
d    -8
dtype: int64

In [16]:

import numpy as np

In [18]:

np.abs(obj2)

Out[18]:

a     1
b     2
c    23
d     4
dtype: int64

In [20]:

data = {
    '张三':92,
    '李四':78,
    '王五':68,
    '小明':82    
}

In [21]:

obj3 = Series(data)
obj3

Out[21]:

小明    82
张三    92
李四    78
王五    68
dtype: int64

In [22]:

names = ['张三', '李四', '王五', '小明']
obj4 = Series(data, index=names)
obj4

Out[22]:

张三    92
李四    78
王五    68
小明    82
dtype: int64

In [23]:

obj4.name = 'math'
obj4.index.name = 'students'

In [24]:

obj4

Out[24]:

students
张三    92
李四    78
王五    68
小明    82
Name: math, dtype: int64

dataframe

In [1]:

import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [2]:

data = {
    'name':['张三', '李四', '王五', '小明'],
    'sex':['female', 'female', 'male', 'male'],
    'year':[2001, 2001, 2003, 2002],
    'city':['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

Out[2]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

In [3]:

df = DataFrame(data, columns=['name', 'sex', 'year', 'city'])
df

Out[3]:


name

sex

year

city

0

张三

female

2001

北京

1

李四

female

2001

上海

2

王五

male

2003

广州

3

小明

male

2002

北京

In [4]:

df = DataFrame(data, columns=['name', 'sex', 'year', 'city'],index=['a', 'b', 'c', 'd'])
df

Out[4]:


name

sex

year

city

a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

In [5]:

df.index

Out[5]:

Index(['a', 'b', 'c', 'd'], dtype='object')

In [6]:

df.columns

Out[6]:

Index(['name', 'sex', 'year', 'city'], dtype='object')

In [7]:

data2 = {
    'sex':{'张三':'female','李四':'female','王五':'male'},
    'city':{'张三':'北京','李四':'上海','王五':'广州'}
}
df2 = DataFrame(data2)
df2

Out[7]:


city

sex

张三

北京

female

李四

上海

female

王五

广州

male

In [8]:

df.index.name = 'id'
df.columns.name = 'std_info'

In [9]:

df

Out[9]:

std_info

name

sex

year

city

id





a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

In [10]:

obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj

Out[10]:

a    1
b   -2
c    3
d   -4
dtype: int64

In [11]:

obj.index

Out[11]:

Index(['a', 'b', 'c', 'd'], dtype='object')

In [12]:

df.index

Out[12]:

Index(['a', 'b', 'c', 'd'], dtype='object', name='id')

In [13]:

df.columns

Out[13]:

Index(['name', 'sex', 'year', 'city'], dtype='object', name='std_info')

In [14]:

index = obj.index
index[1] = 'f'
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
 in ()
      1 index = obj.index
----> 2 index[1] = 'f'

F:Anacondaenvsdata-analysislibsite-packagespandascoreindexesbase.py in __setitem__(self, key, value)
   1668 
   1669     def __setitem__(self, key, value):
-> 1670         raise TypeError("Index does not support mutable operations")
   1671 
   1672     def __getitem__(self, key):

TypeError: Index does not support mutable operations

In [15]:

df

Out[15]:

std_info

name

sex

year

city

id





a

张三

female

2001

北京

b

李四

female

2001

上海

c

王五

male

2003

广州

d

小明

male

2002

北京

In [16]:

'sex' in df.columns

Out[16]:

True

In [17]:

'f' in df.index

Out[17]:

False

In [20]:

obj = Series([1, -2, 3, -4], index=['b', 'a', 'c', 'd'])
obj

Out[20]:

b    1
a   -2
c    3
d   -4
dtype: int64

In [21]:

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

Out[21]:

a   -2.0
b    1.0
c    3.0
d   -4.0
e    NaN
dtype: float64

In [27]:

obj = Series([1, -2, 3, -4], index=[0,2,3,5])
obj

Out[27]:

0    1
2   -2
3    3
5   -4
dtype: int64

In [28]:

obj2 = obj.reindex(range(6),method='ffill')
obj2

Out[28]:

0    1
1    1
2   -2
3    3
4    3
5   -4
dtype: int64

In [29]:

df = DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['name','id','sex'])
df

Out[29]:


name

id

sex

a

0

1

2

c

3

4

5

d

6

7

8

In [30]:

df2 = df.reindex(['a', 'b', 'c', 'd'])
df2

Out[30]:


name

id

sex

a

0.0

1.0

2.0

b

NaN

NaN

NaN

c

3.0

4.0

5.0

d

6.0

7.0

8.0

In [31]:

df3 = df.reindex(columns=['name', 'year', 'id'], fill_value=0)
df3

Out[31]:


name

year

id

a

0

0

1

c

3

0

4

d

6

0

7

In [49]:

data = {
    'name':['张三', '李四', '王五', '小明'],
    'grade':[68, 78, 63, 92]
}
df = DataFrame(data)
df

Out[49]:


grade

name

0

68

张三

1

78

李四

2

63

王五

3

92

小明

In [50]:

df2 = df.sort_values(by='grade')
df2

Out[50]:


grade

name

2

63

王五

0

68

张三

1

78

李四

3

92

小明

In [51]:

df3 = df2.reset_index()
df3

Out[51]:


index

grade

name

0

2

63

王五

1

0

68

张三

2

1

78

李四

3

3

92

小明

In [52]:

df4 = df2.reset_index(drop=True)
df4

Out[52]:


grade

name

0

63

王五

1

68

张三

2

78

李四

3

92

小明

In [45]:

data = {
    'name':['张三', '李四', '王五', '小明'],
    'sex':['female', 'female', 'male', 'male'],
    'year':[2001, 2001, 2003, 2002],
    'city':['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

Out[45]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

In [47]:

df2 = df.set_index('name')
df2

Out[47]:


city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

In [48]:

df3 = df2.reset_index()
df3

Out[48]:


name

city

sex

year

0

张三

北京

female

2001

1

李四

上海

female

2001

2

王五

广州

male

2003

3

小明

北京

male

2002

索引和选取

In [1]:

import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [3]:

obj = Series([1, -2, 3, -4], index=['a', 'b', 'c', 'd'])
obj

Out[3]:

a    1
b   -2
c    3
d   -4
dtype: int64

In [4]:

obj[1]

Out[4]:

-2

In [5]:

obj['b']

Out[5]:

-2

In [6]:

obj[['a','c']]

Out[6]:

a    1
c    3
dtype: int64

In [7]:

obj[0:2]

Out[7]:

a    1
b   -2
dtype: int64

In [8]:

obj['a':'c']

Out[8]:

a    1
b   -2
c    3
dtype: int64

In [53]:

data = {
    'name':['张三', '李四', '王五', '小明'],
    'sex':['female', 'female', 'male', 'male'],
    'year':[2001, 2001, 2003, 2002],
    'city':['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

Out[53]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

In [17]:

df['city']

Out[17]:

0    北京
1    上海
2    广州
3    北京
Name: city, dtype: object

In [18]:

df.name

Out[18]:

0    张三
1    李四
2    王五
3    小明
Name: name, dtype: object

In [20]:

df[['city','sex']]

Out[20]:


city

sex

0

北京

female

1

上海

female

2

广州

male

3

北京

male

In [26]:

df2 = df.set_index('name')
df2

Out[26]:


city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

In [27]:

df2[0:2]

Out[27]:


city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

In [28]:

df2['李四':'王五']

Out[28]:


city

sex

year

name




李四

上海

female

2001

王五

广州

male

2003

In [29]:

df2

Out[29]:


city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

王五

广州

male

2003

小明

北京

male

2002

In [31]:

df2.loc['张三']

Out[31]:

city        北京
sex     female
year      2001
Name: 张三, dtype: object

In [33]:

df2.loc[['张三','王五']]

Out[33]:


city

sex

year

name




张三

北京

female

2001

王五

广州

male

2003

In [35]:

df2.iloc[1]

Out[35]:

city        上海
sex     female
year      2001
Name: 李四, dtype: object

In [36]:

df2.iloc[[1,3]]

Out[36]:


city

sex

year

name




李四

上海

female

2001

小明

北京

male

2002

In [41]:

df2.ix[['张三','王五'],0:2]

Out[41]:


city

sex

name



张三

北京

female

王五

广州

male

In [75]:

pd.set_option('mode.chained_assignment',None)

In [43]:

df2.ix[:,['sex','year']] #获取列

Out[43]:


sex

year

name



张三

female

2001

李四

female

2001

王五

male

2003

小明

male

2002

In [44]:

df2.ix[[1,3],:] #获取行

Out[44]:


city

sex

year

name




李四

上海

female

2001

小明

北京

male

2002

In [45]:

df2['sex'] == 'female'

Out[45]:

name
张三     True
李四     True
王五    False
小明    False
Name: sex, dtype: bool

In [46]:

df2[df2['sex'] == 'female']

Out[46]:


city

sex

year

name




张三

北京

female

2001

李四

上海

female

2001

In [48]:

df2[(df2['sex'] == 'female') & (df2['city'] == '北京')]

Out[48]:


city

sex

year

name




张三

北京

female

2001

行和列的操作

In [54]:

df

Out[54]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

In [57]:

new_data = {
    'city':'武汉',
    'name':'小李',
    'sex':'male',
    'year':2002
}

In [59]:

df = df.append(new_data,ignore_index=True) #忽略索引值
df

Out[59]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

4

武汉

小李

male

2002

In [60]:

df['class'] = 2018
df

Out[60]:


city

name

sex

year

class

0

北京

张三

female

2001

2018

1

上海

李四

female

2001

2018

2

广州

王五

male

2003

2018

3

北京

小明

male

2002

2018

4

武汉

小李

male

2002

2018

In [61]:

df['math'] = [92,78,58,69,82]
df

Out[61]:


city

name

sex

year

class

math

0

北京

张三

female

2001

2018

92

1

上海

李四

female

2001

2018

78

2

广州

王五

male

2003

2018

58

3

北京

小明

male

2002

2018

69

4

武汉

小李

male

2002

2018

82

In [63]:

new_df = df.drop(2)  #删除行
new_df

Out[63]:


city

name

sex

year

class

math

0

北京

张三

female

2001

2018

92

1

上海

李四

female

2001

2018

78

3

北京

小明

male

2002

2018

69

4

武汉

小李

male

2002

2018

82

In [64]:

new_df = new_df.drop('class',axis=1)  #删除列
new_df

Out[64]:


city

name

sex

year

math

0

北京

张三

female

2001

92

1

上海

李四

female

2001

78

3

北京

小明

male

2002

69

4

武汉

小李

male

2002

82

In [65]:

new_df.rename(index={3:2,4:3},columns={'math':'Math'},inplace=True)  #inplace可在原数据上修改
new_df

Out[65]:


city

name

sex

year

Math

0

北京

张三

female

2001

92

1

上海

李四

female

2001

78

2

北京

小明

male

2002

69

3

武汉

小李

male

2002

82

In [67]:

obj1 = Series([3.2,5.3,-4.4,-3.7],index=['a','c','g','f'])
obj1

Out[67]:

a    3.2
c    5.3
g   -4.4
f   -3.7
dtype: float64

In [68]:

obj2 = Series([5.0,-2,4.4,3.4],index=['a','b','c','d'])
obj2

Out[68]:

a    5.0
b   -2.0
c    4.4
d    3.4
dtype: float64

In [69]:

obj1 + obj2

Out[69]:

a    8.2
b    NaN
c    9.7
d    NaN
f    NaN
g    NaN
dtype: float64

In [70]:

df1 = DataFrame(np.arange(9).reshape(3,3),columns=['a','b','c'], index=['apple','tea','banana'])
df1

Out[70]:


a

b

c

apple

0

1

2

tea

3

4

5

banana

6

7

8

In [71]:

df2 = DataFrame(np.arange(9).reshape(3,3),columns=['a','b','d'], index=['apple','tea','coco'])
df2

Out[71]:


a

b

d

apple

0

1

2

tea

3

4

5

coco

6

7

8

In [72]:

df1 + df2

Out[72]:


a

b

c

d

apple

0.0

2.0

NaN

NaN

banana

NaN

NaN

NaN

NaN

coco

NaN

NaN

NaN

NaN

tea

6.0

8.0

NaN

NaN

In [73]:

df1

Out[73]:


a

b

c

apple

0

1

2

tea

3

4

5

banana

6

7

8

In [76]:

s = df1.ix['apple']
s

Out[76]:

a    0
b    1
c    2
Name: apple, dtype: int32

In [77]:

df1 - s

Out[77]:


a

b

c

apple

0

0

0

tea

3

3

3

banana

6

6

6

In [78]:

data = {
    'fruit':['apple', 'orange', 'grape', 'banana'],
    'price':['25元', '42元', '35元', '14元']
}
df1 = DataFrame(data)
df1

Out[78]:


fruit

price

0

apple

25元

1

orange

42元

2

grape

35元

3

banana

14元

In [79]:

def f(x):
    return x.split('元')[0]
df1['price'] = df1['price'].map(f)
df1

Out[79]:


fruit

price

0

apple

25

1

orange

42

2

grape

35

3

banana

14

In [80]:

df2 = DataFrame(np.random.randn(3,3),columns=['a','b','c'],index=['app','win','mac'])
df2

Out[80]:


a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

In [81]:

f = lambda x:x.max()-x.min()
df2.apply(f)

Out[81]:

a    1.699460
b    2.347079
c    0.664100
dtype: float64

In [82]:

df2

Out[82]:


a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

In [84]:

df2.applymap(lambda x:'%.2f'%x)

Out[84]:


a

b

c

app

1.51

-2.14

0.05

win

0.73

0.21

0.40

mac

-0.19

-0.77

-0.27

In [86]:

obj1 = Series([-2,3,2,1],index=['b','a','d','c'])
obj1

Out[86]:

b   -2
a    3
d    2
c    1
dtype: int64

In [87]:

obj1.sort_index()    #升序

Out[87]:

a    3
b   -2
c    1
d    2
dtype: int64

In [88]:

obj1.sort_index(ascending=False)  #降序

Out[88]:

d    2
c    1
b   -2
a    3
dtype: int64

In [91]:

obj1.sort_values()

Out[91]:

b   -2
c    1
d    2
a    3
dtype: int64

In [92]:

df2

Out[92]:


a

b

c

app

1.507962

-2.140018

0.053571

win

0.729671

0.207060

0.397773

mac

-0.191497

-0.765726

-0.266327

In [93]:

df2.sort_values(by='b')

Out[93]:


a

b

c

app

1.507962

-2.140018

0.053571

mac

-0.191497

-0.765726

-0.266327

win

0.729671

0.207060

0.397773

In [2]:

df = DataFrame(np.random.randn(9).reshape(3,3),columns=['a','b','c'])
df

Out[2]:


a

b

c

0

0.660215

-1.137716

-0.302954

1

1.496589

-0.768645

-2.091506

2

0.170316

-2.682284

-0.041099

In [3]:

df.sum()

Out[3]:

a    2.327120
b   -4.588645
c   -2.435558
dtype: float64

In [4]:

df.sum(axis=1)

Out[4]:

0   -0.780455
1   -1.363562
2   -2.553067
dtype: float64

In [5]:

data = {
    'name':['张三', '李四', '王五', '小明'],
    'sex':['female', 'female', 'male', 'male'],
    'math':[78, 79, 83, 92],
    'city':['北京', '上海', '广州', '北京']
}
df = DataFrame(data)
df

Out[5]:


city

math

name

sex

0

北京

78

张三

female

1

上海

79

李四

female

2

广州

83

王五

male

3

北京

92

小明

male

In [6]:

df.describe()

Out[6]:


math

count

4.000000

mean

83.000000

std

6.377042

min

78.000000

25%

78.750000

50%

81.000000

75%

85.250000

max

92.000000

In [7]:

obj = Series(['a','b','a','c','b'])
obj

Out[7]:

0    a
1    b
2    a
3    c
4    b
dtype: object

In [8]:

obj.unique()

Out[8]:

array(['a', 'b', 'c'], dtype=object)

In [9]:

obj.value_counts()

Out[9]:

a    2
b    2
c    1
dtype: int64

In [11]:

obj = Series(np.random.randn(9),
            index=[['one','one','one','two','two','two','three','three','three'],
                  ['a','b','c','a','b','c','a','b','c']])
obj

Out[11]:

one    a    0.697195
       b   -0.887408
       c    0.451851
two    a    0.390779
       b   -2.058070
       c    0.760594
three  a   -0.305534
       b   -0.720491
       c   -0.259225
dtype: float64

In [12]:

obj.index

Out[12]:

MultiIndex(levels=[['one', 'three', 'two'], ['a', 'b', 'c']],
           labels=[[0, 0, 0, 2, 2, 2, 1, 1, 1], [0, 1, 2, 0, 1, 2, 0, 1, 2]])

In [13]:

obj['two']

Out[13]:

a    0.390779
b   -2.058070
c    0.760594
dtype: float64

In [15]:

obj[:,'a']  #内层选取

Out[15]:

one      0.697195
two      0.390779
three   -0.305534
dtype: float64

In [16]:

df = DataFrame(np.arange(16).reshape(4,4),
              index=[['one','one','two','two'],['a','b','a','b']],
              columns=[['apple','apple','orange','orange'],['red','green','red','green']])
df

Out[16]:



apple

orange



red

green

red

green

one

a

0

1

2

3

b

4

5

6

7

two

a

8

9

10

11

b

12

13

14

15

In [17]:

df['apple']

Out[17]:



red

green

one

a

0

1

b

4

5

two

a

8

9

b

12

13

In [18]:

df.swaplevel(0,1)

Out[18]:



apple

orange



red

green

red

green

a

one

0

1

2

3

b

one

4

5

6

7

a

two

8

9

10

11

b

two

12

13

14

15

In [19]:

df.sum(level=0)

Out[19]:


apple

orange


red

green

red

green

one

4

6

8

10

two

20

22

24

26

In [20]:

df.sum(level=1,axis=1)

Out[20]:



green

red

one

a

4

2

b

12

10

two

a

20

18

b

28

26

pandas数据可视化

In [6]:

import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt  #导入matplotlib库
%matplotlib inline    #魔法函数

In [7]:

s = Series(np.random.normal(size=10))
s

Out[7]:

0   -0.468142
1   -1.408927
2   -0.182548
3   -0.043023
4    0.121437
5    0.539194
6    0.011423
7   -0.938207
8    1.589460
9    0.460753
dtype: float64

In [8]:

s.plot()

Out[8]:

In [10]:

df = DataFrame({'normal': np.random.normal(size=100), 
                       'gamma': np.random.gamma(1, size=100), 
                       'poisson': np.random.poisson(size=100)})
df.cumsum()

Out[10]:


gamma

normal

poisson

0

1.804045

1.788000

0.0

1

1.835715

0.089426

0.0

2

3.850210

0.870177

0.0

3

6.082898

0.902761

0.0

4

8.837446

0.959945

1.0

5

9.307126

1.658268

3.0

6

9.518029

3.118419

6.0

7

9.758011

3.861418

6.0

8

10.481856

3.405625

6.0

9

12.405202

4.892910

7.0

10

13.086167

4.776206

7.0

11

13.457807

3.217277

8.0

12

13.574663

1.821368

9.0

13

13.695523

2.829581

10.0

14

13.819044

3.015490

11.0

15

15.801080

2.629254

13.0

16

17.043867

2.052196

14.0

17

17.089774

3.687834

15.0

18

17.499338

2.635491

16.0

19

18.257891

2.636466

18.0

20

19.101743

2.272298

19.0

21

24.158020

-0.113947

20.0

22

25.112218

-0.594266

23.0

23

25.986628

-1.326405

23.0

24

28.383365

-1.349211

23.0

25

28.753694

-1.527589

23.0

26

28.908734

-1.312111

25.0

27

30.607696

0.228251

26.0

28

31.081009

1.067429

27.0

29

31.330353

1.098605

28.0

...

...

...

...

70

72.302929

14.123995

66.0

71

72.794689

14.860449

67.0

72

73.629651

14.828726

67.0

73

74.610837

14.168664

68.0

74

78.773897

13.334949

70.0

75

80.916582

13.722037

71.0

76

81.994526

14.717187

72.0

77

83.927355

13.784763

72.0

78

86.004903

13.343261

75.0

79

86.609627

12.151334

75.0

80

87.199249

13.345584

77.0

81

87.213180

12.311815

77.0

82

87.553190

13.864232

77.0

83

89.157662

14.439016

78.0

84

89.213456

14.401503

80.0

85

89.471336

15.838362

81.0

86

89.552332

14.406933

81.0

87

91.565291

14.520602

82.0

88

94.179919

12.017739

82.0

89

95.075841

13.279973

83.0

90

95.192719

13.089789

83.0

91

96.148316

12.268122

84.0

92

97.146898

11.830559

84.0

93

97.456375

13.035484

86.0

94

99.877122

11.966609

87.0

95

103.015620

12.313341

88.0

96

103.116648

12.715195

88.0

97

103.490265

12.168645

89.0

98

103.925893

11.502630

89.0

99

105.008619

11.193637

89.0

100 rows × 3 columns

In [11]:

df.cumsum().plot()

Out[11]:

In [12]:

data = {
    'name':['张三', '李四', '王五', '小明', 'Peter'],
    'sex':['female', 'female', 'male', 'male','male'],
    'year':[2001, 2001, 2003, 2002, 2002],
    'city':['北京', '上海', '广州', '北京', '北京']
}
df = DataFrame(data)
df

Out[12]:


city

name

sex

year

0

北京

张三

female

2001

1

上海

李四

female

2001

2

广州

王五

male

2003

3

北京

小明

male

2002

4

北京

Peter

male

2002

In [14]:

df['sex'].value_counts()

Out[14]:

male      3
female    2
Name: sex, dtype: int64

In [16]:

df['sex'].value_counts().plot(kind='bar')

Out[16]:

In [18]:

df2 = DataFrame(np.random.randint(0,100,size=(3,3)),
               index=('one','two','three'),
               columns = ['A','B','C'])
df2

Out[18]:


A

B

C

one

29

5

88

two

35

42

43

three

87

85

76

In [19]:

df2.plot(kind='barh')

Out[19]:

In [20]:

df2.plot(kind='barh',stacked=True,alpha=0.5)

Out[20]:

In [28]:

s = Series(np.random.normal(size=100))
s.hist(bins=20,grid=False)

Out[28]:

In [29]:

s.plot(kind='kde')

Out[29]:

In [31]:

df3 = DataFrame(np.arange(10),columns=['X'])
df3['Y'] = 2 * df3['X'] + 5
df3

Out[31]:


X

Y

0

0

5

1

1

7

2

2

9

3

3

11

4

4

13

5

5

15

6

6

17

7

7

19

8

8

21

9

9

23

In [34]:

 df3.plot(kind='scatter',x='X',y='Y')

Out[34]:

In [51]:

import numpy as np
from pandas import Series,DataFrame
import pandas as pd
import seaborn as sns   #导入seaborn库

In [52]:

tips=sns.load_dataset('tips')
tips.head()

Out[52]:


total_bill

tip

sex

smoker

day

time

size

0

16.99

1.01

Female

No

Sun

Dinner

2

1

10.34

1.66

Male

No

Sun

Dinner

3

2

21.01

3.50

Male

No

Sun

Dinner

3

3

23.68

3.31

Male

No

Sun

Dinner

2

4

24.59

3.61

Female

No

Sun

Dinner

4

In [54]:

tips.shape

Out[54]:

(244, 7)

In [55]:

tips.describe()

Out[55]:


total_bill

tip

size

count

244.000000

244.000000

244.000000

mean

19.785943

2.998279

2.569672

std

8.902412

1.383638

0.951100

min

3.070000

1.000000

1.000000

25%

13.347500

2.000000

2.000000

50%

17.795000

2.900000

2.000000

75%

24.127500

3.562500

3.000000

max

50.810000

10.000000

6.000000

In [56]:

tips.info()

RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
dtypes: category(4), float64(2), int64(1)
memory usage: 7.2 KB

In [57]:

 tips.plot(kind='scatter',x='total_bill',y='tip')

Out[57]:

In [62]:

male_tip = tips[tips['sex'] == 'Male']['tip'].mean()
male_tip

Out[62]:

3.0896178343949052

In [63]:

female_tip = tips[tips['sex'] == 'Female']['tip'].mean()
female_tip

Out[63]:

2.833448275862069

In [66]:

s = Series([male_tip,female_tip],index=['male','female'])
s

Out[66]:

male      3.089618
female    2.833448
dtype: float64

In [67]:

s.plot(kind='bar')

Out[67]:

In [68]:

tips['day'].unique()

Out[68]:

[Sun, Sat, Thur, Fri]
Categories (4, object): [Sun, Sat, Thur, Fri]

In [71]:

sun_tip = tips[tips['day'] == 'Sun']['tip'].mean()
sat_tip = tips[tips['day'] == 'Sat']['tip'].mean()
thur_tip = tips[tips['day'] == 'Thur']['tip'].mean()
fri_tip = tips[tips['day'] == 'Fri']['tip'].mean()

In [72]:

s = Series([thur_tip,fri_tip,sat_tip,sun_tip],index=['Thur','Fri','Sat','Sun'])
s

Out[72]:

Thur    2.771452
Fri     2.734737
Sat     2.993103
Sun     3.255132
dtype: float64

In [73]:

s.plot(kind='bar')

Out[73]:

In [74]:

tips['percent_tip'] = tips['tip']/(tips['total_bill']+tips['tip'])
tips.head(10)

Out[74]:


total_bill

tip

sex

smoker

day

time

size

percent_tip

0

16.99

1.01

Female

No

Sun

Dinner

2

0.056111

1

10.34

1.66

Male

No

Sun

Dinner

3

0.138333

2

21.01

3.50

Male

No

Sun

Dinner

3

0.142799

3

23.68

3.31

Male

No

Sun

Dinner

2

0.122638

4

24.59

3.61

Female

No

Sun

Dinner

4

0.128014

5

25.29

4.71

Male

No

Sun

Dinner

4

0.157000

6

8.77

2.00

Male

No

Sun

Dinner

2

0.185701

7

26.88

3.12

Male

No

Sun

Dinner

4

0.104000

8

15.04

1.96

Male

No

Sun

Dinner

2

0.115294

9

14.78

3.23

Male

No

Sun

Dinner

2

0.179345

In [76]:

tips['percent_tip'].hist(bins=50)

Out[76]:

展开阅读全文

页面更新:2024-04-19

标签:升序   数据   武汉   广州   上海   北京   函数   索引   操作   基础   降序

1 2 3 4 5

上滑加载更多 ↓
推荐阅读:
友情链接:
更多:

本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828  

© CopyRight 2020-2024 All Rights Reserved. Powered By 71396.com 闽ICP备11008920号-4
闽公网安备35020302034903号

Top