ch07-数据规整化

合并数据集

1
2
3
4
5
6
7
import numpy as np
import pandas as pd
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

层次化索引

1
2
3
4
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a  1   -0.204708
   2    0.478943
   3   -0.519439
b  1   -0.555730
   3    1.965781
c  1    1.393406
   2    0.092908
d  2    0.281746
   3    0.769023
dtype: float64
1
data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])
1
data['b']
1   -0.555730
3    1.965781
dtype: float64
1
data['b':'c']
b  1   -0.555730
   3    1.965781
c  1    1.393406
   2    0.092908
dtype: float64
1
data.loc[['b', 'd']]
b  1   -0.555730
   3    1.965781
d  2    0.281746
   3    0.769023
dtype: float64
1
data.loc[:, 2]
a    0.478943
c    0.092908
d    0.281746
dtype: float64
1
2
3
4
5
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
2
3
frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
frame['Ohio']
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10

数据库风格的DataFrame合并

1
2
3
4
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2': range(3)})
1
df1
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
1
df2
key data2
0 a 0
1 b 1
2 d 2
1
pd.merge(df1,df2)  #如果没有指定建,则会默认将重叠列名做键
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
1
2
3
4
5
6
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
'data2': range(3)})
print(df3)
print(df4)
  lkey  data1
0    b      0
1    b      1
2    a      2
3    c      3
4    a      4
5    a      5
6    b      6
  rkey  data2
0    a      0
1    b      1
2    d      2
1
pd.merge(df3,df4,left_on='lkey',right_on='rkey') #默认Inner连接,how=''参数决定怎么连接
lkey data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
1
2
#左连接
pd.merge(df3,df4,left_on='lkey',right_on='rkey',how='left')
lkey data1 rkey data2
0 b 0 b 1.0
1 b 1 b 1.0
2 a 2 a 0.0
3 c 3 NaN NaN
4 a 4 a 0.0
5 a 5 a 0.0
6 b 6 b 1.0
1
2
3
4
5
6
7
8
## 多个键进行连接时,传入列表就好
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
pd.merge(left,right,on=['key1','key2'],how='left')
key1 key2 lval rval
0 foo one 1 4.0
1 foo one 1 5.0
2 foo two 2 NaN
3 bar one 3 6.0
1
pd.merge(left, right, on='key1')
key1 key2_x lval key2_y rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7
1
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))
key1 key2_left lval key2_right rval
0 foo one 1 one 4
1 foo one 1 one 5
2 foo two 2 one 4
3 foo two 2 one 5
4 bar one 3 one 6
5 bar one 3 two 7

索引上的合并

1
2
3
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
1
left1
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
1
right1
group_val
a 3.5
b 7.0
1
pd.merge(left1,right1,left_on='key',right_index=True)
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
1
2
3
4
5
6
7
8
9
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
'Nevada', 'Nevada'],
'key2': [2000, 2001, 2002, 2001, 2002],
'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
'Ohio', 'Ohio'],
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
1
lefth
key1 key2 data
0 Ohio 2000 0.0
1 Ohio 2001 1.0
2 Ohio 2002 2.0
3 Nevada 2001 3.0
4 Nevada 2002 4.0
1
righth
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
1
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
key1 key2 data event1 event2
0 Ohio 2000 0.0 4 5
0 Ohio 2000 0.0 6 7
1 Ohio 2001 1.0 8 9
2 Ohio 2002 2.0 10 11
3 Nevada 2001 3.0 0 1
1
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
key1 key2 data event1 event2
0 Ohio 2000 0.0 4.0 5.0
0 Ohio 2000 0.0 6.0 7.0
1 Ohio 2001 1.0 8.0 9.0
2 Ohio 2002 2.0 10.0 11.0
3 Nevada 2001 3.0 0.0 1.0
4 Nevada 2002 4.0 NaN NaN
4 Nevada 2000 NaN 2.0 3.0
1
2
3
4
5
6
7
8
9
10
##两边都开启索引也没有问题

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['Missouri', 'Alabama'])

pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
1
left2.join(right2)    ###更加快速地实现索引合并
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
c 3.0 4.0 9.0 10.0
e 5.0 6.0 13.0 14.0
1
left1.join(right1, on='key')
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN

轴向连接

1
2
arr=np.arange(12).reshape(3,4)
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
1
np.concatenate([arr, arr], axis=1)
array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])
1
2
3
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
1
pd.concat([s1,s2,s3]) #默认axis=0
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
1
2
s4=pd.concat([s1*5,s3])
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])
0 1
a 0.0 0.0
c NaN NaN
b 1.0 5.0
e NaN NaN
1
2
result = pd.concat([s1,s2,s3],keys=['one','two','three'])
result
one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    6
dtype: int64
1
s1
a    0
b    1
dtype: int64
1
result.unstack()
a b c d e f g
one 0.0 1.0 NaN NaN NaN NaN NaN
two NaN NaN 2.0 3.0 4.0 NaN NaN
three NaN NaN NaN NaN NaN 5.0 6.0
1
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'],sort=True)
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
1
2
3
4
5
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
columns=['three', 'four'])
df1
one two
a 0 1
b 2 3
c 4 5
1
df2
three four
a 5 6
c 7 8
1
pd.concat([df1, df2], axis=1,sort=True)
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
1
pd.concat({'level1': df1, 'level2': df2}, axis=1,sort=True) #字典的键会被当做keys选项的值
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
1
pd.concat([df1,df2],axis=1,keys=['l1','l2'],names=['zz1','zz2'],sort=True)
zz1 l1 l2
zz2 one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
1
2
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
1
df1
a b c d
0 1.246435 1.007189 -1.296221 0.274992
1 0.228913 1.352917 0.886429 -2.001637
2 -0.371843 1.669025 -0.438570 -0.539741
1
df2
b d a
0 0.476985 3.248944 -1.021228
1 -0.577087 0.124121 0.302614
1
pd.concat([df1,df2],sort=True)
a b c d
0 1.246435 1.007189 -1.296221 0.274992
1 0.228913 1.352917 0.886429 -2.001637
2 -0.371843 1.669025 -0.438570 -0.539741
0 -1.021228 0.476985 NaN 3.248944
1 0.302614 -0.577087 NaN 0.124121
1
pd.concat([df1,df2],ignore_index=True,sort=True)
a b c d
0 1.246435 1.007189 -1.296221 0.274992
1 0.228913 1.352917 0.886429 -2.001637
2 -0.371843 1.669025 -0.438570 -0.539741
3 -1.021228 0.476985 NaN 3.248944
4 0.302614 -0.577087 NaN 0.124121

合并重叠数据

1
np.where?
1
2
3
4
5
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1]=np.nan
1
a
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
1
b
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64
1
np.where(pd.isnull(a),b,a) #用b对应索引的值来填充a的空值
array([0. , 2.5, 2. , 3.5, 4.5, nan])
1
b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64
1
2
3
4
5
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
'b': [np.nan, 2., np.nan, 6.],
'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
'b': [np.nan, 3., 4., 6., 8.]})
1
df1
a b c
0 1.0 NaN 2
1 NaN 2.0 6
2 5.0 NaN 10
3 NaN 6.0 14
1
df2
a b
0 5.0 NaN
1 4.0 3.0
2 NaN 4.0
3 3.0 6.0
4 7.0 8.0
1
df1.combine_first(df2) #用参数对象的数据为调用者对象的缺失数据‘打补丁’
a b c
0 1.0 NaN 2.0
1 4.0 2.0 6.0
2 5.0 4.0 10.0
3 3.0 6.0 14.0
4 7.0 8.0 NaN

重塑和轴向旋转

重塑层次化索引

1
2
3
4
5
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'],
name='number'))
data
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
1
2
result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64
1
result.unstack()
number one two three
state
Ohio 0 1 2
Colorado 3 4 5

将“长格式”旋转为“宽格式”

1
2
3
4
5
6
7
8
data = pd.read_csv('examples/macrodata.csv')
data.head()
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter,
name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')
ldata = data.stack().reset_index().rename(columns={0: 'value'})
1
ldata[:10]
date item value
0 1959-03-31 realgdp 2710.349
1 1959-03-31 infl 0.000
2 1959-03-31 unemp 5.800
3 1959-06-30 realgdp 2778.801
4 1959-06-30 infl 2.340
5 1959-06-30 unemp 5.100
6 1959-09-30 realgdp 2775.488
7 1959-09-30 infl 2.740
8 1959-09-30 unemp 5.300
9 1959-12-31 realgdp 2785.204
1
2
pivoted = ldata.pivot('date','item','value')  ## index , columns , value 
pivoted.head()
item infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3
1959-12-31 0.27 2785.204 5.6
1960-03-31 2.31 2847.699 5.2
1
ldata['value2'] = np.random.randn(len(ldata))
1
ldata[:10]
date item value value2
0 1959-03-31 realgdp 2710.349 -0.894813
1 1959-03-31 infl 0.000 -1.741494
2 1959-03-31 unemp 5.800 -1.052256
3 1959-06-30 realgdp 2778.801 1.436603
4 1959-06-30 infl 2.340 -0.576207
5 1959-06-30 unemp 5.100 -2.420294
6 1959-09-30 realgdp 2775.488 -1.062330
7 1959-09-30 infl 2.740 0.237372
8 1959-09-30 unemp 5.300 0.000957
9 1959-12-31 realgdp 2785.204 0.065253
1
2
pivoted = ldata.pivot('date','item')
pivoted.head() # 带有层次化索引的列
value value2
item infl realgdp unemp infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8 -1.741494 -0.894813 -1.052256
1959-06-30 2.34 2778.801 5.1 -0.576207 1.436603 -2.420294
1959-09-30 2.74 2775.488 5.3 0.237372 -1.062330 0.000957
1959-12-31 0.27 2785.204 5.6 -1.367524 0.065253 -0.030280
1960-03-31 2.31 2847.699 5.2 -0.642437 0.940489 1.040179
1
pivoted['value'].head()
item infl realgdp unemp
date
1959-03-31 0.00 2710.349 5.8
1959-06-30 2.34 2778.801 5.1
1959-09-30 2.74 2775.488 5.3
1959-12-31 0.27 2785.204 5.6
1960-03-31 2.31 2847.699 5.2

数据转化

移除重复数据

1
2
data=pd.DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]})
data
k1 k2
0 one 1
1 one 1
2 one 2
3 two 3
4 two 3
5 two 4
6 two 4
1
data.duplicated() #判断当前行是否是重复行
0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool
1
data.drop_duplicates() #移除重复行
k1 k2
0 one 1
2 one 2
3 two 3
5 two 4
1
2
data['v1']=np.arange(7)
data.drop_duplicates(['k1']) #只根据某一行来移除
k1 k2 v1
0 one 1 0
3 two 3 3
1
2


k1 k2 v1
0 one 1 0
1 one 1 1
2 one 2 2
3 two 3 3
4 two 3 4
5 two 4 5
6 two 4 6
1
2
data=pd.Series([1,2,3,-99])
data
0     1
1     2
2     3
3   -99
dtype: int64
1
data.replace(-99,np.nan)
0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64
1
data.replace({1:100,-99:np.nan})
0    100.0
1      2.0
2      3.0
3      NaN
dtype: float64

重命名索引

1
2
3
4
data = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['Missouri', 'Alabama'])
data
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0
1
data.index.map(str.upper)
Index(['B', 'C', 'D', 'E'], dtype='object')

离散化与面元划分

1
2
3
4
5
#离散化
age = np.arange(10)
bins = [2,5,8]
cats = pd.cut(age,bins)
cats
[NaN, NaN, NaN, (2, 5], (2, 5], (2, 5], (5, 8], (5, 8], (5, 8], NaN]
Categories (2, interval[int64]): [(2, 5] < (5, 8]]

检测与过滤异常值

1
2
3
from pandas import DataFrame
data=DataFrame(np.random.randn(1000,4))
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.002621 -0.023747 -0.003461 -0.002610
std 0.998586 0.962207 1.012928 0.996423
min -3.024110 -2.657202 -3.105636 -3.530912
25% -0.670724 -0.684972 -0.691494 -0.707701
50% 0.022038 0.023472 0.024927 0.020683
75% 0.649798 0.639806 0.693491 0.672463
max 3.897527 3.160760 3.144389 3.003284
1
2
3
#找出某列中绝对值超过3的值
col=data[3]
col[abs(col)>3]
208   -3.530912
969    3.003284
Name: 3, dtype: float64
1
data[(np.abs(data)>3).any(1)][:2]
0 1 2 3
136 -1.202724 -0.286215 -3.105636 -0.369009
148 -3.024110 -1.168413 -0.888664 0.111410
1
data[(np.abs(data)>3).any(1)]=np.sign(data)*3

计算指标/哑变量

1
2
df = DataFrame({'key':['b','b','a','c','a','b'],'value':np.arange(6)})
df
key value
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 b 5
1
pd.get_dummies(df['key'])
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0

字符串操作

字符串对象方法

1
#和Python编码一起学习下

正则表达式

1
2
import re
#参考之前的博客