合并数据集
1 | import numpy as np |
层次化索引
1 | data = pd.Series(np.random.randn(9), |
a 1 -0.204708
2 0.478943
3 -0.519439
b 1 -0.555730
3 1.965781
c 1 1.393406
2 0.092908
d 2 0.281746
3 0.769023
dtype: float64
1 | data.index |
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])
1 | data['b'] |
1 -0.555730
3 1.965781
dtype: float64
1 | data['b':'c'] |
b 1 -0.555730
3 1.965781
c 1 1.393406
2 0.092908
dtype: float64
1 | data.loc[['b', 'd']] |
b 1 -0.555730
3 1.965781
d 2 0.281746
3 0.769023
dtype: float64
1 | data.loc[:, 2] |
a 0.478943
c 0.092908
d 0.281746
dtype: float64
1 | frame = pd.DataFrame(np.arange(12).reshape((4, 3)), |
Ohio | Colorado | |||
---|---|---|---|---|
Green | Red | Green | ||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
1 | frame.index.names = ['key1', 'key2'] |
state | Ohio | Colorado | ||
---|---|---|---|---|
color | Green | Red | Green | |
key1 | key2 | |||
a | 1 | 0 | 1 | 2 |
2 | 3 | 4 | 5 | |
b | 1 | 6 | 7 | 8 |
2 | 9 | 10 | 11 |
1 | frame['Ohio'] |
color | Green | Red | |
---|---|---|---|
key1 | key2 | ||
a | 1 | 0 | 1 |
2 | 3 | 4 | |
b | 1 | 6 | 7 |
2 | 9 | 10 |
数据库风格的DataFrame合并
1 | df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], |
1 | df1 |
key | data1 | |
---|---|---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | a | 5 |
6 | b | 6 |
1 | df2 |
key | data2 | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | d | 2 |
1 | pd.merge(df1,df2) #如果没有指定建,则会默认将重叠列名做键 |
key | data1 | data2 | |
---|---|---|---|
0 | b | 0 | 1 |
1 | b | 1 | 1 |
2 | b | 6 | 1 |
3 | a | 2 | 0 |
4 | a | 4 | 0 |
5 | a | 5 | 0 |
1 | df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], |
lkey data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
rkey data2
0 a 0
1 b 1
2 d 2
1 | pd.merge(df3,df4,left_on='lkey',right_on='rkey') #默认Inner连接,how=''参数决定怎么连接 |
lkey | data1 | rkey | data2 | |
---|---|---|---|---|
0 | b | 0 | b | 1 |
1 | b | 1 | b | 1 |
2 | b | 6 | b | 1 |
3 | a | 2 | a | 0 |
4 | a | 4 | a | 0 |
5 | a | 5 | a | 0 |
1 | #左连接 |
lkey | data1 | rkey | data2 | |
---|---|---|---|---|
0 | b | 0 | b | 1.0 |
1 | b | 1 | b | 1.0 |
2 | a | 2 | a | 0.0 |
3 | c | 3 | NaN | NaN |
4 | a | 4 | a | 0.0 |
5 | a | 5 | a | 0.0 |
6 | b | 6 | b | 1.0 |
1 | ## 多个键进行连接时,传入列表就好 |
key1 | key2 | lval | rval | |
---|---|---|---|---|
0 | foo | one | 1 | 4.0 |
1 | foo | one | 1 | 5.0 |
2 | foo | two | 2 | NaN |
3 | bar | one | 3 | 6.0 |
1 | pd.merge(left, right, on='key1') |
key1 | key2_x | lval | key2_y | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
1 | pd.merge(left, right, on='key1', suffixes=('_left', '_right')) |
key1 | key2_left | lval | key2_right | rval | |
---|---|---|---|---|---|
0 | foo | one | 1 | one | 4 |
1 | foo | one | 1 | one | 5 |
2 | foo | two | 2 | one | 4 |
3 | foo | two | 2 | one | 5 |
4 | bar | one | 3 | one | 6 |
5 | bar | one | 3 | two | 7 |
索引上的合并
1 | left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], |
1 | left1 |
key | value | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | a | 2 |
3 | a | 3 |
4 | b | 4 |
5 | c | 5 |
1 | right1 |
group_val | |
---|---|
a | 3.5 |
b | 7.0 |
1 | pd.merge(left1,right1,left_on='key',right_index=True) |
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
1 | b | 1 | 7.0 |
4 | b | 4 | 7.0 |
1 | lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', |
1 | lefth |
key1 | key2 | data | |
---|---|---|---|
0 | Ohio | 2000 | 0.0 |
1 | Ohio | 2001 | 1.0 |
2 | Ohio | 2002 | 2.0 |
3 | Nevada | 2001 | 3.0 |
4 | Nevada | 2002 | 4.0 |
1 | righth |
event1 | event2 | ||
---|---|---|---|
Nevada | 2001 | 0 | 1 |
2000 | 2 | 3 | |
Ohio | 2000 | 4 | 5 |
2000 | 6 | 7 | |
2001 | 8 | 9 | |
2002 | 10 | 11 |
1 | pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True) |
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Ohio | 2000 | 0.0 | 4 | 5 |
0 | Ohio | 2000 | 0.0 | 6 | 7 |
1 | Ohio | 2001 | 1.0 | 8 | 9 |
2 | Ohio | 2002 | 2.0 | 10 | 11 |
3 | Nevada | 2001 | 3.0 | 0 | 1 |
1 | pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer') |
key1 | key2 | data | event1 | event2 | |
---|---|---|---|---|---|
0 | Ohio | 2000 | 0.0 | 4.0 | 5.0 |
0 | Ohio | 2000 | 0.0 | 6.0 | 7.0 |
1 | Ohio | 2001 | 1.0 | 8.0 | 9.0 |
2 | Ohio | 2002 | 2.0 | 10.0 | 11.0 |
3 | Nevada | 2001 | 3.0 | 0.0 | 1.0 |
4 | Nevada | 2002 | 4.0 | NaN | NaN |
4 | Nevada | 2000 | NaN | 2.0 | 3.0 |
1 | ##两边都开启索引也没有问题 |
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
b | NaN | NaN | 7.0 | 8.0 |
c | 3.0 | 4.0 | 9.0 | 10.0 |
d | NaN | NaN | 11.0 | 12.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
1 | left2.join(right2) ###更加快速地实现索引合并 |
Ohio | Nevada | Missouri | Alabama | |
---|---|---|---|---|
a | 1.0 | 2.0 | NaN | NaN |
c | 3.0 | 4.0 | 9.0 | 10.0 |
e | 5.0 | 6.0 | 13.0 | 14.0 |
1 | left1.join(right1, on='key') |
key | value | group_val | |
---|---|---|---|
0 | a | 0 | 3.5 |
1 | b | 1 | 7.0 |
2 | a | 2 | 3.5 |
3 | a | 3 | 3.5 |
4 | b | 4 | 7.0 |
5 | c | 5 | NaN |
轴向连接
1 | arr=np.arange(12).reshape(3,4) |
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
1 | np.concatenate([arr, arr], axis=1) |
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
1 | s1 = pd.Series([0, 1], index=['a', 'b']) |
1 | pd.concat([s1,s2,s3]) #默认axis=0 |
a 0
b 1
c 2
d 3
e 4
f 5
g 6
dtype: int64
1 | s4=pd.concat([s1*5,s3]) |
0 | 1 | |
---|---|---|
a | 0.0 | 0.0 |
c | NaN | NaN |
b | 1.0 | 5.0 |
e | NaN | NaN |
1 | result = pd.concat([s1,s2,s3],keys=['one','two','three']) |
one a 0
b 1
two c 2
d 3
e 4
three f 5
g 6
dtype: int64
1 | s1 |
a 0
b 1
dtype: int64
1 | result.unstack() |
a | b | c | d | e | f | g | |
---|---|---|---|---|---|---|---|
one | 0.0 | 1.0 | NaN | NaN | NaN | NaN | NaN |
two | NaN | NaN | 2.0 | 3.0 | 4.0 | NaN | NaN |
three | NaN | NaN | NaN | NaN | NaN | 5.0 | 6.0 |
1 | pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'],sort=True) |
one | two | three | |
---|---|---|---|
a | 0.0 | NaN | NaN |
b | 1.0 | NaN | NaN |
c | NaN | 2.0 | NaN |
d | NaN | 3.0 | NaN |
e | NaN | 4.0 | NaN |
f | NaN | NaN | 5.0 |
g | NaN | NaN | 6.0 |
1 | df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], |
one | two | |
---|---|---|
a | 0 | 1 |
b | 2 | 3 |
c | 4 | 5 |
1 | df2 |
three | four | |
---|---|---|
a | 5 | 6 |
c | 7 | 8 |
1 | pd.concat([df1, df2], axis=1,sort=True) |
one | two | three | four | |
---|---|---|---|---|
a | 0 | 1 | 5.0 | 6.0 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7.0 | 8.0 |
1 | pd.concat({'level1': df1, 'level2': df2}, axis=1,sort=True) #字典的键会被当做keys选项的值 |
level1 | level2 | |||
---|---|---|---|---|
one | two | three | four | |
a | 0 | 1 | 5.0 | 6.0 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7.0 | 8.0 |
1 | pd.concat([df1,df2],axis=1,keys=['l1','l2'],names=['zz1','zz2'],sort=True) |
zz1 | l1 | l2 | ||
---|---|---|---|---|
zz2 | one | two | three | four |
a | 0 | 1 | 5.0 | 6.0 |
b | 2 | 3 | NaN | NaN |
c | 4 | 5 | 7.0 | 8.0 |
1 | df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd']) |
1 | df1 |
a | b | c | d | |
---|---|---|---|---|
0 | 1.246435 | 1.007189 | -1.296221 | 0.274992 |
1 | 0.228913 | 1.352917 | 0.886429 | -2.001637 |
2 | -0.371843 | 1.669025 | -0.438570 | -0.539741 |
1 | df2 |
b | d | a | |
---|---|---|---|
0 | 0.476985 | 3.248944 | -1.021228 |
1 | -0.577087 | 0.124121 | 0.302614 |
1 | pd.concat([df1,df2],sort=True) |
a | b | c | d | |
---|---|---|---|---|
0 | 1.246435 | 1.007189 | -1.296221 | 0.274992 |
1 | 0.228913 | 1.352917 | 0.886429 | -2.001637 |
2 | -0.371843 | 1.669025 | -0.438570 | -0.539741 |
0 | -1.021228 | 0.476985 | NaN | 3.248944 |
1 | 0.302614 | -0.577087 | NaN | 0.124121 |
1 | pd.concat([df1,df2],ignore_index=True,sort=True) |
a | b | c | d | |
---|---|---|---|---|
0 | 1.246435 | 1.007189 | -1.296221 | 0.274992 |
1 | 0.228913 | 1.352917 | 0.886429 | -2.001637 |
2 | -0.371843 | 1.669025 | -0.438570 | -0.539741 |
3 | -1.021228 | 0.476985 | NaN | 3.248944 |
4 | 0.302614 | -0.577087 | NaN | 0.124121 |
合并重叠数据
1 | np.where? |
1 | a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], |
1 | a |
f NaN
e 2.5
d NaN
c 3.5
b 4.5
a NaN
dtype: float64
1 | b |
f 0.0
e 1.0
d 2.0
c 3.0
b 4.0
a NaN
dtype: float64
1 | np.where(pd.isnull(a),b,a) #用b对应索引的值来填充a的空值 |
array([0. , 2.5, 2. , 3.5, 4.5, nan])
1 | b[:-2].combine_first(a[2:]) |
a NaN
b 4.5
c 3.0
d 2.0
e 1.0
f 0.0
dtype: float64
1 | df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan], |
1 | df1 |
a | b | c | |
---|---|---|---|
0 | 1.0 | NaN | 2 |
1 | NaN | 2.0 | 6 |
2 | 5.0 | NaN | 10 |
3 | NaN | 6.0 | 14 |
1 | df2 |
a | b | |
---|---|---|
0 | 5.0 | NaN |
1 | 4.0 | 3.0 |
2 | NaN | 4.0 |
3 | 3.0 | 6.0 |
4 | 7.0 | 8.0 |
1 | df1.combine_first(df2) #用参数对象的数据为调用者对象的缺失数据‘打补丁’ |
a | b | c | |
---|---|---|---|
0 | 1.0 | NaN | 2.0 |
1 | 4.0 | 2.0 | 6.0 |
2 | 5.0 | 4.0 | 10.0 |
3 | 3.0 | 6.0 | 14.0 |
4 | 7.0 | 8.0 | NaN |
重塑和轴向旋转
重塑层次化索引
1 | data = pd.DataFrame(np.arange(6).reshape((2, 3)), |
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
1 | result = data.stack() |
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int64
1 | result.unstack() |
number | one | two | three |
---|---|---|---|
state | |||
Ohio | 0 | 1 | 2 |
Colorado | 3 | 4 | 5 |
将“长格式”旋转为“宽格式”
1 | data = pd.read_csv('examples/macrodata.csv') |
1 | ldata[:10] |
date | item | value | |
---|---|---|---|
0 | 1959-03-31 | realgdp | 2710.349 |
1 | 1959-03-31 | infl | 0.000 |
2 | 1959-03-31 | unemp | 5.800 |
3 | 1959-06-30 | realgdp | 2778.801 |
4 | 1959-06-30 | infl | 2.340 |
5 | 1959-06-30 | unemp | 5.100 |
6 | 1959-09-30 | realgdp | 2775.488 |
7 | 1959-09-30 | infl | 2.740 |
8 | 1959-09-30 | unemp | 5.300 |
9 | 1959-12-31 | realgdp | 2785.204 |
1 | pivoted = ldata.pivot('date','item','value') ## index , columns , value |
item | infl | realgdp | unemp |
---|---|---|---|
date | |||
1959-03-31 | 0.00 | 2710.349 | 5.8 |
1959-06-30 | 2.34 | 2778.801 | 5.1 |
1959-09-30 | 2.74 | 2775.488 | 5.3 |
1959-12-31 | 0.27 | 2785.204 | 5.6 |
1960-03-31 | 2.31 | 2847.699 | 5.2 |
1 | ldata['value2'] = np.random.randn(len(ldata)) |
1 | ldata[:10] |
date | item | value | value2 | |
---|---|---|---|---|
0 | 1959-03-31 | realgdp | 2710.349 | -0.894813 |
1 | 1959-03-31 | infl | 0.000 | -1.741494 |
2 | 1959-03-31 | unemp | 5.800 | -1.052256 |
3 | 1959-06-30 | realgdp | 2778.801 | 1.436603 |
4 | 1959-06-30 | infl | 2.340 | -0.576207 |
5 | 1959-06-30 | unemp | 5.100 | -2.420294 |
6 | 1959-09-30 | realgdp | 2775.488 | -1.062330 |
7 | 1959-09-30 | infl | 2.740 | 0.237372 |
8 | 1959-09-30 | unemp | 5.300 | 0.000957 |
9 | 1959-12-31 | realgdp | 2785.204 | 0.065253 |
1 | pivoted = ldata.pivot('date','item') |
value | value2 | |||||
---|---|---|---|---|---|---|
item | infl | realgdp | unemp | infl | realgdp | unemp |
date | ||||||
1959-03-31 | 0.00 | 2710.349 | 5.8 | -1.741494 | -0.894813 | -1.052256 |
1959-06-30 | 2.34 | 2778.801 | 5.1 | -0.576207 | 1.436603 | -2.420294 |
1959-09-30 | 2.74 | 2775.488 | 5.3 | 0.237372 | -1.062330 | 0.000957 |
1959-12-31 | 0.27 | 2785.204 | 5.6 | -1.367524 | 0.065253 | -0.030280 |
1960-03-31 | 2.31 | 2847.699 | 5.2 | -0.642437 | 0.940489 | 1.040179 |
1 | pivoted['value'].head() |
item | infl | realgdp | unemp |
---|---|---|---|
date | |||
1959-03-31 | 0.00 | 2710.349 | 5.8 |
1959-06-30 | 2.34 | 2778.801 | 5.1 |
1959-09-30 | 2.74 | 2775.488 | 5.3 |
1959-12-31 | 0.27 | 2785.204 | 5.6 |
1960-03-31 | 2.31 | 2847.699 | 5.2 |
数据转化
移除重复数据
1 | data=pd.DataFrame({'k1':['one']*3+['two']*4,'k2':[1,1,2,3,3,4,4]}) |
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | one | 1 |
2 | one | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
1 | data.duplicated() #判断当前行是否是重复行 |
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
1 | data.drop_duplicates() #移除重复行 |
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
1 | data['v1']=np.arange(7) |
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
3 | two | 3 | 3 |
1 |
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | one | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | two | 3 | 4 |
5 | two | 4 | 5 |
6 | two | 4 | 6 |
1 | data=pd.Series([1,2,3,-99]) |
0 1
1 2
2 3
3 -99
dtype: int64
1 | data.replace(-99,np.nan) |
0 1.0
1 2.0
2 3.0
3 NaN
dtype: float64
1 | data.replace({1:100,-99:np.nan}) |
0 100.0
1 2.0
2 3.0
3 NaN
dtype: float64
重命名索引
1 | data = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], |
Missouri | Alabama | |
---|---|---|
b | 7.0 | 8.0 |
c | 9.0 | 10.0 |
d | 11.0 | 12.0 |
e | 13.0 | 14.0 |
1 | data.index.map(str.upper) |
Index(['B', 'C', 'D', 'E'], dtype='object')
离散化与面元划分
1 | #离散化 |
[NaN, NaN, NaN, (2, 5], (2, 5], (2, 5], (5, 8], (5, 8], (5, 8], NaN]
Categories (2, interval[int64]): [(2, 5] < (5, 8]]
检测与过滤异常值
1 | from pandas import DataFrame |
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.002621 | -0.023747 | -0.003461 | -0.002610 |
std | 0.998586 | 0.962207 | 1.012928 | 0.996423 |
min | -3.024110 | -2.657202 | -3.105636 | -3.530912 |
25% | -0.670724 | -0.684972 | -0.691494 | -0.707701 |
50% | 0.022038 | 0.023472 | 0.024927 | 0.020683 |
75% | 0.649798 | 0.639806 | 0.693491 | 0.672463 |
max | 3.897527 | 3.160760 | 3.144389 | 3.003284 |
1 | #找出某列中绝对值超过3的值 |
208 -3.530912
969 3.003284
Name: 3, dtype: float64
1 | data[(np.abs(data)>3).any(1)][:2] |
0 | 1 | 2 | 3 | |
---|---|---|---|---|
136 | -1.202724 | -0.286215 | -3.105636 | -0.369009 |
148 | -3.024110 | -1.168413 | -0.888664 | 0.111410 |
1 | data[(np.abs(data)>3).any(1)]=np.sign(data)*3 |
计算指标/哑变量
1 | df = DataFrame({'key':['b','b','a','c','a','b'],'value':np.arange(6)}) |
key | value | |
---|---|---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | b | 5 |
1 | pd.get_dummies(df['key']) |
a | b | c | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 0 |
5 | 0 | 1 | 0 |
字符串操作
字符串对象方法
1 | #和Python编码一起学习下 |
正则表达式
1 | import re |