ch09-Example

1
2
3
4
5
6
7
8
9
import numpy as np
import pandas as pd
from pandas import Series
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

1. 用特定于分组的值填充缺失值

gropna()可以实现,但也可以用fillna()方法填充。

1
2
3
s = Series(np.random.randn(6))
s[::2] = np.nan
s
0         NaN
1    0.228913
2         NaN
3    0.886429
4         NaN
5   -0.371843
dtype: float64
1
s.fillna(s.mean())
0    0.247833
1    0.228913
2    0.247833
3    0.886429
4    0.247833
5   -0.371843
dtype: float64
1
2
3
4
5
states = ['Ohio', 'New York', 'Vermont', 'Florida',
'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data
Ohio          1.669025
New York     -0.438570
Vermont      -0.539741
Florida       0.476985
Oregon        3.248944
Nevada       -1.021228
California   -0.577087
Idaho         0.124121
dtype: float64
1
data.groupby(group_key).mean()
East    0.291925
West    0.443688
dtype: float64
1
2
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio          1.669025
New York     -0.438570
Vermont       0.569147
Florida       0.476985
Oregon        3.248944
Nevada        1.335928
California   -0.577087
Idaho         1.335928
dtype: float64
1
2
3
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
Ohio          1.669025
New York     -0.438570
Vermont       0.500000
Florida       0.476985
Oregon        3.248944
Nevada       -1.000000
California   -0.577087
Idaho        -1.000000
dtype: float64

2. 随机采样和排列

构造一副扑克牌

1
2
3
4
5
6
7
8
9
10
11
# Hearts(红桃), Spades(黑桃), Clubs(梅花), Diamonds(方块)

suits = ['H','S','C','D']

card_val = (list(range(1,11))+[10]*3)*4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
cards.extend(str(num) + suit for num in base_names)
deck = pd.Series(card_val, index=cards)
deck[:13]
AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64
1
2
3
def draw(deck, n=5):
return deck.sample(n)
draw(deck)
8S     8
8C     8
4C     4
QD    10
3C     3
dtype: int64
1
2
get_suit = lambda x:x[-1]
deck.groupby(get_suit).apply(draw,n=2)
C  2C     2
   4C     4
D  7D     7
   9D     9
H  JH    10
   9H     9
S  3S     3
   JS    10
dtype: int64

3. 分组加权平均数和相关系数

1
2
3
4
5
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
'b', 'b', 'b', 'b'],
'data': np.random.randn(8),
'weights': np.random.rand(8)})
df
category data weights
0 a 0.388782 0.959661
1 a 0.940880 0.652225
2 a 1.660720 0.513206
3 a 0.642044 0.682356
4 b 0.418988 0.489540
5 b -0.259232 0.926490
6 b -0.369982 0.515880
7 b -0.044528 0.072160
1
2
3
grouped = df.groupby('category')
get_wavg = lambda g : np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category
a    0.811113
b   -0.114339
dtype: float64

Yahoo!Finance数据实操

1
2
3
close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,
index_col=0)
close_px.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
1
close_px[-4:]
AAPL MSFT XOM SPX
2011-10-11 400.29 27.00 76.27 1195.54
2011-10-12 402.19 26.96 77.16 1207.25
2011-10-13 408.43 27.18 76.37 1203.66
2011-10-14 422.00 27.27 78.11 1224.58
1
spx_corr = lambda x: x.corrwith(x['SPX'])
1
rets = close_px.pct_change().dropna()
1
2
3
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)
AAPL MSFT XOM SPX
2003 0.541124 0.745174 0.661265 1.0
2004 0.374283 0.588531 0.557742 1.0
2005 0.467540 0.562374 0.631010 1.0
2006 0.428267 0.406126 0.518514 1.0
2007 0.508118 0.658770 0.786264 1.0
2008 0.681434 0.804626 0.828303 1.0
2009 0.707103 0.654902 0.797921 1.0
2010 0.710105 0.730118 0.839057 1.0
2011 0.691931 0.800996 0.859975 1.0

4. 面向分组的线性回归

1
2
3
4
5
6
7
8
import statsmodels.api as sm
def regress(data, yvar, xvars):
Y = data[yvar]
X = data[xvars]
X['intercept'] = 1.
result = sm.OLS(Y, X).fit()
return result.params
by_year.apply(regress, 'AAPL', ['SPX'])
SPX intercept
2003 1.195406 0.000710
2004 1.363463 0.004201
2005 1.766415 0.003246
2006 1.645496 0.000080
2007 1.198761 0.003438
2008 0.968016 -0.001110
2009 0.879103 0.002954
2010 1.052608 0.001261
2011 0.806605 0.001514