1 | import numpy as np |
1. 用特定于分组的值填充缺失值
gropna()可以实现,但也可以用fillna()方法填充。
1 | s = Series(np.random.randn(6)) |
0 NaN
1 0.228913
2 NaN
3 0.886429
4 NaN
5 -0.371843
dtype: float64
1 | s.fillna(s.mean()) |
0 0.247833
1 0.228913
2 0.247833
3 0.886429
4 0.247833
5 -0.371843
dtype: float64
1 | states = ['Ohio', 'New York', 'Vermont', 'Florida', |
Ohio 1.669025
New York -0.438570
Vermont -0.539741
Florida 0.476985
Oregon 3.248944
Nevada -1.021228
California -0.577087
Idaho 0.124121
dtype: float64
1 | data.groupby(group_key).mean() |
East 0.291925
West 0.443688
dtype: float64
1 | fill_mean = lambda g: g.fillna(g.mean()) |
Ohio 1.669025
New York -0.438570
Vermont 0.569147
Florida 0.476985
Oregon 3.248944
Nevada 1.335928
California -0.577087
Idaho 1.335928
dtype: float64
1 | fill_values = {'East': 0.5, 'West': -1} |
Ohio 1.669025
New York -0.438570
Vermont 0.500000
Florida 0.476985
Oregon 3.248944
Nevada -1.000000
California -0.577087
Idaho -1.000000
dtype: float64
2. 随机采样和排列
构造一副扑克牌
1 | # Hearts(红桃), Spades(黑桃), Clubs(梅花), Diamonds(方块) |
AH 1
2H 2
3H 3
4H 4
5H 5
6H 6
7H 7
8H 8
9H 9
10H 10
JH 10
KH 10
QH 10
dtype: int64
1 | def draw(deck, n=5): |
8S 8
8C 8
4C 4
QD 10
3C 3
dtype: int64
1 | get_suit = lambda x:x[-1] |
C 2C 2
4C 4
D 7D 7
9D 9
H JH 10
9H 9
S 3S 3
JS 10
dtype: int64
3. 分组加权平均数和相关系数
1 | df = pd.DataFrame({'category': ['a', 'a', 'a', 'a', |
category | data | weights | |
---|---|---|---|
0 | a | 0.388782 | 0.959661 |
1 | a | 0.940880 | 0.652225 |
2 | a | 1.660720 | 0.513206 |
3 | a | 0.642044 | 0.682356 |
4 | b | 0.418988 | 0.489540 |
5 | b | -0.259232 | 0.926490 |
6 | b | -0.369982 | 0.515880 |
7 | b | -0.044528 | 0.072160 |
1 | grouped = df.groupby('category') |
category
a 0.811113
b -0.114339
dtype: float64
Yahoo!Finance数据实操
1 | close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True, |
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL 2214 non-null float64
MSFT 2214 non-null float64
XOM 2214 non-null float64
SPX 2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
1 | close_px[-4:] |
AAPL | MSFT | XOM | SPX | |
---|---|---|---|---|
2011-10-11 | 400.29 | 27.00 | 76.27 | 1195.54 |
2011-10-12 | 402.19 | 26.96 | 77.16 | 1207.25 |
2011-10-13 | 408.43 | 27.18 | 76.37 | 1203.66 |
2011-10-14 | 422.00 | 27.27 | 78.11 | 1224.58 |
1 | spx_corr = lambda x: x.corrwith(x['SPX']) |
1 | rets = close_px.pct_change().dropna() |
1 | get_year = lambda x: x.year |
AAPL | MSFT | XOM | SPX | |
---|---|---|---|---|
2003 | 0.541124 | 0.745174 | 0.661265 | 1.0 |
2004 | 0.374283 | 0.588531 | 0.557742 | 1.0 |
2005 | 0.467540 | 0.562374 | 0.631010 | 1.0 |
2006 | 0.428267 | 0.406126 | 0.518514 | 1.0 |
2007 | 0.508118 | 0.658770 | 0.786264 | 1.0 |
2008 | 0.681434 | 0.804626 | 0.828303 | 1.0 |
2009 | 0.707103 | 0.654902 | 0.797921 | 1.0 |
2010 | 0.710105 | 0.730118 | 0.839057 | 1.0 |
2011 | 0.691931 | 0.800996 | 0.859975 | 1.0 |
4. 面向分组的线性回归
1 | import statsmodels.api as sm |
SPX | intercept | |
---|---|---|
2003 | 1.195406 | 0.000710 |
2004 | 1.363463 | 0.004201 |
2005 | 1.766415 | 0.003246 |
2006 | 1.645496 | 0.000080 |
2007 | 1.198761 | 0.003438 |
2008 | 0.968016 | -0.001110 |
2009 | 0.879103 | 0.002954 |
2010 | 1.052608 | 0.001261 |
2011 | 0.806605 | 0.001514 |