1 | import numpy as np |
日期和时间数据类型及工具
1 | from datetime import datetime |
datetime.datetime(2019, 1, 15, 0, 30, 21, 654285)
1 | now.year,now.month,now.day |
(2019, 1, 15)
datetime以毫秒格式存储日期和时间,也就是我们常用的timestamp
1 | delta = datetime(2011,1,7) - datetime(2008,6,24,8,15) |
datetime.timedelta(926, 56700)
1 | delta.days |
926
可以给datatime对象+-一个或者多个timedelta
1 | from datetime import timedelta |
datetime.datetime(2011, 1, 8, 0, 0)
字符串和datetime的相互转换
1 | stamp = datetime(2011,1,3) |
datetime.datetime(2011, 1, 3, 0, 0)
1 | str(stamp) |
'2011-01-03 00:00:00'
1 | stamp.strftime('%Y-%m-%d') #跟hive类似 |
'2011-01-03'
dateutil很强
1 | from dateutil.parser import parse |
datetime.datetime(2011, 1, 13, 0, 0)
1 | parse('6/12/2015',dayfirst=True) |
datetime.datetime(2015, 12, 6, 0, 0)
1 | parse('Jan 31, 1997 10:45 PM') |
datetime.datetime(1997, 1, 31, 22, 45)
1 | datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00'] |
DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)
1 | idx = pd.to_datetime(datestrs + [None]) |
DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)
1 | idx.isnull() |
array([False, False, True])
时间序列基础
1 | dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), |
2011-01-02 -0.204708
2011-01-05 0.478943
2011-01-07 -0.519439
2011-01-08 -0.555730
2011-01-10 1.965781
2011-01-12 1.393406
dtype: float64
1 | type(ts) |
pandas.core.series.Series
1 | ts.index |
DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
'2011-01-10', '2011-01-12'],
dtype='datetime64[ns]', freq=None)
1 | ts+ts[::2] |
2011-01-02 -0.409415
2011-01-05 NaN
2011-01-07 -1.038877
2011-01-08 NaN
2011-01-10 3.931561
2011-01-12 NaN
dtype: float64
1 | ts.index.dtype |
dtype('<M8[ns]')
1 | stamp = ts.index[0] |
Timestamp('2011-01-02 00:00:00')
1 | #传入一个被解释为日期的字符串 |
-0.55573030434749
长序列可以之传入年 月等
1 | long_ts = pd.Series(np.random.randn(1000),index = pd.date_range('1/1/2000',periods=1000)) |
2000-01-01 -1.613474
2000-01-02 -0.573966
2000-01-03 0.424894
2000-01-04 1.257544
2000-01-05 -1.065343
Freq: D, dtype: float64
1 | long_ts['2001'] |
2001-01-01 0.590119
2001-01-02 -1.219580
2001-01-03 0.272788
2001-01-04 -2.691584
2001-01-05 1.567780
2001-01-06 0.265030
2001-01-07 -0.929412
2001-01-08 0.885587
2001-01-09 -1.557180
2001-01-10 -2.252237
...
2001-12-22 -0.586346
2001-12-23 0.910428
2001-12-24 -0.573878
2001-12-25 -0.387236
2001-12-26 -0.857498
2001-12-27 -0.875737
2001-12-28 -0.075322
2001-12-29 0.325148
2001-12-30 1.699576
2001-12-31 -0.027604
Freq: D, Length: 365, dtype: float64
日期的范围,频率以及移动
1 | ts.resample('D') |
DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]
生成日期范围
pd.date_range()
1 | pd.date_range('1/1/2011',periods=10) |
DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
'2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08',
'2011-01-09', '2011-01-10'],
dtype='datetime64[ns]', freq='D')
频率和日期偏移量
1 | from pandas.tseries.offsets import Hour, Minute |
<Hour>
1 | pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h') |
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
'2000-01-01 08:00:00', '2000-01-01 12:00:00',
'2000-01-01 16:00:00', '2000-01-01 20:00:00',
'2000-01-02 00:00:00', '2000-01-02 04:00:00',
'2000-01-02 08:00:00', '2000-01-02 12:00:00',
'2000-01-02 16:00:00', '2000-01-02 20:00:00',
'2000-01-03 00:00:00', '2000-01-03 04:00:00',
'2000-01-03 08:00:00', '2000-01-03 12:00:00',
'2000-01-03 16:00:00', '2000-01-03 20:00:00'],
dtype='datetime64[ns]', freq='4H')
1 | Hour(2) + Minute(30) |
<150 * Minutes>
1 | pd.date_range('2000-01-01', periods=10, freq='1h30min') |
DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
'2000-01-01 03:00:00', '2000-01-01 04:30:00',
'2000-01-01 06:00:00', '2000-01-01 07:30:00',
'2000-01-01 09:00:00', '2000-01-01 10:30:00',
'2000-01-01 12:00:00', '2000-01-01 13:30:00'],
dtype='datetime64[ns]', freq='90T')
WOM日期
WOM(Week of Month),获得诸如‘每月第三个星期五’之类的日期
1 | rng = pd.date_range('1/1/2012','9/1/2012',freq='WOM-3FRI') |
DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20',
'2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'],
dtype='datetime64[ns]', freq='WOM-3FRI')
移动数据
1 | ts = pd.Series(np.random.randn(4), |
2000-01-31 0.597205
2000-02-29 0.039901
2000-03-31 -0.757430
2000-04-30 1.698482
Freq: M, dtype: float64
1 | ts.shift(2) |
2000-01-31 NaN
2000-02-29 NaN
2000-03-31 0.597205
2000-04-30 0.039901
Freq: M, dtype: float64
1 | ts.shift(-2) |
2000-01-31 -0.757430
2000-02-29 1.698482
2000-03-31 NaN
2000-04-30 NaN
Freq: M, dtype: float64
时区处理
1 | import pytz |
1 | pytz.common_timezones[-5:] |
['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']
1 | tz=pytz.timezone('UTC') |
<UTC>
1 | rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D') |
2012-03-09 09:30:00 -0.434694
2012-03-10 09:30:00 0.516461
2012-03-11 09:30:00 -0.153220
2012-03-12 09:30:00 -0.452038
2012-03-13 09:30:00 0.777409
2012-03-14 09:30:00 -0.163869
Freq: D, dtype: float64
1 | pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC') |
DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
'2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
'2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
'2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
'2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
dtype='datetime64[ns, UTC]', freq='D')
时期及其算术运算
to be add