pandas基本数据结构
1 | import pandas as pd |
Series
1 | obj=Series([4,7,-1,3]) |
1 | obj |
0 4
1 7
2 -1
3 3
dtype: int64
1 | obj.values |
array([ 4, 7, -1, 3])
1 | obj.index |
RangeIndex(start=0, stop=4, step=1)
1 | obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']) |
d 4
b 7
a -5
c 3
dtype: int64
1 | obj2.index |
Index(['d', 'b', 'a', 'c'], dtype='object')
1 | obj2['a'] |
-5
1 | 'b' in obj2 |
True
1 | obj2>0 |
d True
b True
a False
c True
dtype: bool
1 | obj2[obj2>0] |
d 4
b 7
c 3
dtype: int64
1 | sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} |
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
1 | type(sdata) |
dict
1 | obj3=Series(sdata) |
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
1 | type(obj3) |
pandas.core.series.Series
1 | states = ['California', 'Ohio', 'Oregon', 'Texas'] #California没有对应的键值 |
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
1 | obj4.isnull() |
California True
Ohio False
Oregon False
Texas False
dtype: bool
1 | print(obj3,obj4) |
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64 California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
1 | obj3+obj4 |
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
1 | obj4.name = 'zhangyang' |
1 | obj4.index.name='pk' |
1 | obj4 |
pk
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: zhangyang, dtype: float64
DataFrame
1 | #传入等长列表或者Numpy数组组成的字典 |
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
1 | type(data) |
dict
1 | frame = DataFrame(data) |
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
1 | #如果指定列,则DataFrame会按照指定列排序 |
year | state | pop | |
---|---|---|---|
0 | 2000 | Ohio | 1.5 |
1 | 2001 | Ohio | 1.7 |
2 | 2002 | Ohio | 3.6 |
3 | 2001 | Nevada | 2.4 |
4 | 2002 | Nevada | 2.9 |
5 | 2003 | Nevada | 3.2 |
1 | #若传入没有值,则会被指定为NaN |
year | state | pop | debt | |
---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN |
1 | 2001 | Ohio | 1.7 | NaN |
2 | 2002 | Ohio | 3.6 | NaN |
3 | 2001 | Nevada | 2.4 | NaN |
4 | 2002 | Nevada | 2.9 | NaN |
5 | 2003 | Nevada | 3.2 | NaN |
1 | frame.year |
0 2000
1 2001
2 2002
3 2001
4 2002
5 2003
Name: year, dtype: int64
1 | frame['year'] |
0 2000
1 2001
2 2002
3 2001
4 2002
5 2003
Name: year, dtype: int64
1 | frame.loc[1] |
state Ohio
year 2001
pop 1.7
Name: 1, dtype: object
1 | frame2.debt=10 |
1 | frame2.debt |
0 10
1 10
2 10
3 10
4 10
5 10
Name: debt, dtype: int64
1 | #这里frame2.debt可以看做是一个Series |
[10 10 10 10 10 10] and RangeIndex(start=0, stop=6, step=1)
1 | frame3 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], |
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | NaN |
two | 2001 | Ohio | 1.7 | NaN |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | NaN |
six | 2003 | Nevada | 3.2 | NaN |
1 | val=Series([1.2,3.1,-1],index=['two','five','one']) |
one -1.0
two 1.2
three NaN
four NaN
five 3.1
six NaN
Name: debt, dtype: float64
1 | #del 关键词可用于删除列 |
year | state | pop | debt | tmp | |
---|---|---|---|---|---|
one | 2000 | Ohio | 1.5 | -1.0 | True |
two | 2001 | Ohio | 1.7 | 1.2 | True |
three | 2002 | Ohio | 3.6 | NaN | True |
four | 2001 | Nevada | 2.4 | NaN | False |
five | 2002 | Nevada | 2.9 | 3.1 | False |
six | 2003 | Nevada | 3.2 | NaN | False |
1 | del frame3['tmp'] |
1 | frame3 |
year | state | pop | debt | |
---|---|---|---|---|
one | 2000 | Ohio | 1.5 | -1.0 |
two | 2001 | Ohio | 1.7 | 1.2 |
three | 2002 | Ohio | 3.6 | NaN |
four | 2001 | Nevada | 2.4 | NaN |
five | 2002 | Nevada | 2.9 | 3.1 |
six | 2003 | Nevada | 3.2 | NaN |
1 | pop = {'Nevada': {2001: 2.4, 2002: 2.9}, |
{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
1 | frame4=DataFrame(pop) |
Nevada | Ohio | |
---|---|---|
2000 | NaN | 1.5 |
2001 | 2.4 | 1.7 |
2002 | 2.9 | 3.6 |
1 | frame4.T |
2000 | 2001 | 2002 | |
---|---|---|---|
Nevada | NaN | 2.4 | 2.9 |
Ohio | 1.5 | 1.7 | 3.6 |
1 | frame4.values |
array([[nan, 1.5],
[2.4, 1.7],
[2.9, 3.6]])
索引对象
1 | ###index对象是不可修改的 |
a 0
b 1
c 2
dtype: int64
1 | index = obj.index |
Index(['a', 'b', 'c'], dtype='object')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-f2a2752a2674> in <module>()
1 index = obj.index
2 print(index)
----> 3 index[1]='d'
/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
2048
2049 def __setitem__(self, key, value):
-> 2050 raise TypeError("Index does not support mutable operations")
2051
2052 def __getitem__(self, key):
TypeError: Index does not support mutable operations
1 | ## 不可修改性保证了index对象在多个数据结构之间实现共享的安全 |
基本功能
重新索引
1 | obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) |
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
1 | obj2 = obj.reindex(['a','b','c','d','e']) |
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
1 | obj.reindex(['a','b','c','d','e'],fill_value=0) |
a -5.3
b 7.2
c 3.6
d 4.5
e 0.0
dtype: float64
1 | frame=DataFrame(np.arange(9).reshape((3,3)) |
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
e | 6 | 7 | 8 |
1 | frame.reindex(['a','b','c','e']) |
Ohio | Texas | California | |
---|---|---|---|
a | 0.0 | 1.0 | 2.0 |
b | NaN | NaN | NaN |
c | 3.0 | 4.0 | 5.0 |
e | 6.0 | 7.0 | 8.0 |
1 | frame.reindex(columns=[ 'Texas','California','Ohio']) #这里创建了一个新对象 |
Texas | California | Ohio | |
---|---|---|---|
a | 1 | 2 | 0 |
c | 4 | 5 | 3 |
e | 7 | 8 | 6 |
1 | states=['Texas','California','Ohio'] |
丢弃指定轴上的项
1 | obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) |
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
1 | new_obj = obj.drop('c') |
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
1 | data = pd.DataFrame(np.arange(16).reshape((4, 4)), |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data.drop(['one','two'],axis=1) |
three | four | |
---|---|---|
Ohio | 2 | 3 |
Colorado | 6 | 7 |
Utah | 10 | 11 |
New York | 14 | 15 |
索引、选取和过滤
1 | obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) |
1 | print(obj['b']) |
1.0
1.0
c 2.0
d 3.0
dtype: float64
1 | data = pd.DataFrame(np.arange(16).reshape((4, 4)), |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data['two'] |
Ohio 1
Colorado 5
Utah 9
New York 13
Name: two, dtype: int64
1 | data[['two','three']] |
two | three | |
---|---|---|
Ohio | 1 | 2 |
Colorado | 5 | 6 |
Utah | 9 | 10 |
New York | 13 | 14 |
1 | data[:2] |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
1 | data[data['three']>5] |
one | two | three | four | |
---|---|---|---|---|
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data[data<5]=0 |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
Selection with loc and iloc
1 | data.loc['Colorado',['one','two']] |
one 0
two 5
Name: Colorado, dtype: int64
1 | data |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data.iloc[2] #可以直接使用数字索引 |
one 8
two 9
three 10
four 11
Name: Utah, dtype: int64
Integer Indexes
1 | ser = pd.Series(np.arange(3.)) |
0 0.0
1 1.0
2 2.0
dtype: float64
1 | ser2 = pd.Series(np.arange(3.),index=['a','b','c']) |
a 0.0
b 1.0
c 2.0
dtype: float64
1 | ser2[-1] |
2.0
1 | ser.loc[:1] |
0 0.0
1 1.0
dtype: float64
1 | ser.iloc[:1] |
0 0.0
dtype: float64
Arithmetic and Data Alignment
1 | s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) |
a 7.3
c -2.5
d 3.4
e 1.5
dtype: float64 a -2.1
c 3.6
e -1.5
f 4.0
g 3.1
dtype: float64
1 | s1+s2 |
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
1 | df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), |
1 | df1 |
b | c | d | |
---|---|---|---|
Ohio | 0.0 | 1.0 | 2.0 |
Texas | 3.0 | 4.0 | 5.0 |
Colorado | 6.0 | 7.0 | 8.0 |
1 | df2 |
b | d | e | |
---|---|---|---|
Utah | 0.0 | 1.0 | 2.0 |
Ohio | 3.0 | 4.0 | 5.0 |
Texas | 6.0 | 7.0 | 8.0 |
Oregon | 9.0 | 10.0 | 11.0 |
1 | df1+df2 ## 行列索引同时匹配才进行计算,否则为NaN |
b | c | d | e | |
---|---|---|---|---|
Colorado | NaN | NaN | NaN | NaN |
Ohio | 3.0 | NaN | 6.0 | NaN |
Oregon | NaN | NaN | NaN | NaN |
Texas | 9.0 | NaN | 12.0 | NaN |
Utah | NaN | NaN | NaN | NaN |
1 | df1 = pd.DataFrame({'A': [1, 2]}) |
1 | df1 |
A | |
---|---|
0 | 1 |
1 | 2 |
1 | df2 |
B | |
---|---|
0 | 3 |
1 | 4 |
1 | df2-df1 |
A | B | |
---|---|---|
0 | NaN | NaN |
1 | NaN | NaN |
Arithmetic methods with fill values
1 | df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), |
1 | df1 |
a | b | c | d | |
---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | 5.0 | 6.0 | 7.0 |
2 | 8.0 | 9.0 | 10.0 | 11.0 |
1 | df1.loc[1,'b']=np.nan |
1 | df1 |
a | b | c | d | |
---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 |
1 | 4.0 | NaN | 6.0 | 7.0 |
2 | 8.0 | 9.0 | 10.0 | 11.0 |
1 | df2 |
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 | 4.0 |
1 | 5.0 | 6.0 | 7.0 | 8.0 | 9.0 |
2 | 10.0 | 11.0 | 12.0 | 13.0 | 14.0 |
3 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 |
1 | df1+df2 |
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | NaN |
1 | 9.0 | NaN | 13.0 | 15.0 | NaN |
2 | 18.0 | 20.0 | 22.0 | 24.0 | NaN |
3 | NaN | NaN | NaN | NaN | NaN |
1 | df1.add(df2, fill_value=0) |
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 2.0 | 4.0 | 6.0 | 4.0 |
1 | 9.0 | 6.0 | 13.0 | 15.0 | 9.0 |
2 | 18.0 | 20.0 | 22.0 | 24.0 | 14.0 |
3 | 15.0 | 16.0 | 17.0 | 18.0 | 19.0 |
1 | 1 / df1 |
a | b | c | d | |
---|---|---|---|---|
0 | inf | 1.000000 | 0.500000 | 0.333333 |
1 | 0.250000 | NaN | 0.166667 | 0.142857 |
2 | 0.125000 | 0.111111 | 0.100000 | 0.090909 |
1 | df1.rdiv(1) |
a | b | c | d | |
---|---|---|---|---|
0 | inf | 1.000000 | 0.500000 | 0.333333 |
1 | 0.250000 | NaN | 0.166667 | 0.142857 |
2 | 0.125000 | 0.111111 | 0.100000 | 0.090909 |
1 | df1.reindex(columns=df2.columns,fill_value=0) |
a | b | c | d | e | |
---|---|---|---|---|---|
0 | 0.0 | 1.0 | 2.0 | 3.0 | 0 |
1 | 4.0 | NaN | 6.0 | 7.0 | 0 |
2 | 8.0 | 9.0 | 10.0 | 11.0 | 0 |
Operations between DataFrame and Series
1 | arr = np.arange(12.).reshape((3, 4)) |
array([[ 0., 1., 2., 3.],
[ 4., 5., 6., 7.],
[ 8., 9., 10., 11.]])
1 | arr[0] |
array([0., 1., 2., 3.])
1 | arr-arr[0] #对每一个元素都做处理 |
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
1 | frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), |
b | d | e | |
---|---|---|---|
Utah | 0.0 | 1.0 | 2.0 |
Ohio | 3.0 | 4.0 | 5.0 |
Texas | 6.0 | 7.0 | 8.0 |
Oregon | 9.0 | 10.0 | 11.0 |
1 | series = frame.loc['Utah'] |
b 0.0
d 1.0
e 2.0
Name: Utah, dtype: float64
1 | frame - series #对于DataFrame数据结构同理 |
b | d | e | |
---|---|---|---|
Utah | 0.0 | 0.0 | 0.0 |
Ohio | 3.0 | 3.0 | 3.0 |
Texas | 6.0 | 6.0 | 6.0 |
Oregon | 9.0 | 9.0 | 9.0 |
1 | series2 = pd.Series(range(3), index=['b', 'e', 'f']) |
b 0
e 1
f 2
dtype: int64
1 | frame+series2 |
b | d | e | f | |
---|---|---|---|---|
Utah | 0.0 | NaN | 3.0 | NaN |
Ohio | 3.0 | NaN | 6.0 | NaN |
Texas | 6.0 | NaN | 9.0 | NaN |
Oregon | 9.0 | NaN | 12.0 | NaN |
1 | series3 = frame['d'] |
1 | frame.sub(series3,axis='index') |
b | d | e | |
---|---|---|---|
Utah | -1.0 | 0.0 | 1.0 |
Ohio | -1.0 | 0.0 | 1.0 |
Texas | -1.0 | 0.0 | 1.0 |
Oregon | -1.0 | 0.0 | 1.0 |
Function Application and Mapping
1 | frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), |
b | d | e | |
---|---|---|---|
Utah | 2.006862 | -1.308834 | 1.440639 |
Ohio | 0.001529 | 0.026818 | 0.706586 |
Texas | -0.461218 | 0.365081 | -0.898180 |
Oregon | -0.280978 | 0.402707 | -1.396092 |
1 | np.abs(frame) |
b | d | e | |
---|---|---|---|
Utah | 2.006862 | 1.308834 | 1.440639 |
Ohio | 0.001529 | 0.026818 | 0.706586 |
Texas | 0.461218 | 0.365081 | 0.898180 |
Oregon | 0.280978 | 0.402707 | 1.396092 |
1 | f = lambda x:x.max()-x.min() |
Utah 3.315696
Ohio 0.705058
Texas 1.263261
Oregon 1.798799
dtype: float64
1 | def f(x): |
b | d | e | |
---|---|---|---|
min | -0.461218 | -1.308834 | -1.396092 |
max | 2.006862 | 0.402707 | 1.440639 |
1 | format = lambda x: '%.2f' % x |
b | d | e | |
---|---|---|---|
Utah | 2.01 | -1.31 | 1.44 |
Ohio | 0.00 | 0.03 | 0.71 |
Texas | -0.46 | 0.37 | -0.90 |
Oregon | -0.28 | 0.40 | -1.40 |
1 | frame['e'].map(format) |
Utah 1.44
Ohio 0.71
Texas -0.90
Oregon -1.40
Name: e, dtype: object
Sorting and Ranking
1 | obj = pd.Series(np.random.randn(4), index=['d', 'a', 'b', 'c']) |
d -1.066429
a 0.005021
b -0.257605
c -1.705094
dtype: float64
1 | obj.sort_values() |
c -1.705094
d -1.066429
b -0.257605
a 0.005021
dtype: float64
1 | obj.sort_index() |
a 0.005021
b -0.257605
c -1.705094
d -1.066429
dtype: float64
1 | frame = pd.DataFrame(np.arange(8).reshape((2, 4)), |
d | a | b | c | |
---|---|---|---|---|
three | 0 | 1 | 2 | 3 |
one | 4 | 5 | 6 | 7 |
1 | frame.sort_index(axis=0) |
d | a | b | c | |
---|---|---|---|---|
one | 4 | 5 | 6 | 7 |
three | 0 | 1 | 2 | 3 |
1 | frame.sort_index(axis=1) |
a | b | c | d | |
---|---|---|---|---|
three | 1 | 2 | 3 | 0 |
one | 5 | 6 | 7 | 4 |
1 | obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) |
2 7.0
0 4.0
5 2.0
4 -3.0
1 NaN
3 NaN
dtype: float64
1 | frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) |
b | a | |
---|---|---|
0 | 4 | 0 |
1 | 7 | 1 |
2 | -3 | 0 |
3 | 2 | 1 |
1 | frame.sort_values(by='b') # 以某一列值为准 |
b | a | |
---|---|---|
2 | -3 | 0 |
3 | 2 | 1 |
0 | 4 | 0 |
1 | 7 | 1 |
1 | frame.sort_values(by=['a', 'b']) |
b | a | |
---|---|---|
2 | -3 | 0 |
0 | 4 | 0 |
3 | 2 | 1 |
1 | 7 | 1 |
1 | obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) |
0 6.5
1 1.0
2 6.5
3 4.5
4 3.0
5 2.0
6 4.5
dtype: float64
1 | obj.rank(method='first') |
0 6.0
1 1.0
2 7.0
3 4.0
4 3.0
5 2.0
6 5.0
dtype: float64
1 | # Assign tie values the maximum rank in the group |
0 2.0
1 7.0
2 2.0
3 4.0
4 5.0
5 6.0
6 4.0
dtype: float64
1 | frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], |
b | a | c | |
---|---|---|---|
0 | 4.3 | 0 | -2.0 |
1 | 7.0 | 1 | 5.0 |
2 | -3.0 | 0 | 8.0 |
3 | 2.0 | 1 | -2.5 |
1 | frame.rank(axis=1) |
b | a | c | |
---|---|---|---|
0 | 3.0 | 2.0 | 1.0 |
1 | 3.0 | 1.0 | 2.0 |
2 | 1.0 | 2.0 | 3.0 |
3 | 3.0 | 2.0 | 1.0 |
Axis Indexes with Duplicate Labels
1 | obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c']) |
a 0
a 1
b 2
b 3
c 4
dtype: int64
1 | obj.index.is_unique |
False
1 | df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) |
0 | 1 | 2 | |
---|---|---|---|
a | -1.513555 | 0.286993 | 0.982033 |
a | 1.211395 | -1.512109 | 1.007934 |
b | -0.609349 | 0.729770 | 1.106319 |
b | -0.427720 | 0.354752 | 0.286622 |
1 | df.loc['b'] #选出所有指定列 |
0 | 1 | 2 | |
---|---|---|---|
b | -0.609349 | 0.729770 | 1.106319 |
b | -0.427720 | 0.354752 | 0.286622 |
Summarizing and Computing Descriptive Statistics
1 | df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], |
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | -4.5 |
c | NaN | NaN |
d | 0.75 | -1.3 |
1 | df.sum() |
one 9.25
two -5.80
dtype: float64
1 | df.sum(axis='columns') |
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
1 | df.mean(axis='columns', skipna=False) |
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
1 | df.idxmax() |
one b
two d
dtype: object
1 | df.cumsum() ##累计求和,默认列 |
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 8.50 | -4.5 |
c | NaN | NaN |
d | 9.25 | -5.8 |
1 | df.cumsum(axis=1) |
one | two | |
---|---|---|
a | 1.40 | NaN |
b | 7.10 | 2.60 |
c | NaN | NaN |
d | 0.75 | -0.55 |
1 | df.describe() ###牛逼牛逼 |
one | two | |
---|---|---|
count | 3.000000 | 2.000000 |
mean | 3.083333 | -2.900000 |
std | 3.493685 | 2.262742 |
min | 0.750000 | -4.500000 |
25% | 1.075000 | -3.700000 |
50% | 1.400000 | -2.900000 |
75% | 4.250000 | -2.100000 |
max | 7.100000 | -1.300000 |
1 | obj = pd.Series(['a', 'a', 'b', 'c'] * 4) |
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
1 | obj.describe() |
count 16
unique 3
top a
freq 8
dtype: object
Correlation and Covariance
1 | ## 暂时略过这一部分 |
Unique Values, Value Counts, and Membership
1 | obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) |
0 c
1 a
2 d
3 a
4 a
5 b
6 b
7 c
8 c
dtype: object
1 | uniques = obj.unique() |
array(['c', 'a', 'd', 'b'], dtype=object)
1 | obj.value_counts() |
a 3
c 3
b 2
d 1
dtype: int64
1 | pd.value_counts(obj.values, sort=False) |
c 3
b 2
d 1
a 3
dtype: int64
1 | obj.isin(['b', 'c']) |
0 True
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
dtype: bool
1 | obj[obj.isin(['b', 'c'])] |
0 c
5 b
6 b
7 c
8 c
dtype: object
1 | to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a']) |
array([0, 2, 1, 1, 0, 2])
1 | data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4], |
Qu1 | Qu2 | Qu3 | |
---|---|---|---|
0 | 1 | 2 | 1 |
1 | 3 | 3 | 5 |
2 | 4 | 1 | 2 |
3 | 3 | 2 | 4 |
4 | 4 | 3 | 4 |
1 | res=data.apply(pd.value_counts).fillna(0) ##统计每一列出现的次数 |
Qu1 | Qu2 | Qu3 | |
---|---|---|---|
1 | 1.0 | 1.0 | 1.0 |
2 | 0.0 | 2.0 | 1.0 |
3 | 2.0 | 2.0 | 0.0 |
4 | 2.0 | 0.0 | 2.0 |
5 | 0.0 | 0.0 | 1.0 |
final
该去实战了