ch05-pandas基础

pandas基本数据结构

1
2
3
4
import pandas as pd
from pandas import Series,DataFrame

import numpy as np

Series

1
obj=Series([4,7,-1,3])
1
obj
0    4
1    7
2   -1
3    3
dtype: int64
1
obj.values
array([ 4,  7, -1,  3])
1
obj.index
RangeIndex(start=0, stop=4, step=1)
1
2
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2
d    4
b    7
a   -5
c    3
dtype: int64
1
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
1
obj2['a']
-5
1
'b' in obj2
True
1
obj2>0
d     True
b     True
a    False
c     True
dtype: bool
1
obj2[obj2>0]
d    4
b    7
c    3
dtype: int64
1
2
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
sdata
{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
1
type(sdata)
dict
1
2
obj3=Series(sdata)
obj3
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
1
type(obj3)
pandas.core.series.Series
1
2
3
4
5
states = ['California', 'Ohio', 'Oregon', 'Texas'] #California没有对应的键值
obj4=Series(sdata,index=states)
obj4

#NaN:not a number
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
1
obj4.isnull()
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
1
print(obj3,obj4)
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64 California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
1
obj3+obj4
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
1
obj4.name = 'zhangyang'
1
obj4.index.name='pk'
1
obj4
pk
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: zhangyang, dtype: float64

DataFrame

1
2
3
4
5
#传入等长列表或者Numpy数组组成的字典
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
data
{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
1
type(data)
dict
1
2
frame = DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
1
2
3
#如果指定列,则DataFrame会按照指定列排序
frame1=DataFrame(data,columns=['year','state','pop'])
frame1
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
1
2
3
#若传入没有值,则会被指定为NaN
frame2=DataFrame(data,columns=['year','state','pop','debt'])
frame2
year state pop debt
0 2000 Ohio 1.5 NaN
1 2001 Ohio 1.7 NaN
2 2002 Ohio 3.6 NaN
3 2001 Nevada 2.4 NaN
4 2002 Nevada 2.9 NaN
5 2003 Nevada 3.2 NaN
1
frame.year
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
1
frame['year']
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64
1
frame.loc[1]
state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object
1
frame2.debt=10
1
frame2.debt
0    10
1    10
2    10
3    10
4    10
5    10
Name: debt, dtype: int64
1
2
#这里frame2.debt可以看做是一个Series
print(frame2.debt.values,'and',frame2.debt.index)
[10 10 10 10 10 10] and RangeIndex(start=0, stop=6, step=1)
1
2
3
4
frame3 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four',
'five', 'six'])
frame3
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
1
2
3
val=Series([1.2,3.1,-1],index=['two','five','one'])
frame3.debt=val
print(frame3.debt)
one     -1.0
two      1.2
three    NaN
four     NaN
five     3.1
six      NaN
Name: debt, dtype: float64
1
2
3
#del 关键词可用于删除列
frame3['tmp'] = frame3.state == 'Ohio' ##这里存在运算符的计算优先级,先判断是否相等,返回布尔型值
frame3
year state pop debt tmp
one 2000 Ohio 1.5 -1.0 True
two 2001 Ohio 1.7 1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 NaN False
five 2002 Nevada 2.9 3.1 False
six 2003 Nevada 3.2 NaN False
1
del frame3['tmp']
1
frame3
year state pop debt
one 2000 Ohio 1.5 -1.0
two 2001 Ohio 1.7 1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 3.1
six 2003 Nevada 3.2 NaN
1
2
3
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pop
{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
1
2
frame4=DataFrame(pop)
frame4
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
1
frame4.T
2000 2001 2002
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
1
frame4.values
array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

索引对象

1
2
3
4
###index对象是不可修改的

obj = Series(range(3),index=['a','b','c'])
obj
a    0
b    1
c    2
dtype: int64
1
2
3
index = obj.index
print(index)
index[1]='d'
Index(['a', 'b', 'c'], dtype='object')



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-84-f2a2752a2674> in <module>()
      1 index = obj.index
      2 print(index)
----> 3 index[1]='d'


/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   2048 
   2049     def __setitem__(self, key, value):
-> 2050         raise TypeError("Index does not support mutable operations")
   2051 
   2052     def __getitem__(self, key):


TypeError: Index does not support mutable operations
1
2
## 不可修改性保证了index对象在多个数据结构之间实现共享的安全
## index除了长得像数组,也类似一个固定大小的集合

基本功能

重新索引

1
2
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
1
2
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
1
obj.reindex(['a','b','c','d','e'],fill_value=0)
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
1
2
3
4
frame=DataFrame(np.arange(9).reshape((3,3))
,index=['a','c','e']
,columns=['Ohio', 'Texas', 'California'])
frame
Ohio Texas California
a 0 1 2
c 3 4 5
e 6 7 8
1
frame.reindex(['a','b','c','e'])
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
e 6.0 7.0 8.0
1
frame.reindex(columns=[ 'Texas','California','Ohio'])  #这里创建了一个新对象
Texas California Ohio
a 1 2 0
c 4 5 3
e 7 8 6
1
states=['Texas','California','Ohio']

丢弃指定轴上的项

1
2
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
1
2
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
1
2
3
4
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
1
data.drop(['one','two'],axis=1)
three four
Ohio 2 3
Colorado 6 7
Utah 10 11
New York 14 15

索引、选取和过滤

1
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
1
2
3
print(obj['b'])
print(obj[1])
print(obj[2:4])
1.0
1.0
c    2.0
d    3.0
dtype: float64
1
2
3
4
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
1
data['two']
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64
1
data[['two','three']]
two three
Ohio 1 2
Colorado 5 6
Utah 9 10
New York 13 14
1
data[:2]
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
1
data[data['three']>5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
1
2
data[data<5]=0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15

Selection with loc and iloc

1
data.loc['Colorado',['one','two']]
one    0
two    5
Name: Colorado, dtype: int64
1
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
1
data.iloc[2]    #可以直接使用数字索引
one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

Integer Indexes

1
2
ser = pd.Series(np.arange(3.))
ser
0    0.0
1    1.0
2    2.0
dtype: float64
1
2
ser2 = pd.Series(np.arange(3.),index=['a','b','c'])
ser2
a    0.0
b    1.0
c    2.0
dtype: float64
1
ser2[-1]
2.0
1
ser.loc[:1]
0    0.0
1    1.0
dtype: float64
1
ser.iloc[:1]
0    0.0
dtype: float64

Arithmetic and Data Alignment

1
2
3
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])
print(s1,s2)
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64 a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
1
s1+s2
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
1
2
3
4
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
1
df1
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0
1
df2
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
1
df1+df2   ## 行列索引同时匹配才进行计算,否则为NaN
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
1
2
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
1
df1
A
0 1
1 2
1
df2
B
0 3
1 4
1
df2-df1
A B
0 NaN NaN
1 NaN NaN

Arithmetic methods with fill values

1
2
3
4
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
columns=list('abcde'))
1
df1
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0
1
df1.loc[1,'b']=np.nan
1
df1
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 NaN 6.0 7.0
2 8.0 9.0 10.0 11.0
1
df2
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 6.0 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1
df1+df2
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
1
df1.add(df2, fill_value=0)
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 6.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1
1 / df1
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250000 NaN 0.166667 0.142857
2 0.125000 0.111111 0.100000 0.090909
1
df1.rdiv(1)
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250000 NaN 0.166667 0.142857
2 0.125000 0.111111 0.100000 0.090909
1
df1.reindex(columns=df2.columns,fill_value=0)
a b c d e
0 0.0 1.0 2.0 3.0 0
1 4.0 NaN 6.0 7.0 0
2 8.0 9.0 10.0 11.0 0

Operations between DataFrame and Series

1
2
arr = np.arange(12.).reshape((3, 4))
arr
array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])
1
arr[0]
array([0., 1., 2., 3.])
1
arr-arr[0] #对每一个元素都做处理
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])
1
2
3
4
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
1
2
series = frame.loc['Utah']
series
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
1
frame - series   #对于DataFrame数据结构同理
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
1
2
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2
b    0
e    1
f    2
dtype: int64
1
frame+series2
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN
1
series3 = frame['d']
1
frame.sub(series3,axis='index')
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0

Function Application and Mapping

1
2
3
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
b d e
Utah 2.006862 -1.308834 1.440639
Ohio 0.001529 0.026818 0.706586
Texas -0.461218 0.365081 -0.898180
Oregon -0.280978 0.402707 -1.396092
1
np.abs(frame)
b d e
Utah 2.006862 1.308834 1.440639
Ohio 0.001529 0.026818 0.706586
Texas 0.461218 0.365081 0.898180
Oregon 0.280978 0.402707 1.396092
1
2
f = lambda x:x.max()-x.min()
frame.apply(f,axis='columns') # 传入DataFrame中的一行或者一列数据(Series),在自定义函数中进行计算
Utah      3.315696
Ohio      0.705058
Texas     1.263261
Oregon    1.798799
dtype: float64
1
2
3
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f,axis='index')
b d e
min -0.461218 -1.308834 -1.396092
max 2.006862 0.402707 1.440639
1
2
format = lambda x: '%.2f' % x
frame.applymap(format)
b d e
Utah 2.01 -1.31 1.44
Ohio 0.00 0.03 0.71
Texas -0.46 0.37 -0.90
Oregon -0.28 0.40 -1.40
1
frame['e'].map(format)
Utah       1.44
Ohio       0.71
Texas     -0.90
Oregon    -1.40
Name: e, dtype: object

Sorting and Ranking

1
2
obj = pd.Series(np.random.randn(4), index=['d', 'a', 'b', 'c'])
obj
d   -1.066429
a    0.005021
b   -0.257605
c   -1.705094
dtype: float64
1
obj.sort_values()
c   -1.705094
d   -1.066429
b   -0.257605
a    0.005021
dtype: float64
1
obj.sort_index()
a    0.005021
b   -0.257605
c   -1.705094
d   -1.066429
dtype: float64
1
2
3
4
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
index=['three', 'one'],
columns=['d', 'a', 'b', 'c'])
frame
d a b c
three 0 1 2 3
one 4 5 6 7
1
frame.sort_index(axis=0)
d a b c
one 4 5 6 7
three 0 1 2 3
1
frame.sort_index(axis=1)
a b c d
three 1 2 3 0
one 5 6 7 4
1
2
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values(ascending=False) ## asending参数决定顺序逆序
2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64
1
2
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
b a
0 4 0
1 7 1
2 -3 0
3 2 1
1
frame.sort_values(by='b')   # 以某一列值为准
b a
2 -3 0
3 2 1
0 4 0
1 7 1
1
frame.sort_values(by=['a', 'b'])
b a
2 -3 0
0 4 0
3 2 1
1 7 1
1
2
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()#平均排名,破坏同级关系
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
1
obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
1
2
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
1
2
3
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
'c': [-2, 5, 8, -2.5]})
frame
b a c
0 4.3 0 -2.0
1 7.0 1 5.0
2 -3.0 0 8.0
3 2.0 1 -2.5
1
frame.rank(axis=1)
b a c
0 3.0 2.0 1.0
1 3.0 1.0 2.0
2 1.0 2.0 3.0
3 3.0 2.0 1.0

Axis Indexes with Duplicate Labels

1
2
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj
a    0
a    1
b    2
b    3
c    4
dtype: int64
1
obj.index.is_unique
False
1
2
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
0 1 2
a -1.513555 0.286993 0.982033
a 1.211395 -1.512109 1.007934
b -0.609349 0.729770 1.106319
b -0.427720 0.354752 0.286622
1
df.loc['b']   #选出所有指定列
0 1 2
b -0.609349 0.729770 1.106319
b -0.427720 0.354752 0.286622

Summarizing and Computing Descriptive Statistics

1
2
3
4
5
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
one two
a 1.40 NaN
b 7.10 -4.5
c NaN NaN
d 0.75 -1.3
1
df.sum()
one    9.25
two   -5.80
dtype: float64
1
df.sum(axis='columns')
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
1
df.mean(axis='columns', skipna=False)
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
1
df.idxmax()
one    b
two    d
dtype: object
1
df.cumsum() ##累计求和,默认列
one two
a 1.40 NaN
b 8.50 -4.5
c NaN NaN
d 9.25 -5.8
1
df.cumsum(axis=1)
one two
a 1.40 NaN
b 7.10 2.60
c NaN NaN
d 0.75 -0.55
1
df.describe()   ###牛逼牛逼
one two
count 3.000000 2.000000
mean 3.083333 -2.900000
std 3.493685 2.262742
min 0.750000 -4.500000
25% 1.075000 -3.700000
50% 1.400000 -2.900000
75% 4.250000 -2.100000
max 7.100000 -1.300000
1
2
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object
1
obj.describe()
count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

1
## 暂时略过这一部分

Unique Values, Value Counts, and Membership

1
2
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
1
2
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
1
obj.value_counts()
a    3
c    3
b    2
d    1
dtype: int64
1
pd.value_counts(obj.values, sort=False)
c    3
b    2
d    1
a    3
dtype: int64
1
obj.isin(['b', 'c'])
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
1
obj[obj.isin(['b', 'c'])]
0    c
5    b
6    b
7    c
8    c
dtype: object
1
2
3
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)
array([0, 2, 1, 1, 0, 2])
1
2
3
4
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
Qu1 Qu2 Qu3
0 1 2 1
1 3 3 5
2 4 1 2
3 3 2 4
4 4 3 4
1
2
res=data.apply(pd.value_counts).fillna(0)   ##统计每一列出现的次数
res
Qu1 Qu2 Qu3
1 1.0 1.0 1.0
2 0.0 2.0 1.0
3 2.0 2.0 0.0
4 2.0 0.0 2.0
5 0.0 0.0 1.0

final

该去实战了