pandas基本数据结构

import pandas as pd
from pandas import Series,DataFrame

import numpy as np

Series

1	obj=Series([4,7,-1,3])

obj

0    4
1    7
2   -1
3    3
dtype: int64

1	obj.values

array([ 4,  7, -1,  3])

obj.index

RangeIndex(start=0, stop=4, step=1)

1 2	obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c']) obj2

d    4
b    7
a   -5
c    3
dtype: int64

1	obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

obj2['a']

-5

1	'b' in obj2

True

obj2>0

d     True
b     True
a    False
c     True
dtype: bool

1	obj2[obj2>0]

d    4
b    7
c    3
dtype: int64

1 2	sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

1	type(sdata)

dict

1 2	obj3=Series(sdata) obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

1	type(obj3)

pandas.core.series.Series

states = ['California', 'Ohio', 'Oregon', 'Texas'] #California没有对应的键值
obj4=Series(sdata,index=states)
obj4

#NaN:not a number

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

1	obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

1	print(obj3,obj4)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64 California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

1	obj4.name = 'zhangyang'

1	obj4.index.name='pk'

obj4

pk
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: zhangyang, dtype: float64

DataFrame

#传入等长列表或者Numpy数组组成的字典
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

1	type(data)

dict

1 2	frame = DataFrame(data) frame

	state	year	pop
0	Ohio	2000	1.5
1	Ohio	2001	1.7
2	Ohio	2002	3.6
3	Nevada	2001	2.4
4	Nevada	2002	2.9
5	Nevada	2003	3.2

1
2
3

#如果指定列，则DataFrame会按照指定列排序
frame1=DataFrame(data,columns=['year','state','pop'])
frame1

	year	state	pop
0	2000	Ohio	1.5
1	2001	Ohio	1.7
2	2002	Ohio	3.6
3	2001	Nevada	2.4
4	2002	Nevada	2.9
5	2003	Nevada	3.2

1
2
3

#若传入没有值，则会被指定为NaN
frame2=DataFrame(data,columns=['year','state','pop','debt'])
frame2

	year	state	pop	debt
0	2000	Ohio	1.5	NaN
1	2001	Ohio	1.7	NaN
2	2002	Ohio	3.6	NaN
3	2001	Nevada	2.4	NaN
4	2002	Nevada	2.9	NaN
5	2003	Nevada	3.2	NaN

1	frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

1	frame['year']

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

1	frame.loc[1]

state    Ohio
year     2001
pop       1.7
Name: 1, dtype: object

1	frame2.debt=10

1	frame2.debt

0    10
1    10
2    10
3    10
4    10
5    10
Name: debt, dtype: int64

1 2	#这里frame2.debt可以看做是一个Series print(frame2.debt.values,'and',frame2.debt.index)

[10 10 10 10 10 10] and RangeIndex(start=0, stop=6, step=1)

frame3 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame3

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	NaN
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN
six	2003	Nevada	3.2	NaN

1
2
3

val=Series([1.2,3.1,-1],index=['two','five','one'])
frame3.debt=val
print(frame3.debt)

one     -1.0
two      1.2
three    NaN
four     NaN
five     3.1
six      NaN
Name: debt, dtype: float64

1
2
3

#del 关键词可用于删除列
frame3['tmp'] = frame3.state == 'Ohio'  ##这里存在运算符的计算优先级，先判断是否相等，返回布尔型值
frame3

	year	state	pop	debt	tmp
one	2000	Ohio	1.5	-1.0	True
two	2001	Ohio	1.7	1.2	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	NaN	False
five	2002	Nevada	2.9	3.1	False
six	2003	Nevada	3.2	NaN	False

1	del frame3['tmp']

frame3

	year	state	pop	debt
one	2000	Ohio	1.5	-1.0
two	2001	Ohio	1.7	1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	3.1
six	2003	Nevada	3.2	NaN

1
2
3

pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
pop

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

1 2	frame4=DataFrame(pop) frame4

	Nevada	Ohio
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

frame4.T

	2000	2001	2002
Nevada	NaN	2.4	2.9
Ohio	1.5	1.7	3.6

1	frame4.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

索引对象

###index对象是不可修改的

obj = Series(range(3),index=['a','b','c'])
obj

a    0
b    1
c    2
dtype: int64

1
2
3

index = obj.index
print(index)
index[1]='d'

Index(['a', 'b', 'c'], dtype='object')



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-84-f2a2752a2674> in <module>()
      1 index = obj.index
      2 print(index)
----> 3 index[1]='d'


/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   2048 
   2049     def __setitem__(self, key, value):
-> 2050         raise TypeError("Index does not support mutable operations")
   2051 
   2052     def __getitem__(self, key):


TypeError: Index does not support mutable operations

1 2	## 不可修改性保证了index对象在多个数据结构之间实现共享的安全 ## index除了长得像数组，也类似一个固定大小的集合

基本功能

重新索引

1 2	obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

1 2	obj2 = obj.reindex(['a','b','c','d','e']) obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

1	obj.reindex(['a','b','c','d','e'],fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

frame=DataFrame(np.arange(9).reshape((3,3))
                ,index=['a','c','e']
                ,columns=['Ohio', 'Texas', 'California'])
frame

	Ohio	Texas	California
a	0	1	2
c	3	4	5
e	6	7	8

1	frame.reindex(['a','b','c','e'])

	Ohio	Texas	California
a	0.0	1.0	2.0
b	NaN	NaN	NaN
c	3.0	4.0	5.0
e	6.0	7.0	8.0

1	frame.reindex(columns=[ 'Texas','California','Ohio']) #这里创建了一个新对象

	Texas	California	Ohio
a	1	2	0
c	4	5	3
e	7	8	6

1	states=['Texas','California','Ohio']

丢弃指定轴上的项

1 2	obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

1 2	new_obj = obj.drop('c') new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

1	data.drop(['one','two'],axis=1)

	three	four
Ohio	2	3
Colorado	6	7
Utah	10	11
New York	14	15

索引、选取和过滤

1	obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

1
2
3

print(obj['b'])
print(obj[1])
print(obj[2:4])

1.0
1.0
c    2.0
d    3.0
dtype: float64

data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

1	data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

1	data[['two','three']]

	two	three
Ohio	1	2
Colorado	5	6
Utah	9	10
New York	13	14

data[:2]

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7

1	data[data['three']>5]

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

1 2	data[data<5]=0 data

	one	two	three	four
Ohio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

Selection with loc and iloc

1	data.loc['Colorado',['one','two']]

one    0
two    5
Name: Colorado, dtype: int64

data

	one	two	three	four
Ohio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
New York	12	13	14	15

1	data.iloc[2] #可以直接使用数字索引

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

Integer Indexes

1 2	ser = pd.Series(np.arange(3.)) ser

0    0.0
1    1.0
2    2.0
dtype: float64

1 2	ser2 = pd.Series(np.arange(3.),index=['a','b','c']) ser2

a    0.0
b    1.0
c    2.0
dtype: float64

ser2[-1]

2.0

1	ser.loc[:1]

0    0.0
1    1.0
dtype: float64

1	ser.iloc[:1]

0    0.0
dtype: float64

Arithmetic and Data Alignment

1
2
3

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],index=['a', 'c', 'e', 'f', 'g'])
print(s1,s2)

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64 a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

df1

	b	c	d
Ohio	0.0	1.0	2.0
Texas	3.0	4.0	5.0
Colorado	6.0	7.0	8.0

df2

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

1	df1+df2 ## 行列索引同时匹配才进行计算，否则为NaN

	b	c	d	e
Colorado	NaN	NaN	NaN	NaN
Ohio	3.0	NaN	6.0	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9.0	NaN	12.0	NaN
Utah	NaN	NaN	NaN	NaN

1 2	df1 = pd.DataFrame({'A': [1, 2]}) df2 = pd.DataFrame({'B': [3, 4]})

df1

	A
0	1
1	2

df2

	B
0	3
1	4

df2-df1

	A	B
0	NaN	NaN
1	NaN	NaN

Arithmetic methods with fill values

df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))

df1

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

1	df1.loc[1,'b']=np.nan

df1

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	NaN	6.0	7.0
2	8.0	9.0	10.0	11.0

df2

	a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	6.0	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	19.0

df1+df2

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	NaN	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

1	df1.add(df2, fill_value=0)

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	6.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

1 / df1

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250000	NaN	0.166667	0.142857
2	0.125000	0.111111	0.100000	0.090909

1	df1.rdiv(1)

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250000	NaN	0.166667	0.142857
2	0.125000	0.111111	0.100000	0.090909

1	df1.reindex(columns=df2.columns,fill_value=0)

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	NaN	6.0	7.0
2	8.0	9.0	10.0	11.0

Operations between DataFrame and Series

1 2	arr = np.arange(12.).reshape((3, 4)) arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

arr[0]

array([0., 1., 2., 3.])

1	arr-arr[0] #对每一个元素都做处理

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

1 2	series = frame.loc['Utah'] series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

1	frame - series #对于DataFrame数据结构同理

	b	d	e
Utah	0.0	0.0	0.0
Ohio	3.0	3.0	3.0
Texas	6.0	6.0	6.0
Oregon	9.0	9.0	9.0

1 2	series2 = pd.Series(range(3), index=['b', 'e', 'f']) series2

b    0
e    1
f    2
dtype: int64

1	frame+series2

	b	d	e	f
Utah	0.0	NaN	3.0	NaN
Ohio	3.0	NaN	6.0	NaN
Texas	6.0	NaN	9.0	NaN
Oregon	9.0	NaN	12.0	NaN

1	series3 = frame['d']

1	frame.sub(series3,axis='index')

	b	e
Utah	-1.0	1.0
Ohio	-1.0	1.0
Texas	-1.0	1.0
Oregon	-1.0	1.0

Function Application and Mapping

1
2
3

frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

	b	d	e
Utah	2.006862	-1.308834	1.440639
Ohio	0.001529	0.026818	0.706586
Texas	-0.461218	0.365081	-0.898180
Oregon	-0.280978	0.402707	-1.396092

1	np.abs(frame)

	b	d	e
Utah	2.006862	1.308834	1.440639
Ohio	0.001529	0.026818	0.706586
Texas	0.461218	0.365081	0.898180
Oregon	0.280978	0.402707	1.396092

1 2	f = lambda x:x.max()-x.min() frame.apply(f,axis='columns') # 传入DataFrame中的一行或者一列数据（Series）,在自定义函数中进行计算

Utah      3.315696
Ohio      0.705058
Texas     1.263261
Oregon    1.798799
dtype: float64

1
2
3

def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f,axis='index')

	b	d	e
min	-0.461218	-1.308834	-1.396092
max	2.006862	0.402707	1.440639

1 2	format = lambda x: '%.2f' % x frame.applymap(format)

	b	d	e
Utah	2.01	-1.31	1.44
Ohio	0.00	0.03	0.71
Texas	-0.46	0.37	-0.90
Oregon	-0.28	0.40	-1.40

1	frame['e'].map(format)

Utah       1.44
Ohio       0.71
Texas     -0.90
Oregon    -1.40
Name: e, dtype: object

Sorting and Ranking

1 2	obj = pd.Series(np.random.randn(4), index=['d', 'a', 'b', 'c']) obj

d   -1.066429
a    0.005021
b   -0.257605
c   -1.705094
dtype: float64

1	obj.sort_values()

c   -1.705094
d   -1.066429
b   -0.257605
a    0.005021
dtype: float64

1	obj.sort_index()

a    0.005021
b   -0.257605
c   -1.705094
d   -1.066429
dtype: float64

frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame

	d	a	b	c
three	0	1	2	3
one	4	5	6	7

1	frame.sort_index(axis=0)

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

1	frame.sort_index(axis=1)

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

1 2	obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values(ascending=False) ## asending参数决定顺序逆序

2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64

1 2	frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame

	b	a
0	4	0
1	7	1
2	-3	0
3	2	1

1	frame.sort_values(by='b') # 以某一列值为准

	b	a
2	-3	0
3	2	1
0	4	0
1	7	1

1	frame.sort_values(by=['a', 'b'])

	b	a
2	-3	0
0	4	0
3	2	1
1	7	1

1 2	obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) obj.rank()#平均排名，破坏同级关系

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

1	obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

1 2	# Assign tie values the maximum rank in the group obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

1
2
3

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

	b	a	c
0	4.3	0	-2.0
1	7.0	1	5.0
2	-3.0	0	8.0
3	2.0	1	-2.5

1	frame.rank(axis=1)

	b	a	c
0	3.0	2.0	1.0
1	3.0	1.0	2.0
2	1.0	2.0	3.0
3	3.0	2.0	1.0

Axis Indexes with Duplicate Labels

1 2	obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

1	obj.index.is_unique

False

1 2	df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df

	0	1	2
a	-1.513555	0.286993	0.982033
a	1.211395	-1.512109	1.007934
b	-0.609349	0.729770	1.106319
b	-0.427720	0.354752	0.286622

1	df.loc['b'] #选出所有指定列

	0	1	2
b	-0.609349	0.729770	1.106319
b	-0.427720	0.354752	0.286622

Summarizing and Computing Descriptive Statistics

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

df.sum()

one    9.25
two   -5.80
dtype: float64

1	df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

1	df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

1	df.idxmax()

one    b
two    d
dtype: object

1	df.cumsum() ##累计求和，默认列

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

1	df.cumsum(axis=1)

	one	two
a	1.40	NaN
b	7.10	2.60
c	NaN	NaN
d	0.75	-0.55

1	df.describe() ###牛逼牛逼

	one	two
count	3.000000	2.000000
mean	3.083333	-2.900000
std	3.493685	2.262742
min	0.750000	-4.500000
25%	1.075000	-3.700000
50%	1.400000	-2.900000
75%	4.250000	-2.100000
max	7.100000	-1.300000

1 2	obj = pd.Series(['a', 'a', 'b', 'c'] * 4) obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

1	obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

Correlation and Covariance

1	## 暂时略过这一部分

Unique Values, Value Counts, and Membership

1 2	obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

1 2	uniques = obj.unique() uniques

array(['c', 'a', 'd', 'b'], dtype=object)

1	obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

1	pd.value_counts(obj.values, sort=False)

c    3
b    2
d    1
a    3
dtype: int64

1	obj.isin(['b', 'c'])

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

1	obj[obj.isin(['b', 'c'])]

0    c
5    b
6    b
7    c
8    c
dtype: object

1
2
3

to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

	Qu1	Qu2	Qu3
0	1	2	1
1	3	3	5
2	4	1	2
3	3	2	4
4	4	3	4

1 2	res=data.apply(pd.value_counts).fillna(0) ##统计每一列出现的次数 res

	Qu1	Qu2	Qu3
1	1.0	1.0	1.0
2	0.0	2.0	1.0
3	2.0	2.0	0.0
4	2.0	0.0	2.0
5	0.0	0.0	1.0

final

该去实战了