Fork me on GitHub

pandas系列9-数据规整

层次化索引hierarchical indexing

  • 数据分散在不同的文件或者数据库中
  • 层次化索引在⼀个轴上拥有多个(两个以上)索引级别
  • 低维度形式处理高维度数据
1
2
3
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
1
2
3
4
5
# 创建S: 索引index是一个数组组成的列表
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64
1
data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])
1
data['b']
1   -0.731398
3   -0.707528
dtype: float64
1
2
3
# 部分索引选取数据子集
# 切片形式
data['b':'c']
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
dtype: float64
1
2
# 列表形式
data.loc[['b', 'c']]
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
dtype: float64
1
data.loc[['b', 'd']]
b  1   -0.731398
   3   -0.707528
d  2   -0.826364
   3   -1.874992
dtype: float64
1
data.loc[:, 2]
a   -0.501033
c   -0.177199
d   -0.826364
dtype: float64
1
2
# 2表示含有索引是2
data.loc[:, 2]
a   -0.501033
c   -0.177199
d   -0.826364
dtype: float64
1
data
a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64
1
data.unstack()    # 将层次化索引的数据变成DF形式
1 2 3
a 1.832067 -0.501033 -0.602755
b -0.731398 NaN -0.707528
c -0.382131 -0.177199 NaN
d NaN -0.826364 -1.874992
1
data.unstack().stack()
a  1    1.832067
   2   -0.501033
   3   -0.602755
b  1   -0.731398
   3   -0.707528
c  1   -0.382131
   2   -0.177199
d  2   -0.826364
   3   -1.874992
dtype: float64
1
2
3
4
5
6
# 对于DF类型数据
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
frame
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
2
3
4
5
# 索引设置名字
frame.index.names = ['key1', 'key2']
# 属性设置名字
frame.columns.names = ['state', 'color']
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
frame['Ohio']
color Green Red
key1 key2
a 1 0 1
2 3 4
b 1 6 7
2 9 10
1
2
3
from pandas import MultiIndex
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
names=['state', 'color'])
MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           codes=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

重排与分级排序

  • 重新调整某条轴上的各级别的顺序
  • 指定级别上的值对数据进行排序
  • swaplevel()接受两个级别编号或名称
1
2
# 交换位置
frame.swaplevel('key1', 'key2')
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
1
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
2
# level=0 通过第一层索引key1排序
frame.sort_index(level=0)
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
2
# level=1 通过第一层索引key2排序
frame.sort_index(level=1)
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
1
2
# swaplevel 不仅可以交换两个索引值,还可以交换它们的索引数值
frame.swaplevel(0, 1)
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
1
frame.swaplevel(0, 1).sort_index(level=0)
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11

根据级别统计求和

  • 通过level指定某条轴
  • 指定行或者列
1
frame.sum(level='key2')
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
1
2
# axis=1表示列
frame.sum(level='color', axis=1)
color Green Red
key1 key2
a 1 2 1
2 8 4
b 1 14 7
2 20 10
1
frame
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
1
frame.sum(level='key2',axis=0)
state Ohio Colorado
color Green Red Green
key2
1 6 8 10
2 12 14 16
1
2
# axis=0指定列
frame.sum(level='key1',axis=0)
state Ohio Colorado
color Green Red Green
key1
a 3 5 7
b 15 17 19
1
2
3
4
5
# 使⽤DataFrame的列进⾏索引
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
frame
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
1
2
3
# set_index函数将列转换成行索引,默认删除
frame2 = frame.set_index(['c', 'd'])
frame2
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
1
2
# 将原来的索引保留
frame.set_index(['c', 'd'], drop=False)
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
1
frame2.reset_index()
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1

合并数据集

  • pandas.merge:根据键将不同DF中的行连接起来,类似于数据库的join操作
  • pandas.concat:沿着轴将对象叠在一起
  • 法combine_first可以将重复数据拼接在⼀起,⽤⼀个对象中的值填充另⼀个的缺失值
1
2
3
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
1
df1
key data1
0 b 0
1 b 1
2 a 2
3 c 3
4 a 4
5 a 5
6 b 6
1
df2
key data2
0 a 0
1 b 1
2 d 2
1
2
# 默认根据重叠列名key根据进行合并
pd.merge(df1,df2)
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
1
pd.merge(df1, df2, on='key')
key data1 data2
0 b 0 1
1 b 1 1
2 b 6 1
3 a 2 0
4 a 4 0
5 a 5 0
1
2
3
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})

merge

  • 默认是内连接
  • 结果中的键是交集:只有a、b在两个DF中同时存在
1
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
lkey data1 rkey data2
0 b 0 b 1
1 b 1 b 1
2 b 6 b 1
3 a 2 a 0
4 a 4 a 0
5 a 5 a 0
1
2
3
4
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
'value': [5, 6, 7, 8]})
1
df1.merge(df2, left_on='lkey', right_on='rkey')
lkey value_x rkey value_y
0 foo 1 foo 5
1 foo 1 foo 8
2 foo 5 foo 5
3 foo 5 foo 8
4 bar 2 bar 6
5 baz 3 baz 7
1
2
3
# suffixes解决两个DF中重复列名的问题
df1.merge(df2, left_on='lkey', right_on='rkey',
suffixes=('_left', '_right'))
lkey value_left rkey value_right
0 foo 1 foo 5
1 foo 1 foo 8
2 foo 5 foo 5
3 foo 5 foo 8
4 bar 2 bar 6
5 baz 3 baz 7
1
2
# 相同的values进行合并
df1.merge(df2, how='inner')
lkey value rkey
0 foo 5 foo
1
2
# 右边的值为标准,左边如果有,直接显示;没有则显示NaN
df1.merge(df2, how='right')
lkey value rkey
0 foo 5 foo
1 NaN 6 bar
2 NaN 7 baz
3 NaN 8 foo
1
pd.merge(df1, df2, how='left')
lkey value rkey
0 foo 1 NaN
1 bar 2 NaN
2 baz 3 NaN
3 foo 5 foo
1
2
# outer相当于是right和left一起作用
pd.merge(df1, df2, how='outer')
lkey value rkey
0 foo 1 NaN
1 bar 2 NaN
2 baz 3 NaN
3 foo 5 foo
4 NaN 6 bar
5 NaN 7 baz
6 NaN 8 foo
1
2
3
4
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
'data2': range(5)})
1
2
# 以right为标准:如果右有左无,标记为NaN;左有右无,直接丢弃
pd.merge(df1, df2, on='key', how='right')
key data1 data2
0 b 0.0 1
1 b 1.0 1
2 b 5.0 1
3 b 0.0 3
4 b 1.0 3
5 b 5.0 3
6 a 2.0 0
7 a 4.0 0
8 a 2.0 2
9 a 4.0 2
10 d NaN 4
1
pd.merge(df1, df2, on='key', how='left')
key data1 data2
0 b 0 1.0
1 b 0 3.0
2 b 1 1.0
3 b 1 3.0
4 a 2 0.0
5 a 2 2.0
6 c 3 NaN
7 a 4 0.0
8 a 4 2.0
9 b 5 1.0
10 b 5 3.0
1
pd.merge(df1, df2, how='inner')
key data1 data2
0 b 0 1
1 b 0 3
2 b 1 1
3 b 1 3
4 b 5 1
5 b 5 3
6 a 2 0
7 a 2 2
8 a 4 0
9 a 4 2
1
2
3
4
5
6
7
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
'key2': ['one', 'two', 'one'],
'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
'key2': ['one', 'one', 'one', 'two'],
'rval': [4, 5, 6, 7]})
right
key1 key2 rval
0 foo one 4
1 foo one 5
2 bar one 6
3 bar two 7
1
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1 key2 lval rval
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0
1
pd.merge(left, right, on=['key1', 'key2'], how='inner')
key1 key2 lval rval
0 foo one 1 4
1 foo one 1 5
2 bar one 3 6

索引行的合并

  • DF的连接键有时位于索引
  • 传入left_index=True或right_index=True
1
2
3
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
1
pd.merge(left1, right1, left_on='key', right_index=True)
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
1
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
5 c 5 NaN
1
2
3
4
5
6
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['Missouri', 'Alabama'])
1
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0

join()

  • 按照索引合并
  • 合并多个DF对象,要求没有重复的列
  • 默认使用左连接,保留左边的行索引
  • 简单的合并参数可以是一组DF
1
left2.join(right2, how='outer')
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
1
left1.join(right1, on='key')
key value group_val
0 a 0 3.5
1 b 1 7.0
2 a 2 3.5
3 a 3 3.5
4 b 4 7.0
5 c 5 NaN
1
2
3
4
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'],
columns=['New York', 'Oregon'])
another
New York Oregon
a 7.0 8.0
c 9.0 10.0
e 11.0 12.0
f 16.0 17.0
1
left2.join([right2, another])
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
1
left2.join([right2, another], how='outer', sort='True')
c:\users\admin\venv\lib\site-packages\pandas\core\frame.py:6848: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  verify_integrity=True)
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
b NaN NaN 7.0 8.0 NaN NaN
c 3.0 4.0 9.0 10.0 9.0 10.0
d NaN NaN 11.0 12.0 NaN NaN
e 5.0 6.0 13.0 14.0 11.0 12.0
f NaN NaN NaN NaN 16.0 17.0

轴向索引

  • 连接concatentation、绑定binding、堆叠stacking
  • Numpy的concatenate()函数实现
  • pandas的concat()函数实现
1
2
3
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
1
2
3
# 传入的是列表形式
# concat是在axis=0上⼯作的
pd.concat([s1, s2, s3])
a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64
1
2
s4 = pd.concat([s1, s2, s3], axis=1,sort=True)
s4
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
1
pd.concat([s1, s4])
0 1 2
a 0.0 NaN NaN
b 1.0 NaN NaN
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
1
pd.concat([s1, s4], axis=1, sort=True)
0 0 1 2
a 0.0 0.0 NaN NaN
b 1.0 1.0 NaN NaN
c NaN NaN 2.0 NaN
d NaN NaN 3.0 NaN
e NaN NaN 4.0 NaN
f NaN NaN NaN 5.0
g NaN NaN NaN 6.0
1
2
3
# 传入join求交集
# join_axes指定要在其它轴上使⽤的索引
pd.concat([s1, s4], axis=1, join='inner')
0 0 1 2
a 0 0.0 NaN NaN
b 1 1.0 NaN NaN
1
2
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result
one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64
1
result.unstack()
a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0
1
pd.concat([s1, s2, s3], axis=1, sort=True, keys=['one', 'two', 'three'])
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
1
2
3
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1
a b c d
0 -1.017946 -0.127939 0.556561 3.037912
1 -0.282973 -1.015500 -0.846612 -0.393027
2 -0.022687 -1.878993 0.607246 0.365823
1
pd.concat([df1, df2], ignore_index=True,sort=True)
a b c d
0 -1.017946 -0.127939 0.556561 3.037912
1 -0.282973 -1.015500 -0.846612 -0.393027
2 -0.022687 -1.878993 0.607246 0.365823
3 -0.212958 0.053333 NaN -0.371492
4 -0.237029 -0.239806 NaN 0.623274

合并与重叠

索引全部或者部分重叠的两个数据

  • Numpy的where函数:类似if-else
  • Series有⼀个combine_first⽅法
1
2
3
4
5
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
index=['f', 'e', 'd', 'c', 'b', 'a'])
a
f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64
1
2
b[-1] = np.nan
b
f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    NaN
dtype: float64
1
np.where(pd.isnull(a), b, a)
array([0. , 2.5, 2. , 3.5, 4.5, nan])
1
pd.isnull(a)
f     True
e    False
d     True
c    False
b    False
a     True
dtype: bool
1
2
# Series有⼀个combine_first⽅法
b[:-2].combine_first(a[2:])
a    NaN
b    4.5
c    3.0
d    2.0
e    1.0
f    0.0
dtype: float64

重塑和轴向旋转

  • reshape
  • pivot:⽤set_index创建层次化索引,再⽤unstack重塑;长格式转化为宽格式
  • pandas.melt:将宽格式转化为长格式,合并多列
  • stack:列旋转为行:S------>DF;
    • 默认会滤除缺失数据
    • 修改:dropna=False,不滤除
  • unstack:行旋转为列:DF---->S
1
2
3
4
5
6
# 创建一个DF,同时指定行列索引的name
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio', 'Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'],
name='number'))
data
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
1
2
3
# 将列转换为行,得到一个Series
result = data.stack()
result
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
1
2
# 将S变成DF数据
result.unstack()
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
1
2
#  unstack():默认操作的是最内层
result.unstack('state')
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
1
2
3
4
df = pd.DataFrame(np.random.rand(6, 4),
index=['one', 'two', 'three', 'four', 'five', 'six'],
columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df
Genus A B C D
one 0.862586 0.440328 0.172111 0.852715
two 0.303660 0.322660 0.232518 0.327529
three 0.128695 0.309510 0.224989 0.750412
four 0.658148 0.768613 0.234445 0.184719
five 0.502111 0.406510 0.310178 0.355971
six 0.002387 0.996109 0.207558 0.033299
1
2
# Genus自动用作图例
df.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x1ad366bafd0>

本文标题:pandas系列9-数据规整

发布时间:2019年10月21日 - 14:10

原始链接:http://www.renpeter.cn/2019/10/21/pandas%E7%B3%BB%E5%88%979-%E6%95%B0%E6%8D%AE%E8%A7%84%E6%95%B4.html

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

Coffee or Tea