Fork me on GitHub

pandas技巧1

Series

  • 一种类似一维数组的对象,由一组数据(Numpy数据类型)和与数据相关的数据标签组成
  • 索引在左边,值在右边。如果没有指定索引,自动创建数字的索引
1
2
import pandas as pd
import numpy as np
1
2
obj = pd.Series([4,7,-8,3])
obj
0    4
1    7
2   -8
3    3
dtype: int64
1
obj.values
array([ 4,  7, -8,  3])
1
obj.index
RangeIndex(start=0, stop=4, step=1)
1
2
obj1 = pd.Series([7,2,-6,9], index=['a','d','c','b'])
obj1
a    7
d    2
c   -6
b    9
dtype: int64
1
obj1.index
Index(['a', 'd', 'c', 'b'], dtype='object')
1
obj1['d']    # 通过自建的索引获取数据
2
1
obj1[1]     # 通过默认的数字索引进行获取
2
1
obj1[obj1 > 0]
a    7
d    2
b    9
dtype: int64
1
np.exp(obj1)
a    1096.633158
d       7.389056
c       0.002479
b    8103.083928
dtype: float64
1
2
3
# 通过字典创建Series
data = {'xiaoming': 2000, 'xiaohong': 1000, 'zhangsan': 1500}
obj2 = pd.Series(data)
1
obj2
xiaoming    2000
xiaohong    1000
zhangsan    1500
dtype: int64
1
2
3
4
# lisi这个键不存在,默认为NaN,表示缺失值
dataIndex = ['xiaoming', 'xiaohong', 'zhangsan', 'lisi']
obj3 = pd.Series(data, index=dataIndex)
obj3
xiaoming    2000.0
xiaohong    1000.0
zhangsan    1500.0
lisi           NaN
dtype: float64
1
pd.isnull(obj3)
xiaoming    False
xiaohong    False
zhangsan    False
lisi         True
dtype: bool
1
obj3.isnull()
xiaoming    False
xiaohong    False
zhangsan    False
lisi         True
dtype: bool
1
obj2 + obj3
lisi           NaN
xiaohong    2000.0
xiaoming    4000.0
zhangsan    3000.0
dtype: float64
1
obj3.name = 'person'
1
obj3.index.name = 'personName'
1
obj3
personName
xiaoming    2000.0
xiaohong    1000.0
zhangsan    1500.0
lisi           NaN
Name: person, dtype: float64
1
2
# 通过赋值就地修改索引
obj
0    4
1    7
2   -8
3    3
dtype: int64
1
2
obj.index = ['xiaoming', 'xiaohong', 'zhangsan', 'lisi']  #  直接修改
obj
xiaoming    4
xiaohong    7
zhangsan   -8
lisi        3
dtype: int64

DataFrame

  • 表格型的数据结构
  • 含有一组有序的列,每列可以是不同的值类型(数值,字符串,布尔值等)
  • DF既有行索引又有列索引,可以看作是由Series组成的字典
  • DF中的数据是以一个或者多个二维块存放的
1
2
3
4
# 创建
data = {'city': ['深圳', '上海', '长沙', '广州', '北京'],
'year': [2004, 2007, 2009, 2005, 2002],
'pop': [4.4, 2.8, 3.9, 4.2, 3.6]}
1
2
df = pd.DataFrame(data)
df
city year pop
0 深圳 2004 4.4
1 上海 2007 2.8
2 长沙 2009 3.9
3 广州 2005 4.2
4 北京 2002 3.6
1
df.head(3)
city year pop
0 深圳 2004 4.4
1 上海 2007 2.8
2 长沙 2009 3.9
1
2
# 改变列属性的顺序
pd.DataFrame(data, columns=['year', 'city', 'pop'])
year city pop
0 2004 深圳 4.4
1 2007 上海 2.8
2 2009 长沙 3.9
3 2005 广州 4.2
4 2002 北京 3.6
1
2
3
df1 = pd.DataFrame(data, columns=['year', 'city', 'pop', 'debt'],
index=['one', 'two', 'three','four', 'five'])
df1 # 传入的列debt不在原数据中,引起缺失值
year city pop debt
one 2004 深圳 4.4 NaN
two 2007 上海 2.8 NaN
three 2009 长沙 3.9 NaN
four 2005 广州 4.2 NaN
five 2002 北京 3.6 NaN
1
df1.columns
Index(['year', 'city', 'pop', 'debt'], dtype='object')
1
df1['city']     # 这种方式适合任何列名的访问
one      深圳
two      上海
three    长沙
four     广州
five     北京
Name: city, dtype: object
1
df1.city   # 只有当列名存在的时候才会有用
one      深圳
two      上海
three    长沙
four     广州
five     北京
Name: city, dtype: object
1
2
# 访问一行数据
df1.loc['three'] # 通过创建的索引
year    2009
city      长沙
pop      3.9
debt     NaN
Name: three, dtype: object
1
df1.iloc[2]     #  通过数字索引
year    2009
city      长沙
pop      3.9
debt     NaN
Name: three, dtype: object
1
2
3
# 列赋值
df1['debt'] = 2
df1
year city pop debt
one 2004 深圳 4.4 2
two 2007 上海 2.8 2
three 2009 长沙 3.9 2
four 2005 广州 4.2 2
five 2002 北京 3.6 2
1
2
df1['debt'] = np.arange(5.0)
df1
year city pop debt
one 2004 深圳 4.4 0.0
two 2007 上海 2.8 1.0
three 2009 长沙 3.9 2.0
four 2005 广州 4.2 3.0
five 2002 北京 3.6 4.0
1
2
3
4
# 填充某个Series,精确匹配到DF的索引,空位都将补上缺失值
val = pd.Series([-1.2, -1.7, 2], index=['three', 'one', 'five'])
df1['debt'] = val
df1
year city pop debt
one 2004 深圳 4.4 -1.7
two 2007 上海 2.8 NaN
three 2009 长沙 3.9 -1.2
four 2005 广州 4.2 NaN
five 2002 北京 3.6 2.0
1
2
3
# 创建一个 bool 列
df1['south'] = (df1.city == '深圳')
df1
year city pop debt south
one 2004 深圳 4.4 -1.7 True
two 2007 上海 2.8 NaN False
three 2009 长沙 3.9 -1.2 False
four 2005 广州 4.2 NaN False
five 2002 北京 3.6 2.0 False
1
del df1['south']   # 删除某列
1
df1
year city pop debt
one 2004 深圳 4.4 -1.7
two 2007 上海 2.8 NaN
three 2009 长沙 3.9 -1.2
four 2005 广州 4.2 NaN
five 2002 北京 3.6 2.0
1
2
3
4
5
# 嵌套字典:外层字典的键当作列,内层的键当作行索引
pop = {'Nevada':{2001: 2.2, 2002: 2.9},
'Ohio': {2000:1.5, 2001:1.7, 2002: 3.6}}
df2 = pd.DataFrame(pop)
df2
Nevada Ohio
2000 NaN 1.5
2001 2.2 1.7
2002 2.9 3.6
1
df2.T
2000 2001 2002
Nevada NaN 2.2 2.9
Ohio 1.5 1.7 3.6
1
2
# 改变行索引
pd.DataFrame(pop, index=[2001, 2002, 2003])
Nevada Ohio
2001 2.2 1.7
2002 2.9 3.6
2003 NaN NaN
1
2
3
# 指定行和列索引的名字
df2.index.name = 'year';df2.columns.name = 'state'
df2
state Nevada Ohio
year
2000 NaN 1.5
2001 2.2 1.7
2002 2.9 3.6
1
df2.values
array([[nan, 1.5],
       [2.2, 1.7],
       [2.9, 3.6]])

Index

  • pandas的索引对象负责管理轴标签和其他元数据
  • 在构建Series或者DF型数据时,用到的任何数组和其他序列的标签都会被转成一个Index
  • Index对象不可变,用户不能对其进行修改
  • Index对象在多个数据结构之间的安全共享
1
2
3
obj = pd.Series(range(3), index=['a','b','c'])
index = obj.index
index
Index(['a', 'b', 'c'], dtype='object')
1
obj
a    0
b    1
c    2
dtype: int64
1
index[1:]
Index(['b', 'c'], dtype='object')
1
2
labels = pd.Index(np.arange(3))
labels
Int64Index([0, 1, 2], dtype='int64')
1
2
obj1 = pd.Series([1.2, 1.8, 4] ,index=labels)
obj1
0    1.2
1    1.8
2    4.0
dtype: float64

pandas 基本功能

  • 重新索引reindex
  • 丢弃某条轴上的数据drop
1
2
3
obj = pd.Series([4.5, 7.2, -5.3, 3.6],
index=['a', 'b', 'c', 'd'])
obj
a    4.5
b    7.2
c   -5.3
d    3.6
dtype: float64
1
2
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])  # 重新索引reindex
obj2
a    4.5
b    7.2
c   -5.3
d    3.6
e    NaN
dtype: float64
1
2
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0,2,4])
obj3
0      blue
2    purple
4    yellow
dtype: object
1
obj3.reindex(range(6), method='ffill')
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
1
2
3
4
5
df = pd.DataFrame(np.arange(9).reshape(3,3),   # 数据value值
index=['a', 'c', 'd'], # 行索引
columns=['Ohio','Texas','California'] # 列索引
)
df
Ohio Texas California
a 0 1 2
c 3 4 5
d 6 7 8
1
df.reindex(['a', 'b', 'c', 'd'])
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
1
2
states = ['Texas', 'Utah', 'California']  # 改变一个列属性
df.reindex(columns=states)
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
1
df.loc[['a','b','c','d'], states]
/Applications/downloads/anaconda/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py:1494: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)
Texas Utah California
a 1.0 NaN 2.0
b NaN NaN NaN
c 4.0 NaN 5.0
d 7.0 NaN 8.0
1
2
3
# 丢弃drop
obj = pd.Series(np.arange(5.0), index=list("abcde"))
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
1
2
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
1
obj   # 原来的数据是不变的
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
1
obj.drop(['d', 'c'])  # 丢弃多个值用列表的形式
a    0.0
b    1.0
e    4.0
dtype: float64
1
2
3
4
# drop 删除任意轴上的数据
data = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio', 'Colorado','Utah','NY'],
columns=['one','two', 'three','four'])
1
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
NY 12 13 14 15
1
data.drop(['Colorado', 'Ohio'])   # 删除行记录
one two three four
Utah 8 9 10 11
NY 12 13 14 15
1
data.drop('two', axis=1)  # 指定列 axis=1
one three four
Ohio 0 2 3
Colorado 4 6 7
Utah 8 10 11
NY 12 14 15
1
data.drop('three', axis='columns')    # 通过columns
one two four
Ohio 0 1 3
Colorado 4 5 7
Utah 8 9 11
NY 12 13 15
1
data    # 原来的数据不会变
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
NY 12 13 14 15
1
2
# 原地修改对象:只能删除行记录
data.drop('NY', inplace=True) # 通过 inplace 参数来改变原数据
1
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11

索引、过滤和选取

1
2
3
obj = pd.Series(np.arange(4.0),
index=['a','b','c','d'])
obj
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1
obj['c']
2.0
1
obj[2]
2.0
1
obj[1:3]
b    1.0
c    2.0
dtype: float64
1
obj[['a','d','c']]
a    0.0
d    3.0
c    2.0
dtype: float64
1
obj['b':'c']   # 标签切片的末端是包含的
b    1.0
c    2.0
dtype: float64
1
2
3
# 利用切片进行赋值
obj['b':'c'] = 6
obj
a    0.0
b    6.0
c    6.0
d    3.0
dtype: float64
1
2
3
data = pd.DataFrame(np.arange(16).reshape((4,4)),
index=['Ohio', 'Colorado','Utah','NY'],
columns=['one','two', 'three','four'])
1
data[['three', 'two']]
three two
Ohio 2 1
Colorado 6 5
Utah 10 9
NY 14 13
1
data[data['three'] > 5]  # 布尔值选取
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
NY 12 13 14 15
1
2
data[data < 5] = 0   # 小于5的数赋值为0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
NY 12 13 14 15

loc和iloc

- loc 通过轴的标签进行索引
- iloc 通过数字进行索引
1
data.loc['Utah', ['two' ,'three']]
two       9
three    10
Name: Utah, dtype: int64
1
data.iloc[[1,2],[3,0,1]]
four one two
Colorado 7 0 5
Utah 11 8 9
1
data.loc[:'Utah', 'two']
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64
1
data.iloc[:, :3][data.three > 5]
one two three
Colorado 0 5 6
Utah 8 9 10
NY 12 13 14
1
data.iloc[:, :3]
one two three
Ohio 0 0 0
Colorado 0 5 6
Utah 8 9 10
NY 12 13 14

算术运算和数据对齐

1
2
3
4
s1 = pd.Series([7.3, -2.6, 3.4, 1.4],
index=['a', 'c', 'd', 'e'])
s2 = pd.Series([4.3, -9.6, 1.2, 2.9, 3.1],
index=['a', 'c', 'e', 'f', 'g'])
1
s1
a    7.3
c   -2.6
d    3.4
e    1.4
dtype: float64
1
s2
a    4.3
c   -9.6
e    1.2
f    2.9
g    3.1
dtype: float64
1
s1 + s2   # 自动对齐功能:在不重叠的索引处引入 NaN 值
a    11.6
c   -12.2
d     NaN
e     2.6
f     NaN
g     NaN
dtype: float64
1
2
df1 = pd.DataFrame({'A': [1,2]})
df2 = pd.DataFrame({'B': [3,4]})
1
df1
A
0 1
1 2
1
df2
B
0 3
1 4
1
df1 - df2
A B
0 NaN NaN
1 NaN NaN

算术方法中使用填充值

1
2
3
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),
columns=list('abcd'))
df1
a b c d
0 0.0 1.0 2.0 3.0
1 4.0 5.0 6.0 7.0
2 8.0 9.0 10.0 11.0
1
2
3
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),
columns=list('abcde'))
df2
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 6.0 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1
df1 + df2  # 没有重叠的位置产生 NaN
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 11.0 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
1
df1.add(df2, fill_value=0)   # 没有重叠的部分使用其中某个的值
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 11.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1
2
# 算术方法的副本通过 r 开头
1 / df1
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250 0.200000 0.166667 0.142857
2 0.125 0.111111 0.100000 0.090909
1
df1.rdiv(1)
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250 0.200000 0.166667 0.142857
2 0.125 0.111111 0.100000 0.090909

DataFrame和Series之间的运算

二者之间的运算是通过广播机制来实现的

1
2
arr = np.arange(12.).reshape((3, 4))
arr
array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])
1
arr[0]
array([0., 1., 2., 3.])
1
arr - arr[0]   # 每行都去减掉arr[0]
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])
1
2
3
df = pd.DataFrame(np.arange(12.).reshape(4,3),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas','Oregon'])
1
df
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
1
2
series = df.iloc[0]
series
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
1
df - series   # 将series的索引匹配到 DF 的每列
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
1
2
series1 = frame['d']
series1
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

<ipython-input-105-08a27b7e84ad> in <module>
----> 1 series1 = frame['d']
      2 series1


NameError: name 'frame' is not defined
1
df.sub(series1, axis='index')
1
2


本文标题:pandas技巧1

发布时间:2019年11月13日 - 23:11

原始链接:http://www.renpeter.cn/2019/11/13/pandas%E6%8A%80%E5%B7%A71.html

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

Coffee or Tea