1 | import numpy as np |
重新索引
重新索引不会改变原数据
- 行索引
Series.reindex
DF.reindex()
- 列索引
- 通过
columns
关键字指定
- 通过
1 | obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) |
1 | d 4.5 |
1 | # S型数据重新排序索引 |
1 | a -5.3 |
1 | obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4]) |
1 | 0 blue |
1 | # ffill前项填充:填充的是前一个数值 |
1 | 0 blue |
1 | # DF重新索引 |
Ohio | Texas | California | |
---|---|---|---|
a | 0 | 1 | 2 |
c | 3 | 4 | 5 |
d | 6 | 7 | 8 |
1 | # DF重新索引 |
Ohio | Texas | California | |
---|---|---|---|
a | 0.0 | 1.0 | 2.0 |
b | NaN | NaN | NaN |
c | 3.0 | 4.0 | 5.0 |
d | 6.0 | 7.0 | 8.0 |
1 | # 重新索引列 |
Ohio | Utah | California | |
---|---|---|---|
a | 0 | NaN | 2 |
c | 3 | NaN | 5 |
d | 6 | NaN | 8 |
1 | # drop等函数默认是就地修改,不改变原有数据 |
1 | d 4.5 |
舍弃指定轴上的数据
- drop(index)
- drop([index1, index2])
1 | obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) |
1 | a 0.0 |
1 | # 舍弃一行数据 |
1 | a 0.0 |
删除数据
- 行:axis=0,默认
- 列:axis=1,或者axis=columns
- 删除一个通过标签形式
- 删除多个是传入列表形式
1 | data = pd.DataFrame(np.arange(16).reshape((4, 4)), |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | # 默认是删除行数据 |
one | two | three | four | |
---|---|---|---|---|
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | # axis=1:删除列数据 |
one | three | four | |
---|---|---|---|
Ohio | 0 | 2 | 3 |
Colorado | 4 | 6 | 7 |
Utah | 8 | 10 | 11 |
New York | 12 | 14 | 15 |
1 | # 删除多列数据 |
one | three | |
---|---|---|
Ohio | 0 | 2 |
Colorado | 4 | 6 |
Utah | 8 | 10 |
New York | 12 | 14 |
选取行数据
- loc:轴标签
- iloc:整数索引
1 | data |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | # 标签索引 |
1 | two 5 |
1 | # 切片形式:前面表示行所用,后面表示列 |
1 | Ohio 1 |
1 | # 整数数值索引 |
1 | four 11 |
1 | data.iloc[[1, 2], [3, 0, 1]] |
four | one | two | |
---|---|---|---|
Colorado | 7 | 4 | 5 |
Utah | 11 | 8 | 9 |
整数索引
1 | ser = pd.Series(np.arange(3.)) |
1 | 0 0.0 |
1 | ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c']) |
1 | 2.0 |
1 | # 索引不包含末尾 |
1 | 0 0.0 |
1 | ser.loc[:2] |
1 | 0 0.0 |
1 | ser.iloc[:2] |
1 | 0 0.0 |
1 | data = pd.DataFrame(np.arange(16).reshape((4, 4)), |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data[['three', 'one']] |
three | one | |
---|---|---|
Ohio | 2 | 0 |
Colorado | 6 | 4 |
Utah | 10 | 8 |
New York | 14 | 12 |
1 | data[data['three'] > 5] |
one | two | three | four | |
---|---|---|---|---|
Colorado | 4 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data < 5 |
one | two | three | four | |
---|---|---|---|---|
Ohio | True | True | True | True |
Colorado | True | False | False | False |
Utah | False | False | False | False |
New York | False | False | False | False |
1 | data[data < 5] = 0 |
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 0 | 0 | 0 |
Colorado | 0 | 5 | 6 | 7 |
Utah | 8 | 9 | 10 | 11 |
New York | 12 | 13 | 14 | 15 |
1 | data.loc['Colorado', ['two', 'three']] |
1 | two 5 |
1 | # 所有行的前三列,再选择大于5的数值 |
one | two | three | |
---|---|---|---|
Colorado | 0 | 5 | 6 |
Utah | 8 | 9 | 10 |
New York | 12 | 13 | 14 |