data = pd.DataFrame({'k1':['one','two']*3 + ['two'], 'k2':[1,1,2,3,3,4,4]}) print(data) ==> k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
#这里duplicates函数的作用时返回布尔值,false表示前面没有出现过,true表示前面出现过 print(data.duplicated()) ==> 0False 1False 2False 3False 4False 5False 6True dtype: bool #这里drop_duplicates的作用是将上面显示为true的行删去,也即将与前面重复的行删去 print(data.drop_duplicates()) ==> k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
data['v1'] = range(7) print(data) ==> k1 k2 v1 0 one 10 1 two 11 2 one 22 3 two 33 4 one 34 5 two 45 6 two 46
#对drop_duplicates函数传入参数可以使其基于指定列进行重复值删除,同时传入keep='last'参数会将重复的值的最后一个返回而非返回第一个 print(data.drop_duplicates(['k1','k2'], keep='last')) ==> k1 k2 v1 0 one 10 1 two 11 2 one 22 3 two 33 4 one 34 6 two 46
data = pd.Series([1., -999,2.,-999.,-1000.,3.]) print(data) ==> 01.0 1-999.0 22.0 3-999.0 4-1000.0 53.0 dtype: float64 print(data.replace(-999, np.nan)) ==> 01.0 1 NaN 22.0 3 NaN 4-1000.0 53.0 dtype: float64 print(data.replace([-999,-1000],np.nan)) ==> 01.0 1 NaN 22.0 3 NaN 4 NaN 53.0 dtype: float64 print(data.replace({-999:np.nan, -1000:0})) ==> 01.0 1 NaN 22.0 3 NaN 40.0 53.0 dtype: float64
重命名轴索引
可以对轴函数的index列使用map方法进行更改
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
data = pd.DataFrame(np.arange(12).reshape((3,4)), index=['Ohio', 'Colorado', 'NewYork'], columns=['one', 'two', 'three', 'four']) print(data) ==> one two three four Ohio 0123 Colorado 4567 NewYork 891011 transform = lambda x: x[:4].upper() data.index = data.index.map(transform) print(data) ==> one two three four OHIO 0123 COLO 4567 NEWY 891011
print(data.rename(index=str.title, columns=str.upper)) ==> ONE TWO THREE FOUR Ohio 0123 Colorado 4567 Newyork 891011 print(data.rename(index={'Ohio':'INDIANA'}, columns={'three':'peekaboo'})) ==> one two peekaboo four INDIANA 0123 Colorado 4567 NewYork 891011
df = pd.DataFrame({'key':['b','b','a','c','a','b'], 'data1':range(6)}) print(df) ==> key data1 0 b 0 1 b 1 2 a 2 3 c 3 4 a 4 5 b 5 print(pd.get_dummies(df['key'])) ==> a b c 0010 1010 2100 3001 4100 5010