Python数据分析（3）pandas库的使用：索引，统计函数，排序和NaN数据值处理

xiaoxiao2021-02-28 29

#coding=gbk #pandas 库 #索引对象的其他功能 import numpy as np import pandas as pd #1，更换索引 reindex() frame=pd.Series([1,5,6,3],index=[0,3,5,6]) print(frame) # 0 1 # 3 5 # 5 6 # 6 3 # dtype: int64 print(frame.reindex(range(6),method='ffill')) #ffill，填充与前面的值相同 # 0 1 # 1 1 # 2 1 # 3 5 # 4 5 # 5 6 # dtype: int64 print(frame.reindex(range(6),method='bfill')) #bfill，填充与后面的值相同 # 0 1 # 1 5 # 2 5 # 3 5 # 4 6 # 5 6 # dtype: int64 #2删除列 print(frame.drop(3))#删除索引为3 的行 #还可以删除列，如：print(frame1.drop(['color','object'],axis=1)) frame2=pd.DataFrame(np.arange(16).reshape(4,4), index=['blue','green','black','yellow'], columns=['color','price','object','real']) print(frame2) # color price object real # blue 0 1 2 3 # green 4 5 6 7 # black 8 9 10 11 # yellow 12 13 14 15 #统计函数 print(frame2.sum()) #求每列的总和 # color 24 # price 28 # object 32 # real 36 print(frame2.mean()) #求平均值 print(frame2.describe()) #计算多个统计量 # color price object real # count 4.000000 4.000000 4.000000 4.000000 # mean 6.000000 7.000000 8.000000 9.000000 # std 5.163978 5.163978 5.163978 5.163978 标准方差 # min 0.000000 1.000000 2.000000 3.000000 # 25% 3.000000 4.000000 5.000000 6.000000 四分位图，箱形图 # 50% 6.000000 7.000000 8.000000 9.000000 # 75% 9.000000 10.000000 11.000000 12.000000 # max 12.000000 13.000000 14.000000 15.000000 #排序和排位次 print(frame2.sort_index()) #依据字母的大小正序进行排列 # color price object real # black 8 9 10 11 # blue 0 1 2 3 # green 4 5 6 7 # yellow 12 13 14 15 print(frame2.sort_index(ascending=False)) #降序排列 # color price object real # yellow 12 13 14 15 # green 4 5 6 7 # blue 0 1 2 3 # black 8 9 10 11 print(frame2.sort_index(axis=1)) #对列进行排序 # color object price real # blue 0 2 1 3 # green 4 6 5 7 # black 8 10 9 11 # yellow 12 14 13 15 ser= pd.Series([1,5,2,8,0],index=['blue','green','black','yellow','white']) print(ser) # blue 1 # green 5 # black 2 # yellow 8 # white 0 # dtype: int64 #对元素进行排序 # print(ser.order()) # print(frame2.sort_index(by='color')) #对排位次进行操作依据元素的大小进行排序 print(ser.rank()) # blue 2.0 # green 4.0 # black 3.0 # yellow 5.0 # white 1.0 # dtype: float64 #相关性和协方差 print(frame2.corr()) #相关性 # color price object real # color 1.0 1.0 1.0 1.0 # price 1.0 1.0 1.0 1.0 # object 1.0 1.0 1.0 1.0 # real 1.0 1.0 1.0 1.0 print(frame2.cov()) #协方差 # color price object real # color 26.666667 26.666667 26.666667 26.666667 # price 26.666667 26.666667 26.666667 26.666667 # object 26.666667 26.666667 26.666667 26.666667 # real 26.666667 26.666667 26.666667 26.666667 #NaN数据值处理 #1为元素附nan值 ser1= pd.Series([1,5,np.NaN,np.nan,8],index=['blue','green','black','yellow','white']) print(ser1) # blue 1.0 # green 5.0 # black NaN # yellow NaN # white 8.0 # dtype: float64 #2去除nan元素 print(ser1.dropna()) # blue 1.0 吧存在nan元素删掉 # green 5.0 # white 8.0 # dtype: float64 #dataFrame数据结构 frame3=pd.DataFrame([[1,2,np.nan,np.nan],[2,4,6,np.nan], [8,2,np.nan,np.nan],[np.nan,2,10,np.nan]], index=['blue','green','black','yellow'], columns=['color','price','object','real']) print(frame3) # color price object real # blue 1.0 2 NaN NaN # green 2.0 4 6.0 NaN # black 8.0 2 NaN NaN # yellow NaN 2 10.0 NaN print(frame3.dropna()) #把行或列出现了nan的都删除了，所以为空了 # Empty DataFrame # Columns: [color, price, object, real] # Index: [] print(frame3.dropna(how='all'))#还是为数据 #3为nan赋值 print(frame3.fillna(0)) #为nan值赋值为0 # color price object real # blue 1.0 2 0.0 0.0 # green 2.0 4 6.0 0.0 # black 8.0 2 0.0 0.0 # yellow 0.0 2 10.0 0.0 print(frame3.fillna({'real':3}))#为real这一列赋值3 # color price object real # blue 1.0 2 NaN 3.0 # green 2.0 4 6.0 3.0 # black 8.0 2 NaN 3.0 # yellow NaN 2 10.0 3.0

#pandas 中(loc, iloc, 以及 ix 的区别)

#coding=gbk #pandas 中(loc, iloc, 以及 ix 的区别) # loc：通过行标签索引数据 # iloc：通过行号索引行数据 # ix：通过行标签或行号索引数据（基于loc和iloc的混合） import pandas as pd data = pd.DataFrame([[1,2,3],[4,5,6]], index = ['a','b'], columns=['c','d','e']) print(data) # c d e # a 1 2 3 # b 4 5 6 print(data.loc['a']) #输入行的标签 # c 1 # d 2 # e 3 # Name: a, dtype: int64 # print(data.iloc['a']) 输出出错，需要使用行号，是数值型 print(data.iloc[1]) # c 4 # d 5 # e 6 # Name: b, dtype: int64 print(data['c']) #输出打印出列的数据 # a 1 # b 4 # Name: c, dtype: int64 print('使用ix 方法') print(data.ix['a']) #与下方输出的数据一样的 # c 1 # d 2 # e 3 # Name: a, dtype: int64 print(data.ix[0]) # c 1 # d 2 # e 3 # Name: a, dtype: int64 print('-----') print('-----') #1.修改列名 data = pd.DataFrame(np.arange(12).reshape(6,2),columns =['a','b']) print(data) # a b # 0 0 1 # 1 2 3 # 2 4 5 # 3 6 7 # 4 8 9 # 5 10 11 data.rename(columns ={'a':'key1','b':'key2'},inplace = True) #inplace 是在原数据上修改 print(data) # a b # 0 0 1 # 1 2 3 # 2 4 5 # 3 6 7 # 4 8 9 # 5 10 11 d = pd.Series([1,2,3,4,2,4,1]) print(d) print(d.unique()) print(d.value_counts()) i= d.isin([1,2,5,7]) print(i) ##计算一个“Series各值是否包含传入的值序列中”的布尔数组 # 0 True # 1 True # 2 False # 3 False # 4 True # 5 False # 6 True # dtype: bool #2,值替换 replace（） data = pd.DataFrame(np.arange(6).reshape(3,2),columns =['a','b']) print(data) # a b # 0 0 1 # 1 2 3 # 2 4 5 data['a'] = data['a'].replace({2:100,4:200}) #将2和 4 替换成100 和 200 print(data) # a b # 0 0 1 # 1 100 3 # 2 200 5 #修改值 cloumns = ['age','name','height','sex'] data = pd.DataFrame([[12,'zhangsan',170,'male'],[18,'lisi',178,'female']],columns=cloumns) print(data) # age name height sex # 0 12 zhangsan 170 male # 1 18 lisi 178 female #修改张三的身高为190 data['height'] = data['height'].replace({170:190}) print(data) #3数据选取 print(data[data['b']>4]) #选取b行中大于4的数据 # a b # 2 200 5 #创建新的变量test data = pd.DataFrame(np.arange(12).reshape(3,4),columns = ['a','b','c','d']) print(data) # a b c d # 0 0 1 2 3 # 1 4 5 6 7 # 2 8 9 10 11 data['test'] = None s = data['b']> 4 data['test'] = np.where(s,1,0) print(data) # a b c d test # 0 0 1 2 3 0 # 1 4 5 6 7 1 # 2 8 9 10 11 1 import pandas as pd import numpy as np data = pd.DataFrame(np.arange(12).reshape(3,4),columns = ['a','b','c','d']) print(data) # a b c d # 0 0 1 2 3 # 1 4 5 6 7 # 2 8 9 10 11 print(data['a'][1]) #返回第 'a'列的第一个元素 4 print(data.ix[0,1]) #返回第一行第二列的元素 1 print(data.iloc[0:2,0:2]) #输出指定的行列数的数据 # a b # 0 0 1 # 1 4 5

转载请注明原文地址: https://www.6miu.com/read-2613780.html

技术

最新回复(0)