pandas主要用于数据的处理,在数据预处理方面用途比较多 如下为一些基本操作:
import pandas as pd food_info=pandas.read_csv("food_info.csv") print(type(food_info)) #food_info是Dataframe格式 print(food_info.dtypes) #取food_info的前三行数据 #print(food_info.head(3) ) #取food_info的后四行数据 #print(food_info.tail(4)) #取food_info的列名 print(food_info.columns) #打印数据的行数、列数 print(food_info.shape) #loc对数据进行定位 print(food_info.loc[0]) #调用第0个数据 #对数据进行切片,取出第3、4、5、6的数据 #第一种 food_info.loc[3:6] #第二种 two_five_ten = [2,5,10] food_info.loc[two_five_ten] #第三种 food_info.loc[[2,5,10]] #取出数据中的“NDB_No”列 ndb_col = food_info["NDB_No"] #取两列 columns = ["Zinc_(mg)", "Copper_(mg)"] zinc_copper = food_info[columns] #取出列名中以“g”结尾的数据 col_names = food_info.columns.tolist() #print (col_names) gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3)) #对一列中每一个数据都除以100 print (food_info["Iron_(mg)"]) div_1000 = food_info["Iron_(mg)"] / 1000 print (div_1000) #排序 #以Sodium_(mg)进行排序,inplace=True表示新生成一个,false为替换原数据 food_info.sort_values("Sodium_(mg)", inplace=True) print (food_info["Sodium_(mg)"])#默认为升序 #Sorts by descending order, rather than ascending. food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) print (food_info["Sodium_(mg)"])实例: 对泰坦尼克号数据的处理
import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") #titanic_survival.head() age=titanic_survival["Age"] #取出age这一列 age_is_null=pd.isnull(age) #返回的是bool类型 age_null_true=age[age_is_null] age_null_count=len(age_null_true) print(age_null_count) #如不做预处理,求均值.因为有缺失值算的平均值为nan mean_age=sum(titanic_survival["Age"])/len(titanic_survival["Age"]) print(mean_age) ##手动处理 good_ages=titanic_survival["Age"][age_is_null==False] correct_mean_age=sum(good_ages)/len(good_ages) print(correct_mean_age) #pandas库中提供了mean函数,可以自动处理缺失值情况 correct_mean_age=titanic_survival["Age"].mean() print(correct_mean_age) #1.计算每一个等级船舱的平均价格 passenger_classes=[1,2,3] fares_by_class={} for this_class in passenger_classes: pclass_rows=titanic_survival[titanic_survival["Pclass"]==this_class] pclass_fares=pclass_rows["Fare"] fare_for_class=pclass_fares.mean() fares_by_class[this_class]=fare_for_class print(fares_by_class) #2.以Pclass为基准,统计Survived的几率,相比于上一个方便很多 passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean) print(passenger_survival) passenger_survival=titanic_survival.pivot_table(index="Pclass",values="Age") print(passenger_survival) port_stats=titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) print(port_stats) #dropna去掉缺失值,axis=1删除有缺失值的列 drop_na_columns=titanic_survival.dropna(axis=1) new_titanic_survival=titanic_survival.dropna(axis=0,subset=["Age","Sex"]) #定位 row_index_83_age=titanic_survival.loc[83,"Age"] row_index_1000_pclass=titanic_survival.loc[766,"Pclass"] print(row_index_83_age) print(row_index_1000_pclass) #apply自定义的函数hundredth_row #返回第100行, def hundredth_row(column): hundredth_item=column.loc[99] return hundredth_item hundredth_row=titanic_survival.apply(hundredth_row) print(hundredth_row) # def not_null_count(column): column_null=pd.isnull(column) null=column[column_null] return len(null) column_null_count=titanic_survival.apply(not_null_count) print(column_null_count)Series结构: pandas库中,dataframe是一种矩阵的形式,其中有一种是series,表示某一行或某一列或某几行某几列
import pandas as pd import numpy as np fandango=pd.read_csv("fandango_score_comparison.csv") series_film=fandango["FILM"] series_rt=fandango["RottenTomatoes"] from pandas import Series #可以用字符型作为series索引 film_names=series_film.values rt_scores=series_rt.values series_custom=Series(rt_scores,index=film_names) series_custom[["Minions(2015)","Leviathan(2014)"]] # int index is also aviable series_custom = Series(rt_scores , index=film_names) series_custom[['Minions (2015)', 'Leviathan (2014)']] fiveten = series_custom[5:10] print(fiveten) original_index = series_custom.index.tolist() #print original_index sorted_index = sorted(original_index) sorted_by_index = series_custom.reindex(sorted_index) print (sorted_by_index)