pandas入门《利用Python进行数据分析》读书笔记第5章

xiaoxiao2021-02-27 691

pandas入门

源码点这里

#引入pandas from pandas import Series,DataFrame import pandas as pd

pandas的两个主要数据结构：Series和DataFrame

Series 是一中类似于一维数组的对象，它由一组数据以及一组与之相关的数据标签（即索引）组成。

obj=Series([4,7,-5,3]) obj 0 4 1 7 2 -5 3 3 dtype: int64 obj.values obj.index RangeIndex(start=0, stop=4, step=1) #对各个数据点进行标记的索引 obj2=Series([4,7,-5,3],index=['d','b','a','c']) obj2 d 4 b 7 a -5 c 3 dtype: int64 obj2['a'] -5 obj2['d'] 4 obj2[['c','a','d']] c 3 a -5 d 4 dtype: int64 #Numpy数运算（如根据布尔型数组进行过滤、标量乘法、应用数学函数等）都会保留索引和值之间的连接 obj2[obj2>0] d 4 b 7 c 3 dtype: int64 obj2*2 d 8 b 14 a -10 c 6 dtype: int64 import numpy as np np.exp(obj2) d 54.598150 b 1096.633158 a 0.006738 c 20.085537 dtype: float64 #可以将Series看成是一个定长的有序字典，因为它是索引值到数据值的一个映射 #可用通过字典来创建Series sdata={'0hio':35000,'Texta':79300,'Ohjsodf':16000,'Jsdf':5000} obj3=Series(sdata) obj3 0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64 states=['California','0hio','Texta','Ohjsodf'] obj4=Series(sdata,index=states) obj4 California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64 #NaN在pandas中表示缺失或NA值 pd.isnull(obj4)#检测缺失值 California True 0hio False Texta False Ohjsodf False dtype: bool pd.notnull(obj4) California False 0hio True Texta True Ohjsodf True dtype: bool #Series也有类似的实例方法 obj4.isnull() California True 0hio False Texta False Ohjsodf False dtype: bool **下面重点关注如何处理缺失数据** #Series一个重要的功能：它在算数运算中会自动对齐不同索引的数据 obj3 0hio 35000 Jsdf 5000 Ohjsodf 16000 Texta 79300 dtype: int64 obj4 California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 dtype: float64 obj3+obj4 0hio 70000.0 California NaN Jsdf NaN Ohjsodf 32000.0 Texta 158600.0 dtype: float64 #Series对象本身及其索引都有一个name属性，该属性跟pandas其他的关键功能关系非常密切 obj4.name='population' obj4.index.name='state' obj4 state California NaN 0hio 35000.0 Texta 79300.0 Ohjsodf 16000.0 Name: population, dtype: float64 obj 0 4 1 7 2 -5 3 3 dtype: int64 obj.index=['Bob','Steve','Jeff','Ryan']#通过直接赋值方式就地修改Series的索引 obj Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64

DataFrame

####DataFrame是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型（数值、字符串、布尔值等） ####DataFrame既又行索引也有列索引，它可以被看做Series组成的字典（共同用一个索引） #DataFrame的构造 data={'state':['0hi0','0hio','0hio','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9] } frame=DataFrame(data)#DataFrame会自动加上索引，且全部序列会被有序排列 frame popstateyear01.50hi0200011.70hio200123.60hio200232.4Nevada200142.9Nevada2002 #如果指定了序列，则DataFrame的列就会按照指定顺序进行排列 DataFrame(data,columns=['year','state','pop']) yearstatepop020000hi01.5120010hio1.7220020hio3.632001Nevada2.442002Nevada2.9 #如果传入的列在数据中找不到，就会产生NA值 frame2=DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five']) frame2 yearstatepopdebtone20000hi01.5NaNtwo20010hio1.7NaNthree20020hio3.6NaNfour2001Nevada2.4NaNfive2002Nevada2.9NaN frame2.columns Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’) #可以将DataFrame的列获取为一个Series： frame2['state'] one 0hi0 two 0hio three 0hio four Nevada five Nevada Name: state, dtype: object frame2.year one 2000 two 2001 three 2002 four 2001 five 2002 Name: year, dtype: int64 #行可以通过位置或名称的方式进行获取，比如用索引字段ix frame2.ix['three'] year 2002 state 0hio pop 3.6 debt NaN Name: three, dtype: object #列可以通过赋值的方式进行修改 frame2['debt']=16. frame2 frame2['debt']=np.arange(5.) frame2 yearstatepopdebtone20000hi01.50.0two20010hio1.71.0three20020hio3.62.0four2001Nevada2.43.0five2002Nevada2.94.0 #将列表或数组赋值给某个列时，其长度必须跟DataFrame的长度相匹配。 # 如果赋值的是一个Series，就会精确匹配DataFreme的索引，所有的空位将会被填上缺失值 val=Series([-1.2,-1.5,-1.7],index=['two','four','five']) frame2['debt']=val frame2 yearstatepopdebtone20000hi01.5NaNtwo20010hio1.7-1.2three20020hio3.6NaNfour2001Nevada2.4-1.5five2002Nevada2.9-1.7 #为不存在的列赋值会创建出一个新列。关键字del用于删除列 frame2['eastern']=frame2.state=='0hi0' frame2 yearstatepopdebteasternone20000hi01.5NaNTruetwo20010hio1.7-1.2Falsethree20020hio3.6NaNFalsefour2001Nevada2.4-1.5Falsefive2002Nevada2.9-1.7False del frame2['eastern'] frame2.columns Index([‘year’, ‘state’, ‘pop’, ‘debt’], dtype=’object’) ###通过索引方式返回的列知识相应数据的视图而已，并不是副本。因此对返回的Series所作的任何修改 ###全都会反映到源DataFrame上。通过Series的copy方法即可以显式地复制列 #另一种常见的数据形式是嵌套字典（也就是字典的字典） pop={'Nevada':{2001:2.4,2002:2.9},'0hio':{2000:1.5,2001:1.7,2002:3.6}} #外层字典的键作为列，内层字典的键作为行索引 frame3=DataFrame(pop) frame3 0hioNevada20001.5NaN20011.72.420023.62.9 frame3.T 2000200120020hio1.51.73.6NevadaNaN2.42.9 DataFrame(pop,index=[2001,2002,2003]) 0hioNevada20011.72.420023.62.92003NaNNaN #如果设置了DataFrame的index和columns的name属性，则这些信息也会被显示出来 frame3.index.name='year' frame3.columns.name='state' frame3 state0hioNevadayear20001.5NaN20011.72.420023.62.9 #跟Series一样，values属性也会以二维ndarray的形式返回DataFrame中的数据 frame3.values array([[ 1.5, nan], [ 1.7, 2.4], [ 3.6, 2.9]]) #如果DataFrame各列的数据类型不同，则值数组的数据类型就会选用能兼容所有列的数据类型 frame2.values array([[2000, ‘0hi0’, 1.5, nan], [2001, ‘0hio’, 1.7, -1.2], [2002, ‘0hio’, 3.6, nan], [2001, ‘Nevada’, 2.4, -1.5], [2002, ‘Nevada’, 2.9, -1.7]], dtype=object)

索引对象

obj=Series(range(3),index=['a','b','c']) index=obj.index index Index([‘a’, ‘b’, ‘c’], dtype=’object’) index[1:] Index([‘b’, ‘c’], dtype=’object’) #Index对象是不能修改的（immutable），因此用户不能对其进行修改 index[1]='d' ————————————————————————— TypeError Traceback (most recent call last) in () 1 #Index对象是不能修改的（immutable），因此用户不能对其进行修改 —-> 2 index[1]=’d’ C:\Users\ZJL\AppData\Local\Programs\Python\Python35\lib\site-packages\pandas\indexes\base.py in __setitem__(self, key, value) 1402 1403 def __setitem__(self, key, value): -> 1404 raise TypeError(“Index does not support mutable operations”) 1405 1406 def __getitem__(self, key): TypeError: Index does not support mutable operations #不可修改行保证了Index对象在多个数据结构之间安全共享 index=pd.Index(np.arange(3)) obj2=Series([1.5,-2.5,0],index=index) obj2.index is index True frame3 '0hio' in frame3.columns True 2002in frame3.index True ##重新索引 #pandas对象的一个重要方法是reindex,其作用是创建一个适应新索引的新对象。 obj=Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c']) obj d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64 obj2=obj.reindex(['a','b','c','d','e'])#索引值不存在引入缺失值 obj2 a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64 obj.reindex(['a','b','c','d','e'],fill_value=0) a -5.3 b 7.2 c 3.6 d 4.5 e 0.0 dtype: float64 #对于时间序列这样的有序数据，重新索引时可能需要做一些插值处理。method选项可达到此目的 obj3=Series(['blue','purple','yellow'],index=[0,2,4]) obj3.reindex(range(6),method='ffill') 0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object #reindex的（插值） method选项 ffill或pad 前向填充（或搬运）值 #bfill或backfill 后向填充（或搬运）值 #如果仅传入一个序列，则会重新索引行 frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','b','c'],columns=['0hio','Texas','California']) frame 0hioTexasCaliforniaa012b345c678 frame2=frame.reindex(['a','b','c','d']) frame2 0hioTexasCaliforniaa0.01.02.0b3.04.05.0c6.07.08.0dNaNNaNNaN states=['Texas','Utah','California'] frame.reindex(columns=states) TexasUtahCaliforniaa1NaN2b4NaN5c7NaN8 #也可以同时对行和列进行重新索引，而插值只能按行应用（即轴0） frame.reindex(index=['a','b','c','d'],method='ffill',columns=states) TexasUtahCaliforniaa1NaN2b4NaN5c7NaN8d7NaN8 #利用ix的标签索引功能，重新索引任务可以变得更简洁 frame.ix[['a','b','c','d'],states] TexasUtahCaliforniaa1.0NaN2.0b4.0NaN5.0c7.0NaN8.0dNaNNaNNaN

丢弃指定轴上的项

drop方法返回一个在指定轴上删除了指定值的新对象

obj=Series(np.arange(5.0),index=['a','b','c','d','e']) new_obj=obj.drop('c') new_obj a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 obj.drop(['d','c']) a 0.0 b 1.0 e 4.0 dtype: float64 #对于DataFrame，可以删除任意轴上的索引值 data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four']) data onetwothreefour0hio0123Colorado4567Utah891011New York12131415 data.drop(['Colorado','0hio']) data.drop('two',axis=1) onethreefour0hio023Colorado467Utah81011New York121415 data.drop(['two','four'],axis=1) onethree0hio02Colorado46Utah810New York1214

索引、选取和过滤

obj=Series(np.arange(4.),index=['a','b','c','d']) obj['b'] 1.0 obj[1] 1.0 obj[2:4] c 2.0 d 3.0 dtype: float64 obj[['b','a','d']] b 1.0 a 0.0 d 3.0 dtype: float64 obj[[1,3]] b 1.0 d 3.0 dtype: float64 obj[obj<2] a 0.0 b 1.0 dtype: float64 #利用标签的切片运算与普通的Python切片运算不同，其末端是包含的 obj['b':'c'] b 1.0 c 2.0 dtype: float64 obj['b':'c']=5 obj a 0.0 b 5.0 c 5.0 d 3.0 dtype: float64 #对DataFrame进行索引其实就是获取一个或多个列 data=DataFrame(np.arange(16).reshape((4,4)),index=['0hio','Colorado','Utah','New York'],columns=['one','two','three','four']) data onetwothreefour0hio0123Colorado4567Utah891011New York12131415 data['two'] 0hio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32 data[['two','three']] twothree0hio12Colorado56Utah910New York1314 #这种索引有几个特殊的情况。首先通过切片或布尔型数组选取行 data[:2] onetwothreefour0hio0123Colorado4567 data[data['three']>5] onetwothreefourColorado4567Utah891011New York12131415 data<5 onetwothreefour0hioTrueTrueTrueTrueColoradoTrueFalseFalseFalseUtahFalseFalseFalseFalseNew YorkFalseFalseFalseFalse data[data<5]=0 data onetwothreefour0hio0000Colorado0567Utah891011New York12131415 #引入专门的索引字段ix选取行和列的子集 data.ix['Colorado',['two','three']] two 5 three 6 Name: Colorado, dtype: int32 data.ix[['Colorado','Utah'],[3,0,1]] fouronetwoColorado705Utah1189 data.ix[data.three>5,:3] onetwothreeColorado056Utah8910New York121314

算术运算和数据对齐

#将对象相加时，如果存在不同的索引，则结果的索引就是该索引对的并集 s1=Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e']) s2=Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g']) s1 a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64 s2 a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64 s1+s2 a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64 #对于DataFrame,对齐操作会同时发生在行和列上： df1=DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['0hio','Texas','Colorado']) df2=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon']) df1 bcd0hio0.01.02.0Texas3.04.05.0Colorado6.07.08.0 df2 bdeUtah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 df1+df2 #没有重叠的部分会产生NA值 bcde0hio3.0NaN6.0NaNColoradoNaNNaNNaNNaNOregonNaNNaNNaNNaNTexas9.0NaN12.0NaNUtahNaNNaNNaNNaN df1=DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd')) df2=DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde')) df1 abcd00.01.02.03.014.05.06.07.028.09.010.011.0 df2 abcde00.01.02.03.04.015.06.07.08.09.0210.011.012.013.014.0315.016.017.018.019.0 df1+df2 abcde00.02.04.06.0NaN19.011.013.015.0NaN218.020.022.024.0NaN3NaNNaNNaNNaNNaN df1.add(df2,fill_value=0)#空缺填充值 add sub div mul abcde00.02.04.06.04.019.011.013.015.09.0218.020.022.024.014.0315.016.017.018.019.0 #与此类似，在对Series或DataFrame重新索引时，也可以指定一个填充值 df1.reindex(columns=df2.columns,fill_value=0) abcde00.01.02.03.0014.05.06.07.0028.09.010.011.00

DataFrame和Series之间的运算

#计算一个二维数组与其某行之间的差： arr=np.arange(12.).reshape((3,4)) arr array([[ 0., 1., 2., 3.], [ 4., 5., 6., 7.], [ 8., 9., 10., 11.]]) arr[0] array([ 0., 1., 2., 3.]) arr-arr[0] array([[ 0., 0., 0., 0.], [ 4., 4., 4., 4.], [ 8., 8., 8., 8.]]) #这就叫做广播 DataFrame和Series之间的运算差不多也是如此 frame=DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','0hio','Texas','Oregon']) frame bdeUtah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 series=frame.ix[0] series b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64 frame-series bdeUtah0.00.00.00hio3.03.03.0Texas6.06.06.0Oregon9.09.09.0 #如果某个索引值在DataFrame的列或Series的索引中找不到，则参与运算的两个对象就会被重新索引以形成并集 series2=Series(range(3),index=['b','e','f']) frame+series2 bdefUtah0.0NaN3.0NaN0hio3.0NaN6.0NaNTexas6.0NaN9.0NaNOregon9.0NaN12.0NaN #如果希望匹配行且在列上广播，则必须使用算术运算方法 series3=frame['d'] frame bdeUtah0.01.02.00hio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 series3 Utah 1.0 0hio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64 frame.sub(series3,axis=0)#传入的轴号就是希望匹配的轴，这里目的是匹配DataFrame的行索引进行广播 bdeUtah-1.00.01.00hio-1.00.01.0Texas-1.00.01.0Oregon-1.00.01.0

函数应用和映射

#Numpy的ufuncs(元素级数组方法）也可以用于操作pandas对象 frame=DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','0hio','Texas','Oregon']) frame bdeUtah-0.191031-0.004688-0.3299700hio0.7082490.265398-2.346897Texas1.064349-1.811846-0.899921Oregon0.334061-1.0585060.655632 np.abs(frame) bdeUtah0.1910310.0046880.3299700hio0.7082490.2653982.346897Texas1.0643491.8118460.899921Oregon0.3340611.0585060.655632 #将函数应用到由各列或行所形成的一维数组上。DataFrame的apply方法即可实现此功能： f=lambda x:x.max()-x.min() frame.apply(f) b 1.255380 d 2.077245 e 3.002529 dtype: float64 frame.apply(f,axis=1)#按列运算 Utah 0.325281 0hio 3.055145 Texas 2.876195 Oregon 1.714138 dtype: float64 #传给apply的函数还可以返回由多个值组成的Series: def f(x): return Series([x.min(),x.max()],index=['min','max']) frame.apply(f) bdemin-0.191031-1.811846-2.346897max1.0643490.2653980.655632 #frame中各个浮点值IDE格式化字符串，使用applymap(format) format=lambda x:'%.2f' %x frame.applymap(format) bdeUtah-0.19-0.00-0.330hio0.710.27-2.35Texas1.06-1.81-0.90Oregon0.33-1.060.66 #之所以叫applymap,是因为Series有一个用于应用元素级函数的map方法 frame['e'].map(format) Utah -0.33 0hio -2.35 Texas -0.90 Oregon 0.66 Name: e, dtype: object ###排序和排名 #sort_index方法返回一个已排序的新对象 obj=Series(range(4),index=['d','a','b','c']) obj.sort_index() a 1 b 2 c 3 d 0 dtype: int32 obj=Series(range(4),index=list('bacd')) obj.sort_index() a 1 b 0 c 2 d 3 dtype: int32 #对于DataFrame,则可以根据任意一个轴上的索引进行排序 frame=DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c']) frame.sort_index() dabcone4567three0123 frame.sort_index(axis=1) abcdthree1230one5674 frame.sort_index(axis=1,ascending=False)#数据默认是按升序排列， dcbathree0321one4765 #按值对Series进行排序，可以使用其sort_value方法 obj=Series([4,7,-3,2]) obj.sort_values() 2 -3 3 2 0 4 1 7 dtype: int64 #在排序时，任何缺失值默认都会被放到Series的末尾 obj=Series([4,np.nan,7,np.nan,-3,2]) obj.sort_values() 4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64 #DataFrame，可以根据一个或多个列的值进行排序 frame=DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]}) frame ab00411720-3312 frame.sort_values(by='b')#sort_index方法过时 ab20-3312004117 #根据多个列进行排序，传入名称的列表 frame.sort_values(by=['a','b']) ab20-3004312117 #排名（ranking) 增设一个排名值默认为各组分配一个平均排名 obj=Series([7,-5,7,4,2,0,4]) obj.rank() 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64 #根据值在原数据中出现的顺序给出排名 obj.rank(method='first') 0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64 #按降序进行排名 obj.rank(ascending=False,method='max') 0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64 #DataFrame可以在行或列上计算排名 frame=DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]}) frame abc004.3-2.0117.05.020-3.08.0312.0-2.5 frame.rank(axis=1) abc02.03.01.011.03.02.022.01.03.032.03.01.0

带有重复值的轴索引

obj=Series(range(5),index=['a','a','b','b','c']) obj a 0 a 1 b 2 b 3 c 4 dtype: int32 #索引的is_unique值会告诉你它的值是否是唯一的 obj.index.is_unique False obj['a'] a 0 a 1 dtype: int32 obj['c'] 4 df=DataFrame(np.random.randn(4,3),index=['a','a','b','b']) df 012a-0.524361-0.145395-1.322196a-0.666326-0.4966121.486401b-0.395841-0.9211940.260437b-0.187285-0.4560141.434571 df.ix['b'] 012b-0.395841-0.9211940.260437b-0.187285-0.4560141.434571

汇总和计算描述统计

df=DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two']) df onetwoa1.40NaNb7.10-4.5cNaNNaNd0.75-1.3 #调用DataFrame的sum方法将会返回一个含有列小计的Series df.sum() one 9.25 two -5.80 dtype: float64 #传入axis=1将会按行进行求和运算 df.sum(axis=1) a 1.40 b 2.60 c NaN d -0.55 dtype: float64 #NA值会被自动排除，通过skipna选项可以禁用该功能 df.mean(axis=1,skipna=False) a NaN b 1.300 c NaN d -0.275 dtype: float64 #有些方法（如idxmin和idxmax返回的是间接统计（比如达到最小值或最大值的索引） df.idxmax() one b two d dtype: object #另一些方法则是累计型的 df.cumsum() onetwoa1.40NaNb8.50-4.5cNaNNaNd9.25-5.8 #还有方法既不是约简型也不是累计型，describe就是一个例子，它用于一次产生多个汇总统计 df.describe() onetwocount3.0000002.000000mean3.083333-2.900000std3.4936852.262742min0.750000-4.50000025%1.075000-3.70000050%1.400000-2.90000075%4.250000-2.100000max7.100000-1.300000 #对于非数值型数据，describe会产生另外一种汇总统计 obj=Series(['a','a','b','c']*4) obj.describe() count 16 unique 3 top a freq 8 dtype: object ##相关系数与协方差 #数据来自Yahoo！Finance的股票价格和成交量 # import pandas.io.data as web #版本更新改动了 import pandas_datareader.data as web all_data={} for ticker in ['AAPL','IBM','MSFT','GOOG']: all_data[ticker]=web.get_data_yahoo(ticker,'1/1/2000','1/1/2010') price=DataFrame({tic:data['Adj Close'] for tic,data in all_data.items()}) volume=DataFrame({tic:data['Volume'] for tic,data in all_data.items()}) #计算价格的百分数编号 returns=price.pct_change() returns.tail() AAPLGOOGIBMMSFTDate2009-12-240.0343390.0111170.0043850.0025872009-12-280.0122940.0070980.0133260.0054842009-12-29-0.011861-0.005571-0.0034770.0070582009-12-300.0121470.0053760.005461-0.0136992009-12-31-0.004300-0.004416-0.012597-0.015504 #Series的corr方法用于计算两个Series中重叠的，非NA的，按索引对其的相关系数。与此类似，cov用于计算协方差 returns.MSFT.corr(returns.IBM) 0.49597963862836764 #DataFrame的corr和cov方法将以DataFrame的形式返回完整的相关系数或协方差矩阵 returns.corr() AAPLGOOGIBMMSFTAAPL1.0000000.4706760.4100110.424305GOOG0.4706761.0000000.3906890.443587IBM0.4100110.3906891.0000000.495980MSFT0.4243050.4435870.4959801.000000 returns.cov() AAPLGOOGIBMMSFTAAPL0.0010270.0003030.0002520.000309GOOG0.0003030.0005800.0001420.000205IBM0.0002520.0001420.0003670.000216MSFT0.0003090.0002050.0002160.000516 #DataFrame的corrwith方法，可以计算其列或行跟另一个Series或DataFrame之间的相关系数。传入一个Series将会返回一个相关系数值Series #（针对各列进行计算） returns.corrwith(returns.IBM) AAPL 0.410011 GOOG 0.390689 IBM 1.000000 MSFT 0.495980 dtype: float64 #传入一个DataFrame则会计算按列名配对的相关系数。这里，计算百分比变化与成交量的相关系数 returns.corrwith(volume) ————————————————————————— NameError Traceback (most recent call last) in () 1 #传入一个DataFrame则会计算按列名配对的相关系数。这里，计算百分比变化与成交量的相关系数 —-> 2 returns.corrwith(volume) NameError: name ‘returns’ is not defined #传入axis=1可按行进行计算，无论如何，在计算相关系数之前，所有的数据项都会按标签对齐

唯一值、值计数以及成员资格

obj=Series(['c','a','d','a','a','b','b','c','c']) uniques=obj.unique() uniques array([‘c’, ‘a’, ‘d’, ‘b’], dtype=object) obj.value_counts() c 3 a 3 b 2 d 1 dtype: int64 #可以用于任何数组或序列 pd.value_counts(obj.values,sort=False) d 1 b 2 a 3 c 3 dtype: int64 #isin用于判断矢量化集合的成员资格，选取子集 mask=obj.isin(['b','c']) mask 0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool obj[mask] 0 c 5 b 6 b 7 c 8 c dtype: object #得到DataFrame中多个相关列的一张柱状图 data=DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]}) data Qu1Qu2Qu301211335241233244434 result=data.apply(pd.value_counts).fillna(0) result Qu1Qu2Qu311.01.01.020.02.01.032.02.00.042.00.02.050.00.01.0

处理缺失数据

string_data=Series(['aaedvark','artichoke',np.nan,'avocado']) string_data 0 aaedvark 1 artichoke 2 NaN 3 avocado dtype: object string_data.isnull() 0 False 1 False 2 True 3 False dtype: bool string_data[0]=None string_data.isnull() 0 True 1 False 2 True 3 False dtype: bool ###滤除缺失数据 #对于一个Series,dropna返回一个仅含非空数据和索引值的Series from numpy import nan as NA import numpy as np import pandas as pd from pandas import Series,DataFrame data=Series([1,NA,3.5,NA,7]) data.dropna() 0 1.0 2 3.5 4 7.0 dtype: float64 data[data.notnull()] 0 1.0 2 3.5 4 7.0 dtype: float64 #对于DataFrame对象，dropna默认丢弃任何含有缺失值的行 data=DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]]) clearned=data.dropna() data 01201.06.53.011.0NaNNaN2NaNNaNNaN3NaN6.53.0 clearned 01201.06.53.0 data.dropna(how='all')#丢弃全为NA的行 01201.06.53.011.0NaNNaN3NaN6.53.0 data[4]=NA data 012401.06.53.0NaN11.0NaNNaNNaN2NaNNaNNaNNaN3NaN6.53.0NaN data.dropna(axis=1,how='all') 01201.06.53.011.0NaNNaN2NaNNaNNaN3NaN6.53.0 df=DataFrame(np.random.randn(7,3)) df.ix[:4,1]=NA df.ix[:2,2]=NA df 0120-1.637463NaNNaN1-1.259674NaNNaN2-0.284635NaNNaN30.818905NaN-1.8782444-2.402401NaN-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312 df.dropna(thresh=3) 0125-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312

填充缺失数据

df.fillna(0) 0120-1.6374630.0000000.0000001-1.2596740.0000000.0000002-0.2846350.0000000.00000030.8189050.000000-1.8782444-2.4024010.000000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312 df.fillna({1:0.5,2:-1}) 0120-1.6374630.500000-1.0000001-1.2596740.500000-1.0000002-0.2846350.500000-1.00000030.8189050.500000-1.8782444-2.4024010.500000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312 #fillna默认会返回新对象，但也可以对现有对象进行就地修改 _=df.fillna(0,inplace=True) df 0120-1.6374630.0000000.0000001-1.2596740.0000000.0000002-0.2846350.0000000.00000030.8189050.000000-1.8782444-2.4024010.000000-0.5339425-0.623351-1.472599-0.8606146-0.194565-1.757851-1.251312 df=DataFrame(np.random.randn(6,3)) df.ix[2:,1]=NA df.ix[4:,2]=NA df 01200.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613NaN0.87806231.188953NaN-0.1255614-0.512800NaNNaN5-0.383978NaNNaN df.fillna(method='ffill') 01200.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613-0.4297560.87806231.188953-0.429756-0.1255614-0.512800-0.429756-0.1255615-0.383978-0.429756-0.125561 df.fillna(method='ffill',limit=2) 01200.1737990.2674220.48014111.303258-0.429756-0.7906612-0.110613-0.4297560.87806231.188953-0.429756-0.1255614-0.512800NaN-0.1255615-0.383978NaN-0.125561 data=Series([1.,NA,3.5,NA,7]) data.fillna(data.mean()) 0 1.000000 1 3.833333 2 3.500000 3 3.833333 4 7.000000 dtype: float64

层次化索引

#能在一个轴上拥有多个（两个以上）索引级别。能以低维度形式处理高纬度数据 data=Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]]) data a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64 data.index MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]]) data['b'] 1 0.501535 2 1.547647 3 -0.038850 dtype: float64 data['b':'c'] b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64 data.ix[['b','c']] b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 dtype: float64 data[:,2] a 0.276982 b 1.547647 c -0.905470 d -1.697117 dtype: float64 data.unstack() 123a-2.0592650.276982-1.771092b0.5015351.547647-0.038850c1.963156-0.905470NaNdNaN-1.697117-0.659792 data.unstack().stack() a 1 -2.059265 2 0.276982 3 -1.771092 b 1 0.501535 2 1.547647 3 -0.038850 c 1 1.963156 2 -0.905470 d 2 -1.697117 3 -0.659792 dtype: float64 #对于一个DataFrame，每条轴都可以有分层索引： frame=DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]], columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']]) frame OhioColoradoGreenRedGreena10122345b1678291011 frame.index.names=['key1','key2'] frame.columns.names=['state','color'] frame stateOhioColoradocolorGreenRedGreenkey1key2a10122345b1678291011 frame['Ohio'] colorGreenRedkey1key2a101234b1672910

重排分级顺序

frame.swaplevel('key1','key2') stateOhioColoradocolorGreenRedGreenkey2key11a0122a3451b6782b91011 frame.sortlevel(1) stateOhioColoradocolorGreenRedGreenkey1key2a1012b1678a2345b291011 frame.swaplevel(0,1).sortlevel(0) stateOhioColoradocolorGreenRedGreenkey2key11a012b6782a345b91011

根据级别汇总统计

frame.sum(level='key2') stateOhioColoradocolorGreenRedGreenkey2168102121416 frame.sum(level='color',axis=1) colorGreenRedkey1key2a121284b114722010

使用DataFrame的列

#DataFrame中的列与行索引之间的变化 frame=DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'], 'd':[0,1,2,0,1,2,3]}) frame abcd007one0116one1225one2334two0443two1552two2661two3 frame2=frame.set_index(['c','d']) frame2 abcdone007116225two034143252361 frame.set_index(['c','d'],drop=False) abcdcdone007one0116one1225one2two034two0143two1252two2361two3 frame2.reset_index() cdab0one0071one1162one2253two0344two1435two2526two361

其他有关pandas的话题

整数索引

ser=Series(np.arange(3.)) ser 0 0.0 1 1.0 2 2.0 dtype: float64 ser[1] 1.0 ser#这里会报错 ser2=Series(np.arange(3.),index=['a','b','c'])#非整数索引没有歧义 ser2[-1] 2.0 ser.ix[:1] 0 0.0 1 1.0 dtype: float64 ser3=Series(range(3),index=[-5,1,3]) ser3.iloc[2]#基于位置的索引 iget_value方法过时 2 frame=DataFrame(np.arange(6).reshape(3,2),index=[2,0,1]) frame.iloc[0]#irow和icol方法过时 0 0 1 1 Name: 2, dtype: int32

面板数据

#pandas中有一个Panel数据结构，可以用一个DataFrame对象组成的字典或一个三维ndarray来创建Panel对象 from pandas_datareader import data as web pdata=pd.Panel(dict((stk,web.get_data_yahoo(stk,'1/1/2009','6/1/2012') ) for stk in ['AAPL','GOOG','MSFT','DELL'])) pdata#Panel中的每一项（类似于DataFrame中的列）都是一个DataFrame

转载请注明原文地址: https://www.6miu.com/read-139.html

2012-2014专利技术

最新回复(0)

pandas入门 《利用Python进行数据分析》读书笔记 第5章