pandas

xiaoxiao2025-09-24 291

#pandas import pandas as pd import numpy as np from pandas import Series,DataFrame #Series obj = pd.Series([4,7,-5,3]) obj 0 4 1 7 2 -5 3 3 dtype: int64 obj.values array([ 4, 7, -5, 3], dtype=int64) obj.index RangeIndex(start=0, stop=4, step=1) obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c']) obj2 d 4 b 7 a -5 c 3 dtype: int64 obj2.index Index(['d', 'b', 'a', 'c'], dtype='object') #索引取值 obj2['a'] -5 obj2['d'] = 6 obj2[['c','a','d']] c 3 a -5 d 6 dtype: int64 obj2[obj2>0] d 6 b 7 c 3 dtype: int64 obj2*2 d 12 b 14 a -10 c 6 dtype: int64 np.exp(obj2) d 403.428793 b 1096.633158 a 0.006738 c 20.085537 dtype: float64 'b'in obj2 True 'r' in obj2 False sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} obj3 = pd.Series(sdata) obj3 Ohio 35000 Texas 71000 Oregon 16000 Utah 5000 dtype: int64 states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = pd.Series(sdata,index=states) obj4 California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64 #缺失值 pd.isnull(obj4) California True Ohio False Oregon False Texas False dtype: bool pd.notnull(obj4) California False Ohio True Oregon True Texas True dtype: bool obj4.isnull() California True Ohio False Oregon False Texas False dtype: bool #根据运算的索引标签自动对齐数据： obj3 Ohio 35000 Texas 71000 Oregon 16000 Utah 5000 dtype: int64 obj4 California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 dtype: float64 obj3 + obj4 California NaN Ohio 70000.0 Oregon 32000.0 Texas 142000.0 Utah NaN dtype: float64 obj4.name = 'population' obj4.index.name = 'state' obj4 state California NaN Ohio 35000.0 Oregon 16000.0 Texas 71000.0 Name: population, dtype: float64 obj 0 4 1 7 2 -5 3 3 dtype: int64 obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] obj Bob 4 Steve 7 Jeff -5 Ryan 3 dtype: int64 #DdataFrame 表格型数据结构 data = data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 'year': [2000, 2001, 2002, 2001, 2002, 2003], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} frame = pd.DataFrame(data) frame stateyearpop0Ohio20001.51Ohio20011.72Ohio20023.63Nevada20012.44Nevada20022.95Nevada20033.2 frame.head() #前五行 stateyearpop0Ohio20001.51Ohio20011.72Ohio20023.63Nevada20012.44Nevada20022.9 pd.DataFrame(data,columns=['year','state','pop']) yearstatepop02000Ohio1.512001Ohio1.722002Ohio3.632001Nevada2.442002Nevada2.952003Nevada3.2 frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six']) frame2 yearstatepopdebtone2000Ohio1.5NaNtwo2001Ohio1.7NaNthree2002Ohio3.6NaNfour2001Nevada2.4NaNfive2002Nevada2.9NaNsix2003Nevada3.2NaN frame2.columns Index(['year', 'state', 'pop', 'debt'], dtype='object') frame2['state'] #返回Series one Ohio two Ohio three Ohio four Nevada five Nevada six Nevada Name: state, dtype: object frame2.year one 2000 two 2001 three 2002 four 2001 five 2002 six 2003 Name: year, dtype: int64 frame2.loc['three'] #loc返回行数据 year 2002 state Ohio pop 3.6 debt NaN Name: three, dtype: object frame2['debt'] = 16.5 frame2 yearstatepopdebtone2000Ohio1.516.5two2001Ohio1.716.5three2002Ohio3.616.5four2001Nevada2.416.5five2002Nevada2.916.5six2003Nevada3.216.5 frame2['debt'] = np.arange(6.) frame2 yearstatepopdebtone2000Ohio1.50.0two2001Ohio1.71.0three2002Ohio3.62.0four2001Nevada2.43.0five2002Nevada2.94.0six2003Nevada3.25.0 val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five']) frame2['debt'] = val frame2 yearstatepopdebtone2000Ohio1.5NaNtwo2001Ohio1.7-1.2three2002Ohio3.6NaNfour2001Nevada2.4-1.5five2002Nevada2.9-1.7six2003Nevada3.2NaN frame2['eastern'] = frame2.state == 'Ohio' frame2 yearstatepopdebteasternone2000Ohio1.5NaNTruetwo2001Ohio1.7-1.2Truethree2002Ohio3.6NaNTruefour2001Nevada2.4-1.5Falsefive2002Nevada2.9-1.7Falsesix2003Nevada3.2NaNFalse del frame2['eastern'] frame2 yearstatepopdebtone2000Ohio1.5NaNtwo2001Ohio1.7-1.2three2002Ohio3.6NaNfour2001Nevada2.4-1.5five2002Nevada2.9-1.7six2003Nevada3.2NaN #嵌套字典 pop = {'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}} #外层字典的键作为列，内层键则作为行索引 frame3 = pd.DataFrame(pop) frame3 NevadaOhio2000NaN1.520012.41.720022.93.6 #转置 frame3.T 200020012002NevadaNaN2.42.9Ohio1.51.73.6 frame3.index.name = 'year';frame3.columns.name = 'state' frame3 stateNevadaOhioyear2000NaN1.520012.41.720022.93.6 frame3.values array([[nan, 1.5], [2.4, 1.7], [2.9, 3.6]]) frame2.values array([[2000, 'Ohio', 1.5, nan], [2001, 'Ohio', 1.7, -1.2], [2002, 'Ohio', 3.6, nan], [2001, 'Nevada', 2.4, -1.5], [2002, 'Nevada', 2.9, -1.7], [2003, 'Nevada', 3.2, nan]], dtype=object)

#索引对象 Index obj = pd.Series(range(3),index=['a','b','c']) index = obj.index index Index(['a', 'b', 'c'], dtype='object') index[1:] Index(['b', 'c'], dtype='object') labels = pd.Index(np.arange(3)) labels Int64Index([0, 1, 2], dtype='int64') obj2 = pd.Series([1.5,-2.5,0],index=labels) obj2 0 1.5 1 -2.5 2 0.0 dtype: float64 obj2.index is labels True frame3 NevadaOhio2000NaN1.520012.41.720022.93.6 frame3.columns Index(['Nevada', 'Ohio'], dtype='object') 'Ohio'in frame3.columns True dup_labels = pd.Index(['foo','foo', 'bar', 'bar']) dup_labels Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

#基本功能 #重新索引 obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) obj d 4.5 b 7.2 a -5.3 c 3.6 dtype: float64 obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) obj2 a -5.3 b 7.2 c 3.6 d 4.5 e NaN dtype: float64 obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4]) obj3 0 blue 2 purple 4 yellow dtype: object obj3.reindex(range(6),method='ffill')#插值处理 0 blue 1 blue 2 purple 3 purple 4 yellow 5 yellow dtype: object frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'], columns=['Ohio', 'Texas', 'California']) frame OhioTexasCaliforniaa012c345d678 frame2 = frame.reindex(['a','b','c','d']) frame2 OhioTexasCaliforniaa0.01.02.0bNaNNaNNaNc3.04.05.0d6.07.08.0 states = ['Texas','Utah','California'] frame.reindex(columns=states) TexasUtahCaliforniaa1NaN2c4NaN5d7NaN8

#丢弃指定轴上的项 drop obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64 new_obj = obj.drop('c') new_obj a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 obj.drop(['d','c']) a 0.0 b 1.0 e 4.0 dtype: float64 data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data onetwothreefourOhio0123Colorado4567Utah891011New York12131415 data.drop(['Colorado','Ohio']) onetwothreefourUtah891011New York12131415 data.drop('two',axis=1) #axis指定行或列 onethreefourOhio023Colorado467Utah81011New York121415 data.drop(['two','four'],axis = 'columns') onethreeOhio02Colorado46Utah810New York1214 obj a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64 obj.drop('c',inplace=True) #inplace销毁被删除数据 obj a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 obj a 0.0 b 1.0 d 3.0 e 4.0 dtype: float64 #索引，选取和过滤 obj = pd.Series(np.arange(4),index=['a', 'b', 'c', 'd']) obj a 0 b 1 c 2 d 3 dtype: int32 obj['b'] 1 obj[1] 1 obj[2:4] c 2 d 3 dtype: int32 obj[['b','a','d']] b 1 a 0 d 3 dtype: int32 obj[[1,3]] b 1 d 3 dtype: int32 obj[obj<2] a 0 b 1 dtype: int32 #利用标签的切片运算与普通的Python切片运算不同，其末端是包含的 obj['b':'c'] b 1 c 2 dtype: int32 obj['b':'c'] = 5 obj a 0 b 5 c 5 d 3 dtype: int32 data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four']) data onetwothreefourOhio0123Colorado4567Utah891011New York12131415 data['two'] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int32 data[['three','one']] threeoneOhio20Colorado64Utah108New York1412 data[:2] onetwothreefourOhio0123Colorado4567 data[data['three']>5] onetwothreefourColorado4567Utah891011New York12131415 data<5 onetwothreefourOhioTrueTrueTrueTrueColoradoTrueFalseFalseFalseUtahFalseFalseFalseFalseNew YorkFalseFalseFalseFalse data[data<5] = 0 data onetwothreefourOhio0000Colorado0567Utah891011New York12131415 #利用loc和iloc进行选取；标签运算符 data.loc['Colorado',['two','three']] two 5 three 6 Name: Colorado, dtype: int32 data.iloc[2,[3,0,1]] four 11 one 8 two 9 Name: Utah, dtype: int32 data.iloc[2] one 8 two 9 three 10 four 11 Name: Utah, dtype: int32 data.iloc[[1,2],[3,0,1]] fouronetwoColorado705Utah1189 data.loc[:'Utah','two'] Ohio 0 Colorado 5 Utah 9 Name: two, dtype: int32 data.iloc[:,:3][data.three>5] onetwothreeColorado056Utah8910New York121314

#整数索引 ser = pd.Series(np.arange(3)) ser 0 0 1 1 2 2 dtype: int32 ser[-1] --------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-73-44969a759c20> in <module>() ----> 1 ser[-1] C:\Anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key) 765 key = com._apply_if_callable(key, self) 766 try: --> 767 result = self.index.get_value(self, key) 768 769 if not is_scalar(result): C:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key) 3116 try: 3117 return self._engine.get_value(s, k, -> 3118 tz=getattr(series.dtype, 'tz', None)) 3119 except KeyError as e1: 3120 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']: pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value() pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() KeyError: -1 ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c']) #非整数索引 ser2[-1] 2.0 ser[:1] 0 0 dtype: int32 ser.loc[:1] #注意区别 0 0 1 1 dtype: int32 #算术运算和数据对齐 s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g']) s1 a 7.3 c -2.5 d 3.4 e 1.5 dtype: float64 s2 a -2.1 c 3.6 e -1.5 f 4.0 g 3.1 dtype: float64 s1 + s2 #对齐操作 a 5.2 c 1.1 d NaN e 0.0 f NaN g NaN dtype: float64 df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado']) df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) df1 bcdOhio0.01.02.0Texas3.04.05.0Colorado6.07.08.0 df2 bdeUtah0.01.02.0Ohio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 df1 + df2 #DataFrame对象相加，没有共用的列或行标签，结果都会是空 bcdeColoradoNaNNaNNaNNaNOhio3.0NaN6.0NaNOregonNaNNaNNaNNaNTexas9.0NaN12.0NaNUtahNaNNaNNaNNaN #在算术方法中填充值 df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) df2.loc[1,'b'] = np.nan df1 abcd00.01.02.03.014.05.06.07.028.09.010.011.0 df2 abcde00.01.02.03.04.015.0NaN7.08.09.0210.011.012.013.014.0315.016.017.018.019.0 df1 + df2 abcde00.02.04.06.0NaN19.0NaN13.015.0NaN218.020.022.024.0NaN3NaNNaNNaNNaNNaN df1.add(df2,fill_value=0) #指定填充值 abcde00.02.04.06.04.019.05.013.015.09.0218.020.022.024.014.0315.016.017.018.019.0

#DataFrame和Series之间的运算 arr = np.arange(12).reshape(3,4) arr array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]]) arr[0] array([0, 1, 2, 3]) arr - arr[0] #广播 array([[0, 0, 0, 0], [4, 4, 4, 4], [8, 8, 8, 8]]) frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series =frame.iloc[0] frame bdeUtah0.01.02.0Ohio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 series b 0.0 d 1.0 e 2.0 Name: Utah, dtype: float64 frame - series bdeUtah0.00.00.0Ohio3.03.03.0Texas6.06.06.0Oregon9.09.09.0 series2 = pd.Series(range(3), index=['b', 'e', 'f']) frame + series2 bdefUtah0.0NaN3.0NaNOhio3.0NaN6.0NaNTexas6.0NaN9.0NaNOregon9.0NaN12.0NaN series3 = frame['d'] frame bdeUtah0.01.02.0Ohio3.04.05.0Texas6.07.08.0Oregon9.010.011.0 series3 Utah 1.0 Ohio 4.0 Texas 7.0 Oregon 10.0 Name: d, dtype: float64 frame.sub(series3,axis='index') bdeUtah-1.00.01.0Ohio-1.00.01.0Texas-1.00.01.0Oregon-1.00.01.0 #函数应用和映射 ufuncs（元素级数组方法） frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame bdeUtah-0.951265-0.498273-0.388690Ohio1.9885460.370789-0.488038Texas0.692938-0.1609440.654771Oregon-1.3142371.163286-1.687210 np.abs(frame) bdeUtah0.9512650.4982730.388690Ohio1.9885460.3707890.488038Texas0.6929380.1609440.654771Oregon1.3142371.1632861.687210 f = lambda x:x.max()-x.min() frame.apply(f) #默认列执行f b 3.302783 d 1.661559 e 2.341980 dtype: float64 frame.apply(f,axis='columns')#行执行f Utah 0.562574 Ohio 2.476585 Texas 0.853882 Oregon 2.850495 dtype: float64 def f(x): return pd.Series([x.min(), x.max()], index=['min', 'max']) frame.apply(f) #接受多值的series函数 bdemin-1.314237-0.498273-1.687210max1.9885461.1632860.654771 format = lambda x: '%.2f' % x frame.applymap(format) #元素级函数 bdeUtah-0.95-0.50-0.39Ohio1.990.37-0.49Texas0.69-0.160.65Oregon-1.311.16-1.69 frame['e'].map(format) #区分map与applymap Utah -0.39 Ohio -0.49 Texas 0.65 Oregon -1.69 Name: e, dtype: object # 排序和排名 obj = pd.Series(range(4), index=['d', 'a', 'b', 'c']) obj.sort_index() a 1 b 2 c 3 d 0 dtype: int64 frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame.sort_index() dabcone4567three0123 frame.sort_index(axis=1,ascending=False) dcbathree0321one4765 obj = pd.Series([4,7,-3,2]) obj.sort_values() 2 -3 3 2 0 4 1 7 dtype: int64 obj = pd.Series([4, np.nan, 7, np.nan, -3, 2]) obj.sort_values() 4 -3.0 5 2.0 0 4.0 2 7.0 1 NaN 3 NaN dtype: float64 frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame ba0401712-30321 frame.sort_values(by='b') ba2-30321040171 frame.sort_values(by=['a','b']) ba2-30040321171 obj=pd.Series([7, -5, 7, 4, 2, 0, 4])#rank是通过“为各组分配一个平均排名”的方式破坏平级关系的 obj.rank() 0 6.5 1 1.0 2 6.5 3 4.5 4 3.0 5 2.0 6 4.5 dtype: float64 obj.rank(method='first') 0 6.0 1 1.0 2 7.0 3 4.0 4 3.0 5 2.0 6 5.0 dtype: float64 obj.rank(ascending=False, method='max') 0 2.0 1 7.0 2 2.0 3 4.0 4 5.0 5 6.0 6 4.0 dtype: float64 frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]}) frame bac04.30-2.017.015.02-3.008.032.01-2.5 frame.rank(axis='columns') bac03.02.01.013.01.02.021.02.03.033.02.01.0

#带有重复标签的轴索引 obj = pd.Series(range(5),index=['a','a','b','b','c']) obj a 0 a 1 b 2 b 3 c 4 dtype: int64 obj.index.is_unique False obj['a'] a 0 a 1 dtype: int64 obj['b'] b 2 b 3 dtype: int64 df = pd.DataFrame(np.random.randn(4,3),index= ['a','a','b','b']) df 012a1.2652400.407293-0.652129a0.268019-1.4239121.297783b0.797760-0.3536631.323543b0.9618880.2271321.843558 df.loc['b'] 012b0.797760-0.3536631.323543b0.9618880.2271321.843558 #汇总和计算描绘统计 df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df onetwoa1.40NaNb7.10-4.5cNaNNaNd0.75-1.3 df.sum() #列和 one 9.25 two -5.80 dtype: float64 df.sum(axis=1) a 1.40 b 2.60 c 0.00 d -0.55 dtype: float64 df.mean(axis='columns',skipna=False) #不忽略Nan a NaN b 1.300 c NaN d -0.275 dtype: float64 df.idxmax() #达到最大的索引 one b two d dtype: object df.cumsum() onetwoa1.40NaNb8.50-4.5cNaNNaNd9.25-5.8 df.describe() onetwocount3.0000002.000000mean3.083333-2.900000std3.4936852.262742min0.750000-4.50000025%1.075000-3.70000050%1.400000-2.90000075%4.250000-2.100000max7.100000-1.300000 obj = pd.Series(['a', 'a', 'b', 'c'] * 4)#非数值型 obj.describe() count 16 unique 3 top a freq 8 dtype: object

#相关系数与协方差 import pandas_datareader.data as web all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']} price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()}) volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()}) returns = price.pct_change() returns.tail() AAPLIBMMSFTGOOGDate2018-10-18-0.023374-0.026110-0.019962-0.0248462018-10-190.015230-0.0111070.0014750.0078042018-10-220.0061100.0071260.0089270.0042872018-10-230.0094270.009152-0.0139560.0022972018-10-24-0.034302-0.030486-0.053469-0.048003 returns['MSFT'].corr(returns['IBM']) #相关系数 0.4746674318628231 returns["MSFT"].cov(returns["IBM"])#协方差 8.150193655338736e-05 returns.MSFT.corr(returns.IBM) 0.4746674318628231 returns.corr() #相关系数矩阵 AAPLIBMMSFTGOOGAAPL1.0000000.3644340.4219840.438015IBM0.3644341.0000000.4746670.398449MSFT0.4219840.4746671.0000000.516364GOOG0.4380150.3984490.5163641.000000 returns.cov() #协方差矩阵 AAPLIBMMSFTGOOGAAPL0.0002520.0000700.0000950.000106IBM0.0000700.0001460.0000820.000073MSFT0.0000950.0000820.0002020.000112GOOG0.0001060.0000730.0001120.000232 returns.corrwith(returns.IBM)#与某一列或行的相关系数 AAPL 0.364434 IBM 1.000000 MSFT 0.474667 GOOG 0.398449 dtype: float64 returns.corrwith(volume)#传入一个DataFrame则会计算按列名配对的相关系数 AAPL -0.065065 IBM -0.173822 MSFT -0.088563 GOOG -0.016396 dtype: float64 #唯一值，值记述以及成员资格 obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) uniques = obj.unique() uniques array(['c', 'a', 'd', 'b'], dtype=object) obj.value_counts()#计算出现频率 c 3 a 3 b 2 d 1 dtype: int64 obj 0 c 1 a 2 d 3 a 4 a 5 b 6 b 7 c 8 c dtype: object mask = obj.isin(['b','c'])#用于判断矢量化集合的成员资格 mask 0 True 1 False 2 False 3 False 4 False 5 True 6 True 7 True 8 True dtype: bool obj[mask] 0 c 5 b 6 b 7 c 8 c dtype: object to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a']) unique_values = pd.Series(['c','b','a']) pd.Index(unique_values).get_indexer(to_match) #与isin类似的是Index.get_indexer方法，它可以给你一个索引数组，从可能包含重复值的数组到另一个不同值的数组 array([0, 2, 1, 1, 0, 2], dtype=int64)

转载请注明原文地址: https://www.6miu.com/read-5036812.html

Java

最新回复(0)