pandas10分钟教程 http://pandas.pydata.org/pandas-docs/stable/10min.html
# -*- coding: utf-8 -*- """ Created on Fri May 5 23:23:03 2017 @author: Administrator """ import pandas as pd import numpy as np import matplotlib.pyplot as plt #object Creation # Creating a Series by passing a list of values, #letting pandas create a default integer index: s = pd.Series([1, 3, 5, np.nan, 6, 8, 10]) print (s) print ('----------------------------------') #Creating a DataFrame by passing a numpy array, #with a datetime index and labeled columns. dates = pd.date_range('20130101', periods=7) print (dates) df = pd.DataFrame(np.random.randn(7,5), index = dates, columns=list('ABCDE')) print (df) print ('----------------------------------') #Creating a DataFrame by passing a dict of object #that can be converted to series-like df2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20130103'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': pd.np.array([3]*4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo'}) print (df2) print ('----------------------------------') # Having specific dtypes print (df2.dtypes) print ('----------------------------------') print ('++++++++++++++++++++++++++++++++++') #Viewing Data print (df.head) print ('----------------------------------') print (df.head(3)) print ('----------------------------------') print (df.tail(3)) print ('----------------------------------') print (df.index) print ('----------------------------------') print (df.columns) print ('----------------------------------') print (df.values) print ('----------------------------------') print (df.describe()) print ('----------------------------------') print (df.T) print ('----------------------------------') print (df.sort_index(axis=1, ascending=False)) print ('----------------------------------') print (df.sort_values(by='B')) print ('----------------------------------') #selection print (df['A']) print ('----------------------------------') print(df[0:3]) print ('----------------------------------') print (df['20130102':'20130104']) print ('----------------------------------') print(df.loc[dates[0]]) print ('----------------------------------') print(df.loc[:,['A', 'B']]) print ('----------------------------------') print(df.loc['20130102':'20130104',['A', 'B']]) print ('----------------------------------') print(df.loc['20130102', ['A', 'B']]) print ('----------------------------------') print(df.loc[dates[0], 'A']) print ('----------------------------------') #Selection by Position print(df.iloc[3]) print ('----------------------------------') print(df.iloc[3:5, 0:2]) print ('----------------------------------') print(df.iloc[[1,2,4], [0,2]]) print ('----------------------------------') print (df.iloc[[1, 2, 4],[0, 2]]) print ('----------------------------------') print (df) print (df.iloc[:, 1:3]) print ('----------------------------------') print (df.iloc[1:3, :]) print ('----------------------------------') print (df.iat[1,1]) print ('----------------------------------') #Boolean Indexing #Using a single column's values select data print (df[df.A> 0]) print ('----------------------------------') print (df[df>0]) print ('----------------------------------') df2 = df.copy() df2['F'] = ['one', 'one', 'two', 'three', 'four', 'three', 'three'] print (df2) print ('----------------------------------') print(df2[df2['F'].isin(['two', 'four'])]) print ('----------------------------------') #setting s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range('20130102', periods=6)) print (s1) print ('----------------------------------') df['G'] = s1 print (df) print ('----------------------------------') #setting values by label df.at[dates[0], 'A'] = 0 print (df) print ('----------------------------------') df.iat[0, 1] = 0 print (df) print ('----------------------------------') df.loc[:,'D'] = np.array([5] * len(df)) print (df) print ('----------------------------------') df2 = df.copy() df2[df2>0] = -df2 print(df2) print ('----------------------------------') #Missing Data df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['H']) df1.loc[dates[0]:dates[1], 'H'] = 1 print (df1) print ('----------------------------------') #To drop any rows that have missing data print (df1.dropna(how='any')) print ('----------------------------------') #Filling missing data print (df1.fillna(value = 5)) print ('----------------------------------') #To get the boolean mask where values are nan print(pd.isnull(df1)) print ('----------------------------------') #Operations #Operations in general exclude missing data #performing a descriptive statistic print (df) print (df.mean()) print (df.mean(1)) print ('----------------------------------') #Apply #Applying functions to the data print(df.apply(np.cumsum)) print ('----------------------------------') s = pd.Series(np.random.randint(0,7,size=10)) print (s) print ('----------------------------------') print (s.value_counts()) print ('----------------------------------') ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods = 1000)) ts = ts.cumsum() ts.plot() print ('----------------------------------') df = pd.DataFrame(np.random.randn(1000, 4), index = ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() plt.figure(); df.plot(); plt.legend(loc='best') print (df) df.to_csv('foo.csv', 'w')