优达学城Numpy与Pandas笔记

xiaoxiao2021-02-28 96

此篇为优达学城数据分析入门第二课笔记网址

基本操作

运算

import numpy as np # Change False to True for each block of code to see what it does # Arithmetic operations between 2 NumPy arrays if False: a = np.array([1, 2, 3, 4]) b = np.array([1, 2, 1, 2]) print (a + b) print (a - b) print (a * b) print (a / b) print (a ** b) # Arithmetic operations between a NumPy array and a single number if False: a = np.array([1, 2, 3, 4]) b = 2 print (a + b) print (a - b) print (a * b) print (a / b) print (a ** b) # Logical operations with NumPy arrays if False: a = np.array([True, True, False, False]) b = np.array([True, False, True, False]) print (a & b) print (a | b) print (~a) print (a & True) print (a & False) print (a | True) print (a | False) # Comparison operations between 2 NumPy Arrays if False: a = np.array([1, 2, 3, 4, 5]) b = np.array([5, 4, 3, 2, 1]) print (a > b) print (a >= b) print (a < b) print (a <= b) print (a == b) print (a != b) # Comparison operations between a NumPy array and a single number if False: a = np.array([1, 2, 3, 4]) b = 2 print (a > b) print (a >= b) print (a < b) print (a <= b) print (a == b) print (a != b) # First 20 countries with school completion data countries = np.array([ 'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan', 'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia', 'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Cape Verde' ]) # Female school completion rate in 2007 for those 20 countries female_completion = np.array([ 97.35583, 104.62379, 103.02998, 95.14321, 103.69019, 98.49185, 100.88828, 95.43974, 92.11484, 91.54804, 95.98029, 98.22902, 96.12179, 119.28105, 97.84627, 29.07386, 38.41644, 90.70509, 51.7478 , 95.45072 ]) # Male school completion rate in 2007 for those 20 countries male_completion = np.array([ 95.47622, 100.66476, 99.7926 , 91.48936, 103.22096, 97.80458, 103.81398, 88.11736, 93.55611, 87.76347, 102.45714, 98.73953, 92.22388, 115.3892 , 98.70502, 37.00692, 45.39401, 91.22084, 62.42028, 90.66958 ]) def overall_completion_rate(female_completion, male_completion): ''' Fill in this function to return a NumPy array containing the overall school completion rate for each country. The arguments are NumPy arrays giving the female and male completion of each country in the same order. ''' return female_completion + male_completion sum_completion= overall_completion_rate(female_completion, male_completion) print(sum_completion) [ 192.83205 205.28855 202.82258 186.63257 206.91115 196.29643 204.70226 183.5571 185.67095 179.31151 198.43743 196.96855 188.34567 234.67025 196.55129 66.08078 83.81045 181.92593 114.16808 186.1203 ]

归一化

import numpy as np # First 20 countries with employment data countries = np.array([ 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina' ]) # Employment data in 2007 for those 20 countries employment = np.array([ 55.70000076, 51.40000153, 50.5 , 75.69999695, 58.40000153, 40.09999847, 61.5 , 57.09999847, 60.90000153, 66.59999847, 60.40000153, 68.09999847, 66.90000153, 53.40000153, 48.59999847, 56.79999924, 71.59999847, 58.40000153, 70.40000153, 41.20000076 ]) # Change this country name to change what country will be printed when you # click "Test Run". Your function will be called to determine the standardized # score for this country for each of the given 5 Gapminder variables in 2007. # The possible country names are available in the Downloadables section. country_name = 'United States' def standardize_data(values): ''' Fill in this function to return a standardized version of the given values, which will be in a NumPy array. Each value should be translated into the number of standard deviations that value is away from the mean of the data. (A positive number indicates a value higher than the mean, and a negative number indicates a value lower than the mean.) ''' return (values-values.mean())/values.std() standardize_data(employment) array([-0.31965231, -0.780123 , -0.87650077, 1.82207181, -0.03051941, -1.99019768, 0.30144772, -0.16973184, 0.23719615, 0.84758731, 0.18365304, 1.00821665, 0.87971351, -0.56595055, -1.07996476, -0.20185762, 1.38301845, -0.03051941, 1.2545153 , -1.87240259]) import numpy as np # Change False to True for each block of code to see what it does # Using index arrays if False: a = np.array([1, 2, 3, 4]) b = np.array([True, True, False, False]) # print a[b] # print a[np.array([True, False, True, False])] # Creating the index array using vectorized operations if False: a = np.array([1, 2, 3, 2, 1]) b = (a >= 2) # print a[b] # print a[a >= 2] # Creating the index array using vectorized operations on another array if False: a = np.array([1, 2, 3, 4, 5]) b = np.array([1, 2, 3, 2, 1]) # print b == 2 # print a[b == 2] def mean_time_for_paid_students(time_spent, days_to_cancel): ''' Fill in this function to calculate the mean time spent in the classroom for students who stayed enrolled at least (greater than or equal to) 7 days. Unlike in Lesson 1, you can assume that days_to_cancel will contain only integers (there are no students who have not canceled yet). The arguments are NumPy arrays. time_spent contains the amount of time spent in the classroom for each student, and days_to_cancel contains the number of days until each student cancel. The data is given in the same order in both arrays. ''' st=time_spent[days_to_cancel >= 7] return st.mean() # Time spent in the classroom in the first week for 20 students time_spent = np.array([ 12.89697233, 0. , 64.55043217, 0. , 24.2315615 , 39.991625 , 0. , 0. , 147.20683783, 0. , 0. , 0. , 45.18261617, 157.60454283, 133.2434615 , 52.85000767, 0. , 54.9204785 , 26.78142417, 0. ]) # Days to cancel for 20 students days_to_cancel = np.array([ 4, 5, 37, 3, 12, 4, 35, 38, 5, 37, 3, 3, 68, 38, 98, 2, 249, 2, 127, 35 ]) mean_time_for_paid_students(time_spent, days_to_cancel) 41.054003485454537

numpy的+=与切片需要注意，类似指针去理解

Pandas Series

import pandas as pd countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia'] life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2, 70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7, 67.3, 70.6] gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478, 13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176, 27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972, 483.97086804, 9783.98417323, 2253.46411147, 25034.66692293, 3680.91642923, 366.04496652, 1175.92638695, 1132.21387981] # Life expectancy and gdp data in 2007 for 20 countries life_expectancy = pd.Series(life_expectancy_values) gdp = pd.Series(gdp_values) # Change False to True for each block of code to see what it does # Accessing elements and slicing # if False: # print life_expectancy[0] # print gdp[3:6] # Looping if False: for country_life_expectancy in life_expectancy: print('Examining life expectancy {}'.format(country_life_expectancy)) # Pandas functions # if False: # print life_expectancy.mean() # print life_expectancy.std() # print gdp.max() # print gdp.sum() # Vectorized operations and index arrays if False: a = pd.Series([1, 2, 3, 4]) b = pd.Series([1, 2, 1, 2]) print(a + b) print(a * 2) print(a >= 3) print(a[a >= 3]) def variable_correlation(life_expectancy, gdp): ''' Fill in this function to calculate the number of data points for which the directions of variable1 and variable2 relative to the mean are the same, and the number of data points for which they are different. Direction here means whether each value is above or below its mean. You can classify cases where the value is equal to the mean for one or both variables however you like. Each argument will be a Pandas series. For example, if the inputs were pd.Series([1, 2, 3, 4]) and pd.Series([4, 5, 6, 7]), then the output would be (4, 0). This is because 1 and 4 are both below their means, 2 and 5 are both below, 3 and 6 are both above, and 4 and 7 are both above. On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) and pd.Series([7, 6, 5, 4]), then the output would be (0, 4). This is because 1 is below its mean but 7 is above its mean, and so on. ''' life_expectancy_values_dir = ((life_expectancy-life_expectancy.mean()) > 0) # print(life_expectancy_values_dir) gdp_values_dir = ((gdp-gdp.mean()) > 0) # print(gdp_values_dir) sum_dir = (life_expectancy_values_dir == gdp_values_dir) # print(sum_dir) sum_dir_1 = (sum_dir == 1) sum_dir_0_2 = (sum_dir != 1) #len(sum_dir_1)-... num_same_direction = sum_dir_1.sum() # Replace this with your code num_different_direction = sum_dir_0_2.sum() # Replace this with your code return (num_same_direction, num_different_direction) variable_correlation(life_expectancy, gdp) (17, 3)

带索引的pandas

import pandas as pd countries = [ 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', ] employment_values = [ 55.70000076, 51.40000153, 50.5 , 75.69999695, 58.40000153, 40.09999847, 61.5 , 57.09999847, 60.90000153, 66.59999847, 60.40000153, 68.09999847, 66.90000153, 53.40000153, 48.59999847, 56.79999924, 71.59999847, 58.40000153, 70.40000153, 41.20000076, ] # Employment data in 2007 for 20 countries employment = pd.Series(employment_values, index=countries) def max_employment(employment): ''' Fill in this function to return the name of the country with the highest employment in the given employment data, and the employment in that country. The input will be a Pandas series where the values are employment and the index is country names. Try using the Pandas idxmax() function. Documention can be found here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html ''' max_country = employment.argmax() # Replace this with your code max_value = employment.loc[max_country] # Replace this with your code return (max_country, max_value) print(max_employment(employment) ) ('Angola', 75.699996949999999) import pandas as pd # Change False to True for each block of code to see what it does # Addition when indexes are the same if True: s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd']) print(s1 + s2) # Indexes have same elements in a different order if True: s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c']) print(s1 + s2) # Indexes overlap, but do not have exactly the same elements if True: s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f']) print(s1 + s2) sum = s1 + s2 print(sum.dropna()) # Indexes do not overlap if True: s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h']) print(s1 + s2) print(s1.add(s2, fill_value=0)) a 11 b 22 c 33 d 44 dtype: int64 a 31 b 12 c 43 d 24 dtype: int64 a NaN b NaN c 13.0 d 24.0 e NaN f NaN dtype: float64 c 13.0 d 24.0 dtype: float64 a NaN b NaN c NaN d NaN e NaN f NaN g NaN h NaN dtype: float64 a 1.0 b 2.0 c 3.0 d 4.0 e 10.0 f 20.0 g 30.0 h 40.0 dtype: float64 import pandas as pd # Change False to True to see what the following block of code does # Example pandas apply() usage (although this could have been done # without apply() using vectorized operations) if False: s = pd.Series([1, 2, 3, 4, 5]) def add_one(x): return x + 1 print(s.apply(add_one)) names = pd.Series([ 'Andre Agassi', 'Barry Bonds', 'Christopher Columbus', 'Daniel Defoe', 'Emilio Estevez', 'Fred Flintstone', 'Greta Garbo', 'Humbert Humbert', 'Ivan Ilych', 'James Joyce', 'Keira Knightley', 'Lois Lane', 'Mike Myers', 'Nick Nolte', 'Ozzy Osbourne', 'Pablo Picasso', 'Quirinus Quirrell', 'Rachael Ray', 'Susan Sarandon', 'Tina Turner', 'Ugueth Urbina', 'Vince Vaughn', 'Woodrow Wilson', 'Yoji Yamada', 'Zinedine Zidane' ]) def reverse_name(name): split_name = name.split(" ") firstname = split_name[0] secondname = split_name[1] return firstname+","+secondname # print(reverse_name(names[0])) def reverse_names(names): ''' Fill in this function to return a new series where each name in the input series has been transformed from the format "Firstname Lastname" to "Lastname, FirstName". Try to use the Pandas apply() function rather than a loop. ''' return names.apply(reverse_name) print(reverse_names(names)) 0 Andre,Agassi 1 Barry,Bonds 2 Christopher,Columbus 3 Daniel,Defoe 4 Emilio,Estevez 5 Fred,Flintstone 6 Greta,Garbo 7 Humbert,Humbert 8 Ivan,Ilych 9 James,Joyce 10 Keira,Knightley 11 Lois,Lane 12 Mike,Myers 13 Nick,Nolte 14 Ozzy,Osbourne 15 Pablo,Picasso 16 Quirinus,Quirrell 17 Rachael,Ray 18 Susan,Sarandon 19 Tina,Turner 20 Ugueth,Urbina 21 Vince,Vaughn 22 Woodrow,Wilson 23 Yoji,Yamada 24 Zinedine,Zidane dtype: object import pandas as pd import matplotlib.pyplot as plt employment = pd.read_csv("employment-above-15.csv", index_col='Country') employment_us = employment.loc['United States'] # %pylab inline # employment_us.plt()

转载请注明原文地址: https://www.6miu.com/read-28304.html

技术

最新回复(0)