此篇为优达学城数据分析入门第二课笔记网址
基本操作
import numpy
as np
countries = np.array([
'Afghanistan',
'Albania',
'Algeria',
'Angola',
'Argentina',
'Armenia',
'Australia',
'Austria',
'Azerbaijan',
'Bahamas',
'Bahrain',
'Bangladesh',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Benin',
'Bhutan',
'Bolivia',
'Bosnia and Herzegovina'
])
employment = np.array([
55.70000076,
51.40000153,
50.5 ,
75.69999695,
58.40000153,
40.09999847,
61.5 ,
57.09999847,
60.90000153,
66.59999847,
60.40000153,
68.09999847,
66.90000153,
53.40000153,
48.59999847,
56.79999924,
71.59999847,
58.40000153,
70.40000153,
41.20000076
])
if True:
print(countries[
0])
print(countries[
3])
if False:
print(countries[
0:
3])
print(countries[:
3])
print(countries[
17:])
print(countries[:])
if False:
print(countries.dtype)
print(employment.dtype)
print(np.array([
0,
1,
2,
3]).dtype)
print(np.array([
1.0,
1.5,
2.0,
2.5]).dtype)
print(np.array([
True,
False,
True]).dtype)
print(np.array([
'AL',
'AK',
'AZ',
'AR',
'CA']).dtype)
if False:
for country
in countries:
print(
'Examining country {}'.format(country))
for i
in range(len(countries)):
country = countries[i]
country_employment = employment[i]
print(
'Country {} has employment {}'.format(country,
country_employment))
if False:
print(employment.mean())
print(employment.std())
print(employment.max())
print(employment.sum())
def max_employment(countries, employment):
'''
Fill in this function to return the name of the country
with the highest employment in the given employment
data, and the employment in that country.
'''
i = employment.argmax()
max_country = countries[i]
max_value = employment[i]
return (max_country, max_value)
Afghanistan
Angola
运算
import numpy
as np
if False:
a = np.array([
1,
2,
3,
4])
b = np.array([
1,
2,
1,
2])
print (a + b)
print (a - b)
print (a * b)
print (a / b)
print (a ** b)
if False:
a = np.array([
1,
2,
3,
4])
b =
2
print (a + b)
print (a - b)
print (a * b)
print (a / b)
print (a ** b)
if False:
a = np.array([
True,
True,
False,
False])
b = np.array([
True,
False,
True,
False])
print (a & b)
print (a | b)
print (~a)
print (a &
True)
print (a &
False)
print (a |
True)
print (a |
False)
if False:
a = np.array([
1,
2,
3,
4,
5])
b = np.array([
5,
4,
3,
2,
1])
print (a > b)
print (a >= b)
print (a < b)
print (a <= b)
print (a == b)
print (a != b)
if False:
a = np.array([
1,
2,
3,
4])
b =
2
print (a > b)
print (a >= b)
print (a < b)
print (a <= b)
print (a == b)
print (a != b)
countries = np.array([
'Algeria',
'Argentina',
'Armenia',
'Aruba',
'Austria',
'Azerbaijan',
'Bahamas',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Bolivia',
'Botswana',
'Brunei',
'Bulgaria',
'Burkina Faso',
'Burundi',
'Cambodia',
'Cameroon',
'Cape Verde'
])
female_completion = np.array([
97.35583,
104.62379,
103.02998,
95.14321,
103.69019,
98.49185,
100.88828,
95.43974,
92.11484,
91.54804,
95.98029,
98.22902,
96.12179,
119.28105,
97.84627,
29.07386,
38.41644,
90.70509,
51.7478 ,
95.45072
])
male_completion = np.array([
95.47622,
100.66476,
99.7926 ,
91.48936,
103.22096,
97.80458,
103.81398,
88.11736,
93.55611,
87.76347,
102.45714,
98.73953,
92.22388,
115.3892 ,
98.70502,
37.00692,
45.39401,
91.22084,
62.42028,
90.66958
])
def overall_completion_rate(female_completion, male_completion):
'''
Fill in this function to return a NumPy array containing the overall
school completion rate for each country. The arguments are NumPy
arrays giving the female and male completion of each country in
the same order.
'''
return female_completion + male_completion
sum_completion= overall_completion_rate(female_completion, male_completion)
print(sum_completion)
[ 192.83205 205.28855 202.82258 186.63257 206.91115 196.29643
204.70226 183.5571 185.67095 179.31151 198.43743 196.96855
188.34567 234.67025 196.55129 66.08078 83.81045 181.92593
114.16808 186.1203 ]
归一化
import numpy
as np
countries = np.array([
'Afghanistan',
'Albania',
'Algeria',
'Angola',
'Argentina',
'Armenia',
'Australia',
'Austria',
'Azerbaijan',
'Bahamas',
'Bahrain',
'Bangladesh',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Benin',
'Bhutan',
'Bolivia',
'Bosnia and Herzegovina'
])
employment = np.array([
55.70000076,
51.40000153,
50.5 ,
75.69999695,
58.40000153,
40.09999847,
61.5 ,
57.09999847,
60.90000153,
66.59999847,
60.40000153,
68.09999847,
66.90000153,
53.40000153,
48.59999847,
56.79999924,
71.59999847,
58.40000153,
70.40000153,
41.20000076
])
country_name =
'United States'
def standardize_data(values):
'''
Fill in this function to return a standardized version of the given values,
which will be in a NumPy array. Each value should be translated into the
number of standard deviations that value is away from the mean of the data.
(A positive number indicates a value higher than the mean, and a negative
number indicates a value lower than the mean.)
'''
return (values-values.mean())/values.std()
standardize_data(employment)
array([-0.31965231, -0.780123 , -0.87650077, 1.82207181, -0.03051941,
-1.99019768, 0.30144772, -0.16973184, 0.23719615, 0.84758731,
0.18365304, 1.00821665, 0.87971351, -0.56595055, -1.07996476,
-0.20185762, 1.38301845, -0.03051941, 1.2545153 , -1.87240259])
import numpy
as np
if False:
a = np.array([
1,
2,
3,
4])
b = np.array([
True,
True,
False,
False])
if False:
a = np.array([
1,
2,
3,
2,
1])
b = (a >=
2)
if False:
a = np.array([
1,
2,
3,
4,
5])
b = np.array([
1,
2,
3,
2,
1])
def mean_time_for_paid_students(time_spent, days_to_cancel):
'''
Fill in this function to calculate the mean time spent in the classroom
for students who stayed enrolled at least (greater than or equal to) 7 days.
Unlike in Lesson 1, you can assume that days_to_cancel will contain only
integers (there are no students who have not canceled yet).
The arguments are NumPy arrays. time_spent contains the amount of time spent
in the classroom for each student, and days_to_cancel contains the number
of days until each student cancel. The data is given in the same order
in both arrays.
'''
st=time_spent[days_to_cancel >=
7]
return st.mean()
time_spent = np.array([
12.89697233,
0. ,
64.55043217,
0. ,
24.2315615 ,
39.991625 ,
0. ,
0. ,
147.20683783,
0. ,
0. ,
0. ,
45.18261617,
157.60454283,
133.2434615 ,
52.85000767,
0. ,
54.9204785 ,
26.78142417,
0.
])
days_to_cancel = np.array([
4,
5,
37,
3,
12,
4,
35,
38,
5,
37,
3,
3,
68,
38,
98,
2,
249,
2,
127,
35
])
mean_time_for_paid_students(time_spent, days_to_cancel)
41.054003485454537
numpy的+=与切片需要注意,类似指针去理解
Pandas Series
import pandas
as pd
countries = [
'Albania',
'Algeria',
'Andorra',
'Angola',
'Antigua and Barbuda',
'Argentina',
'Armenia',
'Australia',
'Austria',
'Azerbaijan',
'Bahamas',
'Bahrain',
'Bangladesh',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Benin',
'Bhutan',
'Bolivia']
life_expectancy_values = [
74.7,
75. ,
83.4,
57.6,
74.6,
75.4,
72.3,
81.5,
80.2,
70.3,
72.1,
76.4,
68.1,
75.2,
69.8,
79.4,
70.8,
62.7,
67.3,
70.6]
gdp_values = [
1681.61390973,
2155.48523109,
21495.80508273,
562.98768478,
13495.1274663 ,
9388.68852258,
1424.19056199,
24765.54890176,
27036.48733192,
1945.63754911,
21721.61840978,
13373.21993972,
483.97086804,
9783.98417323,
2253.46411147,
25034.66692293,
3680.91642923,
366.04496652,
1175.92638695,
1132.21387981]
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)
if False:
for country_life_expectancy
in life_expectancy:
print(
'Examining life expectancy {}'.format(country_life_expectancy))
if False:
a = pd.Series([
1,
2,
3,
4])
b = pd.Series([
1,
2,
1,
2])
print(a + b)
print(a *
2)
print(a >=
3)
print(a[a >=
3])
def variable_correlation(life_expectancy, gdp):
'''
Fill in this function to calculate the number of data points for which
the directions of variable1 and variable2 relative to the mean are the
same, and the number of data points for which they are different.
Direction here means whether each value is above or below its mean.
You can classify cases where the value is equal to the mean for one or
both variables however you like.
Each argument will be a Pandas series.
For example, if the inputs were pd.Series([1, 2, 3, 4]) and
pd.Series([4, 5, 6, 7]), then the output would be (4, 0).
This is because 1 and 4 are both below their means, 2 and 5 are both
below, 3 and 6 are both above, and 4 and 7 are both above.
On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) and
pd.Series([7, 6, 5, 4]), then the output would be (0, 4).
This is because 1 is below its mean but 7 is above its mean, and
so on.
'''
life_expectancy_values_dir = ((life_expectancy-life_expectancy.mean()) >
0)
gdp_values_dir = ((gdp-gdp.mean()) >
0)
sum_dir = (life_expectancy_values_dir == gdp_values_dir)
sum_dir_1 = (sum_dir ==
1)
sum_dir_0_2 = (sum_dir !=
1)
num_same_direction = sum_dir_1.sum()
num_different_direction = sum_dir_0_2.sum()
return (num_same_direction, num_different_direction)
variable_correlation(life_expectancy, gdp)
(17, 3)
带索引的pandas
import pandas
as pd
countries = [
'Afghanistan',
'Albania',
'Algeria',
'Angola',
'Argentina',
'Armenia',
'Australia',
'Austria',
'Azerbaijan',
'Bahamas',
'Bahrain',
'Bangladesh',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Benin',
'Bhutan',
'Bolivia',
'Bosnia and Herzegovina',
]
employment_values = [
55.70000076,
51.40000153,
50.5 ,
75.69999695,
58.40000153,
40.09999847,
61.5 ,
57.09999847,
60.90000153,
66.59999847,
60.40000153,
68.09999847,
66.90000153,
53.40000153,
48.59999847,
56.79999924,
71.59999847,
58.40000153,
70.40000153,
41.20000076,
]
employment = pd.Series(employment_values, index=countries)
def max_employment(employment):
'''
Fill in this function to return the name of the country
with the highest employment in the given employment
data, and the employment in that country.
The input will be a Pandas series where the values
are employment and the index is country names.
Try using the Pandas idxmax() function. Documention can
be found here:
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html
'''
max_country = employment.argmax()
max_value = employment.loc[max_country]
return (max_country, max_value)
print(max_employment(employment) )
('Angola', 75.699996949999999)
import pandas
as pd
if True:
s1 = pd.Series([
1,
2,
3,
4], index=[
'a',
'b',
'c',
'd'])
s2 = pd.Series([
10,
20,
30,
40], index=[
'a',
'b',
'c',
'd'])
print(s1 + s2)
if True:
s1 = pd.Series([
1,
2,
3,
4], index=[
'a',
'b',
'c',
'd'])
s2 = pd.Series([
10,
20,
30,
40], index=[
'b',
'd',
'a',
'c'])
print(s1 + s2)
if True:
s1 = pd.Series([
1,
2,
3,
4], index=[
'a',
'b',
'c',
'd'])
s2 = pd.Series([
10,
20,
30,
40], index=[
'c',
'd',
'e',
'f'])
print(s1 + s2)
sum = s1 + s2
print(sum.dropna())
if True:
s1 = pd.Series([
1,
2,
3,
4], index=[
'a',
'b',
'c',
'd'])
s2 = pd.Series([
10,
20,
30,
40], index=[
'e',
'f',
'g',
'h'])
print(s1 + s2)
print(s1.add(s2, fill_value=
0))
a 11
b 22
c 33
d 44
dtype: int64
a 31
b 12
c 43
d 24
dtype: int64
a NaN
b NaN
c 13.0
d 24.0
e NaN
f NaN
dtype: float64
c 13.0
d 24.0
dtype: float64
a NaN
b NaN
c NaN
d NaN
e NaN
f NaN
g NaN
h NaN
dtype: float64
a 1.0
b 2.0
c 3.0
d 4.0
e 10.0
f 20.0
g 30.0
h 40.0
dtype: float64
import pandas
as pd
if False:
s = pd.Series([
1,
2,
3,
4,
5])
def add_one(x):
return x +
1
print(s.apply(add_one))
names = pd.Series([
'Andre Agassi',
'Barry Bonds',
'Christopher Columbus',
'Daniel Defoe',
'Emilio Estevez',
'Fred Flintstone',
'Greta Garbo',
'Humbert Humbert',
'Ivan Ilych',
'James Joyce',
'Keira Knightley',
'Lois Lane',
'Mike Myers',
'Nick Nolte',
'Ozzy Osbourne',
'Pablo Picasso',
'Quirinus Quirrell',
'Rachael Ray',
'Susan Sarandon',
'Tina Turner',
'Ugueth Urbina',
'Vince Vaughn',
'Woodrow Wilson',
'Yoji Yamada',
'Zinedine Zidane'
])
def reverse_name(name):
split_name = name.split(
" ")
firstname = split_name[
0]
secondname = split_name[
1]
return firstname+
","+secondname
def reverse_names(names):
'''
Fill in this function to return a new series where each name
in the input series has been transformed from the format
"Firstname Lastname" to "Lastname, FirstName".
Try to use the Pandas apply() function rather than a loop.
'''
return names.apply(reverse_name)
print(reverse_names(names))
0 Andre,Agassi
1 Barry,Bonds
2 Christopher,Columbus
3 Daniel,Defoe
4 Emilio,Estevez
5 Fred,Flintstone
6 Greta,Garbo
7 Humbert,Humbert
8 Ivan,Ilych
9 James,Joyce
10 Keira,Knightley
11 Lois,Lane
12 Mike,Myers
13 Nick,Nolte
14 Ozzy,Osbourne
15 Pablo,Picasso
16 Quirinus,Quirrell
17 Rachael,Ray
18 Susan,Sarandon
19 Tina,Turner
20 Ugueth,Urbina
21 Vince,Vaughn
22 Woodrow,Wilson
23 Yoji,Yamada
24 Zinedine,Zidane
dtype: object
import pandas
as pd
import matplotlib.pyplot
as plt
employment = pd.read_csv(
"employment-above-15.csv", index_col=
'Country')
employment_us = employment.loc[
'United States']