导入支持库

In [1020]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from pandas import Series, DataFrame


#在GUI中显示
# %matplotlib tk 
#在行内显示(默认)
%matplotlib inline

mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

载入数据 合并数据 data_obj

In [226]:
users_df = pd.read_csv('./data/users.dat', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
movies_df = pd.read_csv('./data/movies.dat', sep='::', engine='python', names=['MovieID', 'Title', 'Genres'])
ratings_df = pd.read_csv('./data/ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
# print(users_df)
# print(movies_df)
# print(ratings_df)

combine_movies_ratings_df = pd.merge(movies_df, ratings_df, on='MovieID')
combine_movies_ratings_users_df = pd.merge(combine_movies_ratings_df, users_df, on='UserID')
data_obj = combine_movies_ratings_users_df
data_obj
Out[226]:
MovieID Title Genres UserID Rating Timestamp Gender Age Occupation Zip-code
0 1 Toy Story (1995) Animation|Children's|Comedy 1 5 978824268 F 1 10 48067
1 48 Pocahontas (1995) Animation|Children's|Musical|Romance 1 5 978824351 F 1 10 48067
2 150 Apollo 13 (1995) Drama 1 5 978301777 F 1 10 48067
3 260 Star Wars: Episode IV - A New Hope (1977) Action|Adventure|Fantasy|Sci-Fi 1 4 978300760 F 1 10 48067
4 527 Schindler's List (1993) Drama|War 1 5 978824195 F 1 10 48067
5 531 Secret Garden, The (1993) Children's|Drama 1 4 978302149 F 1 10 48067
6 588 Aladdin (1992) Animation|Children's|Comedy|Musical 1 4 978824268 F 1 10 48067
7 594 Snow White and the Seven Dwarfs (1937) Animation|Children's|Musical 1 4 978302268 F 1 10 48067
8 595 Beauty and the Beast (1991) Animation|Children's|Musical 1 5 978824268 F 1 10 48067
9 608 Fargo (1996) Crime|Drama|Thriller 1 4 978301398 F 1 10 48067
10 661 James and the Giant Peach (1996) Animation|Children's|Musical 1 3 978302109 F 1 10 48067
11 720 Wallace & Gromit: The Best of Aardman Animatio... Animation 1 3 978300760 F 1 10 48067
12 745 Close Shave, A (1995) Animation|Comedy|Thriller 1 3 978824268 F 1 10 48067
13 783 Hunchback of Notre Dame, The (1996) Animation|Children's|Musical 1 4 978824291 F 1 10 48067
14 914 My Fair Lady (1964) Musical|Romance 1 3 978301968 F 1 10 48067
15 919 Wizard of Oz, The (1939) Adventure|Children's|Drama|Musical 1 4 978301368 F 1 10 48067
16 938 Gigi (1958) Musical 1 4 978301752 F 1 10 48067
17 1022 Cinderella (1950) Animation|Children's|Musical 1 5 978300055 F 1 10 48067
18 1028 Mary Poppins (1964) Children's|Comedy|Musical 1 5 978301777 F 1 10 48067
19 1029 Dumbo (1941) Animation|Children's|Musical 1 5 978302205 F 1 10 48067
20 1035 Sound of Music, The (1965) Musical 1 5 978301753 F 1 10 48067
21 1097 E.T. the Extra-Terrestrial (1982) Children's|Drama|Fantasy|Sci-Fi 1 4 978301953 F 1 10 48067
22 1193 One Flew Over the Cuckoo's Nest (1975) Drama 1 5 978300760 F 1 10 48067
23 1197 Princess Bride, The (1987) Action|Adventure|Comedy|Romance 1 3 978302268 F 1 10 48067
24 1207 To Kill a Mockingbird (1962) Drama 1 4 978300719 F 1 10 48067
25 1246 Dead Poets Society (1989) Drama 1 4 978302091 F 1 10 48067
26 1270 Back to the Future (1985) Comedy|Sci-Fi 1 5 978300055 F 1 10 48067
27 1287 Ben-Hur (1959) Action|Adventure|Drama 1 5 978302039 F 1 10 48067
28 1545 Ponette (1996) Drama 1 4 978824139 F 1 10 48067
29 1566 Hercules (1997) Adventure|Animation|Children's|Comedy|Musical 1 4 978824330 F 1 10 48067
... ... ... ... ... ... ... ... ... ... ...
1000179 2763 Thomas Crown Affair, The (1999) Action|Thriller 5727 5 958492554 M 25 4 92843
1000180 2770 Bowfinger (1999) Comedy 5727 4 958490923 M 25 4 92843
1000181 2805 Mickey Blue Eyes (1999) Comedy|Romance 5727 4 958492371 M 25 4 92843
1000182 2827 Astronaut's Wife, The (1999) Sci-Fi|Thriller 5727 2 958490886 M 25 4 92843
1000183 2840 Stigmata (1999) Thriller 5727 3 958492512 M 25 4 92843
1000184 2841 Stir of Echoes (1999) Thriller 5727 2 958492512 M 25 4 92843
1000185 2858 American Beauty (1999) Comedy|Drama 5727 5 958490857 M 25 4 92843
1000186 2881 Double Jeopardy (1999) Action|Thriller 5727 3 958490981 M 25 4 92843
1000187 2907 Superstar (1999) Comedy 5727 1 958492554 M 25 4 92843
1000188 3005 Bone Collector, The (1999) Thriller 5727 2 958490923 M 25 4 92843
1000189 3053 Messenger: The Story of Joan of Arc, The (1999) Drama|War 5727 3 958492345 M 25 4 92843
1000190 3081 Sleepy Hollow (1999) Horror|Romance 5727 4 958492512 M 25 4 92843
1000191 3113 End of Days (1999) Action|Thriller 5727 3 958490981 M 25 4 92843
1000192 3114 Toy Story 2 (1999) Animation|Children's|Comedy 5727 5 958492554 M 25 4 92843
1000193 3176 Talented Mr. Ripley, The (1999) Drama|Mystery|Thriller 5727 3 958492554 M 25 4 92843
1000194 3177 Next Friday (1999) Comedy 5727 1 958489748 M 25 4 92843
1000195 3273 Scream 3 (2000) Horror|Mystery|Thriller 5727 4 958492479 M 25 4 92843
1000196 3301 Whole Nine Yards, The (2000) Comedy|Crime 5727 5 958490142 M 25 4 92843
1000197 3316 Reindeer Games (2000) Action|Thriller 5727 3 958490699 M 25 4 92843
1000198 3317 Wonder Boys (2000) Comedy|Drama 5727 4 958489879 M 25 4 92843
1000199 3408 Erin Brockovich (2000) Drama 5727 5 958489879 M 25 4 92843
1000200 3409 Final Destination (2000) Drama|Thriller 5727 4 958490143 M 25 4 92843
1000201 3481 High Fidelity (2000) Comedy 5727 4 958489879 M 25 4 92843
1000202 3483 Road to El Dorado, The (2000) Animation|Children's 5727 3 958490143 M 25 4 92843
1000203 3484 Skulls, The (2000) Thriller 5727 1 958489902 M 25 4 92843
1000204 3513 Rules of Engagement (2000) Drama|Thriller 5727 4 958489970 M 25 4 92843
1000205 3535 American Psycho (2000) Comedy|Horror|Thriller 5727 2 958489970 M 25 4 92843
1000206 3536 Keeping the Faith (2000) Comedy|Romance 5727 5 958489902 M 25 4 92843
1000207 3555 U-571 (2000) Action|Thriller 5727 3 958490699 M 25 4 92843
1000208 3578 Gladiator (2000) Action|Drama 5727 5 958490171 M 25 4 92843

1000209 rows × 10 columns

1.查看每部电影中两类性别的平均分

In [79]:
# a_movie = data_obj.groupby(['MovieID', 'Gender']).mean()
# out = DataFrame(a_movie['Rating'])
# out
out = DataFrame.pivot_table(data_obj, values='Rating', index='MovieID', columns='Gender', aggfunc='mean')
out
Out[79]:
Gender F M
MovieID
1 4.444444 4.285714
2 3.666667 3.500000
3 2.500000 2.000000
4 NaN 3.000000
5 NaN 4.000000
6 3.000000 4.000000
7 2.750000 3.500000
9 NaN 5.000000
10 5.000000 3.714286
11 4.000000 3.000000
13 3.000000 4.000000
14 5.000000 4.000000
15 NaN 2.000000
16 4.000000 3.777778
17 4.285714 3.900000
18 NaN 4.000000
19 NaN 2.000000
21 4.000000 3.166667
22 NaN 4.000000
23 3.000000 NaN
24 3.000000 3.333333
25 3.833333 4.500000
26 4.000000 NaN
28 3.000000 3.000000
29 3.000000 4.500000
30 NaN 4.500000
32 4.000000 3.916667
34 3.666667 3.866667
35 2.000000 5.000000
36 3.750000 3.714286
... ... ...
953 4.750000 4.800000
954 4.750000 4.000000
955 4.000000 5.000000
956 3.000000 NaN
957 NaN 4.000000
959 4.500000 NaN
960 4.000000 NaN
961 4.000000 4.000000
963 5.000000 5.000000
965 3.500000 4.000000
968 NaN 3.714286
969 4.200000 4.181818
971 4.500000 4.333333
973 5.000000 NaN
974 3.000000 NaN
976 3.000000 4.000000
978 3.000000 NaN
982 NaN 4.000000
986 1.000000 5.000000
991 NaN 3.000000
994 5.000000 4.000000
996 NaN 3.000000
999 NaN 4.000000
1003 NaN 2.000000
1005 1.000000 2.000000
1007 3.000000 2.000000
1009 5.000000 2.750000
1010 NaN 3.000000
1011 NaN 2.000000
1012 3.000000 3.000000

554 rows × 2 columns

2.查看每部电影中两类性别的均分差异最大的前3的电影

In [113]:
noNaN = out.dropna()
noNaN = noNaN.copy()
noNaN['|F-M|'] = abs(noNaN['F'] - noNaN['M'])
noNaN.sort_values("|F-M|",ascending=False)
noNaN.head(3)
Out[113]:
Gender F M |F-M|
MovieID
1 4.444444 4.285714 0.158730
2 3.666667 3.500000 0.166667
3 2.500000 2.000000 0.500000

3.查找出热门电影热度榜单 (计算表中电影出现的次数,取前3)

In [275]:
ratings_df.groupby('MovieID').count().sort_values('UserID', ascending=False).head(3)
Out[275]:
UserID Rating Timestamp
MovieID
2858 3428 3428 3428
260 2991 2991 2991
1196 2990 2990 2990

4.每部电影的评分(平均分) 获取排名前10的高分榜单

In [141]:
DataFrame(ratings_df.groupby('MovieID').mean().sort_values('Rating', ascending=False)['Rating'].head(10))
Out[141]:
Rating
MovieID
1978 5.0
1611 5.0
1963 5.0
1950 5.0
1939 5.0
1937 5.0
1827 5.0
1821 5.0
1769 5.0
1563 5.0

5.前十大热门电影__的评分

In [274]:
hot = ratings_df.groupby('MovieID').count().sort_values('UserID', ascending=False)['Rating']
DataFrame(hot.head(10))
Out[274]:
Rating
MovieID
2858 3428
260 2991
1196 2990
1210 2883
480 2672
2028 2653
589 2649
2571 2590
1270 2583
593 2578

6.前20名高分电影的__热度 (使用第3.热度表)

In [276]:
height20 = ratings_df.groupby('MovieID').mean().sort_values('Rating', ascending=False)['Rating']
DataFrame(height20.head(20))
Out[276]:
Rating
MovieID
989 5.000000
3881 5.000000
1830 5.000000
3382 5.000000
787 5.000000
3280 5.000000
3607 5.000000
3233 5.000000
3172 5.000000
3656 5.000000
3245 4.800000
53 4.750000
2503 4.666667
2905 4.608696
2019 4.560510
318 4.554558
858 4.524966
745 4.520548
50 4.517106
527 4.510417

7.查询出前十好电影

好电影:

1.看的人多(点击在1000以上)

2.分数还高(按Rating降序)

In [277]:
DataFrame(height20[hot[hot>1000].head(20).index].sort_values(ascending=False))
Out[277]:
Rating
MovieID
527 4.510417
1198 4.477725
260 4.453694
2762 4.406263
593 4.351823
2028 4.337354
2858 4.317386
2571 4.315830
1197 4.303710
1196 4.292977
608 4.254676
110 4.234957
1617 4.219406
2396 4.127480
589 4.058513
1210 4.022893
1270 3.990321
1265 3.953029
480 3.763847
1580 3.739953

- - - - - - - - - - - - - - - - - - - - - - - - -

使用的数据集是美国人口普查的数据

In [286]:
census_df = pd.read_csv('./data/census.csv', engine='python')
census_df
Out[286]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
0 40 3 6 1 0 Alabama Alabama 4779736 4780127 4785161 ... 0.002295 -0.193196 0.381066 0.582002 -0.467369 1.030015 0.826644 1.383282 1.724718 0.712594
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411
6 50 3 6 1 11 Alabama Bullock County 10914 10915 10887 ... -30.953709 -5.180127 -1.130263 14.354290 -16.167247 -29.001673 -2.825524 1.507017 17.243790 -13.193961
7 50 3 6 1 13 Alabama Butler County 20947 20946 20944 ... -14.032727 -11.684234 -5.655413 1.085428 -6.529805 -13.936612 -11.586865 -5.557058 1.184103 -6.430868
8 50 3 6 1 15 Alabama Calhoun County 118572 118586 118437 ... -6.155670 -4.611706 -5.524649 -4.463211 -3.376322 -5.791579 -4.092677 -5.062836 -3.912834 -2.806406
9 50 3 6 1 17 Alabama Chambers County 34215 34170 34098 ... -2.731639 3.849092 2.872721 -2.287222 1.349468 -1.821092 4.701181 3.781439 -1.290228 2.346901
10 50 3 6 1 19 Alabama Cherokee County 25989 25986 25976 ... 6.339327 1.113180 5.488706 -0.076806 -3.239866 6.416167 1.420264 5.757384 0.230419 -2.931307
11 50 3 6 1 21 Alabama Chilton County 43643 43631 43665 ... -1.372935 -2.653369 0.480044 0.456017 -2.253483 -0.823761 -2.447504 0.868651 0.957636 -1.752709
12 50 3 6 1 23 Alabama Choctaw County 13859 13858 13841 ... -15.455274 -0.737028 -8.766391 -1.274984 -5.291205 -15.528177 -0.737028 -8.766391 -1.274984 -5.291205
13 50 3 6 1 25 Alabama Clarke County 25833 25840 25767 ... -6.194363 -17.667705 -0.318345 -8.686428 -5.613667 -6.077488 -17.509958 -0.159172 -8.486280 -5.411736
14 50 3 6 1 27 Alabama Clay County 13932 13932 13880 ... -10.744102 -13.345130 4.902871 5.702648 3.912450 -10.816697 -13.345130 4.977157 5.776708 3.986270
15 50 3 6 1 29 Alabama Cleburne County 14972 14972 14973 ... -3.673524 -5.151880 7.345821 3.654485 -3.123961 -3.673524 -5.151880 7.345821 3.654485 -3.123961
16 50 3 6 1 31 Alabama Coffee County 49948 49948 50177 ... 0.377640 7.675579 -13.146535 -3.602859 2.214774 2.166460 11.513368 -10.438741 -0.767822 5.350738
17 50 3 6 1 33 Alabama Colbert County 54428 54428 54514 ... -0.073423 1.065051 1.762390 1.835688 -0.110260 0.513964 1.469035 2.276420 2.533249 0.588052
18 50 3 6 1 35 Alabama Conecuh County 13228 13228 13208 ... -4.861559 -7.504690 -6.107224 -14.645416 2.684140 -4.861559 -7.504690 -6.107224 -14.645416 2.684140
19 50 3 6 1 37 Alabama Coosa County 11539 11758 11758 ... -33.930581 -10.291443 -4.313831 -22.958017 -5.387581 -34.017138 -10.380162 -4.403703 -23.049483 -5.387581
20 50 3 6 1 39 Alabama Covington County 37765 37765 37796 ... 6.696899 -4.612668 0.740271 3.697932 -0.316945 6.881460 -4.559952 0.793147 3.750759 -0.264121
21 50 3 6 1 41 Alabama Crenshaw County 13906 13906 13853 ... 1.729792 3.950156 -1.864936 3.084648 3.439504 2.666763 5.099293 -0.502098 4.734577 5.087600
22 50 3 6 1 43 Alabama Cullman County 80406 80410 80473 ... -1.404233 -1.019628 4.071247 5.087142 7.915406 -1.031427 -0.634159 4.542916 5.593387 8.417777
23 50 3 6 1 45 Alabama Dale County 50251 50251 50358 ... -10.749798 -5.277150 -15.236079 -11.979785 -5.107706 -9.575283 -0.776637 -12.640155 -9.503292 -1.998668
24 50 3 6 1 47 Alabama Dallas County 43820 43820 43803 ... -15.635599 -11.308243 -16.745678 -9.344789 -14.687232 -15.727573 -11.378047 -16.792849 -9.368689 -14.711389
25 50 3 6 1 49 Alabama DeKalb County 71109 71115 71142 ... 0.294677 -9.302391 -1.748807 0.267830 0.028141 1.375159 -8.656001 -1.029539 1.198187 0.956790
26 50 3 6 1 51 Alabama Elmore County 79303 79296 79465 ... 3.235576 0.822717 1.760531 -1.507057 2.067820 3.674511 1.558176 2.306047 -0.951175 2.757093
27 50 3 6 1 53 Alabama Escambia County 38319 38319 38309 ... -3.449988 -3.855889 -4.822706 -1.189831 1.190902 -3.397716 -3.803428 -4.769999 -1.136950 1.243830
28 50 3 6 1 55 Alabama Etowah County 104430 104427 104442 ... -1.015919 2.062637 -1.931884 -1.726932 -2.082234 -0.632554 2.446383 -1.518596 -1.234901 -1.588308
29 50 3 6 1 57 Alabama Fayette County 17241 17241 17231 ... -5.015601 -0.646640 -3.725937 0.296745 -2.797536 -5.132243 -0.705426 -3.785079 0.237396 -2.857058
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3163 50 2 3 55 131 Wisconsin Washington County 131887 131885 131967 ... -0.794876 0.785279 -2.215465 1.601149 -0.434498 -0.431504 1.162817 -1.763330 2.104796 0.059931
3164 50 2 3 55 133 Wisconsin Waukesha County 389891 389938 390076 ... -0.765799 2.128860 0.038132 0.760109 -0.719858 0.102448 3.180527 1.189727 2.077633 0.593567
3165 50 2 3 55 135 Wisconsin Waupaca County 52410 52410 52422 ... 3.111756 -2.241873 6.292687 -0.441031 -0.480617 3.359933 -2.011937 6.561277 -0.134227 -0.173022
3166 50 2 3 55 137 Wisconsin Waushara County 24496 24496 24506 ... 4.930022 -2.404973 -4.097017 -4.906711 -4.397793 5.174486 -2.160399 -3.810226 -4.535615 -4.024395
3167 50 2 3 55 139 Wisconsin Winnebago County 166994 166994 167059 ... 0.316712 2.889873 0.833819 -2.406192 -4.557985 0.842573 3.502335 1.531624 -1.545153 -3.685304
3168 50 2 3 55 141 Wisconsin Wood County 74749 74749 74807 ... -4.081523 -5.019090 -6.901200 -5.596471 -3.958322 -3.733590 -4.562809 -6.442917 -5.040889 -3.414223
3169 40 4 8 56 0 Wyoming Wyoming 563626 563767 564516 ... -0.381530 9.636214 4.487115 -4.788275 -3.221091 0.289680 10.694870 5.440390 -3.727831 -2.091573
3170 50 4 8 56 1 Wyoming Albany County 36299 36299 36428 ... 3.708956 2.637812 -3.544634 -3.334877 -9.911169 6.736119 6.433032 0.719587 1.429233 -5.166460
3171 50 4 8 56 3 Wyoming Big Horn County 11668 11668 11672 ... 4.868258 2.804930 16.815908 -8.026420 5.095861 4.868258 3.144921 17.236306 -7.608378 5.513554
3172 50 4 8 56 5 Wyoming Campbell County 46133 46133 46244 ... -2.843479 15.601020 -5.895711 -8.550911 10.916963 -2.649606 15.558684 -5.916543 -8.509402 10.978525
3173 50 4 8 56 7 Wyoming Carbon County 15885 15885 15837 ... -7.581980 -13.081441 3.178134 -2.970641 -23.300971 -7.392431 -12.636926 3.623073 -2.338590 -22.600668
3174 50 4 8 56 9 Wyoming Converse County 13833 13833 13826 ... -12.847499 15.493820 19.035533 -20.550587 -0.070403 -12.774915 16.502720 20.093063 -19.358233 1.126443
3175 50 4 8 56 11 Wyoming Crook County 7083 7083 7114 ... -1.544618 -4.202564 1.397819 6.378258 18.629317 -0.982939 -3.642222 2.096729 7.071547 19.309219
3176 50 4 8 56 13 Wyoming Fremont County 40123 40123 40222 ... 2.747083 7.782673 -4.990688 -12.331633 -13.673610 3.093562 8.027411 -4.747240 -12.013555 -13.352750
3177 50 4 8 56 15 Wyoming Goshen County 13249 13247 13408 ... 14.293649 3.961413 -8.079028 -7.017803 -11.899450 14.886132 4.841727 -6.903896 -5.761986 -10.635133
3178 50 4 8 56 17 Wyoming Hot Springs County 4812 4812 4813 ... 3.322604 6.208609 3.095336 -6.017222 -5.454164 5.191569 6.001656 2.888981 -6.224712 -5.663940
3179 50 4 8 56 19 Wyoming Johnson County 8569 8569 8581 ... 4.995063 -4.058912 -0.812583 -10.715742 0.933652 5.227392 -4.058912 -0.812583 -10.715742 0.933652
3180 50 4 8 56 21 Wyoming Laramie County 91738 91881 92271 ... -1.200428 15.547274 4.787847 -1.226133 0.278940 -0.973320 17.914554 6.003143 -0.207819 1.673640
3181 50 4 8 56 23 Wyoming Lincoln County 18106 18106 18091 ... -9.802564 -11.566801 13.564556 6.125989 1.555544 -9.691801 -11.566801 13.619696 6.234414 1.662823
3182 50 4 8 56 25 Wyoming Natrona County 75450 75450 75472 ... 7.189319 23.066162 24.322042 -0.958472 -0.061057 7.689674 23.749508 25.085233 -0.110593 0.793743
3183 50 4 8 56 27 Wyoming Niobrara County 2484 2484 2492 ... -0.401849 0.806452 29.066295 -12.603387 7.492114 -0.401849 0.806452 29.066295 -12.603387 7.492114
3184 50 4 8 56 29 Wyoming Park County 28205 28205 28259 ... 4.582951 8.057765 7.641997 -9.252437 -2.878980 6.486639 11.127389 10.877797 -5.585731 0.856839
3185 50 4 8 56 31 Wyoming Platte County 8667 8667 8678 ... 4.373094 5.392073 2.634593 6.055759 4.662270 4.373094 4.933173 2.176403 5.598720 4.207414
3186 50 4 8 56 33 Wyoming Sheridan County 29116 29116 29146 ... 0.958559 8.425487 4.546373 3.678069 -3.298406 2.122524 9.342778 5.523001 4.781489 -2.198937
3187 50 4 8 56 35 Wyoming Sublette County 10247 10247 10244 ... -23.741784 15.272374 -40.870074 -16.596273 -22.870900 -21.092907 16.828794 -39.211861 -14.409938 -20.664059
3188 50 4 8 56 37 Wyoming Sweetwater County 43806 43806 43593 ... 1.072643 16.243199 -5.339774 -14.252889 -14.248864 1.255221 16.243199 -5.295460 -14.075283 -14.070195
3189 50 4 8 56 39 Wyoming Teton County 21294 21294 21297 ... -1.589565 0.972695 19.525929 14.143021 -0.564849 0.654527 2.408578 21.160658 16.308671 1.520747
3190 50 4 8 56 41 Wyoming Uinta County 21118 21118 21102 ... -17.755986 -4.916350 -6.902954 -14.215862 -12.127022 -18.136812 -5.536861 -7.521840 -14.740608 -12.606351
3191 50 4 8 56 43 Wyoming Washakie County 8533 8533 8545 ... -11.637475 -0.827815 -2.013502 -17.781491 1.682288 -11.990126 -1.182592 -2.250385 -18.020168 1.441961
3192 50 4 8 56 45 Wyoming Weston County 7208 7208 7181 ... -11.752361 -8.040059 12.372583 1.533635 6.935294 -12.032179 -8.040059 12.372583 1.533635 6.935294

3193 rows × 100 columns

2010年中 以每个州(STNAME)人口中 最多的 3 个县的人口总和为这个州人口的衡量标准,问哪 3 个州对应的人口最多?

In [329]:
DataFrame(census_df.sort_values(['STNAME', 'POPESTIMATE2010'],ascending=[True,False]).groupby(['STNAME']).head(3).groupby('STNAME')['POPESTIMATE2010'].sum().sort_values(ascending=False).head(3))
Out[329]:
POPESTIMATE2010
STNAME
California 50264474
Texas 31726014
New York 24148828

各个州、县 在 2010 年至 2012 年间人口平均量是怎么样的?

In [351]:
temp = census_df[["POPESTIMATE2010","POPESTIMATE2011","POPESTIMATE2012"]]
# census_df["three-year-total"] = temp.sum(axis=1) #axis 0为列,1为行
census_df["three-year-avg"] = temp.mean(axis=1)
# census_df[['STNAME', 'CTYNAME', 'three-year-total', 'three-year-avg']]
census_df[['STNAME', 'CTYNAME', 'three-year-avg']].head(10)
Out[351]:
STNAME CTYNAME three-year-avg
0 Alabama Alabama 4.800786e+06
1 Alabama Autauga County 5.502933e+04
2 Alabama Baldwin County 1.867493e+05
3 Alabama Barbour County 2.724200e+04
4 Alabama Bibb County 2.274533e+04
5 Alabama Blount County 5.762000e+04
6 Alabama Bullock County 1.070733e+04
7 Alabama Butler County 2.067500e+04
8 Alabama Calhoun County 1.178303e+05
9 Alabama Chambers County 3.405533e+04

在 2010 年至 2012 年间人口增加数量最多的是哪3个县?

In [352]:
census_df['Δ/➖'] = census_df['POPESTIMATE2012'] - census_df['POPESTIMATE2010']
DataFrame(census_df[['STNAME', 'CTYNAME', 'Δ/➖']].sort_values('Δ/➖', ascending=False).head(3))
Out[352]:
STNAME CTYNAME Δ/➖
2566 Texas Texas 845378
190 California California 721976
329 Florida Florida 502131

yuehui.163.com 数据分析

In [1041]:
girls_df = pd.read_csv('./data/girls', encoding='utf-8', sep='::', engine='python')
girls_df.head(10)
Out[1041]:
会员ID 昵称 年龄 省市 市区县 星座 学历 学历转分数 工资 婚否 身高 体重 职业 性别 头像
0 30135985 做个小女人 47 北京 朝阳区 狮子座 本科 60 9000 1 162 70 其他行业 0 http://yuehui1.nos.netease.com/images/portrait...
1 37827353 云海 35 北京 西城区 摩羯座 硕士 70 0 1 169 55 主管/经理 0 http://yuehui1.nos.netease.com/images/portrait...
2 47845164 紫色柔情 47 北京 海淀区 摩羯座 本科 60 6500 1 163 55 教师 0 http://yuehui1.nos.netease.com/images/portrait...
3 615430496 回眸一笑 48 北京 海淀区 摩羯座 本科 60 6500 1 163 54 教师 0 http://yuehui1.nos.netease.com/images/portrait...
4 615995687 空谷幽兰 47 北京 海淀区 摩羯座 本科 60 6500 1 163 55 教师 0 http://yuehui1.nos.netease.com/images/portrait...
5 32714149 风中百合 65 北京 西城区 射手座 本科 60 6500 1 163 50 其他行业 0 http://yuehui1.nos.netease.com/images/portrait...
6 15735842 小草 55 北京 水瓶座 本科 60 2500 1 165 58 工程师 0 http://yuehui1.nos.netease.com/images/portrait...
7 700172079 回眸一笑 50 北京 朝阳区 天蝎座 专科 50 9000 1 163 60 传媒/影视 0 http://yuehui1.nos.netease.com/images/portrait...
8 19335879 雨后彩虹 56 北京 朝阳区 摩羯座 专科 50 6500 1 168 63 其他行业 0 http://yuehui1.nos.netease.com/images/portrait...
9 30493341 烟雨蒙蒙 46 北京 东城区 水瓶座 本科 60 0 1 160 59 其他行业 0 http://yuehui1.nos.netease.com/images/portrait...

-1. 得到男女生数据总和

In [1042]:
sex_count = DataFrame(girls_df.groupby('性别').count()['会员ID'])
sex_count.rename(columns={'会员ID': '人数'}, inplace=True)
sex_count
Out[1042]:
人数
性别
0 25484
1 19197

0. 分析每个地区的男女生比例

In [1043]:
temp_copy = girls_df
# 改名
temp_copy['性别'] = girls_df['性别'].replace([0,1],['女','男'])
# 总和
total = temp_copy.groupby('省市')[['会员ID']].count()
# 人数 / 总和
province_sex = temp_copy.groupby(['省市', '性别'])[['会员ID']].count()
evey_province_sex_percent = province_sex / total
evey_province_sex_percent.rename(columns={'会员ID': '人数所占百分比'}, inplace=True)
evey_province_sex_percent
Out[1043]:
人数所占百分比
省市 性别
上海 0.821717
0.178283
乌鲁木齐 0.413987
0.586013
兰州 0.473684
0.526316
北京 0.819867
0.180133
南京 0.571429
0.428571
南宁 0.583333
0.416667
南昌 0.363636
0.636364
台湾 0.615385
0.384615
合肥 0.461538
0.538462
呼和浩特 0.615385
0.384615
哈尔滨 0.552898
0.447102
天津 0.872491
0.127509
太原 0.600000
0.400000
广州 0.600000
0.400000
成都 0.388889
0.611111
... ... ...
沈阳 0.625578
0.374422
济南 0.700000
0.300000
海口 0.475000
0.525000
澳门 0.333333
0.666667
石家庄 0.700000
0.300000
福州 0.416667
0.583333
西宁 0.526316
0.473684
西安 0.546218
0.453782
贵阳 0.666667
0.333333
郑州 0.413043
0.586957
重庆 0.663551
0.336449
银川 0.459550
0.540450
长春 0.500000
0.500000
长沙 0.526316
0.473684
香港 0.410324
0.589676

68 rows × 1 columns

1. 查看那5个星座发布的交友信息最多

In [1044]:
# 计数
mostPostStart = DataFrame(girls_df.groupby('星座').count()['会员ID'])
# 改一下列名
mostPostStart.rename(columns={'会员ID': '发部次数'}, inplace=True)
# 输出前5
mostPostStart.sort_values(by='发部次数', ascending=False).head(5)
Out[1044]:
发部次数
星座
摩羯座 6053
双子座 5331
天秤座 3806
金牛座 3492
巨蟹座 3422

2. 查看薪资排名前10 的省份

薪资为保密,0,空值的不进行计算

薪资在区间内的取其均值

50000以上的视为50000 2000以下的视为2000

In [1045]:
# 过滤掉工资为 -1 的行 不统计
have_income = girls_df[~girls_df['工资'].isin([-1])]

# 省份人数总工资 / 计数
DataFrame(have_income.groupby('省市').sum()['工资'] / girls_df.groupby('省市').count()['会员ID'], columns=['平均薪资']).sort_values(by='平均薪资', ascending=False).head(10)
Out[1045]:
平均薪资
省市
香港 14634.017207
澳门 12677.777778
台湾 10237.891738
杭州 9697.530864
成都 9047.668038
北京 8617.494440
南京 8280.864198
福州 8011.316872
长沙 7850.227420
广州 7762.962963

3. 查看哪10个年龄的交友需求比较多 (过滤掉所有无效数据)

In [1046]:
# 计数
mostPostAge = DataFrame(girls_df.groupby('年龄').count()['会员ID'].sort_values(ascending=False))
mostPostAge.rename(columns={'会员ID': '发布次数'}, inplace=True)
mostPostAge.head(10)
Out[1046]:
发布次数
年龄
36 2134
37 2054
34 1925
32 1913
29 1898
33 1855
35 1852
31 1846
39 1839
30 1793

4. 得到各个年龄段的男女的人数

In [1047]:
# 创建副本
temp_copy = girls_df
temp_copy['性别']=temp_copy['性别'].replace([0,1],['女','男'])
# 按年龄 性别 计算数量
all_age_count = DataFrame(DataFrame(temp_copy.groupby(['年龄', '性别']).count())['会员ID'])
all_age_count.rename(columns={'会员ID': '人数'}, inplace=True)
all_age_count
Out[1047]:
人数
年龄 性别
18 99
68
19 135
88
20 132
108
21 216
116
22 456
257
23 326
132
24 472
154
25 495
179
26 824
206
27 903
252
28 1103
358
29 1344
554
30 1251
542
31 1255
591
32 1201
712
... ... ...
74 2
2
75 6
76 11
6
77 2
5
78 17
6
79 16
15
80 15
17
81 4
12
82 17
83 14
11
84 15
29
85 23
17
86 11
9
87 8
6
88 59
45
89 142
107

142 rows × 1 columns

5. 各个学历的占比

In [1076]:
# 合并 空 和 高中以下学历
# girls_df['学历']=girls_df['学历'].replace(['空'],['高中以下'])
# degree_percent.groupby(['学历']).count()
# temp_copy.groupby(['学历']).count()
# 计算 每个学历 对应的总数
degree_temp = DataFrame(DataFrame(temp_copy.groupby(['学历']).count())['会员ID'])
# degree_temp
# 换列名
degree_temp.rename(columns={'会员ID': '百分比'}, inplace=True)

# degree_temp

# degree_temp['学历']['高中以下'] + degree_temp['学历']['空'] 

# 按列计算每个元素所占百分比
DataFrame(degree_temp['百分比'].apply(lambda x :str((x/degree_temp['百分比'].sum()) * 100) + '%')).sort_values(by='百分比', ascending=False)
Out[1076]:
百分比
学历
高中 8.690494841207672%
高中以下 8.430876658982566%
6.049551263400551%
硕士 5.049126026722768%
本科 46.08894160828988%
专科 24.03706273360041%
博士 1.6539468677961549%

6. 得出前10个平均学历最高的城市

无学历的默认为高中学历

假设学历满分为100

高中以下30分,高中40分,专科50分,本科60分,硕士70分,博士80分

In [1049]:
degree_height = girls_df.groupby(['省市', '市区县']).mean().sort_values(by='学历转分数', ascending=False)[['学历转分数']]
# 换列名
degree_height.rename(columns={'学历转分数': '平均学历分数'}, inplace=True)
degree_height.head(10)
Out[1049]:
平均学历分数
省市 市区县
哈尔滨 宁安市 80.000000
抚远县 70.000000
南宁 隆安县 66.666667
石家庄 元氏县 60.000000
哈尔滨 东宁县 60.000000
木兰县 60.000000
石家庄 栾城县 60.000000
南京 58.252427
白下区 58.000000
哈尔滨 让胡路区 58.000000

7. 得到身材相对匀称 人数最多的10个省份

按照BMI指数得到结果

按照:70kg÷(1.75×1.75)=22.86)的比例

结果 如果在18.5-23.9之间的 属于标准身材

In [1067]:
body_thin = girls_df
# body_thin.copy()
# 过滤掉 身高 / 体重 为 0 的行 不统计
body_thin = body_thin[~body_thin['身高'].isin([0])]
body_thin = body_thin.copy()

body_thin['BMI'] = body_thin['体重'] / (body_thin['身高'] * body_thin['身高'] / 10000)

thin_filter = body_thin[(body_thin['BMI'] >= 18.5) & (body_thin['BMI'] <= 23.9)]
thin_temp = thin_filter.groupby('省市').count()[['会员ID']].sort_values(by='会员ID', ascending=False).head(10)
thin_temp
Out[1067]:
会员ID
省市
北京 1657
上海 1492
天津 1271
重庆 1037
成都 915
长沙 880
兰州 804
西宁 778
西安 771
香港 739

8. 三个图

身高 体重散点图

top10 薪资省份(图任选)

满足bmi指数的人数最多的10个省份(柱状图,类似效果)

In [1051]:
temp = girls_df
temp = temp[(girls_df['身高'] < 240) & (girls_df['体重'] < 400)]
plt.scatter(temp['身高'], temp['体重'])
Out[1051]:
<matplotlib.collections.PathCollection at 0x1d418b075f8>
In [1052]:
# 过滤掉工资为 -1 的行 不统计
# 省份人数总工资 / 计数
income_10 = DataFrame(girls_df[~girls_df['工资'].isin([-1])].groupby('省市').sum()['工资'] / girls_df.groupby('省市').count()['会员ID'], columns=['平均薪资']).sort_values(by='平均薪资', ascending=False).head(10)
plt.bar(Series(income_10.index), income_10['平均薪资'], width=0.5)
plt.show()
In [1074]:
plt.bar(Series(thin_temp.index), thin_temp['会员ID'], width=0.5)
plt.show()