Project Introduction
Project Address: Https://www.kaggle.com/fivethirtyeight/2016-election-polls
Contains 27 columns for the 2016 U.S. general election vote for the period from November 2015 to November 2016.
Purpose of the project: to analyze the trend of statistical surveys every month.
Knowledge points involved:
- Higher order function Filter
- NumPy reading a text file
- Working with date format data
- Slices and indexes of numpy
- Statistical methods of NumPy
- List-derived
- Data structure zip
- Matplotlib for simple data visualization
project code
#-*-Coding:utf-8-*-import numpy as Npimport datetimeimport Matplotlib.pyplot as Pltdef main (): # Data file address filename = './presidential_polls.csv ' # reads the column name, i.e. the first row of data with open (filename, ' R ') as F:col_names_str = F.readline () [:-1] # [:-1] means not to read the end of the newline character ' \ n ' # to split the string and make up the list col_name_lst = Col_names_str.split (', ') # using the column name use_col_name_lst = [' En Ddate ', ' Rawpoll_clinton ', ' rawpoll_trump ', ' Adjpoll_clinton ', ' Adjpoll_trump '] # Gets the index number of the corresponding column name use_col_index_lst = [Co L_name_lst.index (Use_col_name) for Use_col_name in Use_col_name_lst] # data Read Data_array = np.loadtxt (filename, # file name Delimiter= ', ', # delimiter Skiprows=1, # Skip first line, skip column name DTYPE=STR, # Data type Usecols=use_col_index_lst) # Specifies the column index number to read # Processing date format Data enddate_id x = Use_col_name_lst.index (' enddate ') Enddate_lst = data_array[:, enddate_idx].tolist () # Print Enddate_lst # will date Uniform string format, i.e. ' yy/dd/mm ' ENddate_lst = [Enddate.replace ('-', '/') for EndDate in Enddate_lst] # Converts a date string to a date Date_lst = [DATETIME.DATETIME.STRP Time (EndDate, '%m/%d/%y ') for EndDate in Enddate_lst] # Construction year-month list month_lst = ['%d-%02d '% (Date_obj.year, date_obj. Month) for Date_obj in date_lst] Month_array = Np.array (month_lst) months = Np.unique (month_array) # Print months # poll number # Cliton # raw data Rawpoll Rawpoll_clinton_idx = Use_col_name_lst.index (' Rawpoll_clinton ') Rawpoll_ Clinton_data = data_array[:, Rawpoll_clinton_idx] # adjusted data adhpool Adjpoll_clinton_idx = use_col_name_lst.index (' adj Poll_clinton ') Adjpoll_clinton_data = data_array[:, Adjpoll_clinton_idx] # Trump # raw Data rawpoll Rawpoll_trump_ IDX = Use_col_name_lst.index (' rawpoll_trump ') Rawpoll_trump_data = data_array[:, Rawpoll_trump_idx] # adjusted data adjpoll Adjpoll_trump_idx = Use_col_name_lst.index (' adjpoll_trump ') Adjpoll_trump_data = data_array[:, Adjpoll_trump_idx] # result Save results = [] For month in months: # Clinton # raw data rawpoll Rawpoll_clinton_month_data = rawpoll_clinton_data[m Onth_array = = Month] # Count total votes in the month Rawpoll_clinton_month_sum = Get_sum (rawpoll_clinton_month_data) # adjustment number According to adjpoll adjpoll_clinton_month_data = Adjpoll_clinton_data[month_array = = Month] # count the total votes of the month Adjpoll_ Clinton_month_sum = Get_sum (adjpoll_clinton_month_data) # Trump # raw Data rawpoll Rawpoll_trump_month_da Ta = Rawpoll_trump_data[month_array = = Month] # count the total votes of the Month Rawpoll_trump_month_sum = Get_sum (Rawpoll_trump_mon Th_data) # adjust data adjpoll Adjpoll_trump_month_data = Adjpoll_trump_data[month_array = = Month] # Statistics of the month's total votes Number adjpoll_trump_month_sum = Get_sum (Adjpoll_trump_month_data) Results.append ((Month, rawpoll_clinton_month_s Um, adjpoll_clinton_month_sum, rawpoll_trump_month_sum, adjpoll_trump_month_sum)) # Print Result s months, Raw_cliton_Sum, Adj_cliton_sum, raw_trump_sum, adj_trump_sum = Zip (*results) # Visual analysis Results fig, Subplot_arr = Plt.subplots (2, 2 , Figsize= (15, 10)) # Raw Data trends show subplot_arr[0, 0].plot (raw_cliton_sum, color= ' R ') subplot_arr[0, 0].plot (raw_trump _sum, color= ' g ') width = 0.25 x = Np.arange (len (months)) subplot_arr[0, 1].bar (x, Raw_cliton_sum, Width, color= ' r ') subplot_arr[0, 1].bar (x + width, raw_trump_sum, width, color= ' g ') subplot_arr[0, 1].set_xticks (x + width) subp Lot_arr[0, 1].set_xticklabels (months, rotation= ' vertical ') # Adjust data trend show subplot_arr[1, 0].plot (adj_cliton_sum, color= ' R ') subplot_arr[1, 0].plot (adj_trump_sum, color= ' g ') width = 0.25 x = Np.arange (len (months)) subplot_arr[1, 1] . Bar (x, Adj_cliton_sum, Width, color= ' R ') subplot_arr[1, 1].bar (x + width, adj_trump_sum, width, color= ' g ') subplot_ Arr[1, 1].set_xticks (x + width) subplot_arr[1, 1].set_xticklabels (months, rotation= ' vertical ') plt.subplots_adjust (w space=0.2) Plt.show ()def is_convert_float (s): "" "Determines whether a string can be converted to float" "" Try:float (s) except:return False Retu RN truedef Get_sum (Str_array): "" "returns the sum of the numbers in the string array" "" # Remove data that cannot be converted to numbers Cleaned_data = Filter (Is_convert_float, Str_array) # convert data type Float_array = Np.array (Cleaned_data, np.float) return Np.sum (float_array) if __name__ = = ' __m Ain__ ': Main ()
Python data analysis U.S. election Project Combat (iii)