# clone the course repository, change to right directory, and import libraries.
%cd /content
!git clone https://github.com/mmeekstu/data_science.git
%cd /content/data_science/'Project Data'/'July Data'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

/content
Cloning into 'data_science'...
remote: Enumerating objects: 212, done.
remote: Counting objects: 100% (78/78), done.
remote: Compressing objects: 100% (54/54), done.
remote: Total 212 (delta 30), reused 66 (delta 24), pack-reused 134
Receiving objects: 100% (212/212), 18.75 MiB | 18.34 MiB/s, done.
Resolving deltas: 100% (89/89), done.
/content/data_science/Project Data/July Data


# Reading in csv files
df_2020 = pd.read_csv('Crowd Estimates July 2020 - Tally.csv')
df_2021 = pd.read_csv('Crowd Estimates July 2021 - Tally.csv')
df_2022 = pd.read_csv('Crowd Estimates July 2022 - Tally.csv')


# Adding a year column to each
df_2020['Year'] = 2020
df_2021['Year'] = 2021
df_2022['Year'] = 2022


# Creating a list for easy access
df_list = list([df_2020, df_2021, df_2022])


for df in df_list:
  display(df.head(1))


# Dropping a lot of data that will not be needed for the project
for (cName, cData) in df_2020.items():
    if ("source" in cName) | ('Source' in cName) | ('macroevent' in cName) | ('MacroEvent' in cName) | ('Misc.' in cName) | ('Unnamed' in cName) | ('notes' in cName) | ('coder' in cName) | ('TownsCities' in cName) | ('Events' in cName) | ('BestGuess' in cName) | ('AdjustedLow' in cName) | ('AdjustedHigh' in cName) | ('EstimateText' in cName) | ('County' in cName) | ('size_text' in cName) | ('title' in cName):
      df_2020 = df_2020.drop(columns = cName)
for (cName, cData) in df_2021.items():
    if ("source" in cName) | ('Source' in cName) | ('macroevent' in cName) | ('MacroEvent' in cName) | ('Misc.' in cName) | ('Unnamed' in cName) | ('notes' in cName) | ('coder' in cName) | ('TownsCities' in cName) | ('Events' in cName) | ('BestGuess' in cName) | ('AdjustedLow' in cName) | ('AdjustedHigh' in cName) | ('EstimateText' in cName) | ('County' in cName) | ('size_text' in cName) | ('title' in cName):
      df_2021 = df_2021.drop(columns = cName)
for (cName, cData) in df_2022.items():
    if ("source" in cName) | ('Source' in cName) | ('macroevent' in cName) | ('MacroEvent' in cName) | ('Misc.' in cName) | ('Unnamed' in cName) | ('notes' in cName) | ('coder' in cName) | ('TownsCities' in cName) | ('Events' in cName) | ('BestGuess' in cName) | ('AdjustedLow' in cName) | ('AdjustedHigh' in cName) | ('EstimateText' in cName) | ('County' in cName) | ('size_text' in cName) | ('title' in cName):
      df_2022 = df_2022.drop(columns = cName)


# Re-estabilshing the list
df_list = list([df_2020, df_2021, df_2022])


# Let's see what we have now
for df in df_list:
  display(df.head(1))


# Renamming inconsistent data
df_2020 = df_2020.rename(columns={"CityTown": "City", "StateTerritory": "State", 'Actor': 'Actors', 'Pro(2)/Anti(1)': 'Political Affiliation'})
df_2021 = df_2021.rename(columns={'date': 'Date', 'locality': 'City', 'location': 'Location', 'state': 'State', 'size_low': 'EstimateLow', 'size_high': 'EstimateHigh', 'claims': 'Claim', 'event_type': 'EventType', 'participants': 'Actors', 'valence': 'Political Affiliation', 'police_injuries': 'ReportedPoliceInjuries', 'participant_injuries': 'ReportedParticipantInjuries', 'arrests': 'ReportedArrests', 'property_damage': 'ReportedPropertyDamage'})
df_2022 = df_2022.rename(columns={'date': 'Date', 'locality': 'City', 'location': 'Location', 'state': 'State', 'size_low': 'EstimateLow', 'size_high': 'EstimateHigh', 'claims': 'Claim', 'event_type': 'EventType', 'participants': 'Actors', 'valence': 'Political Affiliation', 'police_injuries': 'ReportedPoliceInjuries', 'participant_injuries': 'ReportedParticipantInjuries', 'arrests': 'ReportedArrests', 'property_damage': 'ReportedPropertyDamage'})


# Re-establishing the list
df_list = list([df_2020, df_2021, df_2022])


# Much more consistent
for df in df_list:
  display(df.head(1))


df_full = pd.concat([df_2020, df_2021, df_2022])


# Success. Here is what the new table looks like
df_full


# Creating a new dataframe
df_new = pd.DataFrame(df_full[['State', 'EstimateLow', 'EstimateHigh', 'Political Affiliation', 'Year']])
df_new


# Finding and getting rid of NAs
df_new = df_new.dropna()
df_new


# Remapping Political Affiliation to be more intuitive
df_new['Political Affiliation'] = df_new['Political Affiliation'].map({
    0.0: 'Neither',
    1.0: 'Left/Anti-Trump',
    2.0: 'Right/Pro-Trump'
})

<ipython-input-16-d7bce5010979>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['Political Affiliation'] = df_new['Political Affiliation'].map({


# Great
df_new


# Creating dict
states_dict = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}


# Remapping
df_new['State'] = df_new['State'].map(states_dict)

<ipython-input-19-cd63c67008f0>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['State'] = df_new['State'].map(states_dict)


# Done
df_new


df_new.dtypes

State                     object
EstimateLow              float64
EstimateHigh             float64
Political Affiliation     object
Year                       int64
dtype: object


df_new = df_new.astype({'EstimateLow': 'Int64', 'EstimateHigh': 'Int64'})


# Better
df_new.dtypes

State                    object
EstimateLow               Int64
EstimateHigh              Int64
Political Affiliation    object
Year                      int64
dtype: object


# Grouping Years to find total number of observations
df_Counts = df_new.groupby('Year')['Year'].count()
display(df_Counts)
df_Counts.plot.bar(title='Total Protests', ylabel='Number of Protests')

Year
2020    437
2021    782
2022    979
Name: Year, dtype: int64

<Axes: title={'center': 'Total Protests'}, xlabel='Year', ylabel='Number of Protests'>


# Grouping 'EstimateLow' and EstimateHigh and summing
df_estimate_sum = df_new.groupby('Year')[['EstimateLow', 'EstimateHigh']].sum()
display(df_estimate_sum)
df_estimate_sum.plot.bar(title='Total Attendence', ylabel='Number of Participants')

<Axes: title={'center': 'Total Attendence'}, xlabel='Year', ylabel='Number of Participants'>


# Grouping 'EstimateLow' and 'EstimateHigh' and averaging
df_estimate_avg = df_new.groupby('Year')[['EstimateLow', 'EstimateHigh']].mean()
display(df_estimate_avg)
df_estimate_avg.plot.bar(title='Average Attendence', ylabel='Number of Participants')

<Axes: title={'center': 'Average Attendence'}, xlabel='Year', ylabel='Number of Participants'>


# Creating bar graph of total protests throughout the 3 years
df_new['Political Affiliation'].value_counts(sort=False).plot.bar(title='Protests by Political Affiliation',
                                                                  ylabel='Number of Protests', xlabel='Political Affiliation')

<Axes: title={'center': 'Protests by Political Affiliation'}, xlabel='Political Affiliation', ylabel='Number of Protests'>


# Creating three tables based on year
df_2020 = df_new.loc[df_new['Year'] == 2020]
df_2021 = df_new.loc[df_new['Year'] == 2021]
df_2022 = df_new.loc[df_new['Year'] == 2022]

# Creating three crosstab joint distributions
ct_2020 = pd.crosstab(df_2020.State, df_2020['Political Affiliation'], normalize=True)
ct_2021 = pd.crosstab(df_2021.State, df_2021['Political Affiliation'], normalize=True)
ct_2022 = pd.crosstab(df_2022.State, df_2022['Political Affiliation'], normalize=True)

# Displaying cross-tabulations
df_list = [ct_2020, ct_2021, ct_2022]
for df in df_list:
  display(df)


# Creating the heatmap for 2020
plt.subplots(figsize=(10,10))
sns.heatmap(ct_2020, xticklabels=True, yticklabels=True)

<Axes: xlabel='Political Affiliation', ylabel='State'>


# Creating the heatmap for 2021
plt.subplots(figsize=(10,10))
sns.heatmap(ct_2021, xticklabels=True, yticklabels=True)

<Axes: xlabel='Political Affiliation', ylabel='State'>


# Creating the heatmap for 2022
plt.subplots(figsize=(10,10))
sns.heatmap(ct_2022, xticklabels=True, yticklabels=True)

<Axes: xlabel='Political Affiliation', ylabel='State'>


# Creating full cross-tabulation
ct_full = pd.crosstab(df_new.State, df_new['Political Affiliation'], normalize=True)

# Creating conditional distribution
state_counts = ct_full.sum(axis=1)
pa_given_state = ct_full.divide(state_counts, axis=0)

# Displaying distribution
pa_given_state


# Creating stacked bar graph
pa_given_state.plot.bar(stacked=True, figsize=(10,10), legend=True)

<Axes: xlabel='State'>


# Changing directory
%cd ../'Hate Crimes'

/content/data_science/Project Data/Hate Crimes


# Reading in files
hc_2020 = pd.read_csv('hate crimes by state 2020.csv')
hc_2021 = pd.read_csv('hate crimes by state 2021.csv')
hc_2022 = pd.read_csv('hate crimes by state 2022.csv')

# Let's see what we have
display(hc_2020.head(5))
display(hc_2021.head(5))
display(hc_2022.head(5))


# Creating a year column with appropriate years
hc_2020['Year'] = 2020
hc_2021['Year'] = 2021
hc_2022['Year'] = 2022

# Displaying
display(hc_2020.head(5))
display(hc_2021.head(5))
display(hc_2022.head(5))


# Creating list of dfs
df_list = [hc_2020, hc_2021, hc_2022]

# Renaming columns
for df in df_list:
  df.rename(columns={'Participating State': 'State'}, inplace=True)

# Displaying dfs
for df in df_list:
  display(df.head(1))


# Now we concatonate the data
hc_full = pd.concat([hc_2020, hc_2021, hc_2022])
hc_full


# Dtypes look good
hc_full.dtypes

State             object
Total offenses     int64
Year               int64
dtype: object


hc_full['Total offenses'].describe()

count     153.000000
mean      209.320261
std       285.948943
min         1.000000
25%        47.000000
50%       117.000000
75%       252.000000
max      2261.000000
Name: Total offenses, dtype: float64


# Investigating the issue
hc_full.loc[hc_full['Total offenses'] == 2261]


# Grouping data by state and year
hc_group = hc_full.groupby(['State', 'Year'])['Total offenses'].sum()
hc_group

State      Year
Alabama    2020     33
           2021    263
           2022    254
Alaska     2020     10
           2021     14
                  ... 
Wisconsin  2021    118
           2022    163
Wyoming    2020     21
           2021     21
           2022     29
Name: Total offenses, Length: 153, dtype: int64


# Creating numpy array
y_indexes = np.arange(51)
height = .25

# Setting figure size for the rather large plot
plt.figure(figsize=(8,13))

# Creating the plot
plt.barh(y_indexes - height, hc_2020['Total offenses'], height=height, label='2020')

plt.barh(y_indexes, hc_2021['Total offenses'], height=height, label='2021')

plt.barh(y_indexes + height, hc_2022['Total offenses'], height=height, label='2022')

# Matching the numpy array to correct labels
plt.yticks(ticks=y_indexes, labels=hc_2020['State'], fontsize=8)

# Adding title, axis labels, and legend
plt.title('Hate Crime Offenses per State', fontsize=15)
plt.xlabel('Total Offenses', fontsize=12)
plt.ylabel('State', fontsize=12)
plt.legend(prop={'size': 10})
plt.grid(axis='x')

# Displaying plot
plt.show()


# Merging data
df_merged = df_new.merge(hc_full, on=['State', 'Year'])
df_merged


# Changing directory
%cd ..

# Reading in population csv file
pop_df = pd.read_csv('Population Data.csv')
pop_df.head(5)

/content/data_science/Project Data


# Creating three seperate tabels by year
pop_df_2020 = pop_df[['State', '2020']]
pop_df_2021 = pop_df[['State', '2021']]
pop_df_2022 = pop_df[['State', '2022']]

# Adding a year column to each
pop_df_2020['Year'] = 2020
pop_df_2021['Year'] = 2021
pop_df_2022['Year'] = 2022

# Changing column name to Population
pop_df_2020.rename(columns={'2020': 'Population'}, inplace=True)
pop_df_2021.rename(columns={'2021': 'Population'}, inplace=True)
pop_df_2022.rename(columns={'2022': 'Population'}, inplace=True)

# Concatenating the dfs
pop_half = pd.concat([pop_df_2020, pop_df_2021])
pop_df = pd.concat([pop_half, pop_df_2022])
pop_df

<ipython-input-46-ad03bc6c17c5>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2020['Year'] = 2020
<ipython-input-46-ad03bc6c17c5>:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2021['Year'] = 2021
<ipython-input-46-ad03bc6c17c5>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2022['Year'] = 2022
<ipython-input-46-ad03bc6c17c5>:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2020.rename(columns={'2020': 'Population'}, inplace=True)
<ipython-input-46-ad03bc6c17c5>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2021.rename(columns={'2021': 'Population'}, inplace=True)
<ipython-input-46-ad03bc6c17c5>:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pop_df_2022.rename(columns={'2022': 'Population'}, inplace=True)


# Merging tables
final_df = df_merged.merge(pop_df, on=['State', 'Year'])
final_df


from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# Creating features to fit model
features = ['State', 'EstimateLow', 'EstimateHigh', 'Political Affiliation',
            'Year', 'Population']

# Setting the training data
X_train_dict = final_df[features].to_dict(orient="records")
y_train = final_df['Total offenses']

# Dummy encoding
vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

# Scaling data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

# K-Nearest Neighbors Model
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train_sc, y_train)
y_train_pred = model.predict(X_train_sc)

# Calculating prediction accuracy
mse = ((y_train - y_train_pred) ** 2).mean()
rmse = np.sqrt(mse)
rmse

147.59384928591015


final_df['Total offenses'].mean()

411.8073059360731


# Creating a function to test the model based on features
def test_model(features):

  from sklearn.feature_extraction import DictVectorizer
  from sklearn.preprocessing import StandardScaler
  from sklearn.neighbors import KNeighborsRegressor
  from sklearn.pipeline import Pipeline
  from sklearn.model_selection import cross_val_score

  # Setting the training data
  X_dict = final_df[features].to_dict(orient="records")
  y = final_df['Total offenses']

  # Creating the pipeline for cv
  vec = DictVectorizer(sparse=False)
  scaler = StandardScaler()
  def get_cv_error(k):
      model = KNeighborsRegressor(n_neighbors=k)
      pipeline = Pipeline([("vectorizer", vec), ("scaler", scaler), ("fit", model)])
      mse = np.mean(-cross_val_score(
          pipeline, X_dict, y,
          cv=10, scoring="neg_mean_squared_error"
      ))
      # Finding and returning RMSE
      rmse = np.sqrt(mse)
      return rmse

  # Applying the function to a series of K-values to find the best match
  ks = pd.Series(range(1, 51))
  ks.index = range(1, 51)
  test_errs = ks.apply(get_cv_error)

  display(test_errs.sort_values())


# Creating features to fit model
features = ['State', 'EstimateLow', 'EstimateHigh', 'Political Affiliation',
            'Year', 'Population']
test_model(features)

2     1308.478804
3     1309.808228
1     1311.358962
4     1313.637205
16    1313.680531
15    1313.694672
14    1314.657191
17    1315.000239
13    1317.226181
18    1317.403031
5     1318.106265
19    1319.245065
12    1320.801347
20    1321.432688
6     1323.723434
21    1325.386166
11    1326.215236
22    1329.176557
7     1330.737379
23    1331.920860
10    1333.047922
24    1334.661910
8     1336.983182
25    1337.892473
26    1341.183565
9     1342.392255
27    1344.632541
28    1348.806691
29    1352.362345
50    1354.907282
30    1356.499465
49    1358.947974
31    1359.422038
32    1362.550583
48    1363.353416
33    1364.619850
34    1366.992803
47    1367.710584
35    1368.563191
36    1369.582802
37    1370.782396
46    1372.144758
38    1372.210104
39    1373.925720
40    1376.124108
45    1376.956504
41    1378.418426
42    1380.637737
44    1382.330132
43    1383.112618
dtype: float64


# Setting State to index and dropping California
final_df = final_df.set_index('State').drop('California')

# Resetting index
final_df = final_df.reset_index()
final_df


# Creating features to fit model
features = ['State', 'EstimateLow', 'EstimateHigh', 'Political Affiliation',
            'Year', 'Population']

test_model(features)

1     202.281462
2     220.490351
3     233.627711
4     240.701775
5     245.742813
6     249.179314
7     252.304116
8     253.506376
10    254.003275
9     254.087586
11    254.187763
12    255.005329
17    255.689985
18    255.772137
16    256.126104
13    256.130441
15    256.276410
19    256.411711
14    257.117632
20    257.460189
21    258.802064
22    260.284278
26    260.489019
27    260.544855
28    260.581936
25    260.611264
29    260.878950
24    260.953020
30    261.308943
23    261.535952
31    261.817612
32    262.278236
33    262.605370
34    263.095780
35    263.617776
36    264.196391
37    264.939978
40    265.088988
41    265.098131
38    265.213977
39    265.358719
42    265.380857
43    265.857152
44    266.078680
45    266.572152
46    267.195899
47    267.937241
48    268.831120
49    269.769102
50    270.919691
dtype: float64


# Finding new mean without California
final_df['Total offenses'].mean()

263.62651875330164


# Dropping Estimate numbers
features = ['State', 'Political Affiliation',
            'Year', 'Population']

test_model(features)

1     201.712462
2     219.637604
3     232.525221
4     239.885433
5     244.961057
6     248.779659
7     251.867976
10    253.138863
11    253.150450
8     253.254247
9     253.425539
12    253.941817
18    254.876959
13    255.005153
19    255.014155
17    255.341412
20    255.645817
14    256.184700
16    256.483802
21    256.650172
15    257.521982
22    257.907742
23    259.368045
26    259.579711
27    259.598468
25    259.784336
28    259.805735
24    260.098430
29    260.212482
30    260.718169
31    261.256101
32    261.907101
33    262.351444
34    262.969576
35    263.701343
36    264.356208
37    265.061600
38    265.412848
39    265.858757
42    265.901274
43    266.039150
41    266.073954
40    266.469497
44    266.513160
45    267.298049
46    267.645166
47    268.142926
48    268.859665
49    269.701741
50    270.749653
dtype: float64


# Dropping Political Affiliation
features = ['State', 'Year', 'Population']

test_model(features)

1     201.746776
2     218.804648
3     231.204409
4     238.419069
5     243.045434
6     246.303633
7     248.750718
8     250.660916
9     252.199084
10    253.468783
11    254.581780
12    255.592058
13    256.510428
14    257.343493
15    258.099415
16    258.786466
17    259.412386
18    259.984150
19    260.507925
20    260.989803
21    261.490521
22    262.033778
23    262.592530
24    263.162021
25    263.734628
26    264.269616
27    264.776092
28    265.353492
29    266.060545
30    266.825053
31    267.649860
32    268.531874
33    269.443800
34    270.391615
35    271.363956
36    272.260132
37    273.174829
38    274.155234
39    275.140242
40    276.153568
41    277.173154
42    278.214307
43    279.257721
44    280.343149
45    281.427668
46    282.507821
47    283.508811
48    284.508914
49    285.505341
50    286.495771
dtype: float64


# Dropping Population
features = ['State', 'Political Affiliation',
            'Year']

test_model(features)

1     176.719099
2     186.955735
3     194.149532
4     199.610703
5     205.435103
6     211.485052
7     217.205319
8     220.823619
9     225.671020
10    230.686885
11    236.434390
12    242.400573
13    248.186382
14    253.774471
15    259.098702
16    264.234499
17    268.586566
18    272.472398
19    276.302538
20    280.123386
21    284.046577
22    287.316948
23    290.572948
24    293.914099
25    297.149925
26    299.745101
27    302.163013
28    304.744103
50    306.586332
29    307.223699
49    308.718416
30    309.744480
31    310.653447
48    310.686657
32    311.755431
47    312.662292
33    312.909353
34    314.167509
46    314.707015
35    315.552996
45    316.012364
36    316.974352
44    317.465697
37    318.481275
43    318.768775
38    319.050999
39    319.606796
42    319.715064
40    319.888028
41    320.126783
dtype: float64

	City	Location	State	Country	Date	EstimateLow	EstimateHigh	Actors	Claim	Political Affiliation	EventType	ReportedArrests	ReportedParticipantInjuries	ReportedPoliceInjuries	ReportedPropertyDamage	Year	organizations	police_measures	participant_measures	police_deaths	participant_deaths
0	Ashland	Memorial Park; streets of Ashland; Highway 13 ...	WI	US	2020-07-01	NaN	NaN	water protectors	against recloation of Enbridge Line 5 Pipeline...	1.0	protest	0	0	0.0	0.0	2020	NaN	NaN	NaN	NaN	NaN
1	Boulevard	NaN	CA	US	2020-07-01	100.0	100.0	members of the Kumeyaay Tribe	against construction of border wall, against d...	1.0	march	NaN	NaN	NaN	NaN	2020	NaN	NaN	NaN	NaN	NaN
2	Brentwood	Bank of America Financial Center	CA	US	2020-07-01	NaN	NaN	fired employees of Terranea Resort	against firing of staff and salary cuts at Ter...	0.0	caravan; protest	0	0	0.0	0.0	2020	NaN	NaN	NaN	NaN	NaN
3	Eugene	Lane County Jail	OR	US	2020-07-01	150.0	150.0	general protesters	show solidarity with inmates; seek release of ...	0.0	protest	1	0	1.0	1.0	2020	NaN	NaN	NaN	NaN	NaN
4	Holyoke	Providence Behavioral Health Hospital	MA	US	2020-07-01	NaN	NaN	Nurses Association	against closure of inpatient beds in psychiatr...	0.0	memorial; protest	0	0	0.0	0.0	2020	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2518	Washington	National Mall	DC	NaN	2022-07-31	NaN	NaN	NaN	against 1776 Restoration Movement	0.0	counter-protest	NaN	NaN	NaN	NaN	2022	NaN	NaN	NaN	NaN	NaN
2519	Washington	U.S. Capitol	DC	NaN	2022-07-31	NaN	NaN	veterans	for the PACT Act, for passage of federal legis...	0.0	demonstration; rally	NaN	NaN	NaN	NaN	2022	Burn Pits 360	NaN	round-the-clock sit-in on Capitol steps	NaN	NaN
2520	Washington	Union Station	DC	NaN	2022-07-31	NaN	NaN	NaN	for Medicare for all	1.0	demonstration	NaN	NaN	NaN	NaN	2022	March for Medicare for All	NaN	NaN	NaN	NaN
2521	West Hartford	Flatbush Ave and New Park Ave	CT	NaN	2022-07-31	NaN	NaN	NaN	against circumcision of male infants	0.0	protest	NaN	NaN	NaN	NaN	2022	Bloodstained Men	NaN	NaN	NaN	NaN
2522	Wooster	Wooster Square	OH	NaN	2022-07-31	NaN	NaN	NaN	against racism, against police brutality	1.0	demonstration	NaN	NaN	NaN	NaN	2022	Wayne County Racial Justice Coalition	NaN	NaN	NaN	NaN

Political Affiliation	Left/Anti-Trump	Neither	Right/Pro-Trump
State
Alabama	0.000000	0.000000	0.002294
Alaska	0.000000	0.002294	0.002294
Arizona	0.000000	0.009174	0.009174
Arkansas	0.000000	0.000000	0.004587
California	0.018349	0.064220	0.057339
Colorado	0.000000	0.004587	0.018349
Connecticut	0.002294	0.011468	0.013761
District of Columbia	0.002294	0.000000	0.000000
Florida	0.004587	0.027523	0.029817
Georgia	0.000000	0.011468	0.002294
Hawaii	0.002294	0.000000	0.000000
Idaho	0.000000	0.000000	0.009174
Illinois	0.000000	0.020642	0.011468
Indiana	0.000000	0.004587	0.018349
Iowa	0.000000	0.004587	0.004587
Kansas	0.000000	0.006881	0.004587
Kentucky	0.000000	0.000000	0.006881
Louisiana	0.000000	0.011468	0.002294
Maine	0.002294	0.006881	0.000000
Maryland	0.004587	0.016055	0.006881
Massachusetts	0.002294	0.016055	0.013761
Michigan	0.002294	0.013761	0.013761
Minnesota	0.000000	0.006881	0.006881
Mississippi	0.000000	0.002294	0.000000
Missouri	0.000000	0.016055	0.009174
Montana	0.000000	0.000000	0.004587
Nebraska	0.000000	0.000000	0.002294
Nevada	0.000000	0.002294	0.004587
New Hampshire	0.000000	0.002294	0.004587
New Jersey	0.000000	0.009174	0.018349
New Mexico	0.000000	0.000000	0.013761
New York	0.000000	0.029817	0.043578
North Carolina	0.004587	0.004587	0.011468
Ohio	0.000000	0.004587	0.016055
Oklahoma	0.002294	0.000000	0.000000
Oregon	0.000000	0.011468	0.018349
Pennsylvania	0.004587	0.011468	0.027523
Rhode Island	0.000000	0.000000	0.002294
South Carolina	0.000000	0.011468	0.000000
South Dakota	0.000000	0.000000	0.002294
Tennessee	0.000000	0.006881	0.006881
Texas	0.004587	0.034404	0.027523
Utah	0.000000	0.011468	0.009174
Vermont	0.000000	0.000000	0.009174
Virginia	0.004587	0.016055	0.013761
Washington	0.002294	0.004587	0.018349
West Virginia	0.000000	0.002294	0.004587
Wisconsin	0.000000	0.009174	0.000000

Political Affiliation	Left/Anti-Trump	Neither	Right/Pro-Trump
State
Alabama	0.005115	0.003836	0.001279
Alaska	0.000000	0.001279	0.000000
Arizona	0.003836	0.002558	0.002558
Arkansas	0.002558	0.001279	0.000000
California	0.052430	0.040921	0.038363
Colorado	0.007673	0.000000	0.005115
Connecticut	0.005115	0.006394	0.006394
Delaware	0.001279	0.001279	0.000000
District of Columbia	0.015345	0.015345	0.002558
Florida	0.010230	0.089514	0.010230
Georgia	0.005115	0.005115	0.005115
Hawaii	0.006394	0.006394	0.003836
Idaho	0.002558	0.002558	0.005115
Illinois	0.015345	0.016624	0.003836
Indiana	0.010230	0.003836	0.002558
Iowa	0.002558	0.000000	0.001279
Kansas	0.001279	0.001279	0.000000
Kentucky	0.005115	0.003836	0.000000
Louisiana	0.001279	0.006394	0.001279
Maine	0.007673	0.001279	0.002558
Maryland	0.008951	0.007673	0.000000
Massachusetts	0.017903	0.015345	0.003836
Michigan	0.003836	0.006394	0.008951
Minnesota	0.023018	0.003836	0.005115
Mississippi	0.001279	0.001279	0.000000
Missouri	0.003836	0.005115	0.003836
Montana	0.001279	0.001279	0.003836
Nebraska	0.001279	0.003836	0.000000
Nevada	0.001279	0.002558	0.005115
New Hampshire	0.005115	0.000000	0.001279
New Jersey	0.008951	0.001279	0.001279
New Mexico	0.000000	0.000000	0.005115
New York	0.040921	0.052430	0.008951
North Carolina	0.014066	0.006394	0.015345
North Dakota	0.000000	0.000000	0.001279
Ohio	0.014066	0.007673	0.005115
Oklahoma	0.001279	0.002558	0.000000
Oregon	0.005115	0.002558	0.007673
Pennsylvania	0.024297	0.012788	0.010230
Puerto Rico	0.001279	0.000000	0.000000
Rhode Island	0.002558	0.001279	0.001279
South Carolina	0.002558	0.002558	0.000000
South Dakota	0.001279	0.001279	0.002558
Tennessee	0.003836	0.007673	0.001279
Texas	0.019182	0.026854	0.003836
Utah	0.000000	0.002558	0.000000
Virginia	0.014066	0.007673	0.010230
Washington	0.010230	0.001279	0.002558
West Virginia	0.002558	0.000000	0.001279
Wisconsin	0.003836	0.005115	0.000000

Political Affiliation	Left/Anti-Trump	Neither	Right/Pro-Trump
State
Alabama	0.010215	0.000000	0.000000
Alaska	0.006129	0.000000	0.002043
Arizona	0.004086	0.001021	0.005107
Arkansas	0.004086	0.002043	0.001021
California	0.044944	0.081716	0.009193
Colorado	0.009193	0.001021	0.001021
Connecticut	0.004086	0.002043	0.000000
Delaware	0.002043	0.000000	0.000000
District of Columbia	0.025536	0.001021	0.006129
Florida	0.022472	0.003064	0.004086
Georgia	0.016343	0.003064	0.001021
Guam	0.001021	0.000000	0.000000
Hawaii	0.000000	0.003064	0.000000
Idaho	0.006129	0.000000	0.001021
Illinois	0.015322	0.043922	0.003064
Indiana	0.011236	0.001021	0.004086
Iowa	0.008172	0.031665	0.002043
Kansas	0.004086	0.023493	0.001021
Kentucky	0.006129	0.001021	0.000000
Louisiana	0.006129	0.000000	0.000000
Maine	0.004086	0.000000	0.000000
Maryland	0.010215	0.032686	0.001021
Massachusetts	0.013279	0.009193	0.004086
Michigan	0.012257	0.004086	0.001021
Minnesota	0.008172	0.003064	0.000000
Mississippi	0.001021	0.000000	0.000000
Missouri	0.011236	0.001021	0.000000
Montana	0.006129	0.000000	0.004086
Nebraska	0.004086	0.002043	0.002043
Nevada	0.002043	0.000000	0.001021
New Hampshire	0.001021	0.000000	0.002043
New Jersey	0.005107	0.002043	0.001021
New Mexico	0.006129	0.000000	0.001021
New York	0.037794	0.023493	0.009193
North Carolina	0.014300	0.005107	0.003064
North Dakota	0.002043	0.000000	0.001021
Ohio	0.034729	0.002043	0.005107
Oklahoma	0.006129	0.000000	0.000000
Oregon	0.006129	0.013279	0.001021
Pennsylvania	0.025536	0.008172	0.002043
Puerto Rico	0.001021	0.004086	0.000000
Rhode Island	0.000000	0.032686	0.000000
South Carolina	0.006129	0.001021	0.002043
South Dakota	0.003064	0.000000	0.000000
Tennessee	0.004086	0.002043	0.000000
Texas	0.019408	0.006129	0.005107
Utah	0.009193	0.000000	0.002043
Vermont	0.004086	0.000000	0.000000
Virginia	0.019408	0.003064	0.003064
Washington	0.004086	0.002043	0.000000
West Virginia	0.009193	0.000000	0.001021
Wisconsin	0.014300	0.031665	0.001021
Wyoming	0.003064	0.000000	0.002043

Political Affiliation	Left/Anti-Trump	Neither	Right/Pro-Trump
State
Alabama	0.736842	0.157895	0.105263
Alaska	0.545455	0.181818	0.272727
Arizona	0.280000	0.280000	0.440000
Arkansas	0.500000	0.250000	0.250000
California	0.313131	0.471380	0.215488
Colorado	0.483871	0.096774	0.419355
Connecticut	0.281250	0.375000	0.343750
Delaware	0.750000	0.250000	0.000000
District of Columbia	0.644068	0.220339	0.135593
Florida	0.225352	0.598592	0.176056
Georgia	0.526316	0.315789	0.157895
Guam	1.000000	0.000000	0.000000
Hawaii	0.352941	0.470588	0.176471
Idaho	0.421053	0.105263	0.473684
Illinois	0.262136	0.631068	0.106796
Indiana	0.487179	0.153846	0.358974
Iowa	0.208333	0.687500	0.104167
Kansas	0.142857	0.771429	0.085714
Kentucky	0.588235	0.235294	0.176471
Louisiana	0.368421	0.526316	0.105263
Maine	0.647059	0.235294	0.117647
Maryland	0.279412	0.661765	0.058824
Massachusetts	0.405797	0.405797	0.188406
Michigan	0.355556	0.333333	0.311111
Minnesota	0.619048	0.214286	0.166667
Mississippi	0.500000	0.500000	0.000000
Missouri	0.424242	0.363636	0.212121
Montana	0.411765	0.058824	0.529412
Nebraska	0.384615	0.384615	0.230769
Nevada	0.230769	0.230769	0.538462
New Hampshire	0.454545	0.090909	0.454545
New Jersey	0.413793	0.241379	0.344828
New Mexico	0.352941	0.000000	0.647059
New York	0.381215	0.425414	0.193370
North Carolina	0.457627	0.203390	0.338983
North Dakota	0.500000	0.000000	0.500000
Ohio	0.633803	0.140845	0.225352
Oklahoma	0.800000	0.200000	0.000000
Oregon	0.222222	0.444444	0.333333
Pennsylvania	0.505495	0.252747	0.241758
Puerto Rico	0.333333	0.666667	0.000000
Rhode Island	0.054054	0.891892	0.054054
South Carolina	0.444444	0.444444	0.111111
South Dakota	0.500000	0.125000	0.375000
Tennessee	0.318182	0.500000	0.181818
Texas	0.367347	0.428571	0.204082
Utah	0.409091	0.318182	0.272727
Vermont	0.500000	0.000000	0.500000
Virginia	0.492308	0.246154	0.261538
Washington	0.464286	0.178571	0.357143
West Virginia	0.687500	0.062500	0.250000
Wisconsin	0.298246	0.684211	0.017544
Wyoming	0.600000	0.000000	0.400000

In the Heat of Protest¶

Protest Data¶

ETL¶

Protest Data EDA¶

Hate Crime Data¶

ETL¶

EDA¶

Modeling and Testing¶

Conclusions¶

	EstimateLow	EstimateHigh
Year
2020	137.819222	155.086957
2021	90.131714	161.933504
2022	149.011236	176.951992

	State	2020	2021	2022
0	Alabama	5,031,362	5,049,846	5,074,296
1	Alaska	732,923	734,182	733,583
2	Arizona	7,179,943	7,264,877	7,359,197
3	Arkansas	3,014,195	3,028,122	3,045,637
4	California	39,501,653	39,142,991	39,029,342

	EstimateLow	EstimateHigh
Year
2020	60227	67773
2021	70483	126632
2022	145882	173236