# Voters

In [1]:
import pandas as pd
import statsmodels.api as sm

## Replicate Figure 1

We replicate Figure 1 by rewriting `Dofiles/Analysis/analysis_votechoice.do` in Python. This draws data from `Data/Analysis/analysis_indiv.dta` to run the following regression:

> `reg int_act calsurv_dummy* calweekday* caldummy_pos caldistpos_dummy* dummy*, hascons cl(id_clust)`.

The coefficients of `dummy*` are then used to generate the figure.

### Load raw data and subset

In [2]:
analysis_indiv = pd.read_stata('../../datasets/voters/raw/Data/Analysis/analysis_indiv.dta')
analysis_indiv

Unnamed: 0,month_pre,day_pre,month_pos,day_pos,age,id_resp,day_elec,month_elec,year_elec,year_pre,...,new_int2,new_act2,winner_int,winner_act,watch,small_int,small_act,small_int2,small_act2,rcs
0,9.0,11.0,10.0,27.0,56.0,3876.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,,1.0,,,0.0,0.0,0.0,1.0
1,8.0,29.0,11.0,4.0,27.0,102.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,1.0,1.0,,0.0,0.0,0.0,0.0,1.0
2,8.0,19.0,10.0,15.0,69.0,3866.0,29.0,9.0,2013.0,2013.0,...,,0.0,,1.0,,,0.0,,0.0,1.0
3,9.0,13.0,10.0,19.0,27.0,1805.0,29.0,9.0,2013.0,2013.0,...,,0.0,,1.0,,,0.0,,0.0,1.0
4,9.0,11.0,10.0,11.0,50.0,2698.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,,1.0,,,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336249,9.0,2.0,,,29.0,18989.0,14.0,9.0,2014.0,2014.0,...,,,,,,,,,,0.0
336250,9.0,6.0,,,43.0,16682.0,14.0,9.0,2014.0,2014.0,...,,,,,,,,,,0.0
336251,9.0,5.0,,,29.0,5249.0,14.0,9.0,2014.0,2014.0,...,,,,,,,,,,0.0
336252,9.0,2.0,,,46.0,9241.0,14.0,9.0,2014.0,2014.0,...,0.0,,0.0,,,0.0,,0.0,,0.0


The relevant columns are:
- `survey`: name of survey
- `id_surv`: survey ID (1 to 65)
- `date_elec`: date of election
- `dist_pre`: days before election when survey was taken
- `weekday_pre`: day of the week when pre-election survey was taken
- `dist_pos`: days after election when survey was taken
- `int_act`: dummy whether respondent's intended vote in pre-election survey matches actual vote

In [3]:
df = analysis_indiv.copy()
df = df[['survey', 'id_surv', 'date_elec', 'dist_pre', 'weekday_pre', 'dist_pos', 'int_act']]
df = df[(df['dist_pre'] <= 60) & (df['dist_pre'] > 0)]
df.dropna(subset=['int_act'], inplace=True)

df['id_surv'] = df['id_surv'].astype(int)
df['dist_pre'] = df['dist_pre'].astype(int)
df['weekday_pre'] = df['weekday_pre'].astype(int)
df['int_act'] = df['int_act'].astype(int)

df

Unnamed: 0,survey,id_surv,date_elec,dist_pre,weekday_pre,dist_pos,int_act
1,AUTNES,1,2013-09-29,31,4,36.0,1
2,AUTNES,1,2013-09-29,41,1,16.0,0
3,AUTNES,1,2013-09-29,16,5,20.0,0
5,AUTNES,1,2013-09-29,23,5,35.0,0
6,AUTNES,1,2013-09-29,33,2,17.0,1
...,...,...,...,...,...,...,...
334785,SNES,65,2014-09-14,12,2,1.0,1
334786,SNES,65,2014-09-14,10,4,1.0,1
334787,SNES,65,2014-09-14,11,3,1.0,1
334788,SNES,65,2014-09-14,12,2,1.0,0


In [4]:
df['date_elec'].min()

Timestamp('1952-11-04 00:00:00')

### Prepare for regression

Categorical variables must be converted to dummies and then centered on `dist_pre=1`.

In [5]:
sample_ref = df['dist_pre'] == 1
sample_ref

1         False
2         False
3         False
5         False
6         False
          ...  
334785    False
334786    False
334787    False
334788    False
334789    False
Name: dist_pre, Length: 200916, dtype: bool

In [6]:
dummy = pd.get_dummies(df['dist_pre'], prefix='dummy', dtype=int)

In [7]:
surv_dummy = pd.get_dummies(df['id_surv'], prefix='surv', dtype=int)
surv_mean = surv_dummy.loc[sample_ref, :].mean()
calsurv_dummy = surv_dummy.sub(surv_mean)
calsurv_dummy = calsurv_dummy.iloc[:, 1:]

In [8]:
weekday_dummy = pd.get_dummies(df['weekday_pre'], prefix='weekday', dtype=int)
weekday_mean = weekday_dummy.loc[sample_ref, :].mean()
calweekday_dummy = weekday_dummy.sub(weekday_mean)
calweekday_dummy = calweekday_dummy.iloc[:, 1:]

In [9]:
distpos_dummy = pd.get_dummies(df['dist_pos'], prefix='dist', dummy_na=True, dtype=int)
distpos_mean = distpos_dummy.loc[sample_ref, :].mean()
caldistpos_dummy = distpos_dummy.sub(distpos_mean)
caldistpos_dummy = caldistpos_dummy.iloc[:, 1:]

In [10]:
X = pd.concat([calsurv_dummy, calweekday_dummy, caldistpos_dummy, dummy], axis=1)
y = df['int_act']

In [11]:
X

Unnamed: 0,surv_2,surv_3,surv_4,surv_5,surv_6,surv_7,surv_8,surv_9,surv_10,surv_11,...,dummy_51,dummy_52,dummy_53,dummy_54,dummy_55,dummy_56,dummy_57,dummy_58,dummy_59,dummy_60
1,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
2,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
3,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
5,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
6,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334785,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
334786,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
334787,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0
334788,-0.016224,-0.13849,-0.155655,-0.141077,-0.014108,-0.01881,-0.013402,-0.014108,-0.015989,-0.012932,...,0,0,0,0,0,0,0,0,0,0


### Perform regression

In [12]:
df['survey'], _ = pd.factorize(df['survey'])
df['date_elec'], _ = pd.factorize(df['date_elec'])

In [13]:
df

Unnamed: 0,survey,id_surv,date_elec,dist_pre,weekday_pre,dist_pos,int_act
1,0,1,0,31,4,36.0,1
2,0,1,0,41,1,16.0,0
3,0,1,0,16,5,20.0,0
5,0,1,0,23,5,35.0,0
6,0,1,0,33,2,17.0,1
...,...,...,...,...,...,...,...
334785,10,65,45,12,2,1.0,1
334786,10,65,45,10,4,1.0,1
334787,10,65,45,11,3,1.0,1
334788,10,65,45,12,2,1.0,0


In [14]:
model = sm.OLS(y, X)
results = model.fit(cov_type='cluster', cov_kwds={'groups': [df['survey'], df['date_elec']]})
results.summary()

0,1,2,3
Dep. Variable:,int_act,R-squared:,0.072
Model:,OLS,Adj. R-squared:,0.071
Method:,Least Squares,F-statistic:,
Date:,"Thu, 25 May 2023",Prob (F-statistic):,
Time:,12:14:17,Log-Likelihood:,-99100.0
No. Observations:,200916,AIC:,198700.0
Df Residuals:,200668,BIC:,201200.0
Df Model:,247,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
surv_2,0.1311,0.009,14.673,0.000,0.114,0.149
surv_3,0.1050,0.012,9.114,0.000,0.082,0.128
surv_4,0.1484,0.007,22.369,0.000,0.135,0.161
surv_5,0.1993,0.006,34.835,0.000,0.188,0.211
surv_6,0.0018,0.006,0.297,0.766,-0.010,0.014
surv_7,-0.0542,0.003,-18.761,0.000,-0.060,-0.049
surv_8,-0.0879,0.004,-23.885,0.000,-0.095,-0.081
surv_9,-0.0424,0.003,-12.378,0.000,-0.049,-0.036
surv_10,-0.0261,0.002,-14.318,0.000,-0.030,-0.023

0,1,2,3
Omnibus:,32414.573,Durbin-Watson:,1.884
Prob(Omnibus):,0.0,Jarque-Bera (JB):,51459.998
Skew:,-1.24,Prob(JB):,0.0
Kurtosis:,2.972,Cond. No.,182.0


In [42]:
consistency = pd.concat([results.params, results.conf_int()], axis=1)
consistency = consistency[consistency.index.str.contains('dummy_')].reset_index(drop=True)
consistency.columns = ['est', 'conf_int_low', 'conf_int_high']
consistency['dist_pre'] = -(consistency.index + 1)

In [50]:
consistency[0:10]

Unnamed: 0,est,conf_int_low,conf_int_high,dist_pre
0,0.876793,0.853563,0.900023,-1
1,0.873853,0.835343,0.912364,-2
2,0.854707,0.830094,0.87932,-3
3,0.852763,0.827555,0.877972,-4
4,0.839538,0.821556,0.85752,-5
5,0.838909,0.814825,0.862993,-6
6,0.826957,0.80332,0.850594,-7
7,0.81933,0.806733,0.831926,-8
8,0.814097,0.797217,0.830977,-9
9,0.818631,0.801811,0.835452,-10


In [51]:
consistency.to_csv('consistency.csv', index=False) 

## Figure 1 over time

Construct Figure 1 per decade.

In [164]:
df = pd.read_stata('../../datasets/voters/raw/Data/Analysis/analysis_indiv.dta')
df.dropna(subset=['int_act'], inplace=True)
df = df[(df['dist_pre'] <= 60) & (df['dist_pre'] > 0)]

df['id_surv'] = df['id_surv'].astype(int)
df['dist_pre'] = df['dist_pre'].astype(int)
df['weekday_pre'] = df['weekday_pre'].astype(int)
df['int_act'] = df['int_act'].astype(int)
df['decade'] = df['date_elec'].dt.year // 10 * 10
df['dist_pre_week'] = df['dist_pre'] // 7

df = df[['survey', 'id_surv', 'date_elec', 'decade', 'dist_pre', 'dist_pre_week', 'weekday_pre', 'dist_pos', 'int_act']]

df

Unnamed: 0,survey,id_surv,date_elec,decade,dist_pre,dist_pre_week,weekday_pre,dist_pos,int_act
1,AUTNES,1,2013-09-29,2010,31,4,4,36.0,1
2,AUTNES,1,2013-09-29,2010,41,5,1,16.0,0
3,AUTNES,1,2013-09-29,2010,16,2,5,20.0,0
5,AUTNES,1,2013-09-29,2010,23,3,5,35.0,0
6,AUTNES,1,2013-09-29,2010,33,4,2,17.0,1
...,...,...,...,...,...,...,...,...,...
334785,SNES,65,2014-09-14,2010,12,1,2,1.0,1
334786,SNES,65,2014-09-14,2010,10,1,4,1.0,1
334787,SNES,65,2014-09-14,2010,11,1,3,1.0,1
334788,SNES,65,2014-09-14,2010,12,1,2,1.0,0


In [166]:
df['dist_pre_week'].value_counts()

dist_pre_week
1    48916
0    28678
2    27412
3    26162
4    23076
5    18600
6    13465
7     9263
8     5344
Name: count, dtype: int64

### Helper function

In [171]:
def get_params(df_all, decade):
    
    df = df_all[df_all['decade'] == decade].copy()

    df['survey'], _ = pd.factorize(df['survey'])
    df['date_elec'], _ = pd.factorize(df['date_elec'])
    
    sample_ref = df['dist_pre_week'] == 1

    dummy = pd.get_dummies(df['dist_pre_week'], prefix='dummy', dtype=int)

    surv_dummy = pd.get_dummies(df['id_surv'], prefix='surv', dtype=int)
    surv_mean = surv_dummy.loc[sample_ref, :].mean()
    calsurv_dummy = surv_dummy.sub(surv_mean)
    calsurv_dummy = calsurv_dummy.iloc[:, 1:]

    weekday_dummy = pd.get_dummies(df['weekday_pre'], prefix='weekday', dtype=int)
    weekday_mean = weekday_dummy.loc[sample_ref, :].mean()
    calweekday_dummy = weekday_dummy.sub(weekday_mean)
    calweekday_dummy = calweekday_dummy.iloc[:, 1:]

    distpos_dummy = pd.get_dummies(df['dist_pos'], prefix='dist', dummy_na=True, dtype=int)
    distpos_mean = distpos_dummy.loc[sample_ref, :].mean()
    caldistpos_dummy = distpos_dummy.sub(distpos_mean)
    caldistpos_dummy = caldistpos_dummy.iloc[:, 1:]

    #X = pd.concat([calsurv_dummy, calweekday_dummy, caldistpos_dummy, dummy], axis=1)
    X = pd.concat([calsurv_dummy, caldistpos_dummy, dummy], axis=1)
    y = df['int_act']

    model = sm.OLS(y, X)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': df['date_elec']})

    params = pd.concat([results.params, results.conf_int()], axis=1)
    params = params[params.index.str.contains('dummy_')].reset_index(drop=True)
    params.columns = ['est', 'conf_int_low', 'conf_int_high']
    params['dist_pre_week'] = -params.index
    params['decade'] = decade

    params = params[['decade', 'dist_pre_week', 'est', 'conf_int_low', 'conf_int_high']]
    
    return params

In [172]:
decades = list(range(1950, 2020, 10))

params = pd.DataFrame(columns=['decade', 'dist_pre_week', 'est', 'conf_int_low', 'conf_int_high'])

for decade in decades:
    params_d = get_params(df, decade)
    params = pd.concat([params, params_d], axis=0)


In [173]:
params

Unnamed: 0,decade,dist_pre_week,est,conf_int_low,conf_int_high
0,1950,0,0.809965,0.740326,0.879603
1,1950,-1,0.772242,0.759982,0.784502
2,1950,-2,0.789133,0.780015,0.798252
3,1950,-3,0.735069,0.725930,0.744208
4,1950,-4,0.762495,0.750482,0.774507
...,...,...,...,...,...
4,2010,-4,0.740299,0.715450,0.765149
5,2010,-5,0.742977,0.715965,0.769990
6,2010,-6,0.728303,0.690194,0.766412
7,2010,-7,0.712137,0.679395,0.744880


In [174]:
params.to_csv('overtime.csv', index=False)

## Debates

In [175]:
analysis_debate_indiv = pd.read_stata('../../datasets/voters/raw/Data/Analysis/analysis_debate_indiv.dta')
analysis_debate_indiv

Unnamed: 0,month_pre,day_pre,month_pos,day_pos,age,id_resp,day_elec,month_elec,year_elec,year_pre,...,dummy_pos107,dummy_pos108,dummy_pos109,dummy_pos110,dummy_pos111,dummy_pos112,dummy_pos113,dummy_pos114,dummy_pos115,dummy_pos116
0,9.0,4.0,,,48.0,206.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.0,30.0,10.0,16.0,49.0,3239.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,30.0,,,50.0,494.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.0,20.0,10.0,21.0,66.0,2647.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9.0,6.0,10.0,16.0,38.0,3372.0,29.0,9.0,2013.0,2013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399502,9.0,1.0,10.0,2.0,37.0,3336.0,24.0,9.0,2017.0,2017.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399503,8.0,10.0,10.0,10.0,44.0,7206.0,24.0,9.0,2017.0,2017.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399504,8.0,3.0,9.0,29.0,38.0,7551.0,24.0,9.0,2017.0,2017.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399505,8.0,6.0,,,53.0,5519.0,24.0,9.0,2017.0,2017.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [207]:
analysis_debate_indiv.to_csv('analysis_debate_indiv.csv', index=False)

In [199]:
weekday_dummy = list(analysis_debate_indiv.columns[analysis_debate_indiv.columns.str.contains('weekday_dummy')])
dummy_pos = list(analysis_debate_indiv.columns[analysis_debate_indiv.columns.str.contains('dummy_pos')])

X_cols = weekday_dummy + ['pos'] + dummy_pos + ['dummya', 'dummyl3', 'dummyl2', 'dummyl1', 'dummyu1', 'dummyu2', 'dummyu3', 'dummyb']
all_cols = ['country', 'date_debate', 'id_date', 'int_act'] + X_cols

In [203]:
df = analysis_debate_indiv.copy()
df['id_date'] = df.groupby(['country', 'date_debate']).ngroup()
df = df.set_index(['id_date', 'country'])

In [206]:
pd.get_dummies(df.index.get_level_values('country'), drop_first=True)

Unnamed: 0,Canada,Germany,Netherlands,NewZealand,UK,US
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
399502,False,True,False,False,False,False
399503,False,True,False,False,False,False
399504,False,True,False,False,False,False
399505,False,True,False,False,False,False


In [200]:
df = analysis_debate_indiv.copy()
df['id_date'] = df.groupby(['country', 'date_debate']).ngroup()
df.dropna(subset=['int_act'], inplace=True)

X = df[X_cols]
y = df['int_act']
df = df[all_cols]

df

Unnamed: 0,country,date_debate,id_date,int_act,weekday_dummy1,weekday_dummy2,weekday_dummy3,weekday_dummy4,weekday_dummy5,weekday_dummy6,...,dummy_pos115,dummy_pos116,dummya,dummyl3,dummyl2,dummyl1,dummyu1,dummyu2,dummyu3,dummyb
1,Austria,2013-08-29,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,Austria,2013-08-29,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Austria,2013-08-29,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,Austria,2013-08-29,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,Austria,2013-08-29,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399492,Germany,2017-09-03,7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399493,Germany,2017-09-03,7,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
399495,Germany,2017-09-03,7,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
399502,Germany,2017-09-03,7,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [201]:
df['int_act'].values

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [None]:
model = sm.OLS(y, X)
results = model.fit(cov_type='cluster', cov_kwds={'groups': [df['id_date'], df['date_elec']]})
results.summary()