For this assignment, we were to run a Chi Square test on our dataset, which in my case is the 2012 Outlook on Life Survey. I chose to compare respondents' indicated ethnicity to their willingness to date outside their race (one of the few questions with a simple yes/no answer). This one was fairly straightforward. Plotted on a bar graph, the results look like this:
where 1=Black, 2=White, 3=Latino, 5=Other, and 6=Mixed.
Right away, one group (blacks) appears to be much higher than the others. Running the Chi Square test on all groups gave a value of 40.7, with p<0.0001 and df=4.
However, I'm not done because I have multiple groups being compared. I need to run pairwise comparisons for all groups, for a total of 10 comparisons. The Bonferroni Adjustment indicates I need to divide my significant p value by 10, giving p<0.005. I must accept the null hypothesis for anything higher than that.
As expected, Black vs White Chi Square gave a value of 38.5, with p<0.0001 and df=1. Black vs Latino gave CS=5.0, p=0.026, with df=1; this would have been significant also, but following the Bonferroni Adjustment shown above, it is not. No other comparisons were even close to significant. Thus, the Chi Square test indicates blacks were significantly more willing to date outside their race than were whites, but not more significantly so than were other ethnicities.
For your perusal, my code follows:
"""
Analysis of Outlook on Life 2012 dataset using Python
created Tuesday, November 10, 2020
@author Joel Caren
"""
import pandas
import numpy
import seaborn
import scipy.stats
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
data = pandas.read_csv('oolpds.csv',low_memory = False) # read data file into 'data' using pandas and fend off a memory error
print (len(data)) #number of observations (rows)
print (len(data.columns)) #number of variables (columns)
#check format of variables
data['W1_M1'].dtype
#setting variables you will be working with to numeric; errors="coerce" fixes missing data
#data['W1_C1'] = pandas.to_numeric(data['W1_C1'], errors = "coerce") #pol party? (R=1,D=2,I=3,other=4)
data['W1_D11'] = pandas.to_numeric(data['W1_D11'], errors = "coerce") #rate republicans (0-100)
data['W1_D12'] = pandas.to_numeric(data['W1_D12'], errors = "coerce") #rate dems (0-100)
data['W1_E4'] = pandas.to_numeric(data['W1_E4'], errors = "coerce") #date outside race? y=1
data['W1_E61_C'] = pandas.to_numeric(data['W1_E61_C'], errors = "coerce") #against biracial kids (black), 1=agree--4=disagree
data['W1_E62_C'] = pandas.to_numeric(data['W1_E62_C'], errors = "coerce") #against biracial kids (latino), 1=agree--4=disagree
data['W1_E63_C'] = pandas.to_numeric(data['W1_E63_C'], errors = "coerce") #against biracial kids (white), 1=agree--4=disagree
data['W1_E64_C'] = pandas.to_numeric(data['W1_E64_C'], errors = "coerce") #against biracial kids (asian), 1=agree--4=disagree
data['W1_M1'] = pandas.to_numeric(data['W1_M1'], errors = "coerce") #which religion? none=13
data['W1_M1A'] = pandas.to_numeric(data['W1_M1A'], errors = "coerce") #born again? y=1,n=2,not asked=3,refuse=4
data['W1_M2'] = pandas.to_numeric(data['W1_M2'], errors = "coerce") #church race? B=1,W=2,L=3,A=4,other=5,don't know=6
data['W1_M4'] = pandas.to_numeric(data['W1_M4'], errors = "coerce") #allow women clergy? 1=agree--4=disagree
data['W1_M5'] = pandas.to_numeric(data['W1_M5'], errors = "coerce") #how often attend church? mult times/wk=1,never=6
data['W1_M6'] = pandas.to_numeric(data['W1_M6'], errors = "coerce") #active church member? y=1
data['W1_N1B'] = pandas.to_numeric(data['W1_N1B'], errors = "coerce") #rate native americans (0-100)
data['W1_N1C'] = pandas.to_numeric(data['W1_N1C'], errors = "coerce") #rate latinos (0-100)
data['W1_N1D'] = pandas.to_numeric(data['W1_N1D'], errors = "coerce") #rate whites (0-100)
data['W1_N1E'] = pandas.to_numeric(data['W1_N1E'], errors = "coerce") #rate blacks (0-100)
data['W1_N1F'] = pandas.to_numeric(data['W1_N1F'], errors = "coerce") #rate asians (0-100)
data['W1_O1'] = pandas.to_numeric(data['W1_O1'], errors = "coerce") #blacks should teach kids vs racism, 1=no--5=yes
data['W1_O3'] = pandas.to_numeric(data['W1_O3'], errors = "coerce") #blacks should teach kids not all whites racist, 1=no--5=yes
data['W1_QA5D'] = pandas.to_numeric(data['W1_QA5D'], errors = "coerce") #blacks should segregate, 1=agree--4=disagree
data['W2_QL2B'] = pandas.to_numeric(data['W2_QL2B'], errors = "coerce") #gays as priests? 1=agree--4=disagree
data['W2_QL3'] = pandas.to_numeric(data['W2_QL3'], errors = "coerce") #homosex immoral? 1=agree--4=disagree
data['W2_QL4'] = pandas.to_numeric(data['W2_QL4'], errors = "coerce") #laws vs gay discrimination? y=1--n=5
data['PPETHM'] = pandas.to_numeric(data['PPETHM'], errors = "coerce") #your race? w=1,b=2,other=3,L=4,mix=5
# recode missing values to python missing (NaN)
#data['W1_C1']=data['W1_C1'].replace(-1, numpy.nan)
data['W1_D11']=data['W1_D11'].replace(998, numpy.nan)
data['W1_D12']=data['W1_D12'].replace(998, numpy.nan)
data['W1_E4']=data['W1_E4'].replace(-1, numpy.nan)
data['W1_E61_C']=data['W1_E61_C'].replace(-1, numpy.nan)
data['W1_E62_C']=data['W1_E62_C'].replace(-1, numpy.nan)
data['W1_E63_C']=data['W1_E63_C'].replace(-1, numpy.nan)
data['W1_E64_C']=data['W1_E64_C'].replace(-1, numpy.nan)
data['W1_M1']=data['W1_M1'].replace(-1, numpy.nan)
data['W1_M1A']=data['W1_M1A'].replace(-1, numpy.nan)
data['W1_M2']=data['W1_M2'].replace(-1, numpy.nan)
data['W1_M4']=data['W1_M4'].replace(-1, numpy.nan)
data['W1_M5']=data['W1_M5'].replace(-1, numpy.nan)
data['W1_M6']=data['W1_M6'].replace(-1, numpy.nan)
data['W1_N1B']=data['W1_N1B'].replace(998, numpy.nan)
data['W1_N1C']=data['W1_N1C'].replace(998, numpy.nan)
data['W1_N1D']=data['W1_N1D'].replace(998, numpy.nan)
data['W1_N1E']=data['W1_N1E'].replace(998, numpy.nan)
data['W1_N1F']=data['W1_N1F'].replace(998, numpy.nan)
data['W1_N1B']=data['W1_N1B'].replace(-1, numpy.nan)
data['W1_N1C']=data['W1_N1C'].replace(-1, numpy.nan)
data['W1_N1D']=data['W1_N1D'].replace(-1, numpy.nan)
data['W1_N1E']=data['W1_N1E'].replace(-1, numpy.nan)
data['W1_N1F']=data['W1_N1F'].replace(-1, numpy.nan)
data['W1_O1']=data['W1_O1'].replace(-1, numpy.nan)
data['W1_O3']=data['W1_O3'].replace(-1, numpy.nan)
data['W1_QA5D']=data['W1_QA5D'].replace(-1, numpy.nan)
data['W2_QL2B']=data['W2_QL2B'].replace(-1, numpy.nan)
data['W2_QL3']=data['W2_QL3'].replace(-1, numpy.nan)
data['W2_QL4']=data['W2_QL4'].replace(-1, numpy.nan)
#create new variables that can be added together for religiosity from current variables
recode1 = {1:1, 2:1, 3:1, 4:1, 5:1, 6:1, 7:1, 8:1, 9:1, 10:1, 11:1, 12:1, 13:0}
data['M1'] = data['W1_M1'].map(recode1)
recode2 = {1:2, 2:0, 3:0} #yes=2pts, no=0
data['M1A'] = data['W1_M1A'].map(recode2)
data['M4'] = data['W1_M4'] / 4 #0-1pt in 0.25 increments
recode3 = {1:2.5, 2:2, 3:1.5, 4:1, 5:0.5, 6:0, 7:0}
data['M5'] = data['W1_M5'].map(recode3) #0-2.5pts based on church attentance
data['M6'] = (2 - data['W1_M6']) * 2 #yes=2pts, no=0
data['QL2'] = data['W2_QL2B'] / 4 #0-1pt in 0.25 increments
data['QL3'] = (5 - data['W2_QL3']) / 4 #0-1pt in 0.25 increments
data['QL4'] = data['W2_QL4'] / 5 #0-1pt in 0.2 increments
#create data subset for religiosity calculation
relig = data[['M1', 'M1A', 'M4', 'M5', 'M6', 'QL2', 'QL3', 'QL4']]
a = relig.head (n=10)
print ("Values for religiosity questions")
print(a)
#create new secondary variable, RELIG, the index of religiosity
data['RELIG']= relig.sum(axis=1) #axis=1 means the horizontal axis
b = data['RELIG'].head (n=10)
print ()
print ("Sum of Religiosity values for each respondent")
print (b)
data['CNT'] = relig.count(axis=1)
d = data['CNT'].head (n=10)
print ()
print ("Number of religiosity questions with valid answers")
print (d)
data['RELIND'] = relig.sum(axis=1) / relig.count(axis=1) #religious index=sum of religious variables/num of questions answered
c = data['RELIND'].head (n=10)
print ()
print ("Religious Index: sum of values / number of questions")
print (c)
#Manage acceptance variables
recode7 = {1:2, 2:1, 3:5, 4:3, 5:6}
data['ETH'] = data['PPETHM'].map(recode7) #remap ethnicity to match M2 church race
recode4 = {1:1, 2:0}
data['E4']=data['W1_E4'].map(recode4) #recode 'would you date outside your race?' to 1=Y, 0=N
recode5 = {1:0, 2:0.33, 3:0.66, 4:1}
data['E61'] = data['W1_E61_C'].map(recode5) #recode scale to 0--1 from 1--4
data['E62'] = data['W1_E62_C'].map(recode5) #concerned about biracial kids?
data['E63'] = data['W1_E63_C'].map(recode5)
data['E64'] = data['W1_E64_C'].map(recode5)
data['QA5D'] =data['W1_QA5D'].map(recode5)
data['N1B'] = data['W1_N1B'] / 100 #change 0--100 to 0.0--1.0 scale
data['N1C'] = data['W1_N1C'] / 100 #rate races
data['N1D'] = data['W1_N1D'] / 100
data['N1E'] = data['W1_N1E'] / 100
data['N1F'] = data['W1_N1F'] / 100
recode6 = {1:0, 2:0.25, 3:0.5, 4:0.75, 5:1.0}
data['O1'] = data['W1_O1'].map(recode6) #recode scale to 0--1 from 1--5
data['O3'] = data['W1_O3'].map(recode6)
#subdividing based on ethnicity
blk = data[(data['ETH'] == 1)] #black sub
wht = data[(data['ETH'] == 2)] #white sub
ltn = data[(data['ETH'] == 3)] #latino sub
oth = data[(data['ETH'] == 5)] #other sub
mix = data[(data['ETH'] == 6)] #mixed sub
def CHURAC (row): #1pt if church race != own race
if row['W1_M2'] > row['ETH']:
return 1
if row['W1_M2'] < row['ETH']:
return 1
if row['W1_M2'] == row['ETH']:
return 0
data['M2'] = data.apply (lambda row: CHURAC (row),axis=1)
#group the acceptance questions for each race (omit rating of own race)
baccept = data[['E4', 'E61', 'E62', 'E63', 'E64', 'M2', 'N1B', 'N1C', 'N1D', 'N1F', 'O1', 'O3', 'QA5D']] #the questions for blacks
waccept = data[['E4', 'E61', 'E62', 'E63', 'E64', 'M2', 'N1B', 'N1C', 'N1E', 'N1F', 'O1', 'O3', 'QA5D']] #the questions for whites
laccept = data[['E4', 'E61', 'E62', 'E63', 'E64', 'M2', 'N1B', 'N1D', 'N1E', 'N1F', 'O1', 'O3', 'QA5D']] #the questions for latinos
oaccept = data[['E4', 'E61', 'E62', 'E63', 'E64', 'M2', 'N1C', 'N1D', 'N1E', 'O1', 'O3', 'QA5D']] #the questions for other
maccept = data[['E4', 'E61', 'E62', 'E63', 'E64', 'M2', 'N1B', 'N1C', 'N1D', 'N1E', 'N1F', 'O1', 'O3', 'QA5D']] #the questions for mixed
#add the columns for each race, but do it for all respondents
data['BACSUM'] = baccept.sum (axis=1)
data['WACSUM'] = waccept.sum (axis=1)
data['LACSUM'] = laccept.sum (axis=1)
data['OACSUM'] = oaccept.sum (axis=1)
data['MACSUM'] = maccept.sum (axis=1)
def ACCEP (row): #select the sum (from above) based on respondent's race
a = 0.0
if row['ETH'] == 1: #black
a = row['BACSUM']
if row['ETH'] == 2: #white
a = row['WACSUM']
if row['ETH'] == 3: #latino
a = row['LACSUM']
if row['ETH'] == 5: #other
a = row['OACSUM']
if row['ETH'] == 6: #mix
a = row['MACSUM']
return a
data['ACCEPT'] = data.apply (lambda row: ACCEP (row),axis=1)
a1 = data['ACCEPT'].head (n=10)
print ('acceptance')
print (a1)
#count number of valid acceptance answers for all respondents based on race
data['BACCNT'] = baccept.count(axis=1)
data['WACCNT'] = waccept.count(axis=1)
data['LACCNT'] = laccept.count(axis=1)
data['OACCNT'] = oaccept.count(axis=1)
data['MACCNT'] = maccept.count(axis=1)
def ACCNT (row): #select the count (from above) based on respondent's race
a = 0.0
if row['ETH'] == 1: #black
a = row['BACCNT']
if row['ETH'] == 2: #white
a = row['WACCNT']
if row['ETH'] == 3: #latino
a = row['LACCNT']
if row['ETH'] == 5: #other
a = row['OACCNT']
if row['ETH'] == 6: #mix
a = row['MACCNT']
return a
data['ACCNT'] = data.apply (lambda row: ACCNT (row),axis=1)
a2 = data['ACCNT'].head (n=10)
print ('# of valid answers to acceptance questions')
print (a2)
#calculate acceptance index: acceptance sum / valid question count
data['ACCIND'] = data['ACCEPT'] / data['ACCNT']
a3 = data['ACCIND'].head (n=10)
print ('Acceptance index')
print (a3)
descr = data['RELIND'].describe()
print (descr)
desca = data['ACCIND'].describe()
print (desca)
#Univariate histogram for religiosity:
hist1 = seaborn.histplot(data["RELIND"].dropna(), kde=False);
plt.xlabel('Index of Religiosity')
plt.title('Index of Religiosity')
print (hist1)
#Univariate histogram for acceptance:
hist2 = seaborn.histplot(data["ACCIND"].dropna(), kde=False);
plt.xlabel('Index of Racial Acceptance')
plt.title('Index of Racial Acceptance')
print (hist2)
#basic scatterplot: Q->Q
scat1 = seaborn.regplot(x="RELIND", y="ACCIND", fit_reg=False, data=data)
plt.xlabel('Religiosity')
plt.ylabel('Acceptance')
plt.title('Scatterplot for the Association Between Religiosity and Acceptance')
#Set up for Chi Square comparison: race vs willingness to date outside race
# contingency table of observed counts
ct1=pandas.crosstab(data['E4'], data['ETH'])
print (ct1)
# column percentages
colsum=ct1.sum(axis=0)
colpct=ct1/colsum
print(colpct)
# chi-square
print ('chi-square value, p value, expected counts')
cs1= scipy.stats.chi2_contingency(ct1)
print (cs1)
# graph percent willing to date outside race within each racial group
seaborn.catplot(x="ETH", y="E4", data=data, kind="bar", ci=None)
plt.xlabel('Ethnicity')
plt.ylabel('Proportion Willing to Date Extraracially')
#Using Bonferroni Adjustment post hoc test to guard vs Type I error: 10 different comparisons, so significance level p<0.005
#use recode to do individual pairwise comparisons: 1-2, 1-3, 1-5, 1-5, 2-3, 2-5, 2-6, 3-5, 3-6, 5-6
recode2 = {1: 1, 2: 2} #1-2 comparison
data['COMP1v2']= data['ETH'].map(recode2)
# contingency table of observed counts
ct2=pandas.crosstab(data['E4'], data['COMP1v2'])
print (ct2)
# column percentages
colsum=ct2.sum(axis=0)
colpct=ct2/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs2= scipy.stats.chi2_contingency(ct2)
print (cs2)
recode3 = {1: 1, 3: 3} #1-3 comparison
data['COMP1v3']= data['ETH'].map(recode3)
# contingency table of observed counts
ct3=pandas.crosstab(data['E4'], data['COMP1v3'])
print (ct3)
# column percentages
colsum=ct3.sum(axis=0)
colpct=ct3/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs3= scipy.stats.chi2_contingency(ct3)
print (cs3)
recode4 = {1: 1, 5: 5} #1-5 comparison
data['COMP1v5']= data['ETH'].map(recode4)
# contingency table of observed counts
ct4=pandas.crosstab(data['E4'], data['COMP1v5'])
print (ct4)
# column percentages
colsum=ct4.sum(axis=0)
colpct=ct4/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs4= scipy.stats.chi2_contingency(ct4)
print (cs4)
recode5 = {1: 1, 6: 6} #1-6 comparison
data['COMP1v6']= data['ETH'].map(recode5)
# contingency table of observed counts
ct5=pandas.crosstab(data['E4'], data['COMP1v6'])
print (ct5)
# column percentages
colsum=ct5.sum(axis=0)
colpct=ct5/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs5= scipy.stats.chi2_contingency(ct5)
print (cs5)
recode6 = {2: 2, 3: 3} #2-3 comparison
data['COMP2v3']= data['ETH'].map(recode6)
# contingency table of observed counts
ct6=pandas.crosstab(data['E4'], data['COMP2v3'])
print (ct6)
# column percentages
colsum=ct6.sum(axis=0)
colpct=ct6/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs6= scipy.stats.chi2_contingency(ct6)
print (cs6)
recode7 = {2: 2, 5: 5} #2-5 comparison
data['COMP2v5']= data['ETH'].map(recode7)
# contingency table of observed counts
ct7=pandas.crosstab(data['E4'], data['COMP2v5'])
print (ct7)
# column percentages
colsum=ct7.sum(axis=0)
colpct=ct7/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs7= scipy.stats.chi2_contingency(ct7)
print (cs7)
recode8 = {2: 2, 6: 6} #2-6 comparison
data['COMP2v6']= data['ETH'].map(recode8)
# contingency table of observed counts
ct8=pandas.crosstab(data['E4'], data['COMP2v6'])
print (ct8)
# column percentages
colsum=ct8.sum(axis=0)
colpct=ct8/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs8= scipy.stats.chi2_contingency(ct8)
print (cs8)
recode9 = {3: 3, 5: 5} #3-5 comparison
data['COMP3v5']= data['ETH'].map(recode9)
# contingency table of observed counts
ct9=pandas.crosstab(data['E4'], data['COMP3v5'])
print (ct9)
# column percentages
colsum=ct9.sum(axis=0)
colpct=ct9/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs9= scipy.stats.chi2_contingency(ct9)
print (cs9)
recode10 = {3: 3, 6: 6} #3-6 comparison
data['COMP3v6']= data['ETH'].map(recode10)
# contingency table of observed counts
ct10=pandas.crosstab(data['E4'], data['COMP3v6'])
print (ct10)
# column percentages
colsum=ct10.sum(axis=0)
colpct=ct10/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs10= scipy.stats.chi2_contingency(ct10)
print (cs10)
recode11 = {5: 5, 6: 6} #5-6 comparison
data['COMP5v6']= data['ETH'].map(recode11)
# contingency table of observed counts
ct11=pandas.crosstab(data['E4'], data['COMP5v6'])
print (ct11)
# column percentages
colsum=ct11.sum(axis=0)
colpct=ct11/colsum
print(colpct)
print ('chi-square value, p value, expected counts')
cs11= scipy.stats.chi2_contingency(ct11)
print (cs11)