Stepwise logistic regression using R(Ⅱ)

Consider a clinical data set. You can download it from Here. The Diabetes data set consists nine variables. The variable Class is the response variable with a value of tested_positive and a value of tested_negative. The other eight variables are the explanatory variables.

Stepwise logistic regression
  1. #read into data
  2. diabetes<-read.csv(“d:/diabetes.csv”,head=TRUE)
  3. #get explanatory variables
  4. x<-c(rep(1,768),diabetes[2:769,1],diabetes[2:769,2],diabetes[2:769,3],diabetes[2:769,4],diabetes[2:769,5],diabetes[2:769,6],diabetes[2:769,7],diabetes[2:769,8])
  5. x<-matrix(x,nrow=768)
  6. #get response variable
  7. y<-diabetes[2:769,9]
  8. y<-ifelse(y==“tested_positive”,1,0)
  9. y<-matrix(y,nrow=768)
  10. #set explantory variable’s name
  11. x_name<-c(“INTERCEPT”,“PREGNANT”,“PLASMA”,“PRESSURE”,“TRICEPS”,“INSULIN”,“INDEX”,“PEDIGREE”,“AGE”)
    The logistic regression R code is as follows.
Stepwise logistic regression
  1. pi_fun<-function(x,Beta){
  2. Beta<-matrix(as.vector(Beta),ncol=1)
  3. x<-matrix(as.vector(x),ncol=nrow(Beta))
  4. g_fun<-x%*%Beta
  5. exp(g_fun)/(1+exp(g_fun))
  6. }
  7. funs<-function(x,y,Beta){
  8. Beta<-matrix(c(Beta),ncol=1)
  9. pi_value<- pi_fun(x,Beta)
  10. U<-t(x)%*%(ypi_value);
  11. uni_matrix<-matrix(rep(1,nrow(pi_value)),nrow= nrow(pi_value));
  12. H<-t(x)%*%diag(as.vector(pi_value*( uni_matrix pi_value)))%*%x
  13. list(U=U,H=H)
  14. }
  15. Newtons<-function(fun,x,y,ep=1e8,it_max=100){
  16. Beta=matrix(rep(0,ncol(x)),nrow=ncol(x))
  17. Index<-0;
  18. k<-1
  19. while(k<=it_max){
  20. x1<-Beta;obj<-fun(x,y,Beta);
  21. Beta<-Beta+solve(obj$H,obj$U);
  22. objTemp<-fun(x,y,Beta)
  23. if(any(is.nan(objTemp$H))){
  24. Beta<-x1
  25. print(“Warning:The maximum likelihood estimate does not exist.”)
  26. print(paste(“The LOGISTIC procedure continues in spite of the above warning.”,
  27. “Results shown are based on the last maximum likelihood iteration. Validity of the model fit is questionable.”));
  28. }
  29. norm<-sqrt(t((Betax1))%*%(Betax1))
  30. if(norm<ep){
  31. index<-1;break
  32. }
  33. k<-k+1
  34. }
  35. obj<-fun(x,y,Beta);
  36. list(Beta=Beta,it=k,U=obj$U,H=obj$H)
  37. }
  38. CheckZero<-function(x){
  39. for(i in 1:length(x)){
  40. if(x[i]==0)
  41. x[i]<-1e300
  42. }
  43. x
  44. }
  45. ModelFitStat<-function(x,y,Beta){
  46. Beta<-matrix(as.vector(Beta),ncol=1)
  47. uni_matrix<-matrix(rep(1,nrow(y)),nrow= nrow(y));
  48. LOGL<–2*(t(y)%*%log(pi_fun(x,Beta))+t(uni_matrixy)%*%log(CheckZero(uni_matrixpi_fun(x,Beta))))
  49. #print(“—————–“)
  50. #print(LOGL)
  51. AIC<-LOGL+2*(ncol(x))
  52. SC<-LOGL+(ncol(x))*log(nrow(x))
  53. list(LOGL=LOGL,AIC=AIC,SC=SC)
  54. }
  55. GlobalNullTest<-function(x,y,Beta,BetaIntercept){
  56. Beta<-matrix(as.vector(Beta),ncol=1)
  57. pi_value<- pi_fun(x,Beta)
  58. df<-nrow(Beta)1
  59. ##compute Likelihood ratio
  60. MF<-ModelFitStat(x,y,Beta)
  61. n1<-sum(y[y>0])
  62. n<-nrow(y)
  63. LR<–2*(n1*log(n1)+(nn1)*log(nn1)n*log(n))MF$LOGL
  64. LR_p_value<-pchisq(LR,df,lower.tail=FALSE)
  65. LR_Test<-list(LR=LR,DF=df,LR_p_value=LR_p_value)
  66. ##compute Score
  67. BetaIntercept<-matrix(c(as.vector(BetaIntercept),rep(0,ncol(x)1)),ncol=1)
  68. obj<-funs(x,y,BetaIntercept)
  69. Score<-t(obj$U)%*%solve(obj$H)%*%obj$U
  70. Score_p_value<-pchisq(Score,df,lower.tail=FALSE)
  71. Score_Test<-list(Score=Score,DF=df,Score_p_value=Score_p_value)
  72. ##compute Wald test
  73. obj<-funs(x,y,Beta)
  74. I_Diag<-diag((solve(obj$H)))
  75. Q<-matrix(c(rep(0,nrow(Beta)1),as.vector(diag(rep(1,nrow(Beta)1)))),nrow=nrow(Beta)1)
  76. Wald<-t(Q%*%Beta)%*%solve(Q%*%diag(I_Diag)%*%t(Q))%*%(Q%*%Beta)
  77. Wald_p_value<-pchisq(Wald,df,lower.tail=FALSE)
  78. Wald_Test<-list(Wald=Wald,DF=df,Wald_p_value=Wald_p_value)
  79. list(LR_Test=LR_Test,Score_Test=Score_Test,Wald_Test=Wald_Test)
  80. }
  81. WhichEqual1<-function(x){
  82. a<-NULL
  83. for(i in 1:length(x)){
  84. if(x[i]==1){
  85. a<-c(a,i)
  86. }
  87. }
  88. a
  89. }
  90. CheckOut<-function(source,check){
  91. for(j in 1:length(source)){
  92. for(k in 1:length(check)){
  93. if(source[j]==check[k])
  94. source[j]<-0
  95. }
  96. }
  97. source[source>0]
  98. }
  99. NegativeCheck<-function(x){
  100. for(i in length(x):1){
  101. if(x[i]>0)
  102. }
  103. i
  104. }
  105. CycleCheck<-function(x){
  106. NegativeFlg<-NegativeCheck(x)
  107. if(NegativeFlg==length(x)){
  108. return(FALSE)
  109. }
  110. NegVec<-x[(NegativeFlg+1):length(x)]
  111. PosVec<-x[(2*NegativeFlglength(x)+1):NegativeFlg]
  112. NegVec<-sort(1*NegVec)
  113. PosVec<-sort(PosVec)
  114. if(all((NegVecPosVec)==0))
  115. return(TRUE)
  116. return(FALSE)
  117. }
  118. ##stepwise
  119. Stepwise<-function(x,y,checkin_pvalue=0.05,checkout_pvalue=0.05){
  120. ##as matrix
  121. ##indication of variable
  122. indict<-rep(0,ncol(x)) ##which column enter
  123. ##intercept entered
  124. indict[1]<-1
  125. Beta<-NULL
  126. print(“Intercept Entered…”)
  127. Result<-Newtons(funs,x[,1],y)
  128. Beta<-Result$Beta
  129. BetaIntercept<-Result$Beta
  130. uni_matrix<-matrix(rep(1,nrow(y)),nrow= nrow(y));
  131. LOGL<–2*(t(y)%*%log(pi_fun(x[,1],Beta))+t(uni_matrixy)%*%log(uni_matrixpi_fun(x[,1],Beta)))
  132. print(paste(“-2Log=”,LOGL))
  133. indexVector<-WhichEqual1(indict)
  134. ##check other variable
  135. VariableFlg<-NULL
  136. Terminate<-FALSE
  137. if(Terminate==TRUE){
  138. print(“Model building terminates because the last effect entered is removed by the Wald statistic criterion. “)
  139. }
  140. pvalue<-rep(1,ncol(x))
  141. k<-2:length(indict)
  142. k<-CheckOut(k,indexVector)
  143. for(i in k){
  144. obj<-funs(c(x[,indexVector],x[,i]),y,c(as.vector(Beta),0))
  145. Score<-t(obj$U)%*%solve(obj$H,obj$U)
  146. Score_pvalue<-pchisq(Score,1,lower.tail=FALSE)
  147. pvalue[i]<-Score_pvalue
  148. }
  149. #print(“Score pvalue for variable enter”)
  150. #print(pvalue)
  151. pvalue_min<-min(pvalue)
  152. if(pvalue_min<checkin_pvalue){
  153. j<-which.min(pvalue)
  154. print(paste(x_name[j],” entered:”))
  155. ##set indication of variable
  156. indict[j]<-1
  157. VariableFlg<-c(VariableFlg,j)
  158. indexVector<-WhichEqual1(indict)
  159. print(“indexVector–test”)
  160. print(indexVector)
  161. Result<-Newtons(funs,x[,indexVector],y)
  162. Beta<-NULL
  163. Beta<-Result$Beta
  164. ##compute model fit statistics
  165. print(“Model Fit Statistics”)
  166. MFStat<-ModelFitStat(x[,indexVector],y,Beta)
  167. print(MFStat)
  168. ##test globel null hypothesis:Beta=0
  169. print(“Testing Global Null Hypothese:Beta=0”)
  170. GNTest<-GlobalNullTest(x[,indexVector],y,Beta,BetaIntercept)
  171. print(GNTest)
  172. ##compute Wald test in order to remove variable
  173. indexVector<-WhichEqual1(indict)
  174. obj<-funs(x[,indexVector],y,Beta)
  175. H_Diag<-sqrt(diag(solve(obj$H)))
  176. WaldChisq<-(as.vector(Beta)/H_Diag)^2
  177. WaldChisqPvalue<-pchisq(WaldChisq,1,lower.tail=FALSE)
  178. WaldChisqTest<-list(WaldChisq=WaldChisq,WaldChisqPvalue=WaldChisqPvalue)
  179. print(“Wald chisq pvalue for variable remove”)
  180. print(WaldChisqPvalue)
  181. ##check wald to decide to which variable to be removed
  182. pvalue_max<-max(WaldChisqPvalue[2:length(WaldChisqPvalue)])##not check for intercept
  183. if(pvalue_max>checkout_pvalue){
  184. n<-which.max(WaldChisqPvalue[2:length(WaldChisqPvalue)])##not check for intercept
  185. print(paste(x_name[indexVector[n+1]],” is removed:”))
  186. ##set indication of variable
  187. indict[indexVector[n+1]]<-0
  188. m<- indexVector[n+1]
  189. VariableFlg<-c(VariableFlg,m)
  190. ##renew Beta
  191. indexVector<-WhichEqual1(indict)
  192. Result<-Newtons(funs,x[,indexVector],y)
  193. Beta<-NULL
  194. Beta<-Result$Beta
  195. if(CycleCheck(VariableFlg)==TRUE){
  196. Terminate<-TRUE
  197. }
  198. }
  199. else {
  200. print(paste(“No (additional) effects met the” ,checkout_pvalue,“significance level for removal from the model.”))
  201. }
  202. }##repeat end
  203. }
  204. else {
  205. print(paste(“No effects met the” ,checkin_pvalue,” significance level for entry into the model”))
  206. }
  207. }##repeat end
  208. ##
  209. print(“Beta”)
  210. print(Beta)
  211. print(“Analysis of Maximum Likelihood Estimates”)
  212. obj<-funs(x[,indexVector],y,Beta)
  213. H_Diag<-sqrt(diag(solve(obj$H)))
  214. WaldChisq<-(as.vector(Beta)/H_Diag)^2
  215. WaldChisqPvalue<-pchisq(WaldChisq,1,lower.tail=FALSE)
  216. WaldChisqTest<-list(WaldChisq=WaldChisq,WaldChisqPvalue=WaldChisqPvalue)
  217. print(WaldChisqTest)
  218. }

Finally, revoke the stepwise function.

Stepwise logistic regression
  1. Stepwise(x,y)

Stepwise Logistic Regression Using R(Ⅰ)

1. Logic regression model

We assume that we have p explanatory variables . The logic regression model is

,

where is the probability of and . The right-hand side is usually referred to logic function, which can be pictured as follow.

We can also define

,

which means the probability of ,in other words, the probability of an event that not occurs.

The odds of an event that occurs with probability p is defined as

.

Using log function on both sides, we can get a linear function

.

2. Maximum likelihood estimation

The next goal is to estimate the p+1 parameters . It can be achieved by maximum likelihood estimation. Since the probability of an event that occurs is , the probability of that occurs can be writhed as . Likewise, the probability of that occurs can be writhed as . Thus ,we can get a likelihood function

Replacing with , it can be rewritten as

In order to maximize the likelihood function, we can use log function on both sides of likelihood function.

To maximize , we set the first order partial derivatives of to zero.



For j=1,2,… p.

3. Newton-Raphson iteration method

In order to solve the set of p+1 nonlinear equations ,for j=1,2,…p. we can use the Newton-Raphson iteration method . The matrix of second derivatives, called the Hessian, is

In matrix form, let denotes Hessian matrix, and


Where is an n by n diagonal matrix with element . We can get

If

We have New-Raphson update formula

In fact, we don’t need to compute directly. Since is a positive definite matrices, we can get it by Cholesky decomposition on .

Let is denoted as ,this matrix is called the observed information matrix. The variances and covariances of the estimated coefficients can be obtained from the inverse of , which represent as .that is to say, the variance of is the diagonal element of matrix , and the covariance of and , which can be denoted by , is the off-diagonal element. So, we can the estimated standard errors of the estimated coefficients as

, for j=0,1,2,…,p

4. Significance test

A Wald test is used to test the statistical significance of each coefficient in the model. The hypothesis of test is =0. We can calculate

Where is not included in and the relevant row and column is not included in . This value is distributed as chi-square with degrees of freedom.

However, there are some problems with the use of the Wald statistic. Large coefficients and inflated standard error will lead to lower the Wald statistic value, and is easy to fail to reject the null when the hypothesis(TypeⅡerror). So the likelihood ratio test is more reliable for small sample sizes often used in practice.

The likelihood-ratio test uses the ratio of the maximized value of the likelihood function for unrestricted (L1) over the maximized value of the likelihood function when (L0). The test statistic,


yields a chi-squared distribution.

Score test is be introduced to test the hypothesis:

:

The score test statistic is

Which yelds a chi-squared distribution with df=1. The score test is often used in forward selection to add variable to model.

5. Variable selection: stepwise

A stepwise procedure for selection or deletion of variables from a model is based on a statistical algorithm that checks for the “importance” of variables. The “importance” of a variable is defined in terms of a measure of the statistical significance of the coefficient for the variable. Here is the flow:

Optimization Schaffer F6 function using basic genetic algorithm

Schaffer F6 function, as well and its graph,is pictured here.

clip_image002_thumb

schafferf6_thumbif set y=0,you can get a 2D graph.

schafferf62d_thumb 

These figures illustrate the fact that the Schaffer F6 function has many local optimizations.

The basic genetic algorithm is as follows.The matplotlib is included for plot the function graph.

import random
from math import sin, sqrt, pow
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np
from pylab import *

MAXIMIZE, MINIMIZE = 11, 22

def SchafferF6(x, y):
    temp1 = sin(sqrt(x * x + y * y))
    temp2 = 1 + 0.001 * (x * x + y * y)
    return 0.5 + (temp1 * temp1 - 0.5) / (temp2 * temp2)

#
#plot the 3D figure
#
fig = plt.figure()
ax = Axes3D(fig)
X = np.arange(-100, 100, 1)
Y = np.arange(-100, 100, 1)
X, Y = np.meshgrid(X, Y)

R = np.sin(np.sqrt(X * X + Y * Y))
Q = 1 + 0.001 * (X * X + Y * Y)
Z = 0.5 + (R * R - 0.5) / (Q * Q)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.jet)

plt.show()
#end

#
#plot the 2D figure
#

t = arange(-100, 100, 1)
s = 0.5 + (sin(t) * sin(t) - 0.5) / (1 + 0.001 * (t * t))
plot(t, s, linewidth=1.0)

xlabel('x')
ylabel('y')
title('Schaffer F6 2D figure')
grid(True)
show()

class Individual:
    alleles = (0, 1)
    seperator = ' '
    optimization = MINIMIZE
    length = 20
    chromosome = None
    score = None
    def __init__(self, chromosome=None):
        self.chromosome = chromosome or self._makechromosome()
        self.score = None # set during evaluation

    def _makechromosome(self):
        "makes a chromosome from randomly selected alleles."
        return [random.choice(self.alleles) for gene \

                 in range(0, self.length)]

    def evaluate(self):
        gene_x = self.chromosome[0: int(self.length / 2) + 1]
        gene_y = self.chromosome[int(self.length / 2) + 1:]
        x = 0
        y = 0
        for i in range(0, len(gene_x)):
            x += gene_x[i] * pow(2, i - 2)
        for j in range(0, len(gene_y)):
            y += gene_y[j] * pow(2, j - 2)
        self.score = SchafferF6(x, y)   

    def crossover(self, other):
       return self._twopoint(other)

    def mutate(self, gene):
       self.chromosome[gene] = 1 - self.chromosome[gene]

    # sample crossover method
    def _twopoint(self, other):
        "creates offspring via two-point crossover between mates."
        left, right = self._pickpivots()
        def mate(p0, p1):
            chromosome = p0.chromosome[:]
            chromosome[left:right] = p1.chromosome[left:right]
            child = p0.__class__(chromosome)
            return child
        return mate(self, other), mate(other, self)  

    def _pickpivots(self):
        left = random.randrange(1, self.length - 2)
        right = random.randrange(left, self.length - 1)
        return left, right

    def __repr__(self):
        "returns string representation of self"
        return '<%s chromosome="%s" score=%s>' % \
               (self.__class__.__name__,
                self.seperator.join(map(str, self.chromosome)),\

                self.score)

    def __cmp__(self, other):
        if self.optimization == MINIMIZE:
            return cmp(self.score, other.score)
        else: # MAXIMIZE
            return cmp(other.score, self.score)

    def copy(self):
        twin = self.__class__(self.chromosome[:])
        twin.score = self.score
        return twin

class Environment(object):
    def __init__(self, individual, population=None, size=100,\

                 maxgenerations=100,crossover_rate=0.90,\

                 mutation_rate=0.1, optimum=0):
        self.individual = individual
        self.size = size
        self.optimum = optimum
        self.population = population or self._makepopulation()
        for individual in self.population:
            individual.evaluate()
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        self.maxgenerations = maxgenerations
        self.generation = 0

        self.population.sort()
        self.best = self.population[0]
        self.report()

    def _makepopulation(self):
        return [self.individual() for i in range(self.size)]

    def run(self):
        while self.generation < self.maxgenerations or \
              self.best.score <= self.optimum * 1.0:
            next_population = [self.best.copy()]
            while len(next_population) < self.size:
                mate1 = self._select()
                if random.random() < self.crossover_rate:
                    mate2 = self._select()
                    offspring = mate1.crossover(mate2)
                else:
                    offspring = [mate1.copy()]
                for individual in offspring:
                    for gene in range(individual.length):
                        if random.random() < self.mutation_rate:
                            individual.mutate(gene)
                    individual.evaluate()
                    next_population.append(individual)
            self.population = next_population[:self.size]

            self.population.sort()
            self.best = self.population[0]
            self.generation += 1
            self.report()
        self.report()

    def _select(self):
        "override this to use your preferred selection method"
        return self._tournament()

    #
    # sample selection method
    #
    def _tournament(self, size=8, choosebest=0.90):
        competitors = [random.choice(self.population) \

                       for i in range(size)]
        competitors.sort()
        if random.random() < choosebest:
            return competitors[0]
        else:
            return random.choice(competitors[1:])

    def report(self):
        print "=" * 70
        print "generation: ", self.generation
        print "best:       ", self.best

if __name__ == "__main__":
    env = Environment(Individual, maxgenerations=1000)
    env.run()