Stepwise logistic regression using R(Ⅱ)

Consider a clinical data set. You can download it from Here. The Diabetes data set consists nine variables. The variable Class is the response variable with a value of tested_positive and a value of tested_negative. The other eight variables are the explanatory variables.

Stepwise logistic regression
  1. #read into data
  2. diabetes<-read.csv(“d:/diabetes.csv”,head=TRUE)
  3. #get explanatory variables
  4. x<-c(rep(1,768),diabetes[2:769,1],diabetes[2:769,2],diabetes[2:769,3],diabetes[2:769,4],diabetes[2:769,5],diabetes[2:769,6],diabetes[2:769,7],diabetes[2:769,8])
  5. x<-matrix(x,nrow=768)
  6. #get response variable
  7. y<-diabetes[2:769,9]
  8. y<-ifelse(y==“tested_positive”,1,0)
  9. y<-matrix(y,nrow=768)
  10. #set explantory variable’s name
  11. x_name<-c(“INTERCEPT”,“PREGNANT”,“PLASMA”,“PRESSURE”,“TRICEPS”,“INSULIN”,“INDEX”,“PEDIGREE”,“AGE”)
    The logistic regression R code is as follows.
Stepwise logistic regression
  1. pi_fun<-function(x,Beta){
  2. Beta<-matrix(as.vector(Beta),ncol=1)
  3. x<-matrix(as.vector(x),ncol=nrow(Beta))
  4. g_fun<-x%*%Beta
  5. exp(g_fun)/(1+exp(g_fun))
  6. }
  7. funs<-function(x,y,Beta){
  8. Beta<-matrix(c(Beta),ncol=1)
  9. pi_value<- pi_fun(x,Beta)
  10. U<-t(x)%*%(ypi_value);
  11. uni_matrix<-matrix(rep(1,nrow(pi_value)),nrow= nrow(pi_value));
  12. H<-t(x)%*%diag(as.vector(pi_value*( uni_matrix pi_value)))%*%x
  13. list(U=U,H=H)
  14. }
  15. Newtons<-function(fun,x,y,ep=1e8,it_max=100){
  16. Beta=matrix(rep(0,ncol(x)),nrow=ncol(x))
  17. Index<-0;
  18. k<-1
  19. while(k<=it_max){
  20. x1<-Beta;obj<-fun(x,y,Beta);
  21. Beta<-Beta+solve(obj$H,obj$U);
  22. objTemp<-fun(x,y,Beta)
  23. if(any(is.nan(objTemp$H))){
  24. Beta<-x1
  25. print(“Warning:The maximum likelihood estimate does not exist.”)
  26. print(paste(“The LOGISTIC procedure continues in spite of the above warning.”,
  27. “Results shown are based on the last maximum likelihood iteration. Validity of the model fit is questionable.”));
  28. }
  29. norm<-sqrt(t((Betax1))%*%(Betax1))
  30. if(norm<ep){
  31. index<-1;break
  32. }
  33. k<-k+1
  34. }
  35. obj<-fun(x,y,Beta);
  36. list(Beta=Beta,it=k,U=obj$U,H=obj$H)
  37. }
  38. CheckZero<-function(x){
  39. for(i in 1:length(x)){
  40. if(x[i]==0)
  41. x[i]<-1e300
  42. }
  43. x
  44. }
  45. ModelFitStat<-function(x,y,Beta){
  46. Beta<-matrix(as.vector(Beta),ncol=1)
  47. uni_matrix<-matrix(rep(1,nrow(y)),nrow= nrow(y));
  48. LOGL<–2*(t(y)%*%log(pi_fun(x,Beta))+t(uni_matrixy)%*%log(CheckZero(uni_matrixpi_fun(x,Beta))))
  49. #print(“—————–“)
  50. #print(LOGL)
  51. AIC<-LOGL+2*(ncol(x))
  52. SC<-LOGL+(ncol(x))*log(nrow(x))
  53. list(LOGL=LOGL,AIC=AIC,SC=SC)
  54. }
  55. GlobalNullTest<-function(x,y,Beta,BetaIntercept){
  56. Beta<-matrix(as.vector(Beta),ncol=1)
  57. pi_value<- pi_fun(x,Beta)
  58. df<-nrow(Beta)1
  59. ##compute Likelihood ratio
  60. MF<-ModelFitStat(x,y,Beta)
  61. n1<-sum(y[y>0])
  62. n<-nrow(y)
  63. LR<–2*(n1*log(n1)+(nn1)*log(nn1)n*log(n))MF$LOGL
  64. LR_p_value<-pchisq(LR,df,lower.tail=FALSE)
  65. LR_Test<-list(LR=LR,DF=df,LR_p_value=LR_p_value)
  66. ##compute Score
  67. BetaIntercept<-matrix(c(as.vector(BetaIntercept),rep(0,ncol(x)1)),ncol=1)
  68. obj<-funs(x,y,BetaIntercept)
  69. Score<-t(obj$U)%*%solve(obj$H)%*%obj$U
  70. Score_p_value<-pchisq(Score,df,lower.tail=FALSE)
  71. Score_Test<-list(Score=Score,DF=df,Score_p_value=Score_p_value)
  72. ##compute Wald test
  73. obj<-funs(x,y,Beta)
  74. I_Diag<-diag((solve(obj$H)))
  75. Q<-matrix(c(rep(0,nrow(Beta)1),as.vector(diag(rep(1,nrow(Beta)1)))),nrow=nrow(Beta)1)
  76. Wald<-t(Q%*%Beta)%*%solve(Q%*%diag(I_Diag)%*%t(Q))%*%(Q%*%Beta)
  77. Wald_p_value<-pchisq(Wald,df,lower.tail=FALSE)
  78. Wald_Test<-list(Wald=Wald,DF=df,Wald_p_value=Wald_p_value)
  79. list(LR_Test=LR_Test,Score_Test=Score_Test,Wald_Test=Wald_Test)
  80. }
  81. WhichEqual1<-function(x){
  82. a<-NULL
  83. for(i in 1:length(x)){
  84. if(x[i]==1){
  85. a<-c(a,i)
  86. }
  87. }
  88. a
  89. }
  90. CheckOut<-function(source,check){
  91. for(j in 1:length(source)){
  92. for(k in 1:length(check)){
  93. if(source[j]==check[k])
  94. source[j]<-0
  95. }
  96. }
  97. source[source>0]
  98. }
  99. NegativeCheck<-function(x){
  100. for(i in length(x):1){
  101. if(x[i]>0)
  102. }
  103. i
  104. }
  105. CycleCheck<-function(x){
  106. NegativeFlg<-NegativeCheck(x)
  107. if(NegativeFlg==length(x)){
  108. return(FALSE)
  109. }
  110. NegVec<-x[(NegativeFlg+1):length(x)]
  111. PosVec<-x[(2*NegativeFlglength(x)+1):NegativeFlg]
  112. NegVec<-sort(1*NegVec)
  113. PosVec<-sort(PosVec)
  114. if(all((NegVecPosVec)==0))
  115. return(TRUE)
  116. return(FALSE)
  117. }
  118. ##stepwise
  119. Stepwise<-function(x,y,checkin_pvalue=0.05,checkout_pvalue=0.05){
  120. ##as matrix
  121. ##indication of variable
  122. indict<-rep(0,ncol(x)) ##which column enter
  123. ##intercept entered
  124. indict[1]<-1
  125. Beta<-NULL
  126. print(“Intercept Entered…”)
  127. Result<-Newtons(funs,x[,1],y)
  128. Beta<-Result$Beta
  129. BetaIntercept<-Result$Beta
  130. uni_matrix<-matrix(rep(1,nrow(y)),nrow= nrow(y));
  131. LOGL<–2*(t(y)%*%log(pi_fun(x[,1],Beta))+t(uni_matrixy)%*%log(uni_matrixpi_fun(x[,1],Beta)))
  132. print(paste(“-2Log=”,LOGL))
  133. indexVector<-WhichEqual1(indict)
  134. ##check other variable
  135. VariableFlg<-NULL
  136. Terminate<-FALSE
  137. if(Terminate==TRUE){
  138. print(“Model building terminates because the last effect entered is removed by the Wald statistic criterion. “)
  139. }
  140. pvalue<-rep(1,ncol(x))
  141. k<-2:length(indict)
  142. k<-CheckOut(k,indexVector)
  143. for(i in k){
  144. obj<-funs(c(x[,indexVector],x[,i]),y,c(as.vector(Beta),0))
  145. Score<-t(obj$U)%*%solve(obj$H,obj$U)
  146. Score_pvalue<-pchisq(Score,1,lower.tail=FALSE)
  147. pvalue[i]<-Score_pvalue
  148. }
  149. #print(“Score pvalue for variable enter”)
  150. #print(pvalue)
  151. pvalue_min<-min(pvalue)
  152. if(pvalue_min<checkin_pvalue){
  153. j<-which.min(pvalue)
  154. print(paste(x_name[j],” entered:”))
  155. ##set indication of variable
  156. indict[j]<-1
  157. VariableFlg<-c(VariableFlg,j)
  158. indexVector<-WhichEqual1(indict)
  159. print(“indexVector–test”)
  160. print(indexVector)
  161. Result<-Newtons(funs,x[,indexVector],y)
  162. Beta<-NULL
  163. Beta<-Result$Beta
  164. ##compute model fit statistics
  165. print(“Model Fit Statistics”)
  166. MFStat<-ModelFitStat(x[,indexVector],y,Beta)
  167. print(MFStat)
  168. ##test globel null hypothesis:Beta=0
  169. print(“Testing Global Null Hypothese:Beta=0”)
  170. GNTest<-GlobalNullTest(x[,indexVector],y,Beta,BetaIntercept)
  171. print(GNTest)
  172. ##compute Wald test in order to remove variable
  173. indexVector<-WhichEqual1(indict)
  174. obj<-funs(x[,indexVector],y,Beta)
  175. H_Diag<-sqrt(diag(solve(obj$H)))
  176. WaldChisq<-(as.vector(Beta)/H_Diag)^2
  177. WaldChisqPvalue<-pchisq(WaldChisq,1,lower.tail=FALSE)
  178. WaldChisqTest<-list(WaldChisq=WaldChisq,WaldChisqPvalue=WaldChisqPvalue)
  179. print(“Wald chisq pvalue for variable remove”)
  180. print(WaldChisqPvalue)
  181. ##check wald to decide to which variable to be removed
  182. pvalue_max<-max(WaldChisqPvalue[2:length(WaldChisqPvalue)])##not check for intercept
  183. if(pvalue_max>checkout_pvalue){
  184. n<-which.max(WaldChisqPvalue[2:length(WaldChisqPvalue)])##not check for intercept
  185. print(paste(x_name[indexVector[n+1]],” is removed:”))
  186. ##set indication of variable
  187. indict[indexVector[n+1]]<-0
  188. m<- indexVector[n+1]
  189. VariableFlg<-c(VariableFlg,m)
  190. ##renew Beta
  191. indexVector<-WhichEqual1(indict)
  192. Result<-Newtons(funs,x[,indexVector],y)
  193. Beta<-NULL
  194. Beta<-Result$Beta
  195. if(CycleCheck(VariableFlg)==TRUE){
  196. Terminate<-TRUE
  197. }
  198. }
  199. else {
  200. print(paste(“No (additional) effects met the” ,checkout_pvalue,“significance level for removal from the model.”))
  201. }
  202. }##repeat end
  203. }
  204. else {
  205. print(paste(“No effects met the” ,checkin_pvalue,” significance level for entry into the model”))
  206. }
  207. }##repeat end
  208. ##
  209. print(“Beta”)
  210. print(Beta)
  211. print(“Analysis of Maximum Likelihood Estimates”)
  212. obj<-funs(x[,indexVector],y,Beta)
  213. H_Diag<-sqrt(diag(solve(obj$H)))
  214. WaldChisq<-(as.vector(Beta)/H_Diag)^2
  215. WaldChisqPvalue<-pchisq(WaldChisq,1,lower.tail=FALSE)
  216. WaldChisqTest<-list(WaldChisq=WaldChisq,WaldChisqPvalue=WaldChisqPvalue)
  217. print(WaldChisqTest)
  218. }

Finally, revoke the stepwise function.

Stepwise logistic regression
  1. Stepwise(x,y)
Advertisements

Stepwise Logistic Regression Using R(Ⅰ)

1. Logic regression model

We assume that we have p explanatory variables . The logic regression model is

,

where is the probability of and . The right-hand side is usually referred to logic function, which can be pictured as follow.

We can also define

,

which means the probability of ,in other words, the probability of an event that not occurs.

The odds of an event that occurs with probability p is defined as

.

Using log function on both sides, we can get a linear function

.

2. Maximum likelihood estimation

The next goal is to estimate the p+1 parameters . It can be achieved by maximum likelihood estimation. Since the probability of an event that occurs is , the probability of that occurs can be writhed as . Likewise, the probability of that occurs can be writhed as . Thus ,we can get a likelihood function

Replacing with , it can be rewritten as

In order to maximize the likelihood function, we can use log function on both sides of likelihood function.

To maximize , we set the first order partial derivatives of to zero.



For j=1,2,… p.

3. Newton-Raphson iteration method

In order to solve the set of p+1 nonlinear equations ,for j=1,2,…p. we can use the Newton-Raphson iteration method . The matrix of second derivatives, called the Hessian, is

In matrix form, let denotes Hessian matrix, and


Where is an n by n diagonal matrix with element . We can get

If

We have New-Raphson update formula

In fact, we don’t need to compute directly. Since is a positive definite matrices, we can get it by Cholesky decomposition on .

Let is denoted as ,this matrix is called the observed information matrix. The variances and covariances of the estimated coefficients can be obtained from the inverse of , which represent as .that is to say, the variance of is the diagonal element of matrix , and the covariance of and , which can be denoted by , is the off-diagonal element. So, we can the estimated standard errors of the estimated coefficients as

, for j=0,1,2,…,p

4. Significance test

A Wald test is used to test the statistical significance of each coefficient in the model. The hypothesis of test is =0. We can calculate

Where is not included in and the relevant row and column is not included in . This value is distributed as chi-square with degrees of freedom.

However, there are some problems with the use of the Wald statistic. Large coefficients and inflated standard error will lead to lower the Wald statistic value, and is easy to fail to reject the null when the hypothesis(TypeⅡerror). So the likelihood ratio test is more reliable for small sample sizes often used in practice.

The likelihood-ratio test uses the ratio of the maximized value of the likelihood function for unrestricted (L1) over the maximized value of the likelihood function when (L0). The test statistic,


yields a chi-squared distribution.

Score test is be introduced to test the hypothesis:

:

The score test statistic is

Which yelds a chi-squared distribution with df=1. The score test is often used in forward selection to add variable to model.

5. Variable selection: stepwise

A stepwise procedure for selection or deletion of variables from a model is based on a statistical algorithm that checks for the “importance” of variables. The “importance” of a variable is defined in terms of a measure of the statistical significance of the coefficient for the variable. Here is the flow:

Optimization Schaffer F6 function using basic genetic algorithm

Schaffer F6 function, as well and its graph,is pictured here.

clip_image002_thumb

schafferf6_thumbif set y=0,you can get a 2D graph.

schafferf62d_thumb 

These figures illustrate the fact that the Schaffer F6 function has many local optimizations.

The basic genetic algorithm is as follows.The matplotlib is included for plot the function graph.

import random
from math import sin, sqrt, pow
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import matplotlib.pyplot as plt
import numpy as np
from pylab import *

MAXIMIZE, MINIMIZE = 11, 22

def SchafferF6(x, y):
    temp1 = sin(sqrt(x * x + y * y))
    temp2 = 1 + 0.001 * (x * x + y * y)
    return 0.5 + (temp1 * temp1 - 0.5) / (temp2 * temp2)

#
#plot the 3D figure
#
fig = plt.figure()
ax = Axes3D(fig)
X = np.arange(-100, 100, 1)
Y = np.arange(-100, 100, 1)
X, Y = np.meshgrid(X, Y)

R = np.sin(np.sqrt(X * X + Y * Y))
Q = 1 + 0.001 * (X * X + Y * Y)
Z = 0.5 + (R * R - 0.5) / (Q * Q)
ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.jet)

plt.show()
#end

#
#plot the 2D figure
#

t = arange(-100, 100, 1)
s = 0.5 + (sin(t) * sin(t) - 0.5) / (1 + 0.001 * (t * t))
plot(t, s, linewidth=1.0)

xlabel('x')
ylabel('y')
title('Schaffer F6 2D figure')
grid(True)
show()

class Individual:
    alleles = (0, 1)
    seperator = ' '
    optimization = MINIMIZE
    length = 20
    chromosome = None
    score = None
    def __init__(self, chromosome=None):
        self.chromosome = chromosome or self._makechromosome()
        self.score = None # set during evaluation

    def _makechromosome(self):
        "makes a chromosome from randomly selected alleles."
        return [random.choice(self.alleles) for gene \

                 in range(0, self.length)]

    def evaluate(self):
        gene_x = self.chromosome[0: int(self.length / 2) + 1]
        gene_y = self.chromosome[int(self.length / 2) + 1:]
        x = 0
        y = 0
        for i in range(0, len(gene_x)):
            x += gene_x[i] * pow(2, i - 2)
        for j in range(0, len(gene_y)):
            y += gene_y[j] * pow(2, j - 2)
        self.score = SchafferF6(x, y)   

    def crossover(self, other):
       return self._twopoint(other)

    def mutate(self, gene):
       self.chromosome[gene] = 1 - self.chromosome[gene]

    # sample crossover method
    def _twopoint(self, other):
        "creates offspring via two-point crossover between mates."
        left, right = self._pickpivots()
        def mate(p0, p1):
            chromosome = p0.chromosome[:]
            chromosome[left:right] = p1.chromosome[left:right]
            child = p0.__class__(chromosome)
            return child
        return mate(self, other), mate(other, self)  

    def _pickpivots(self):
        left = random.randrange(1, self.length - 2)
        right = random.randrange(left, self.length - 1)
        return left, right

    def __repr__(self):
        "returns string representation of self"
        return '<%s chromosome="%s" score=%s>' % \
               (self.__class__.__name__,
                self.seperator.join(map(str, self.chromosome)),\

                self.score)

    def __cmp__(self, other):
        if self.optimization == MINIMIZE:
            return cmp(self.score, other.score)
        else: # MAXIMIZE
            return cmp(other.score, self.score)

    def copy(self):
        twin = self.__class__(self.chromosome[:])
        twin.score = self.score
        return twin

class Environment(object):
    def __init__(self, individual, population=None, size=100,\

                 maxgenerations=100,crossover_rate=0.90,\

                 mutation_rate=0.1, optimum=0):
        self.individual = individual
        self.size = size
        self.optimum = optimum
        self.population = population or self._makepopulation()
        for individual in self.population:
            individual.evaluate()
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        self.maxgenerations = maxgenerations
        self.generation = 0

        self.population.sort()
        self.best = self.population[0]
        self.report()

    def _makepopulation(self):
        return [self.individual() for i in range(self.size)]

    def run(self):
        while self.generation < self.maxgenerations or \
              self.best.score <= self.optimum * 1.0:
            next_population = [self.best.copy()]
            while len(next_population) < self.size:
                mate1 = self._select()
                if random.random() < self.crossover_rate:
                    mate2 = self._select()
                    offspring = mate1.crossover(mate2)
                else:
                    offspring = [mate1.copy()]
                for individual in offspring:
                    for gene in range(individual.length):
                        if random.random() < self.mutation_rate:
                            individual.mutate(gene)
                    individual.evaluate()
                    next_population.append(individual)
            self.population = next_population[:self.size]

            self.population.sort()
            self.best = self.population[0]
            self.generation += 1
            self.report()
        self.report()

    def _select(self):
        "override this to use your preferred selection method"
        return self._tournament()

    #
    # sample selection method
    #
    def _tournament(self, size=8, choosebest=0.90):
        competitors = [random.choice(self.population) \

                       for i in range(size)]
        competitors.sort()
        if random.random() < choosebest:
            return competitors[0]
        else:
            return random.choice(competitors[1:])

    def report(self):
        print "=" * 70
        print "generation: ", self.generation
        print "best:       ", self.best

if __name__ == "__main__":
    env = Environment(Individual, maxgenerations=1000)
    env.run()

Plot correlation matrices in SAS

I have developed a simple SAS program to show representations of correlation matrices. A paper which introduces a set of techniques that depicts the value of a correlation can be found in Corrgrams (Corrgrams: Exploratory displays for correlation matrices,Friendly,2002).

The procedure can be divided into two steps. At first, I use the gplot procedure to show the center of circle. The next step I use the annotate system to plot the circles and sectors.

The result is like this.Color encodes the sign of correlation value.Red color is represented for positive value, and blue color is represented for negative value.

gplot

SAS program is as follows.

%macro     plotgraph(corrdata);
%let radius=8;

data corrmatrix(drop= _TYPE_ _NAME_);
    set &corrdata(firstobs=4);
run;

proc sql noprint;
select count(name) into :nvars
    from dictionary.columns
    where libname=upcase("work") and memname=upcase("corrmatrix");
select name into :name1-:name%left(&nvars)
       from dictionary.columns
    where libname=upcase("work") and memname=upcase("corrmatrix");
Quit;

data corrdata(keep=corr);
    set corrmatrix;
    %do i=1 %to &nvars;
        corr=&&name&i;
        output;
    %end;
run;

%annomac;
data annodata;
    set corrdata;
    retain xsys ysys ‘2’ hsys ‘3’;
    center_x=mod(_N_-1,&nvars);
        center_y=&nvars-ceil(_N_/&nvars);
        if _N_<=&nvars then do;
            function=’label’;
            color=’black’;
            x=center_x;y=center_y+0.8;
            style=’SIMPLEX’;
            size=3;
            rotate=0;
            text=symget(‘name’||left(_N_));
            output;
            function=’label’;
            color=’black’;
            x=-0.8;y=&nvars-_N_;
            style=’SIMPLEX’;
            size=3;
            rotate=0;
            text=symget(‘name’||left(_N_));
            output;
        end;       
        %circle(center_x,center_y,&radius,black);       
        if corr >0 then do;
            x=center_x;
            y=center_y;
            angle=0;
            rotate=corr*360;   
            size=&radius;
            color=’red’;
            style=’p3n45′;
            line=3;
            output;   
        end;
        else do;
            x=center_x;
            y=center_y;
            angle=0;
            rotate=abs(corr)*360;   
            size=&radius;
            color=’blue’;
            style=’p3n45′;
            line=3;   
            output;
        end;
run;

/*remove duplicates*/
proc sql noprint;
create table plotdata as
    select distinct center_x,center_y
    from annodata;
quit;

goption reset=all;
goption device=gif hsize=400 pt vsize=400 pt;
symbol1 v=point;
AXIS1  ORDER =(-1 TO &nvars BY 1) origin=(10 pct,10 pct)
        COLOR=white;/*disappear the axis*/
proc gplot data=plotdata;
plot center_y*center_x/ haxis=axis1 vaxis=axis1
                        nolegend
                        annotate=annodata;
run;
quit;
%mend;
%macro corrgraph(libname,dataset);
proc sql noprint;
    select count(name) into :nvars
    from dictionary.columns
    where libname=upcase("&libname") and memname=upcase("&dataset");
       select name into :varname separated by ‘ ‘
    from dictionary.columns
    where libname=upcase("&libname") and memname=upcase("&dataset");   
quit;
proc corr data=&dataset pearson spearman hoeffding
                           outp=Pearson_data outs=Spearman_data
                        outh=Hoeffding_data;
var &varname;
run;
/*can add other correlation graph*/
%plotgraph(Pearson_data);
/*%plotgraph(Spearman_data);
%plotgraph(Hoeffding_data);*/
%mend;

The example data is listed here.

data fitness;
   input Age Weight Runtime Oxygen @@;
   datalines;
57 73.37 12.63 39.407   54 79.38 11.17 46.080
52 76.32 9.63  45.441   50 70.87 8.92    .
51 67.25 11.08 45.118   54 91.63 12.88 39.203
51 73.71 10.47 45.790   57 59.08 9.93  50.545
49 76.32  .    48.673   48 61.24 11.5  47.920
52 82.78 10.5  47.467   44 73.03 10.13 50.541
45 87.66 14.03 37.388   45 66.45 11.12 44.754
47 79.15 10.6  47.273   54 83.12 10.33 51.855
49 81.42 8.95  40.836   51 77.91 10.00 46.672
48 91.63 10.25 46.774   49 73.37 10.08 50.388
44 89.47 11.37 44.609   40 75.07 10.07 45.313
44 85.84 8.65  54.297   42 68.15 8.17  59.571
38 89.02 9.22  49.874   47 77.45 11.63 44.811
40 75.98 11.95 45.681   43 81.19 10.85 49.091
44 81.42 13.08 39.442   38 81.87 8.63  60.055
;
run;

Finally,invoke the macro.

%corrgraph(work,fitness);

A Simple Genetic Programming in Python

You can download a free guide to genetic programming here.http://www.gp-field-guide.org.uk/

This example aims to reconstruct a simple mathematic function,which can been refined in def examplefun(x, y). I have given x * x + x + 2 * y + 1 to it.

Nodes in a tree can be classified to three kinds:function,variable and constant, so I have built three wrappers:funwrapper,variable and constant.

The fitness function is adding the difference between  result of the tree and  correct result of the  function.Roulette wheel selection is selected as selection method . A final result can be shown like this.

tree

The code is as follows.

'''
Created on 2009-10-30

@author: Administrator
'''
from random import random, randint, choice
from copy import deepcopy
from PIL import Image, ImageDraw

class funwrapper:
  def __init__(self, function, childcount, name):
    self.function = function
    self.childcount = childcount
    self.name = name

class variable:
  def __init__(self, var, value=0):
    self.var = var
    self.value = value
    self.name = str(var)
    self.type = "variable"  

  def evaluate(self):
    return self.varvalue

  def setvar(self, value):
    self.value = value

  def display(self, indent=0):
    print '%s%s' % (' '*indent, self.var)

class const:
  def __init__(self, value):
    self.value = value
    self.name = str(value)
    self.type = "constant"   

  def evaluate(self):
    return self.value

  def display(self, indent=0):
    print '%s%d' % (' '*indent, self.value) 

class node:
  def __init__(self, type, children, funwrap, var=None, const=None):
    self.type = type
    self.children = children
    self.funwrap = funwrap
    self.variable = var
    self.const = const
    self.depth = self.refreshdepth()
    self.value = 0
    self.fitness = 0

  def eval(self):
    if self.type == "variable":
      return self.variable.value
    elif self.type == "constant":
      return self.const.value
    else:
      for c in self.children:
        result = [c.eval() for c in self.children]
      return self.funwrap.function(result)  

  def getfitness(self, checkdata):#checkdata like {"x":1,"result":3"}
    diff = 0
    #set variable value
    for data in checkdata:
      self.setvariablevalue(data)
      diff += abs(self.eval() - data["result"])
    self.fitness = diff      

  def setvariablevalue(self, value):
    if self.type == "variable":
      if value.has_key(self.variable.var):
        self.variable.setvar(value[self.variable.var])
      else:
        print "There is no value for variable:", self.variable.var
        return
    if self.type == "constant":
      pass
    if self.children:#function node
      for child in self.children:
        child.setvariablevalue(value)            

  def refreshdepth(self):
    if self.type == "constant" or self.type == "variable":
      return 0
    else:
      depth = []
      for c in self.children:
        depth.append(c.refreshdepth())
      return max(depth) + 1

  def __cmp__(self, other):
        return cmp(self.fitness, other.fitness)  

  def display(self, indent=0):
    if self.type == "function":
      print ('  '*indent) + self.funwrap.name
    elif self.type == "variable":
      print ('  '*indent) + self.variable.name
    elif self.type == "constant":
      print ('  '*indent) + self.const.name
    if self.children:
      for c in self.children:
        c.display(indent + 1)
  ##for draw node
  def getwidth(self):
    if self.type == "variable" or self.type == "constant":
      return 1
    else:
      result = 0
      for i in range(0, len(self.children)):
        result += self.children[i].getwidth()
      return result
  def drawnode(self, draw, x, y):
    if self.type == "function":
      allwidth = 0
      for c in self.children:
        allwidth += c.getwidth()*100
      left = x - allwidth / 2
      #draw the function name
      draw.text((x - 10, y - 10), self.funwrap.name, (0, 0, 0))
      #draw the children
      for c in self.children:
        wide = c.getwidth()*100
        draw.line((x, y, left + wide / 2, y + 100), fill=(255, 0, 0))
        c.drawnode(draw, left + wide / 2, y + 100)
        left = left + wide
    elif self.type == "variable":
      draw.text((x - 5 , y), self.variable.name, (0, 0, 0))
    elif self.type == "constant":
      draw.text((x - 5 , y), self.const.name, (0, 0, 0))

  def drawtree(self, jpeg="tree.png"):
    w = self.getwidth()*100
    h = self.depth * 100 + 120

    img = Image.new('RGB', (w, h), (255, 255, 255))
    draw = ImageDraw.Draw(img)
    self.drawnode(draw, w / 2, 20)
    img.save(jpeg, 'PNG')

class enviroment:
  def __init__(self, funwraplist, variablelist, constantlist, checkdata, \
               minimaxtype="min", population=None, size=10, maxdepth=10, \
               maxgen=100, crossrate=0.9, mutationrate=0.1, newbirthrate=0.6):
    self.funwraplist = funwraplist
    self.variablelist = variablelist
    self.constantlist = constantlist
    self.checkdata = checkdata
    self.minimaxtype = minimaxtype
    self.maxdepth = maxdepth
    self.population = population or self._makepopulation(size)
    self.size = size
    self.maxgen = maxgen
    self.crossrate = crossrate
    self.mutationrate = mutationrate
    self.newbirthrate = newbirthrate

    self.besttree = self.population[0]
    for i in range(0, self.size):
      self.population[i].depth=self.population[i].refreshdepth()
      self.population[i].getfitness(checkdata)
      if self.minimaxtype == "min":
        if self.population[i].fitness < self.besttree.fitness:
          self.besttree = self.population[i]
      elif self.minimaxtype == "max":
        if self.population[i].fitness > self.besttree.fitness:
          self.besttree = self.population[i]    

  def _makepopulation(self, popsize):
    return [self._maketree(0) for i in range(0, popsize)]     

  def _maketree(self, startdepth):
    if startdepth == 0:
      #make a new tree
      nodepattern = 0#function
    elif startdepth == self.maxdepth:
      nodepattern = 1#variable or constant
    else:
      nodepattern = randint(0, 1)
    if nodepattern == 0:
      childlist = []
      selectedfun = randint(0, len(self.funwraplist) - 1)
      for i in range(0, self.funwraplist[selectedfun].childcount):
        child = self._maketree(startdepth + 1)
        childlist.append(child)
      return node("function", childlist, self.funwraplist[selectedfun])
    else:
      if randint(0, 1) == 0:#variable
        selectedvariable = randint(0, len(self.variablelist) - 1)
        return node("variable", None, None, \
               variable(self.variablelist[selectedvariable]), None)
      else:
        selectedconstant = randint(0, len(self.constantlist) - 1)
        return node("constant", None, None, None,\
               const(self.constantlist[selectedconstant]))

  def mutate(self, tree, probchange=0.1, startdepth=0):
    if random() < probchange:
      return self._maketree(startdepth)
    else:
      result = deepcopy(tree)
      if result.type == "function":
        result.children = [self.mutate(c, probchange, startdepth + 1) \
                           for c in tree.children]
    return result

  def crossover(self, tree1, tree2, probswap=0.8, top=1):
    if random() < probswap and not top:
      return deepcopy(tree2)
    else:
      result = deepcopy(tree1)
      if tree1.type == "function" and tree2.type == "function":
        result.children = [self.crossover(c, choice(tree2.children), \
                           probswap, 0) for c in tree1.children]
    return result

  def envolve(self, maxgen=100, crossrate=0.9, mutationrate=0.1):
    for i in range(0, maxgen):
      print "generation no.", i
      child = []
      for j in range(0, int(self.size * self.newbirthrate / 2)):
        parent1, p1 = self.roulettewheelsel()
        parent2, p2 = self.roulettewheelsel()
        newchild = self.crossover(parent1, parent2)
        child.append(newchild)#generate new tree
        parent, p3 = self.roulettewheelsel()
        newchild = self.mutate(parent, mutationrate)
        child.append(newchild)
      #refresh all tree's fitness
      for j in range(0, int(self.size * self.newbirthrate)):
        replacedtree, replacedindex = self.roulettewheelsel(reverse=True)
        #replace bad tree with child
        self.population[replacedindex] = child[j]

      for k in range(0, self.size):
        self.population[k].getfitness(self.checkdata)
        self.population[k].depth=self.population[k].refreshdepth()
        if self.minimaxtype == "min":
          if self.population[k].fitness < self.besttree.fitness:
            self.besttree = self.population[k]
        elif self.minimaxtype == "max":
          if self.population[k].fitness > self.besttree.fitness:
            self.besttree = self.population[k]
      print "best tree's fitbess..",self.besttree.fitness
    self.besttree.display()
    self.besttree.drawtree()  

  def gettoptree(self, choosebest=0.9, reverse=False):
    if self.minimaxtype == "min":
      self.population.sort()
    elif self.minimaxtype == "max":
      self.population.sort(reverse=True)  

    if reverse == False:
      if random() < choosebest:
        i = randint(0, self.size * self.newbirthrate)
        return self.population[i], i
      else:
        i = randint(self.size * self.newbirthrate, self.size - 1)
        return self.population[i], i
    else:
      if random() < choosebest:
        i = self.size - randint(0, self.size * self.newbirthrate) - 1
        return self.population[i], i
      else:
        i = self.size - randint(self.size * self.newbirthrate,\
            self.size - 1)
        return self.population[i], i

  def roulettewheelsel(self, reverse=False):
    if reverse == False:
      allfitness = 0
      for i in range(0, self.size):
        allfitness += self.population[i].fitness
      randomnum = random()*(self.size - 1)
      check = 0
      for i in range(0, self.size):
        check += (1.0 - self.population[i].fitness / allfitness)
        if check >= randomnum:
          return self.population[i], i
    if reverse == True:
      allfitness = 0
      for i in range(0, self.size):
        allfitness += self.population[i].fitness
      randomnum = random()
      check = 0
      for i in range(0, self.size):
        check += self.population[i].fitness * 1.0 / allfitness
        if check >= randomnum:
          return self.population[i], i

  def listpopulation(self):
    for i in range(0, self.size):
      self.population[i].display()   

#############################################################

def add(ValuesList):
    sumtotal = 0
    for val in ValuesList:
      sumtotal = sumtotal + val
    return sumtotal

def sub(ValuesList):
    return ValuesList[0] - ValuesList[1]

def mul(ValuesList):
    return ValuesList[0] * ValuesList[1]

def div(ValuesList):
    if ValuesList[1] == 0:
        return 1
    return ValuesList[0] / ValuesList[1]

addwrapper = funwrapper(add, 2, "Add")
subwrapper = funwrapper(sub, 2, "Sub")
mulwrapper = funwrapper(mul, 2, "Mul")
divwrapper = funwrapper(div, 2, "Div")

def examplefun(x, y):
  return x * x + x + 2 * y + 1
def constructcheckdata(count=10):
  checkdata = []
  for i in range(0, count):
    dic = {}
    x = randint(0, 10)
    y = randint(0, 10)
    dic['x'] = x
    dic['y'] = y
    dic['result'] = examplefun(x, y)
    checkdata.append(dic)
  return checkdata

if __name__ == "__main__":
  checkdata = constructcheckdata()
  print checkdata
  env = enviroment([addwrapper, subwrapper, mulwrapper], ["x", "y"],
                  [-3, -2, -1, 1, 2, 3], checkdata)
  env.envolve()

Resolve TSP (Traveling Saleman Problem) in Genetic Algorithm

A random route can be shown like this,

TSPstart

After iteration,the result can be like this.

TSPresult

Method of generation routines is got from here:

http://www.psychicorigami.com/2007/04/17/tackling-the-travelling-salesman-problem-part-one/

'''
Created on 2009-10-29
@author: Administrator
'''
import sys, random
from math import sqrt
from pickle import *

PIL_SUPPORT = None
try:
   from PIL import Image, ImageDraw, ImageFont
   PIL_SUPPORT = True
except:
   PIL_SUPPORT = False 

def cartesian_matrix(coords):
   """ A distance matrix """
   matrix = {}
   for i, (x1, y1) in enumerate(coords):
      for j, (x2, y2) in enumerate(coords):
         dx, dy = x1 - x2, y1 - y2
         dist = sqrt(dx * dx + dy * dy)
         matrix[i, j] = dist
   return matrix 

def tour_length(matrix, tour):
   """ Returns the total length of the tour """
   total = 0
   num_cities = len(tour)
   for i in range(num_cities):
      j = (i + 1) % num_cities
      city_i = tour[i]
      city_j = tour[j]
      total += matrix[city_i, city_j]
   return total

def write_tour_to_img(coords, tour, img_file):
   """ The function to plot the graph """
   padding = 20
   coords = [(x + padding, y + padding) for (x, y) in coords]
   maxx, maxy = 0, 0
   for x, y in coords:
      maxx = max(x, maxx)
      maxy = max(y, maxy)
   maxx += padding
   maxy += padding
   img = Image.new("RGB", (int(maxx), int(maxy)),\
         color=(255, 255, 255))
   font = ImageFont.load_default()
   d = ImageDraw.Draw(img);
   num_cities = len(tour)
   for i in range(num_cities):
      j = (i + 1) % num_cities
      city_i = tour[i]
      city_j = tour[j]
      x1, y1 = coords[city_i]
      x2, y2 = coords[city_j]
      d.line((int(x1), int(y1), int(x2), int(y2)), fill=(0, 0, 0))
      d.text((int(x1) + 7, int(y1) - 5), str(i), \
        font=font, fill=(32, 32, 32)) 

   for x, y in coords:
      x, y = int(x), int(y)
      d.ellipse((x - 5, y - 5, x + 5, y + 5), outline=(0, 0, 0),\
                fill=(196, 196, 196))
   del d
   img.save(img_file, "PNG")
   print "The plot was saved into the %s file." % (img_file,)  

cm = []
coords = [] 

def eval_func(chromosome):
   """ The evaluation function """
   global cm
   return tour_length(cm, chromosome) 

def cities_random(cities, xmax=800, ymax=600):
   """ get random cities/positions """
   coords = []
   for i in xrange(cities):
      x = random.randint(0, xmax)
      y = random.randint(0, ymax)
      coords.append((float(x), float(y)))
   return coords

#Individuals
class Individual:
    score = 0
    length = 30
    seperator = ' '
    def __init__(self, chromosome=None, length=30):
        self.chromosome = chromosome or self._makechromosome()
        self.length = length
        self.score = 0  # set during evaluation  

    def _makechromosome(self):
        "makes a chromosome from randomly selected alleles."
        chromosome = []
        lst = [i for i in xrange(self.length)]
        for i in xrange(self.length):
            choice = random.choice(lst)
            lst.remove(choice)
            chromosome.append(choice)
        return chromosome

    def evaluate(self, optimum=None):
        self.score = eval_func(self.chromosome)

    def crossover(self, other):
        left, right = self._pickpivots()
        p1 = Individual()
        p2 = Individual()
        c1 = [ c for c in self.chromosome \
               if c not in other.chromosome[left:right + 1]]
        p1.chromosome = c1[:left] + other.chromosome[left:right + 1]\
                         + c1[left:]
        c2 = [ c for c in other.chromosome \
               if c not in self.chromosome[left:right + 1]]
        p2.chromosome = c2[:left] + self.chromosome[left:right + 1] \
                        + c2[left:]
        return p1, p2

    def mutate(self):
        "swap two element"
        left, right = self._pickpivots()
        temp = self.chromosome[left]
        self.chromosome[left] = self.chromosome[right]
        self.chromosome[right] = temp   

    def _pickpivots(self):
        left = random.randint(0, self.length - 2)
        right = random.randint(left, self.length - 1)
        return left, right    

    def __repr__(self):
        "returns string representation of self"
        return '<%s chromosome="%s" score=%s>' % \
               (self.__class__.__name__,
                self.seperator.join(map(str, self.chromosome)), self.score)  

    def copy(self):
        twin = self.__class__(self.chromosome[:])
        twin.score = self.score
        return twin

    def __cmp__(self, other):
        return cmp(self.score, other.score)

class Environment:
    size = 0
    def __init__(self, population=None, size=100, maxgenerations=1000,\
                 newindividualrate=0.6,crossover_rate=0.90,\
                 mutation_rate=0.1):
        self.size = size
        self.population = self._makepopulation()
        self.maxgenerations = maxgenerations
        self.newindividualrate = newindividualrate
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate
        for individual in self.population:
            individual.evaluate()
        self.generation = 0
        self.minscore = sys.maxint
        self.minindividual = None
        #self._printpopulation()
        if PIL_SUPPORT:
            write_tour_to_img(coords, self.population[0].chromosome,\
                              "TSPstart.png")
        else:
            print "No PIL detected,can not plot the graph"

    def _makepopulation(self):
        return [Individual() for i in range(0, self.size)]

    def run(self):
        for i in range(1, self.maxgenerations + 1):
            print "Generation no:" + str(i)
            for j in range(0, self.size):
                self.population[j].evaluate()
                curscore = self.population[j].score
                if curscore < self.minscore:
                    self.minscore = curscore
                    self.minindividual = self.population[j]
            print "Best individual:", self.minindividual
            if random.random() < self.crossover_rate:
                children = []
                newindividual = int(self.newindividualrate * self.size / 2)
                for i in range(0, newindividual):
                    selected1 = self._selectrank()
                    selected2 = self._selectrank()
                    parent1 = self.population[selected1]
                    parent2 = self.population[selected2]
                    child1, child2 = parent1.crossover(parent2)
                    child1.evaluate()
                    child2.evaluate()
                    children.append(child1)
                    children.append(child2)
                for i in range(0, newindividual):
                    #replce with child
                    totalscore = 0
                    for k in range(0, self.size):
                        totalscore += self.population[k].score
                    randscore = random.random()
                    addscore = 0
                    for j in range(0, self.size):
                        addscore += (self.population[j].score / totalscore)
                        if addscore >= randscore:
                            self.population[j] = children[i]
                            break
            if random.random() < self.mutation_rate:
                selected = self._select()
                self.population[selected].mutate()
        #end loop
        for i in range(0, self.size):
                self.population[i].evaluate()
                curscore = self.population[i].score
                if curscore < self.minscore:
                    self.minscore = curscore
                    self.minindividual = self.population[i]
        print "..................Result........................."
        print self.minindividual
        #self._printpopulation()

    def _select(self):
        totalscore = 0
        for i in range(0, self.size):
            totalscore += self.population[i].score
        randscore = random.random()*(self.size - 1)
        addscore = 0
        selected = 0
        for i in range(0, self.size):
            addscore += (1 - self.population[i].score / totalscore)
            if addscore >= randscore:
                selected = i
                break
        return selected

    def _selectrank(self, choosebest=0.9):
        self.population.sort()
        if random.random() < choosebest:
            return random.randint(0, self.size * self.newindividualrate)
        else:
            return random.randint(self.size * self.newindividualrate,\
                   self.size - 1)

    def _printpopulation(self):
        for i in range(0, self.size):
            print "Individual ", i, self.population[i]      

def main_run():
    global cm, coords
    #get cities's coords
    coords =cities_random(30)
    cm = cartesian_matrix(coords)
    ev = Environment()
    ev.run()
    if PIL_SUPPORT:
        write_tour_to_img(coords, ev.minindividual.chromosome, \
                          "TSPresult.png")
    else:
        print "No PIL detected,can not plot the graph"
if __name__ == "__main__":
    main_run()