CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CLUSPAC: Computer Programs for Mixture-Model Clustering C C C C COPYRIGHT 1991 STANLEY LOUIS SCLOVE. C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CMS DSN = ISDT1DT CLUSPAC C C C C C C PROGRAM ISDT1DT OF ISOPAC IS C C FOR CLUSTERING UNIVARIATE DATA (CLUSTERING ON THE LINE). C C IT ALLOWS DIFFERENT VARIANCES IN CLUSTERS; THE C C OBSERVATION X IS ASSIGNED TO THAT CLUSTER FOR WHICH C C LOG VAR + (X-MEAN)**2/VAR IS MINIMAL. C C MANUAL MODE: NUMBER OF CLUSTERS AND INITIAL MEANS ARE C C INPUT; USE PROGRAM ISDT1DTA FOR AUTOMATIC SETTING OF C C NUMBER OF CLUSTERS AND INITIAL MEANS. C C C C C C PROGRAMMED BY C C DR. STANLEY L. SCLOVE 312/996-2681 C C DEPARTMENT OF C C INFORMATION & DECISION SCIENCES M/C 294 C C COLLEGE OF BUSINESS ADMINISTRATION C C UNIVERSITY OF ILLINOIS AT CHICAGO C C BOX 4348, CHICAGO, IL 60680 C C C C C C LAST UPDATE: VERSION 1.0 23-OCT-89 C C C C COPYRIGHT 1991 STANLEY LOUIS SCLOVE. C C C C C C C C RESTRICTIONS (CAN BE MODIFIED): C C N, SAMPLE SIZE, AT MOST 9999; C C K, NUMBER OF CLUSTERS, AT MOST 9; C C ITER, MAXIMUM NUMBER OF ITERATIONS, 20. C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C DIMENSION X(9999),F(9999),ICLUS(9999),IOTA(9999) DIMENSION D(29),C(29),SUM(29) DIMENSION TITLE(18) DIMENSION BDY1(29),BDY2(29),NG(29),XMEAN(29) DIMENSION ROOT1(29),ROOT2(29) DIMENSION A(29),B(29) DIMENSION SS(29),SSD(29) DIMENSION SD(29) DIMENSION VAR(29) DIMENSION ICLSOL(9999) DOUBLE PRECISION SUM,SS C C CONTROL CARDS: C C DATASET TITLE C N, IN FORMAT (2X,I4) C C C FOR OUTPUT. C DATA C K, NUMBER OF CLUSTERS, IN FORMAT (2X,I1) C K INITIAL MEANS C C READ(5,1050) TITLE C C WRITE PROGRAM INFORMATION. WRITE(6,1012) WRITE(6,1013) WRITE(6,1014) C WRITE(6,1060) TITLE C C READ SAMPLE SIZE, N. READ(5,1001) N XN = N WRITE(6,1051) N C C READ DATA AND COMPUTE STATISTICS OF WHOLE SAMPLE: TOTAL=0.0 SUMSQS=0.0 SSDEVS=0.0 C C DO 100 I = 1,N READ(5,*) X(I) TOTAL = TOTAL + X(I) SUMSQS = SUMSQS + X(I)*X(I) IF(I .EQ. 1) GO TO 498 GO TO 499 498 XMAX = X(1) XMIN = X(1) 499 CONTINUE IF(X(I) .LT. XMIN) XMIN=X(I) IF(X(I) .GT. XMAX) XMAX=X(I) 100 CONTINUE WRITE(6,1150) XMIN,XMAX XBAR = TOTAL/XN SSDEVS = SUMSQS- TOTAL*TOTAL/XN VARHAT = SSDEVS/XN WRITE(6,1112) XBAR WRITE(6,1113) VARHAT PI = 3.1415927 TEMP = 2.0*PI*SSDEVS/N XMN2LL = N*(1.0 + ALOG(TEMP)) STDDEV = SQRT(VARHAT) WRITE(6,1121) SSDEVS, XMN2LL, STDDEV C NUMBER OF PARAMETERS FOR ONLY 1 CLUSTER IS 2 (1 MEAN, 1 VARIANCE). NOPARM = 1 + 1 AIC = XMN2LL + 2.0*NOPARM WRITE(6,1205) AIC SCH = XMN2LL + ALOG(XN)*NOPARM WRITE(6,1206) SCH C KASHYAP'S CRITERION IS SCHWARZ'S PLUS THE LOG OF THE C DETERMINANT OF THE COVARIANCE MATRIX OF THE MLE'S. C IN GENERAL, THIS HAS TO BE ESTIMATED. XKASH = SCH - ALOG(2*VARHAT**3) WRITE(6,1111) XKASH C C WRITE(6,1009) C WRITE(6,1234) (X(I),I=1,N) C C READ K, NUMBER OF CLUSTERS. READ(5,1000) K WRITE(6,1015) K C C READ INITIAL MEANS DO 101 IG=1,K READ(5,*) XMEAN(IG) 101 CONTINUE C WRITE(6,1010) C WRITE(6,1011)( XMEAN(IG), IG=1,K ) C SET CONSTANTS. DO 105 INTEG=1,N IOTA(INTEG) = INTEG 105 CONTINUE DO 110 IG = 1,K VAR(IG) = 1.0 110 CONTINUE ITER = 1 601 CONTINUE IF(ITER .EQ. 1) GO TO 560 DO 565 I = 1,N ICLSOL(I) = ICLUS(I) 565 CONTINUE C COMMENCE DISTANCE COMPUTATIONS. C "DISTANCE" HERE MEANS NORMALIZED SQUARED DISTANCE PLUS C LOG OF VARIANCE. C FOR FIRST ITERATION, VARIANCES ARE SET EQUAL TO 1 C (SO "DISTANCE" IS JUST SQUARED DISTANCE FROM THE MEAN). 560 CONTINUE DO 104 I = 1,N DO 102 L = 1,K D(L) =(( XMEAN(L) - X(I) )**2)/VAR(L) + ALOG(VAR(L)) 102 CONTINUE F(I) = D(1) ICLUS(I) = 1 DO 104 L = 2,K IF( D(L) .LT. F(I) ) GO TO 103 GO TO 104 103 F(I) = D(L) ICLUS(I) = L 104 CONTINUE WRITE(6,1003) C WRITE(6,1004) (IOTA(I), I=1,N) WRITE(6,1005) (ICLUS(I), I=1,N) DO 202 IG = 1,K SUM(IG) = 0.0 SS(IG) = 0.0 C NG(IG) = 0 202 CONTINUE DO 400 I = 1,N IGROUP = ICLUS(I) SUM(IGROUP) = SUM(IGROUP) + X(I) SS(IGROUP) = SS(IGROUP) + X(I)*X(I) NG(IGROUP) = NG(IGROUP) + 1 400 CONTINUE WGSS = 0.0 DO 401 IG = 1,K IF (NG(IG) .EQ. 0) GO TO 410 GO TO 415 410 WRITE(6,1235) IG GO TO 1995 415 CONTINUE XMEAN(IG) = SUM(IG)/NG(IG) NTEMP=NG(IG) IF (NTEMP .EQ. 1) GO TO 135 SSD(IG) = SS(IG) - SUM(IG)**2/NG(IG) VAR(IG) = SSD(IG)/NG(IG) SD(IG) = SQRT(VAR(IG)) GO TO 140 135 SSD(IG)=0.0 VAR(IG) = 0.0 SD(IG)=0.0 140 CONTINUE WGSS = WGSS + SSD(IG) C 401 CONTINUE C TEMP = 0.0 DO 420 IG = 1,K TEMP = TEMP + NG(IG)*(1.0+ALOG(2*PI*VAR(IG))) 420 CONTINUE XMN2LL = TEMP C WGMS = WGSS/(N-K) WRITE(6,1021) WGSS, XMN2LL, WGMS STDERR = SQRT(WGMS) WRITE(6,1085) STDERR C KM1 = K-1 DO 500 IG=1,KM1 IGP1 = IG+1 C COMPUTE BOUNDARY BETWEEN G-TH AND G+1-ST CLASSES. A(IG)=1/VAR(IG)-1/VAR(IGP1) B(IG)=-2*(XMEAN(IG)/VAR(IG)-XMEAN(IGP1)/VAR(IGP1)) C(IG) = ALOG(VAR(IG)/VAR(IGP1)) D(IG) = B(IG)**2 - 4*A(IG)*C(IG) ROOT1(IG) = (-B(IG) - SQRT(D(IG)))/(2*A(IG)) ROOT2(IG) = (-B(IG) + SQRT(D(IG)))/(2*A(IG)) BDY1(IG) = ROOT1(IG) BDY2(IG) = ROOT2(IG) 500 CONTINUE WRITE(6,1036) ITER WRITE(6,1035) (BDY1(IG),IG=1,KM1) WRITE(6,1034) (BDY2(IG),IG=1,KM1) WRITE(6,1020) (XMEAN(IG),IG=1,K) IF(ITER .EQ. 1) GO TO 600 DO 555 I = 1,N IF(ICLUS(I) .EQ. ICLSOL(I)) GO TO 555 GO TO 600 555 CONTINUE GO TO 530 600 CONTINUE ITER = ITER + 1 IF(ITER.GE.21) GO TO 570 GO TO 601 570 WRITE(6,1160) 1995 STOP C C 530 CONTINUE C WRITE(6,1040) (SUM(IG),IG=1,K) WRITE(6,1045) (NG(IG),IG=1,K) WRITE(6,1080) (VAR(IG),IG=1,K) WRITE(6,1055) (SD(IG), IG=1,K) C VARHAT IS MLE OF VARIANCE. VARHAT = WGSS/N WRITE(6,1100) VARHAT C C WRITE(6,1095) WRITE(6,1090) (X(I),ICLUS(I), I=1,N) C C COMPUTE MODEL-SELECTION CRITERIA: NOPARM = K + K C NUMBER OF PARAMETERS IS K MEANS PLUS K VARIANCES. WRITE(6,1210) NOPARM AIC = XMN2LL + 2.0*NOPARM SCH = XMN2LL + ALOG(XN)*NOPARM C XKASH = 0.0 DO 111 IGRP=1,K TERM = -ALOG(2*VAR(IGRP)**3) XKASH = XKASH + TERM 111 CONTINUE XKASH = XKASH + SCH C WRITE(6,1205) AIC WRITE(6,1206) SCH WRITE(6,1111) XKASH C C 1000 FORMAT(2X,I1) 1001 FORMAT(2X,I4) 1003 FORMAT(1X,'CLUSTERING') 1004 FORMAT(1X,'POINT: '/, (1X,40I3)) 1005 FORMAT(1X,'CLUSTER: '/, (1X,40I3)) C1009 FORMAT(1X,'DATA'/) 1010 FORMAT(//1X,'INITIAL MEANS') 1011 FORMAT(1X, 9F13.2//) 1100 FORMAT(1X, 'M.L. ESTIMATE OF COMMON VARIANCE = ',F14.5/) 1111 FORMAT(///,1X,'KASHYAP CRITERION = ', E14.4///) 1112 FORMAT(/,1X,'MEAN = ', E14.4) 1113 FORMAT(1X, 'M.L. ESTIMATE OF VARIANCE = ',F14.5/) 1121 FORMAT(//1X,'SSDEVS = ',E14.4,' MINUS 2 LOG LIKELIHOOD = ', XE15.5, ' STDDEV = ',E14.4//) 1012 FORMAT('1','****************************************', A//,1X,'PROGRAM ISDT1DT.ISOPAC '/ B,1X,'FOR CLUSTERING UNIVARIATE DATA (DATA ON THE LINE)'/ C,1X,'USING ISODATA WITH DIFFERENT CLUSTER VARIANCES'/ D1X,'DEVELOPED AND PROGRAMMED BY DR. STANLEY L. SCLOVE' E/,1X,'VERSION 1.0 23-OCT-89 '//) 1013 FORMAT(1X,'PROGRAM ISOPAC:ISDT1DT') 1014 FORMAT(1X,'COPYRIGHT 1991 STANLEY LOUIS SCLOVE.'/) 1015 FORMAT('1',1X,'K = ',I1,' CLUSTERS') 1020 FORMAT(1X,'MEANS: ',9F13.2) 1021 FORMAT(//1X,'WGSS = ',E14.4,' MINUS 2 LOG LIKELIHOOD = ', XE15.5, ' WGMS = ',E14.4//) 1035 FORMAT(1X,'BOUNDARIES:', 9X, 9F13.2) 1034 FORMAT(7X,'BOUNDARIES:', 9X, 9F13.2) 1036 FORMAT(1X,'ITERATION ', I2) 1040 FORMAT(1X,'SUMS:',6X,9F13.2) 1045 FORMAT(1X,'NUMBERS:',3X,9(I10,3X)) 1050 FORMAT(18A4) 1051 FORMAT(1X,'N = ',I3/) 1055 FORMAT(1X,'STD.DEVS.: ',9F13.2) 1060 FORMAT(1X,18A4) 1080 FORMAT(1X,'VARIANCES: ',9F13.2) 1085 FORMAT(1X,'STD.ERROR=SQRT(WGMS) = ',F13.4//) 1090 FORMAT( (8(F6.1,I2)/)) 1095 FORMAT(1X//,1X,'DATA AND LABELS'//) C 1150 FORMAT(/1X,'MIN = ',E14.4,5X,'MAX = ',E14.4/) 1160 FORMAT(1X,'PROGRAM HAS NOT CONVERGED IN 20 ITERATIONS. STOP') 1205 FORMAT(1X,'AIC = ', E15.5//) 1206 FORMAT(1X,'SCHWARZ CRITERION = ', E15.5//) 1210 FORMAT(//1X,'NUMBER OF PARAMETERS = ',I4//) 1235 FORMAT(1X,'NO OBSERVATIONS IN GROUP ',I3,'. STOP') C1234 FORMAT(3X, F6.3) STOP END