C$PAGES=40 C$STATEMENTS=500000 C$T=9 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CLUSPAC: Computer Programs for Mixture-Model Clustering C C C C COPYRIGHT 1991 STANLEY LOUIS SCLOVE. C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CMS DSN = ISDT1DTA ISOPAC C C C C C C C C PROGRAM ISDT1DTA OF ISOPAC IS FOR C C CLUSTERING UNIVARIATE DATA (CLUSTERING ON THE LINE). C C IT ALLOWS DIFFERENT VARIANCES IN CLUSTERS; THE C C OBSERVATION X IS ASSIGNED TO THAT CLUSTER FOR WHICH C C LOG VAR + (X-MEAN)**2/VAR IS MINIMAL. C C AUTOMATIC MODE: VARYING NUMBER OF CLUSTERS; INITIAL MEANS C C ARE SET AUTOMATICALLY. C C C C PROGRAMMED BY C C DR. STANLEY L. SCLOVE 312/996-2681 C C PROFESSOR C C INFORMATION & DECISION SCIENCE DEPT. M/C 294 C C COLLEGE OF BUSINESS ADMINISTRATION C C UNIVERSITY OF ILLINOIS AT CHICAGO C C BOX 4348, CHICAGO, IL 60680-4348 C C C C C C VERSION 1.2 24-OCT-89 C C C C C C RESTRICTIONS (CAN BE MODIFIED): C C N, SAMPLE SIZE, AT MOST 9999; C C K, NUMBER OF CLUSTERS, AT MOST 29; C C ITER, MAXIMUM NUMBER OF ITERATIONS, 20. C C C C C C CONTROL CARDS: C C DATASET TITLE, IN FORMAT(18A4) C C N, IN FORMAT (2X,I4) C C FMT, IN FORMAT (18A4), E.G., (F4.1) C C DATA, IN FORMAT SPECIFIED BY FMT C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C DIMENSION X(9999),F(9999),ICLUS(9999),IOTA(9999) DIMENSION SUM(29) DIMENSION TITLE(18) DIMENSION A(29),B(29),C(29),D(29),NG(29),XMEAN(29) DIMENSION BDY1(29),BDY2(29),ROOT1(29),ROOT2(29) DIMENSION FMT(18) DIMENSION SS(29),SSD(29) DIMENSION SD(29) DIMENSION VAR(29) DIMENSION ICLSOL(9999) DIMENSION AICVEC(29),SCHVEC(29),XKSVEC(29) DOUBLE PRECISION SUM,SS,SSD,SD C C C READ DATASET TITLE. READ(5,1050) TITLE C READ SAMPLE SIZE, N. READ(5,1001) N XN = N C C C SET CONSTANTS: PI = 3.141593 DO 105 INTEG=1,N IOTA(INTEG) = INTEG 105 CONTINUE C C WRITE PROGRAM INFORMATION. WRITE(6,1120) WRITE(6,1060) TITLE C WRITE(6,1200) N C C C READ DATA AND COMPUTE STATISTICS FOR WHOLE SAMPLE: C SUM1 = 0.0 SS1 = 0.0 DO 100 I = 1,N READ(5,*) X(I) C SUM1 = SUM1 + X(I) SS1 = SS1 + X(I)*X(I) C IF(I .EQ. 1) GO TO 498 GO TO 499 498 XMAX = X(1) XMIN = X(1) 499 CONTINUE IF(X(I) .LT. XMIN) XMIN=X(I) IF(X(I) .GT. XMAX) XMAX=X(I) 100 CONTINUE WRITE(6,1150) XMIN,XMAX C XMEAN1 = SUM1/N SSD1 = SS1 - SUM1*SUM1/N VAR1 = SSD1/N SD1 = SQRT(VAR1) WRITE(6,3000) XMEAN1,VAR1,SD1 TEMP = 2.0*PI*SSD1/N XMN2LL = N*(1.0 + ALOG(TEMP)) NOPARM = 2 AIC = XMN2LL + 2.0*NOPARM SCH = XMN2LL + ALOG(XN)*NOPARM XKASH = SCH - ALOG(2*VAR1**3) AICVEC(1) = AIC SCHVEC(1) = SCH XKSVEC(1) = XKASH WRITE(6,1215) WRITE(6,1220) AIC,SCH,XKASH C C C PERFORM CLUSTERING FOR K = 2 TO 9 GROUPS. C DO 995 K = 2,9 WGSS = 0.0 WRITE(6,1015) K C C COMPUTE INITIAL MEANS, EQUALLY SPACED THROUGH RANGE OF DATA. DO 101 IG=1,K XMEAN(IG) = XMIN + IG*(XMAX-XMIN)/(K+1) 101 CONTINUE C C DO 110 IGRP = 1,K VAR(IGRP) = 1.0 110 CONTINUE C WRITE(6,1010) C WRITE(6,1110)( XMEAN(IG), IG=1,K ) ITER = 1 601 CONTINUE IF(ITER .EQ. 1) GO TO 560 DO 565 I = 1,N ICLSOL(I) = ICLUS(I) 565 CONTINUE C COMMENCE DISTANCE COMPUTATIONS. C "DISTANCE" HERE MEANS NORMALIZED SQUARED DISTANCE PLUS C FOR FIRST ITERATION, VARIANCES ARE SET EQUAL TO ONE C (SO "DISTANCE" IS JUST SQUARED DISTANCE FROM THE MEAN). 560 CONTINUE DO 104 I = 1,N DO 102 L = 1,K D(L) = ( XMEAN(L) - X(I) )**2/VAR(L) + ALOG(VAR(L)) 102 CONTINUE F(I) = D(1) ICLUS(I) = 1 DO 104 L = 2,K IF( D(L) .LT. F(I) ) GO TO 103 GO TO 104 103 F(I) = D(L) ICLUS(I) = L 104 CONTINUE C IF (N .GE. 31) GO TO 200 WRITE(6,1003) C WRITE(6,1004) (IOTA(I), I=1,N) WRITE(6,1005) (ICLUS(I), I=1,N) 200 CONTINUE C DO 202 IG = 1,K SUM(IG) = 0.0 SS(IG) = 0.0 C NG(IG) = 0 202 CONTINUE DO 400 I = 1,N IGROUP = ICLUS(I) SUM(IGROUP) = SUM(IGROUP) + X(I) SS(IGROUP) = SS(IGROUP) + X(I)*X(I) NG(IGROUP) = NG(IGROUP) + 1 400 CONTINUE WGSS = 0.0 DO 401 IG = 1,K IF (NG(IG) .EQ. 0) GO TO 410 GO TO 415 410 WRITE(6,3100) IG GO TO 1995 415 CONTINUE XMEAN(IG) = SUM(IG)/NG(IG) NTEMP=NG(IG) IF (NTEMP .EQ. 1) GO TO 135 SSD(IG) = SS(IG) - SUM(IG)**2/NG(IG) VAR(IG) = SSD(IG)/NG(IG) SD(IG) = SQRT(VAR(IG)) GO TO 140 135 SSD(IG)=0.0 C IF NG(IG)=1, VAR(IG) IS SET EQUAL TO POOLED VARIANCE ESTIMATE C TO GIVE COMPUTATION A WAY OF PROCEEDING: VAR(IG) = VARHAT SD(IG)=SQRT(VARHAT) 140 CONTINUE WGSS = WGSS + SSD(IG) C 401 CONTINUE C TEMP = 0.0 DO 420 IG = 1,K TEMP = TEMP + NG(IG)*( 1.0 + ALOG(2*PI*VAR(IG)) ) 420 CONTINUE XMN2LL = TEMP C WGMS = WGSS/(N-K) WRITE(6,1025) XMN2LL C WRITE POOLED STATISTICS: WRITE(6,1030) WGSS, WGMS STDERR = SQRT(WGMS) WRITE(6,1085) STDERR C KM1 = K-1 DO 500 IG=1,KM1 IGP1 = IG+1 C B(IG) IS BOUNDARY BETWEEN G-TH AND G+1-ST CLASSES. IF ( VAR(IG) .NE. VAR(IGP1) ) GO TO 495 BDY1(IG) = ( XMEAN(IG)+XMEAN(IGP1) )/2.0 BDY2(IG) = BDY1(IG) GO TO 500 495 CONTINUE A(IG)=1/VAR(IG)-1/VAR(IGP1) B(IG)=-2*(XMEAN(IG)/VAR(IG)-XMEAN(IGP1)/VAR(IGP1)) C(IG) = ALOG(VAR(IG)/VAR(IGP1)) D(IG) = B(IG)**2 - 4*A(IG)*C(IG) ROOT1(IG) = (-B(IG) - SQRT(D(IG)))/(2*A(IG)) ROOT2(IG) = (-B(IG) + SQRT(D(IG)))/(2*A(IG)) BDY1(IG) = ROOT1(IG) BDY2(IG) = ROOT2(IG) 500 CONTINUE WRITE(6,1036) ITER WRITE(6,1035) (BDY1(IG),IG=1,KM1) WRITE(6,1034) (BDY2(IG),IG=1,KM1) WRITE(6,1020) (XMEAN(IG),IG=1,K) IF(ITER .EQ. 1) GO TO 600 DO 555 I = 1,N IF(ICLUS(I) .EQ. ICLSOL(I)) GO TO 555 GO TO 600 555 CONTINUE GO TO 530 600 CONTINUE ITER = ITER + 1 IF(ITER.GE.21) GO TO 570 GO TO 601 570 WRITE(6,1160) 1995 GO TO 995 C C 530 CONTINUE C WRITE(6,1040) (SUM(IG),IG=1,K) WRITE(6,1045) (NG(IG),IG=1,K) WRITE(6,1080) (VAR(IG),IG=1,K) WRITE(6,1055) (SD(IG), IG=1,K) C VARHAT IS MLE OF VARIANCE. VARHAT = WGSS/N WRITE(6,1100) VARHAT C C WRITE(6,1095) WRITE(6,1090) (X(I),ICLUS(I), I=1,N) C C COMPUTE MODEL-SELECTION CRITERIA: C NOPARM = K + K WRITE(6,1210) NOPARM AIC = XMN2LL + 2.0*NOPARM SCH = XMN2LL + ALOG(XN)*NOPARM C XKASH = 0.0 DO 211 IGRP=1,K TERM = -ALOG(2*VAR(IGRP)**3) C C XKASH = XKASH + TERM 211 CONTINUE XKASH = XKASH + SCH AICVEC(K) = AIC SCHVEC(K) = SCH XKSVEC(K) = XKASH WRITE(6,3500) AIC WRITE(6,3600) SCH WRITE(6,3700) XKASH C C GO ON TO NEXT VALUE OF K: 995 CONTINUE C C WRITE VALUES OF MSC'S FOR VARIOUS K: WRITE(6,3400) DO 605 K = 1,9 WRITE(6,3300) K, AICVEC(K), SCHVEC(K), XKSVEC(K) 605 CONTINUE C C C 1001 FORMAT(2X,I4) 1003 FORMAT(1X,'CLUSTERING') 1004 FORMAT(1X,'POINT: '/, (1X,40I3)) 1005 FORMAT(1X,'CLUSTER: '/, (1X,40I3)) 1060 FORMAT(1X,18A4) 1010 FORMAT(//1X,'INITIAL MEANS') 1110 FORMAT(1X, 9F13.2//) 1120 FORMAT('1','****************************************', A//,1X,'PROGRAM ISDT1DTA ISOPAC'/ B,1X,'FOR CLUSTERING UNIVARIATE DATA (DATA ON THE LINE)'/ C1X,'COPYRIGHT 1991 STANLEY LOUIS SCLOVE.' D/,1X,'VERSION 1.2 24-OCT-89'//) 1015 FORMAT('1',1X,'K = ',I1,' CLUSTERS') 1020 FORMAT(1X,'MEANS: ',9F13.2) 1025 FORMAT(//1X,' MINUS 2 LOG LIKELIHOOD = ', F15.2) 1030 FORMAT(//1X,'POOLED STATISTICS'/' WGSS =',F15.2,' WGMS =',F15.2/) 1035 FORMAT(1X,'BOUNDARIES:', 7X, 9F13.2) 1034 FORMAT(1X,'BOUNDARIES:', 7X, 9F13.2) 1036 FORMAT(1X,'ITERATION ', I2) 1040 FORMAT(1X,'SUMS:',6X,9F13.2) 1045 FORMAT(1X,'NUMBERS:',3X,9(I10,3X)) 1050 FORMAT(18A4) 1055 FORMAT(1X,'STD.DEVS.: ',9F13.2) 1080 FORMAT(1X,'VARIANCES: ',9F13.2) 1085 FORMAT(1X,'STD.ERROR=SQRT(WGMS) = ',F13.6//) 1090 FORMAT( (8(F6.1,I2)/)) 1095 FORMAT(1X//,1X,'DATA AND LABELS'//) 1100 FORMAT(1X, 'M.L. ESTIMATE OF COMMON VARIANCE = ',F14.5/) C 1150 FORMAT(/1X,'MIN = ',E14.4,5X,'MAX = ',E14.4/) 1160 FORMAT(1X,'PROGRAM HAS NOT CONVERGED IN 20 ITERATIONS. STOP') 1200 FORMAT(1X,'N = ',I3/) 3500 FORMAT(1X,'AIC = ', E15.5/) 3600 FORMAT(1X,'SCHWARZ CRITERION = ', E15.5/) 3700 FORMAT(1X,'KASHYAP CRITERION = ', E15.5/) 1210 FORMAT(//1X,'NUMBER OF PARAMETERS = ',I4/) 1215 FORMAT(/1X,'VALUES OF MODEL-SELECTION CRITERIA FOR WHOLE SAMPLE:') 1220 FORMAT(/1X,'AKAIKE:',E15.5,' SCHWARZ: ',E15.5,' KASHYAP: ',E15.5/) 3000 FORMAT(//1X,'FOR WHOLE SAMPLE: MEAN IS',F15.3,' VARIANCE IS', XF15.3,' STANDARD DEVIATION IS ',F15.3/) 3100 FORMAT(1X,'NO OBSERVATIONS IN GROUP ',I3,'. STOP') 3300 FORMAT(1X,'K=',I3, 3F15.2) 3400 FORMAT(1X,'MODEL SELECTION CRITERIA'/ X' AIC SCHWARZ KASHYAP '/) STOP END C$DATA TRYPANOSOME LENGTHS (Lancaster, THE CHI-SQUARED DISTRIBUTION) N=0500 15 15 15 15 15 15 16 16 16 16 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 23 23 23 23 23 23 23 23 23 23 23 23 23 23 24 24 24 24 24 24 24 24 24 24 24 24 24 25 25 25 25 25 25 25 25 25 25 25 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31 32 32 32 32 32 32 32 32 32 32 32 32 32 33 33 33 33 33 33 33 34 34 34 35 35 /* #TRYPANOS DATA D #LENGTHS OF 500 ROUNDWORMS #See: #Lancaster, H. O., The Chi-Squared Distribution, Wiley, New York, 1969. #Frequency distribution: #value: 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 #freq.: 6 4 27 57 94 49 37 22 14 13 11 19 25 23 29 27 18 13 7 3 2 /*