CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CLUSPAC: Computer Programs for Mixture-Model Clustering C C C C COPYRIGHT (C) 1991, 1992 STANLEY L. SCLOVE. C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C$STATEMENTS=100000 C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CMS DSN = MIX1DT C C C C "MIX1DT" IS A PROGRAM FOR CLUSTERING UNIVARIATE DATA C C (DATA ON THE LINE) BY ITERATIVE MAXIMIZATION OF THE MIXTURE- C C MODEL LIKELIHOOD C C C C N K C C --- -- C C L = | | > P(C)*f(X(I)|C) C C | | -- C C I=1 C=1 C C C C C The K probability density functions f are Gaussian with C C different means. The "DT" in the program name means that the C C the variances are allowed to differ. (Use program MIX1CM for C C fitting a common variance for the K distributions.) C C C C REFERENCE: C C C C Wolfe, J. H. (1970). Pattern clustering by multivariate C C mixture analysis. Multivariate Behavioral Research 5, C C 329-350. C C C C MANUAL MODE: NUMBER OF CLUSTERS AND INITIAL MEANS ARE C C INPUT. (USE PROGRAM MIX1DTA FOR AUTOMATIC SETTING OF C C NUMBERS OF CLUSTERS AND INITIAL MEANS.) C C C C C C PROGRAMMED BY C C DR. STANLEY L. SCLOVE 312/996-2681 C C DEPARTMENT OF 312/996-2676 C C INFORMATION & DECISION SCIENCES (MC 294) C C UNIVERSITY OF ILLINOIS AT CHICAGO C C 601 S. MORGAN ST. C C CHICAGO, IL 60607-7124 C C C C C C VERSION 1.6 15-MAY-92 C C C C C C C C C C RESTRICTIONS (CAN BE MODIFIED): C C N, SAMPLE SIZE, AT MOST 999; C C K, NUMBER OF CLUSTERS, AT MOST 29; C C ITER, MAXIMUM NUMBER OF ITERATIONS, 99. C C C C C C C C CONTROL CARDS: C C C C (1) DATASET TITLE C C (2) N, IN FORMAT (2X,I4) C C (3) FMT, IN FORMAT (18A4), E.G., (1X,F4.1). C C ALLOW AT LEAST ONE BLANK IN FMT: IT WILL ALSO BE USED C C FOR OUTPUT, WHERE CC1 IS FOR CARRIAGE CONTROL. C C ALLOW A CC FOR THE DECIMAL POINT ON OUTPUT, C C WHETHER OR NOT THERE IS ONE ON INPUT. C C (4) DATA, IN FORMAT SPECIFIED BY FMT C C (5) K, NUMBER OF CLUSTERS, IN FORMAT (2X,I1) C C (6) K INITIAL VALUES OF PRIOR PROBS, MEANS AND VARIANCES C C IN FORMAT (5X,F3.2,2X,F8.2,2X,F8.2). C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C DIMENSION X(999),XMNDSQ(999),ICLUS(999),IOTA(999) DIMENSION DSQ(29),C(29),SUM(29) DIMENSION TITLE(18) DIMENSION B(29),NC(29),XMEAN(29),XMEANOL(29) DIMENSION FMT(18) DIMENSION SS(29),SSD(29) DIMENSION SD(29) DIMENSION VAR(29) DIMENSION ICLSOL(999) DIMENSION F(999,29) DIMENSION P(29),XNC(29) DIMENSION PP(29,999) DIMENSION XMXPR(999) DIMENSION DENOM(999) C DOUBLE PRECISION SUM,SS,F,P,PP,SUMSQS,TOTAL,SSD,XNC,VAR DOUBLE PRECISION TEMP DOUBLE PRECISION XMEAN,XMEANOL,XMN2LL,XMN2LLOL,WGSS,SSDEVS C READ (5,64000) TITLE C C WRITE PROGRAM INFORMATION. WRITE (6,56000) WRITE (6,32000) WRITE (6,34000) C WRITE (6,70000) TITLE C C READ SAMPLE SIZE, N. READ (5,12000) N XN = N WRITE (6,66000) N C C READ DATA FORMAT. READ (5,64000) FMT C C READ DATA AND C COMPUTE STATISTICS OF WHOLE SAMPLE: TOTAL=0.0 SUMSQS=0.0 SSDEVS=0.0 C DO 300 I = 1,N READ (5,FMT) X(I) TOTAL = TOTAL + X(I) SUMSQS = SUMSQS + X(I)*X(I) IF (I .EQ. 1) GO TO 100 GO TO 200 100 XMAX = X(1) XMIN = X(1) 200 CONTINUE IF (X(I) .LT. XMIN) XMIN=X(I) IF (X(I) .GT. XMAX) XMAX=X(I) 300 CONTINUE C WRITE DATA: WRITE (6,24000) IF ( N .GE. 31) GO TO 350 WRITE (6,FMT) (X(I),I=1,N) 350 CONTINUE C WRITE SUMMARY STATISTICS FOR WHOLE SAMPLE: WRITE (6,44000) WRITE (6,FMT) XMIN WRITE (6,46000) WRITE (6,FMT) XMAX XBAR = TOTAL/XN SSDEVS = SUMSQS- TOTAL*TOTAL/XN VARHAT = SSDEVS/XN WRITE (6,50000) XBAR WRITE (6,52000) VARHAT C PI = 3.141593 TEMP = 2.0*PI*SSDEVS/N XMN2LL = N*(1.0 + DLOG(TEMP)) STDDEV = SQRT(VARHAT) WRITE (6,54000) SSDEVS, XMN2LL, STDDEV NOPARM = 1 + 1 AIC = XMN2LL + 2.0*NOPARM WRITE (6,82000) AIC SCH = XMN2LL + ALOG(XN)*NOPARM XKASH = SCH - ALOG(2*VARHAT**3) WRITE (6,48000) XKASH C C C READ K, NUMBER OF CLUSTERS. READ (5,10000) K WRITE (6,36000) K C C READ INITIAL PRIOR PROBS, MEANS AND VARIANCES: DO 400 IC=1,K READ (5,14000) P(IC),XMEAN(IC),VAR(IC) 400 CONTINUE C C C C WRITE INITIAL PRIOR PROBS, MEANS AND VARIANCES: WRITE (6,26000) DO 500 IC = 1,K WRITE (6,28000) IC, P(IC), XMEAN(IC), VAR(IC) 500 CONTINUE C C SET CONSTANTS. DO 600 INTGER=1,N IOTA(INTGER) = INTGER 600 CONTINUE C ITER = 1 C IF THE INITIAL PRIOR PROBABILITIES ARE EQUAL, THEN THE C FIRST ITERATION IS EQUIVALENT TO MINIMUM DISTANCE CLUSTERING C TO INITIAL MEANS (I.E., "ISODATA"). C IN GENERAL, THE CLUSTERING IS BY MAXIMUM POSTERIOR C PROBABILITY CLUSTERING. C 700 CONTINUE C IF (ITER .EQ. 1) GO TO 900 C C STORE OLD CLUSTERING RESULTS FOR TESTING CONVERGENCE: XMN2LLOL = XMN2LL DO 800 I = 1,N ICLSOL(I) = ICLUS(I) 800 CONTINUE DO 850 IC = 1,K XMEANOL(IC) = XMEAN(IC) 850 CONTINUE C C COMMENCE DISTANCE COMPUTATIONS. 900 CONTINUE DO 1100 I = 1,N DO 1100 IC = 1,K DSQ(IC) = ( XMEAN(IC) - X(I) )**2 ZSQ = DSQ(IC)/VAR(IC) IF ( ZSQ .LE. 174.673 ) GO TO 1000 F(I,IC) = 0.0 GO TO 1100 1000 CONTINUE C C NOTE THAT A PROB. DENSITY FUNCTION OTHER THAN THE GAUSSIAN C COULD BE USED HERE: F(I,IC) =(EXP(-ZSQ/2.0))/(SQRT(2*PI*VAR(IC))) 1100 CONTINUE C XMNDSQ(I) = MIN SQ. DISTANCE FROM X(I) TO ANY MEAN C C C 1200 CONTINUE C C COMPUTE POSTERIOR PROBABILITIES OF GROUP MEMBERSHIP: DO 1300 I = 1,N DENOM(I) = 0.0 DO 1300 IH=1,K DENOM(I) = DENOM(I) + P(IH)*F(I,IH) 1300 CONTINUE DO 1400 I = 1,N DO 1400 IC=1,K C IF ( DENOM(I) .EQ. 0.0 ) DENOM(I)=0.0001 PP(IC,I)= P(IC)*F(I,IC)/DENOM(I) 1400 CONTINUE C C COMPUTE NEW LABELS BY MAX POSTERIOR PROBABILITY: DO 1600 I = 1,N XMXPR(I) = PP(1,I) ICLUS(I) = 1 DO 1600 IC = 2,K IF ( PP(IC,I) .GT. XMXPR(I) ) GO TO 1500 GO TO 1600 1500 XMXPR(I) = PP(IC,I) ICLUS(I) = IC 1600 CONTINUE C IF (N .GE. 31) GO TO 1650 C C WRITE NEW LABELS: WRITE (6,16000) WRITE (6,18000) (IOTA(I), I=1,N) WRITE (6,20000) (ICLUS(I), I=1,N) 1650 CONTINUE C C UPDATE CLUSTER PRIOR PROBABILITIES P(IC), MEANS XMEAN(IC) AND C VARIANCES VAR(IC): WGSS = 0.0 DO 1800 IC = 1,K XNC(IC) = 0.0 C XNC(IC) WILL BE THE SUM OVER ALL N OBSERVATIONS OF THEIR C POSTERIOR PROBABILITIES OF MEMBERSHIP IN CLUSTER IC. SUM(IC) = 0.0 SS(IC) = 0.0 DO 1700 I = 1,N XNC(IC) = XNC(IC) + PP(IC,I) SUM(IC) = SUM(IC) + PP(IC,I)*X(I) SS(IC) = SS(IC) + PP(IC,I)*X(I)*X(I) 1700 CONTINUE IF ( XNC(IC) .EQ. 0.0 ) GO TO 2700 XMEAN(IC) = SUM(IC)/XNC(IC) SSD(IC) = SS(IC) - SUM(IC)*SUM(IC)/XNC(IC) VAR(IC) = SSD(IC)/(XNC(IC)) C IF ( VAR(IC) .LE. 0.0 ) VAR(IC) = 0.0001 SD(IC) = SQRT(VAR(IC)) P(IC) = XNC(IC)/XN WGSS = WGSS + SSD(IC) 1800 CONTINUE C C COUNT NUMBERS IN CLUSTERS: DO 1900 IC = 1,K NC(IC) = 0 1900 CONTINUE DO 2000 I = 1,N IGROUP = ICLUS(I) NC(IGROUP) = NC(IGROUP) + 1 2000 CONTINUE C C C VARHAT = WGSS/XN C SUMLNF = 0.0 DO 2200 I=1,N SUMPXF = 0.0 DO 2100 IC=1,K SUMPXF = SUMPXF + P(IC)*F(I,IC) 2100 CONTINUE SUMLNF = SUMLNF + ALOG(SUMPXF) 2200 CONTINUE C XMN2LL = -2.0*SUMLNF C C WGMS = WGSS/(N-K) WRITE (6,40000) WGSS, XMN2LL, WGMS STDERR = SQRT(WGMS) WRITE (6,74000) STDERR C KM1 = K-1 DO 2300 IC=1,KM1 ICP1 = IC+1 C B(IC) IS BOUNDARY BETWEEN G-TH AND G+1-ST CLASSES. B(IC) = ( XMEAN(IC) + XMEAN(ICP1) )/2.0 BDYADJ = DLOG(P(ICP1)) - DLOG(P(IC)) BDYADJ = BDYADJ/(XMEAN(ICP1) - XMEAN(IC)) BDYADJ = VARHAT*BDYADJ B(IC) = B(IC) - BDYADJ 2300 CONTINUE WRITE (6,58000) ITER WRITE (6,42000) (B(IC),IC=1,KM1) WRITE (6,38000) (XMEAN(IC),IC=1,K) IF (ITER .EQ. 1) GO TO 2500 DO 2400 I = 1,N C If any case changes cluster, another iteration will be performed. IF (ICLUS(I) .EQ. ICLSOL(I)) GO TO 2400 GO TO 2500 2400 CONTINUE WRITE (6,79000) C Even if no case changes cluster, another iteration will be C performed if the parameter estimates have changed much. DO 2450 IC = 1,K TEST=(XMEAN(IC)-XMEANOL(IC))/XMEANOL(IC) TOLERANS = 0.01 IF (ABS(TEST) .LE. TOLERANS) GO TO 2450 GO TO 2500 2450 CONTINUE GO TO 2900 2500 CONTINUE ITER = ITER + 1 IF (ITER.GE.100) GO TO 2600 GO TO 700 2600 WRITE (6,80000) 2700 CONTINUE WRITE (6,88000) C C Next the results will be written: C 2900 CONTINUE C WRITE (6,60000) ( P(IC), IC = 1,K ) WRITE (6,62000) (NC(IC),IC=1,K) WRITE (6,72000) (VAR(IC),IC=1,K) WRITE (6,68000) (SD(IC), IC=1,K) C VARHAT IS MLE OF VARIANCE. VARHAT = WGSS/N WRITE (6,30000) VARHAT C C IF (N .GE. 31) GO TO 3050 WRITE (6,78000) DO 3000 I=1,N WRITE (6,76000) X(I),ICLUS(I),(PP(IC,I),IC=1,K) 3000 CONTINUE C 3050 CONTINUE C C COMPUTE MODEL-SELECTION CRITERIA: C NO. PARAMETERS = K MEANS + K VARIANCES + (K-1) PROBS. NOPARM = K + K + (K-1) C WRITE (6,86000) NOPARM AIC = XMN2LL + 2.0*NOPARM SCH = XMN2LL + ALOG(XN)*NOPARM C C SCHWARZ' CRITERION IS FIRST-DEGREE EXPANSION OF C LOG POSTERIOR PROBABILITY OF THE MODEL. C KASHYAP'S CRITERION IS SECOND-DEGREE EXPANSION OF SAME. XKASH = SCH - ALOG(2*VARHAT**3) C WRITE (6,82000) AIC WRITE (6,84000) SCH WRITE (6,48000) XKASH C C C STOP 10000 FORMAT(2X,I1) 12000 FORMAT(2X,I4) 14000 FORMAT(5X,F3.2,2X,F8.2,2X,F8.2) 16000 FORMAT(1X,'CLUSTERING') 18000 FORMAT(1X,'POINT: '/, (1X,40I3)) 20000 FORMAT(1X,'CLUSTER: '/, (1X,40I3)) 24000 FORMAT(1X,'DATA:'/) 26000 FORMAT(/1X,'INITIAL PRIOR PROBS, MEANS AND VARIANCES:') 28000 FORMAT(1X,I2,2X,3F11.2) 30000 FORMAT(1X, 'M.L. ESTIMATE OF COMMON VARIANCE = ',F14.5/) 32000 FORMAT(1X,'CMS DSN = MIX1DT CLUSPAC') 34000 FORMAT(1X,'COPYRIGHT (C) 1991, 1992 STANLEY L. SCLOVE'/) 36000 FORMAT('1',1X,'K = ',I1,' CLUSTERS') 38000 FORMAT(1X,'MEANS: ',9F13.3) 40000 FORMAT(/1X,'WGSS = ',F14.4,' MINUS 2 LOG LIKELIHOOD = ', XF14.4, ' WGMS = ',F14.4/) 42000 FORMAT(1X,'BOUNDARIES:', 8X, 9F13.3) 44000 FORMAT(1X,'MINIMUM OF SAMPLE: ') 46000 FORMAT(1X,'MAXIMUM OF SAMPLE: ') 48000 FORMAT( 1X,'KASHYAP CRITERION = ', F14.4) 50000 FORMAT(/,1X,'MEAN = ', F14.4) 52000 FORMAT(1X, 'M.L. ESTIMATE OF VARIANCE = ',F14.5/) 54000 FORMAT(/1X,'SSDEVS = ',F14.4,' MINUS 2 LOG LIKELIHOOD = ', XF14.4, ' STDDEV = ',F14.4/) 56000 FORMAT('1','.................................................', X/,1X,'PROGRAM MIX1DT CLUSPAC'/1X,'MIXTURE MODEL CLUSTERING'/ X,1X,'FOR UNIVARIATE DATA (DATA ON THE LINE)'/ X,1X,'WITH UNEQUAL CLASS VARIANCES '/ X1X,'DEVELOPED AND PROGRAMMED BY DR. STANLEY L. SCLOVE' X/,1X,'VERSION 1.6 15-MAY-92 '/) 58000 FORMAT(1X,'ITERATION ', I2) 60000 FORMAT(1X,'PROBS:',5X,9F13.3) 62000 FORMAT(1X,'NUMBERS:',3X,9(I10,3X)) 64000 FORMAT(18A4) 66000 FORMAT(1X,'N = ',I3/) 68000 FORMAT(1X,'STD.DEVS.: ',9F13.3) 70000 FORMAT(1X,18A4/) 72000 FORMAT(1X,'VARIANCES: ',9F13.3) 74000 FORMAT(1X,'STD.ERROR=SQRT(WGMS) = ',F13.4/) 76000 FORMAT(1X,F8.2,2X,I2,3X,(8F6.3/)) 78000 FORMAT(1X/,1X,'DATA, LABELS AND PROBS OF CLUSTER MEMBERSHIP'/) 79000 FORMAT(/,1X,'NO CASE CHANGED CLUSTERS IN THIS ITERATION.') 80000 FORMAT(1X,'PROGRAM HAS NOT CONVERGED IN 99 ITERATIONS. STOP') 82000 FORMAT(1X,'AIC = ', F14.4 ) 84000 FORMAT(1X,'SCHWARZ CRITERION = ', F14.4 ) 86000 FORMAT(/1X,'NUMBER OF PARAMETERS = ',I4 ) 88000 FORMAT(1X,'NO OBSERVATIONS IN GROUP ',I3,'. STOP') END