CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CLUSPAC: Computer Programs for Mixture-Model Clustering C C C C COPYRIGHT (C) 1991 STANLEY LOUIS SCLOVE C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C$STATEMENTS=100000 C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C C CMS DSN = MIX1CM CLUSPAC C C C C "MIX1CM CLUSPAC" IS A PROGRAM FOR CLUSTERING UNIVARIATE DATA C C (DATA ON THE LINE) BY ITERATIVE MAXIMIZATION OF THE MIXTURE- C C MODEL LIKELIHOOD C C C C N K C C --- -- C C L = | | > P(C)*F(X(I)|C) C C | | -- C C I=1 C=1 C C C C C C REFERENCE: C C C C WOLFE (1970). C C C C MANUAL MODE: NUMBER OF CLUSTERS AND INITIAL MEANS ARE C C INPUT. (USE PROGRAM MIX1CMA FOR AUTOMATIC SETTING OF C C NUMBERS OF CLUSTERS AND INITIAL MEANS.) C C C C C C PROGRAMMED BY C C DR. STANLEY L. SCLOVE 312/996-2681 C C DEPARTMENT OF C C INFORMATION & DECISION SCIENCES M/C 294 C C COLLEGE OF BUSINESS ADMINISTRATION C C UNIVERSITY OF ILLINOIS AT CHICAGO C C BOX 4348 C C CHICAGO, IL 60680-4348 C C C C C C VERSION 3.4 12-JAN-92 C C VERSION 3.3 31-MAR-91 C C C C C C C C C C RESTRICTIONS (CAN BE MODIFIED): C C N, SAMPLE SIZE, AT MOST 999; C C K, NUMBER OF CLUSTERS, AT MOST 29; C C ITER, MAXIMUM NUMBER OF ITERATIONS, 20. C C C C C C C C CONTROL CARDS: C C C C (1) DATASET TITLE C C (2) N, IN FORMAT (2X,I4) C C (3) FMT, IN FORMAT (18A4), E.G., (1X,F4.1). C C ALLOW AT LEAST ONE BLANK IN FMT: IT WILL ALSO BE USED C C FOR OUTPUT, WHERE CC1 IS FOR CARRIAGE CONTROL. C C ALLOW A CC FOR THE DECIMAL POINT ON OUTPUT, C C WHETHER OR NOT THERE IS ONE ON INPUT. C C (4) DATA, IN FORMAT SPECIFIED BY FMT C C (5) K, NUMBER OF CLUSTERS, IN FORMAT (2X,I1) C C (6) K INITIAL VALUES OF PRIOR PROBABILITIES AND MEANS, C C IN FORMAT (5X,F3.2,2X,F8.2). C C (7) INITIAL VALUE OF THE VARIANCE. (THE RESULTS DO NOT C C DEPEND UPON THIS VALUE IF THE INITIAL PRIOR PROBABILITIES C C ARE EQUAL.) C C C C C CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC C C C C DIMENSION X(999),XMNDSQ(999),ICLUS(999),IOTA(999) DIMENSION DSQ(29),C(29),SUM(29) DIMENSION TITLE(18) DIMENSION B(29),NC(29),XMEAN(29) DIMENSION FMT(18) DIMENSION SS(29),SSD(29) DIMENSION SD(29) DIMENSION VAR(29) DIMENSION ICLSOL(999) DIMENSION F(999,29) DIMENSION P(29),XNC(29) DIMENSION PP(29,999) DIMENSION XMXPR(999) DIMENSION DENOM(999) DOUBLE PRECISION SUM,SS,F,P,PP,VAR DOUBLE PRECISION TOTAL,SUMSQS,SSDEVS,WGSS,WGMS,SSD C READ (5,64000) TITLE C C WRITE PROGRAM INFORMATION. WRITE (6,56000) WRITE (6,32000) WRITE (6,34000) C WRITE (6,70000) TITLE C C READ SAMPLE SIZE, N. READ (5,12000) N XN = N WRITE (6,66000) N C C READ DATA FORMAT. READ (5,64000) FMT C C READ DATA AND C COMPUTE STATISTICS OF WHOLE SAMPLE: TOTAL=0.0 SUMSQS=0.0 SSDEVS=0.0 C DO 300 I = 1,N C THE NEXT STMT WAS FOR USE WITH THE WATFOR COMPILER: C READ (5,*) X(I) C READ (5,FMT) X(I) TOTAL = TOTAL + X(I) SUMSQS = SUMSQS + X(I)*X(I) IF (I .EQ. 1) GO TO 100 GO TO 200 100 XMAX = X(1) XMIN = X(1) 200 CONTINUE IF (X(I) .LT. XMIN) XMIN=X(I) IF (X(I) .GT. XMAX) XMAX=X(I) 300 CONTINUE C WRITE DATA: WRITE (6,24000) WRITE (6,*) (X(I),I=1,N) C WRITE SUMMARY STATISTICS FOR WHOLE SAMPLE: WRITE (6,44000) WRITE (6,*) XMIN WRITE (6,46000) WRITE (6,*) XMAX XBAR = TOTAL/XN SSDEVS = SUMSQS- TOTAL*TOTAL/XN VARHAT = SSDEVS/XN WRITE (6,50000) XBAR WRITE (6,52000) VARHAT PI = 3.141593 TEMP = 2.0*PI*SSDEVS/N XMN2LL = N*(1.0 + ALOG(TEMP)) C IF ( VARHAT.LE. 0.0) VARHAT = 0.0 C STDDEV = SQRT(VARHAT) WRITE (6,54000) SSDEVS, XMN2LL, STDDEV NOPARM = 1 + 1 AIC = XMN2LL + 2.0*NOPARM WRITE (6,82000) AIC SCH = XMN2LL + ALOG(XN)*NOPARM XKASH = SCH - ALOG(2*VARHAT**3) WRITE (6,48000) XKASH C C C READ K, NUMBER OF CLUSTERS. READ (5,10000) K WRITE (6,36000) K C C READ INITIAL PRIOR PROBS AND MEANS: DO 400 IC=1,K READ (5,14000) P(IC),XMEAN(IC) 400 CONTINUE C READ INITIAL VARIANCE: READ (5,22000) VARHAT C C C C WRITE INITIAL PRIOR PROBS, MEANS AND VARIANCE: WRITE (6,26000) DO 500 IC = 1,K WRITE (6,28000) IC, P(IC), XMEAN(IC) 500 CONTINUE WRITE (6,29000) VARHAT C C SET CONSTANTS. DO 600 INTGER=1,N IOTA(INTGER) = INTGER 600 CONTINUE C ITER = 1 C IF THE INITIAL PRIOR PROBABILITIES ARE EQUAL, THEN THE C FIRST ITERATION IS EQUIVALENT TO MINIMUM DISTANCE CLUSTERING C TO INITIAL MEANS (I.E., "ISODATA"). C IN GENERAL, THE CLUSTERING IS BY MAXIMUM POSTERIOR C PROBABILITY CLUSTERING. C 700 CONTINUE C IF (ITER .EQ. 1) GO TO 900 C STORE OLD CLUSTERING: DO 800 I = 1,N ICLSOL(I) = ICLUS(I) 800 CONTINUE C C COMMENCE DISTANCE COMPUTATIONS. 900 CONTINUE DO 1100 I = 1,N DO 1100 IC = 1,K DSQ(IC) = ( XMEAN(IC) - X(I) )**2 ZSQ = DSQ(IC)/VARHAT IF ( ZSQ .LE. 174.673 ) GO TO 1000 F(I,IC) = 0.0 1000 CONTINUE C NOTE THAT A PROB. DENSITY FUNCTION OTHER THAN THE GAUSSIAN C COULD BE USED HERE: F(I,IC) = EXP(-ZSQ/2.0) 1100 CONTINUE C XMNDSQ(I) = MIN SQ. DISTANCE FROM X(I) TO ANY MEAN C C C 1200 CONTINUE C C COMPUTE POSTERIOR PROBABILITIES OF GROUP MEMBERSHIP: DO 1300 I = 1,N DENOM(I) = 0.0 DO 1300 IH=1,K DENOM(I) = DENOM(I) + P(IH)*F(I,IH) 1300 CONTINUE DO 1400 I = 1,N DO 1400 IC=1,K C IF ( DENOM(I) .EQ. 0.0 ) DENOM(I)=0.0001 PP(IC,I)= P(IC)*F(I,IC)/DENOM(I) 1400 CONTINUE C C COMPUTE NEW LABELS BY MAX POSTERIOR PROBABILITY: DO 1600 I = 1,N XMXPR(I) = PP(1,I) ICLUS(I) = 1 DO 1600 IC = 2,K IF ( PP(IC,I) .GT. XMXPR(I) ) GO TO 1500 GO TO 1600 1500 XMXPR(I) = PP(IC,I) ICLUS(I) = IC 1600 CONTINUE C C C WRITING OF NEW LABELS SUPPRESSED: TOO MUCH VOLUME OF OUTPUT. C WRITE NEW LABELS: C WRITE (6,16000) C WRITE (6,18000) (IOTA(I), I=1,N) C WRITE (6,20000) (ICLUS(I), I=1,N) C C C UPDATE CLUSTER PRIOR PROBABILITIES P(IC), MEANS XMEAN(IC) AND C VARIANCE VARHAT: WGSS = 0.0 DO 1800 IC = 1,K XNC(IC) = 0.0 C XNC(IC) WILL BE THE SUM OVER ALL N OBSERVATIONS OF THEIR C POSTERIOR PROBABILITIES OF MEMBERSHIP IN CLUSTER IC. SUM(IC) = 0.0 SS(IC) = 0.0 DO 1700 I = 1,N XNC(IC) = XNC(IC) + PP(IC,I) SUM(IC) = SUM(IC) + PP(IC,I)*X(I) SS(IC) = SS(IC) + PP(IC,I)*X(I)*X(I) 1700 CONTINUE IF ( XNC(IC) .EQ. 0.0 ) GO TO 2700 XMEAN(IC) = SUM(IC)/XNC(IC) C SSD(IC) = SS(IC) - SUM(IC)*SUM(IC)/XNC(IC) C VAR(IC) = SSD(IC)/XNC(IC) C IF ( VAR(IC) .LE. 0.0 ) VAR(IC) = 0.0 C SD(IC) = SQRT(VAR(IC)) P(IC) = XNC(IC)/XN WGSS = WGSS + SSD(IC) 1800 CONTINUE C C COUNT NUMBERS IN CLUSTERS: DO 1900 IC = 1,K NC(IC) = 0 1900 CONTINUE DO 2000 I = 1,N IGROUP = ICLUS(I) NC(IGROUP) = NC(IGROUP) + 1 2000 CONTINUE C C C VARHAT = WGSS/XN C SUMLNF = 0.0 DO 2200 I=1,N SUMPXF = 0.0 DO 2100 IC=1,K SUMPXF = SUMPXF + P(IC)*F(I,IC) 2100 CONTINUE SUMLNF = SUMLNF + ALOG(SUMPXF) 2200 CONTINUE C XMN2LL = -2.0*SUMLNF C C WGMS = WGSS/(N-K) WRITE (6,40000) WGSS, XMN2LL, WGMS STDERR = SQRT(WGMS) WRITE (6,74000) STDERR C KM1 = K-1 DO 2300 IC=1,KM1 ICP1 = IC+1 C B(IC) IS BOUNDARY BETWEEN G-TH AND G+1-ST CLASSES. B(IC) = ( XMEAN(IC) + XMEAN(ICP1) )/2.0 BDYADJ = DLOG(P(ICP1)) - DLOG(P(IC)) BDYADJ = BDYADJ/(XMEAN(ICP1) - XMEAN(IC)) BDYADJ = VARHAT*BDYADJ B(IC) = B(IC) - BDYADJ 2300 CONTINUE WRITE (6,58000) ITER WRITE (6,42000) (B(IC),IC=1,KM1) WRITE (6,38000) (XMEAN(IC),IC=1,K) IF (ITER .EQ. 1) GO TO 2500 DO 2400 I = 1,N IF (ICLUS(I) .EQ. ICLSOL(I)) GO TO 2400 GO TO 2500 2400 CONTINUE GO TO 2900 2500 CONTINUE ITER = ITER + 1 IF (ITER.GE.21) GO TO 2600 GO TO 700 2600 WRITE (6,80000) 2700 CONTINUE WRITE (6,88000) 2800 STOP C C 2900 CONTINUE C WRITE (6,60000) (SUM(IC),IC=1,K) WRITE (6,62000) (NC(IC),IC=1,K) WRITE (6,72000) (VAR(IC),IC=1,K) WRITE (6,68000) (SD(IC), IC=1,K) C VARHAT IS MLE OF VARIANCE. VARHAT = WGSS/N WRITE (6,30000) VARHAT C C IF (N .GE. 26) GO TO 3050 WRITE (6,78000) DO 3000 I=1,N WRITE (6,76000) X(I),ICLUS(I),(PP(IC,I),IC=1,K) 3000 CONTINUE C 3050 CONTINUE C C COMPUTE MODEL-SELECTION CRITERIA: C NO. PARAMETERS = K MEANS + 1 VARIANCE + (K-1) PROBS. NOPARM = K + 1 NOPARM = NOPARM + K - 1 WRITE (6,86000) NOPARM AIC = XMN2LL + 2.0*NOPARM SCH = XMN2LL + ALOG(XN)*NOPARM C C SCHWARZ' CRITERION IS FIRST-DEGREE EXPANSION OF C LOG POSTERIOR PROBABILITY OF THE MODEL. C KASHYAP'S CRITERION IS SECOND-DEGREE EXPANSION OF SAME. XKASH = SCH - ALOG(2*VARHAT**3) C WRITE (6,82000) AIC WRITE (6,84000) SCH WRITE (6,48000) XKASH C C C STOP 10000 FORMAT(2X,I1) 12000 FORMAT(2X,I4) 14000 FORMAT(5X,F3.2,2X,F8.2) 16000 FORMAT(1X,'CLUSTERING') 18000 FORMAT(1X,'POINT: '/, (1X,40I3)) 20000 FORMAT(1X,'CLUSTER: '/, (1X,40I3)) 22000 FORMAT(5X,F8.2) 24000 FORMAT(1X,'DATA:'/) 26000 FORMAT(/1X,'INITIAL PRIOR PROBS, MEANS AND VARIANCE:') 28000 FORMAT(1X,I2,2X,2F11.2) 29000 FORMAT(1X, 'INITIAL VARIANCE = ',F14.5/) 30000 FORMAT(1X, 'M.L. ESTIMATE OF COMMON VARIANCE = ',F14.5/) 32000 FORMAT(1X,'CMS DSN = MIX1CM ISOPAC') 34000 FORMAT(1X,'COPYRIGHT 1991 STANLEY LOUIS SCLOVE.'/) 36000 FORMAT('1',1X,'K = ',I1,' CLUSTERS') 38000 FORMAT(1X,'MEANS: ',9F13.2) 40000 FORMAT(/1X,'WGSS = ',F14.4,' MINUS 2 LOG LIKELIHOOD = ', XF14.4, ' WGMS = ',F14.4/) 42000 FORMAT(1X,'BOUNDARIES:', 9X, 9F13.2) 44000 FORMAT(1X,'MINIMUM OF SAMPLE: ') 46000 FORMAT(1X,'MAXIMUM OF SAMPLE: ') 48000 FORMAT(/,1X,'KASHYAP CRITERION = ', F14.4) 50000 FORMAT(/,1X,'MEAN = ', F14.4) 52000 FORMAT(1X, 'M.L. ESTIMATE OF VARIANCE = ',F14.5/) 54000 FORMAT(/1X,'SSDEVS = ',F14.4,' MINUS 2 LOG LIKELIHOOD = ', XF14.4, ' STDDEV = ',F14.4/) 56000 FORMAT('1','****************************************', X/,1X,'PROGRAM MIX1CM ISOPAC '/ X,1X,'FOR CLUSTERING UNIVARIATE DATA (DATA ON THE LINE)'/ X1X,'DEVELOPED AND PROGRAMMED BY DR. STANLEY L. SCLOVE' X/,1X,'VERSION 3.4 12-JAN-92 '/) 58000 FORMAT(1X,'ITERATION ', I2) 60000 FORMAT(1X,'SUMS:',6X,9F13.2) 62000 FORMAT(1X,'NUMBERS:',3X,9(I10,3X)) 64000 FORMAT(18A4) 66000 FORMAT(1X,'N = ',I3/) 68000 FORMAT(1X,'STD.DEVS.: ',9F13.2) 70000 FORMAT(1X,18A4/) 72000 FORMAT(1X,'VARIANCES: ',9F13.2) 74000 FORMAT(1X,'STD.ERROR=SQRT(WGMS) = ',F13.4/) 76000 FORMAT(1X,F8.2,2X,I2,3X,(8F6.3/)) 78000 FORMAT(1X/,1X,'DATA, LABELS AND PROBS OF CLUSTER MEMBERSHIP'/) 80000 FORMAT(1X,'PROGRAM HAS NOT CONVERGED IN 20 ITERATIONS. STOP') 82000 FORMAT(1X,'AIC = ', F14.4/) 84000 FORMAT(1X,'SCHWARZ CRITERION = ', F14.4/) 86000 FORMAT(/1X,'NUMBER OF PARAMETERS = ',I4/) 88000 FORMAT(1X,'NO OBSERVATIONS IN GROUP ',I3,'. STOP') C WITH THE WATFOR COMPILER, USE THE FOLLOWING STMT AFTER END C INSTEAD OF //GO.SYSIN DD * . C $DATA END