#svl // kmeans.svl k-means Cluster // // 04-feb-2016 (yk) fix Plot button error // 29-jan-2014 (rk) replaced cattok with tok_cat for MOE 2013.08 // 09-sep-2010 (rk) modified to treat 0 as non-blank // 08-jun-2010 (rk) added the failure limit to stop the iteration // if fails to reduce the cost in the limited time // 07-jun-2010 (rk) added 'Print Cluster Center Numbers' option // 03-jun-2010 (rk) made iteration limit controllable with optional // parameters // 04-feb-2010 (rk) added option for diverse seeds beginning from farthest // traversal pair // 02-feb-2010 (rk) reuse seeds if already selected // 02-feb-2010 (rk) added microplate well numbers, 96, 384, 1536, or 3456 // to the selectors for k // 29-jan-2010 (rk) fail-safe to confirm the selection of the descriptors // 28-jan-2010 (rk) added option to select the global center as the // initial center // 26-jan-2010 (rk) added k-means# algorithm for selecting initial centers // 25-jan-2010 (rk) call kmeans_Plot3D once // 21-jan-2010 (rk) added 'Normalize Descriptors to Unit Variance' option // 21-jan-2010 (rk) added 3D Plot functionality // 21-jan-2010 (rk) added k-means++ algorithm for selecting initial centers // 19-jan-2010 (rk) terminate if the centers appear twice to avoid // oscillation // 01-dec-2006 (kt) added 'Keep selected entries to center' option // 30-nov-2006 (kt) make a few modifications // 05-jul-2006 (rk) replaced distance with sqr distance // 03-jul-2006 (rk) replaced loop with apt in reading descriptors // 03-jul-2006 (rk) added time counter for each process // 30-jun-2006 (rk) added process counter // 30-jun-2006 (kt) lessened memory usage using 'cat' to append data // on reading MDB // 29-jun-2006 (rk) corrected typos and added bubbleHelp's // 26-jan-2006 (kt) create // // COPYRIGHT (C) 2017 MOLSIS INC. ALL RIGHTS RESERVED. // // PERMISSION TO USE, COPY, MODIFY AND DISTRIBUTE THIS SOFTWARE IS HEREBY // GRANTED PROVIDED THAT: (1) UNMODIFIED OR FUNCTIONALLY EQUIVALENT CODE // DERIVED FROM THIS SOFTWARE MUST CONTAIN THIS NOTICE; (2) ALL CODE DERIVED // FROM THIS SOFTWARE MUST ACKNOWLEDGE THE AUTHOR(S) AND INSTITUTION(S); (3) // THE NAMES OF THE AUTHOR(S) AND INSTITUTION(S) NOT BE USED IN ADVERTISING // OR PUBLICITY PERTAINING TO THE DISTRIBUTION OF THE SOFTWARE WITHOUT // SPECIFIC, WRITTEN PRIOR PERMISSION; (4) ALL CODE DERIVED FROM THIS SOFTWARE // BE EXECUTED WITH THE MOLECULAR OPERATING ENVIRONMENT (MOE) LICENSED FROM // MOLSIS INC. // // MOLSIS INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS // SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, // AND IN NO EVENT SHALL MOLSIS INC. BE LIABLE FOR ANY // SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER // RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF // CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN // CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // // References: // k-means: // J. B. MacQueen, Some Methods for classification and Analysis of Multivariate // Observations, Proceedings of 5-th Berkeley Symposium on Mathematical // Statistics and Probability, Berkeley, University of California Press, // 1:281-297 (1967). // // k-means++: // D. Arthur, and S. Vassilvitskii, k-means++: the Advantages of Careful // Seeding, http://www.stanford.edu/~darthur/kMeansPlusPlus.pdf. // // k-means#: // N. Ailon, R. Jaiswal, and C. Monteleoni, Streaming k-means approximation, // http://books.nips.cc/papers/files/nips22/NIPS2009_1085.pdf #set title 'Database k-means Cluster' #set class 'MOE:database' #set version 'RSI 2014.01.29' #set main 'KMeans' function KMeans, kmeans_Plot3D; function db_PrincipalComponents, QuaSAR_Descriptor_List; const WINDOW_NAME = 'k-means Cluster'; const KMEANS_DEFAULTS = [ mdb: '', // source field cluster: '$CLUSTER', // cluster field center: '$CENTER', // cluster center field k: 10, // number of clusters pca: 'PCA', // PCA field method: 'k-means#', // method for selecting seeds seed1: 'Farthest Traversal', // the first seed for the Diverse subset // selection keep_centers: 0, // Keep selected entries to centers plot_seeds: 0, // plot seeds in 3D PCA space fld: '', // first QuaSAR_Descriptor_List [] // descriptor fields desc_normalize: 1, // normalization of each field? print_centers: 0, // print center numbers? lmt: 100, // iteration limit fail_lmt: 10 // failure limit ]; const tags_for_seeds = [ 'mdb', 'cluster', 'center', 'k', 'pca', 'method', 'seed1', 'fld', 'desc_normalize' ]; local function KMeansGUI v if MOE_BATCH or WindowShow [WINDOW_NAME, 1] then return []; endif if isfalse v.mdb then // invoked in DBV v.mdb = task_getenv 'DB_VIEW'; endif if isfalse v.mdb then // invoked in MOE v.mdb = FilePrompt [ 'k-means Diverse Subset: Select a Database','open','*.mdb' ]; if v.mdb === [] then return; endif endif v.mdb = db_Filename v.mdb; local db = db_Open v.mdb; local nfields = db_NumericFields db; db_Close db; local wkey = WindowCreate [ windowName: WINDOW_NAME, title: tok_cat[WINDOW_NAME, ' ', (modenv[]).version], name: 'panel', text: ['Cluster', 'Plot', 'Cancel'], onTrigger: 'return', bubbleHelp: 'Cluster: Run clustering.\n' 'Plot: Plot clustered entries in 3D PCA space.\n' 'Cancel: Close this panel.', Text: [ title: 'Database:', name: 'mdb', sensitive: 0, len:40, extendH: 1, bubbleHelp: 'The database to operate upon.' ], Mbox:[columns:2, Text:[title:'Cluster Field:', name:'cluster', type:'char', bubbleHelp: 'The field name for the sequential number of clusters.' ], Text:[title:'Center Field:', name:'center', type:'char', bubbleHelp: 'The field name for the center flag of clusters\n' ' 1: the closest to the center;\n' ' 0: other.' ], Text:[title: 'No. of Clusters:', name: 'k', len:10, type:'int', shortcut:['96', '384', '1536', '3456'], // the number of wells bubbleHelp: 'The number of clusters, k, to generate.' ], Text:[title:'PCA Prefix:', name:'pca', type:'char'] ], Separator:[], Vbox:[ extendH: 1, Radio: [ title: 'Seed Entries:', name: 'method', text: [ 'First', 'Selected', 'Diverse', 'Random', 'k-means++', 'k-means#' ], onTrigger: 'return', extendH: 1, bubbleHelp: 'Method for selecting the entries as the first cluster ' 'center seeds:\n' ' First: The first k entries, where k stands for ' 'the number set in \'No. of Clusters\'\n' ' above.\n' ' Selected: Selected entries. ' '\'No. of Clusters\' will be replaced with \n' ' the numbers of selected entries.\n' ' Diverse: Select the first seed with First Seed ' 'option and then select the point\n' ' farthest from those picked so far;\n' ' Random: Randomly select k entries;\n' ' k-means++: Uniformly randomly select the first seed ' 'and then choose each\n' ' point at random with probability proportional to ' 'its squared distance \n' ' from the centers chosen already;\n' ' k-means#: Almost the same algorithm with k-means++ ' 'except for selecting\n' ' 3 * log k entries at once;' ], Radio: [ title: 'First Seed:', name: 'seed1', text: ['First Entry', 'Global Center', 'Farthest Traversal'], extendH: 1, bubbleHelp: 'First seed for the Diverse subset selection:\n' ' First Entry: The first entry in the database;\n' ' Global Center: The global center in the database;\n' ' Farthest Traversal: The farthest-first traversal in ' 'the database.' ], Checkbox:[ text: 'Keep Selected Entries to Centers', name:'keep_centers', onTrigger: 'return' ], Button : [ text : 'Plot Seeds in 3D Space...', font: 'mediumBold', name : 'plot_seeds', onTrigger: 'return', bubbleHelp: 'Plot the selected seeds in 3D Space in MOE window.' ] ], Separator:[], Vbox: [ title: 'Descriptors:', extendH: 1, extendV: 1, Listbox: [ name: 'fld', type: 'char', extendH:1, extendV:1, multiSelect:1, multiColumn:1, len: 8, text: nfields, onTrigger: 'return', bubbleHelp: 'Select scalar descriptors for calculating distances.' ] ], Checkbox : [ name: 'desc_normalize', text: 'Normalize Descriptors to Unit Variance' ], Checkbox : [ name: 'print_centers', text: 'Print Cluster Center Numbers' ], Hbox: [ Text : [ name: 'lmt', title: 'Iteration Limit:', type: 'real', min:0, shortcut: ['10','20','50','100','200','500','1000'], bubbleHelp: 'Iteration will be terminated when it\n' 'exceeds over the specified value.' ], Text : [ name: 'fail_lmt', title: 'Failure Limit:', type: 'real', min:0, shortcut: ['10','20','50','100','200','500','1000'], bubbleHelp: 'Iteration will be terminated when the cost\n' 'fails to be reduced over the specified times.' ] ] ]; WindowSetData [wkey, tagcat [[ mdb: v.mdb, // source field fld: first QuaSAR_Descriptor_List [], // descriptor fields lmt: KMEANS_DEFAULTS.lmt, // iteration limit lmt: KMEANS_DEFAULTS.fail_lmt // failure limit ], v]]; WindowSetAttr [wkey, [seed1: [sensitive: 0], keep_centers: [sensitive: 0]]]; WindowShow wkey; function PlotPCA val if istrue x_findmatch [ 'k-means 3D Plot', last untag (tr task_info [])(2) ] then // clear k-means 3D Plot panel and 3D PCA plot local taskkeylist = task_keylist []; task_kill taskkeylist(x_findmatch [ 'k-means 3D Plot', task_title taskkeylist ]); return; endif // if PCA's have not been calculated, calculate them local fields = first db_Fields val.mdb; local pca_fieldnames = tok_cat [val.pca, totok igen 3]; local m_pca_fields = m_findmatch [ pca_fieldnames, fields ]; local pca_fields = fields | m_pca_fields; if not (add m_pca_fields >= 3) then // no or less than 3 PCA // fields db_PrincipalComponents [ val.mdb, [yfield: [], wfield: [], xfields: val.fld, esel: 0], [autoscale: 1, prefix: val.pca, maxcomp: 3, minvar: 100, maxcond: 1e+006, write_pca: 1], '*cli*' ]; else // if no values exist in any cell, calculate them local ents = db_Entries val.mdb; local sents = keep [ents, min [length ents, 100]]; local data = apt db_ReadFields [val.mdb, sents, [pca_fieldnames]]; if orE app isfalse cat data then db_PrincipalComponents [ val.mdb, [yfield: [], wfield: [], xfields: val.fld, esel: 0], [autoscale: 1, prefix: val.pca, maxcomp: 3, minvar: 100, maxcond: 1e+006, write_pca: 1], '*cli*' ]; endif endif local [tkey, code] = task_fork [master: 'mutual']; if code === 'child' then local plot_opt = [ sel_ent: 0, xfield: tok_cat [val.pca, '1'], yfield: tok_cat [val.pca, '2'], zfield: tok_cat [val.pca, '3'], act: val.cluster, thres_val: 'Unique' ]; run ['kmeans3dplot.svl', [v.mdb, plot_opt]]; endif endfunction local mkey = Message[0,'']; local settings_for_selected_seeds = []; local old_keep_centers = 0; local seeds = [], cost; loop local [val, trig] = WindowWait wkey; if trig === 'panel' then if val.panel === 'Cluster' then if val.method === 'Selected' then if isnull dbv_SelectedEntries db then Warning 'Please select entries for initial centers!'; continue; endif endif if isfalse val.mdb then val.mdb = v.mdb; endif val.fld = first val.fld; if isnull val.fld then Warning 'Please select scalar descriptors or a bit ' 'fingerprint,\n\'FP:BIT_*\', for calculating ' 'distances!'; continue; endif mkey = Message[mkey,'k-means Cluster: Calculating...']; WindowSetAttr [wkey, [panel: [sensitive : [0, 1, 1]]]]; if eqL [settings_for_selected_seeds, val | m_findmatch [ tags_for_seeds, first untag val ]] then val = tagcat [ [seeds: seeds, k1: length seeds, keep_centers: 1], val ]; endif [seeds, cost] = KMeans val; WindowSetAttr [wkey, [panel: [sensitive : [1, 1, 1]]]]; mkey = Message[mkey,'']; elseif val.panel === 'Plot' then mkey = Message[mkey,'k-means Cluster: Plotting...']; val.fld = first val.fld; // (yk) 04-feb-2016 PlotPCA tagcat [[act: val.cluster], val]; if not WindowKey wkey then return; endif elseif val.panel === 'Cancel' then mkey = Message[mkey,'']; WindowDestroy wkey; return; endif elseif trig === 'method' then WindowSetAttr [wkey, [seed1: [ sensitive: val.method === 'Diverse' ]]]; if istrue x_findmatch [ val.method, ['First', 'Selected', 'Diverse', 'Random'] ] then WindowSetAttr [wkey, [keep_centers: [sensitive: 1]]]; WindowSetData [wkey, [keep_centers: old_keep_centers]]; else WindowSetAttr [wkey, [keep_centers: [sensitive: 0]]]; WindowSetData [wkey, [keep_centers: 0]]; endif elseif trig === 'keep_centers' then old_keep_centers = val.keep_centers; elseif trig === 'plot_seeds' then mkey = Message[mkey,'k-means Cluster: Calculating Seeds...']; val.plot_seeds = 1; val.fld = first val.fld; [seeds, cost] = KMeans val; if istrue seeds then mkey = Message[mkey,'k-means Cluster: Plotting Seeds...']; PlotPCA tagcat [[act: '(None)'], val]; settings_for_selected_seeds = val; if not WindowKey wkey then return; endif endif elseif trig === 'fld' then // if bit fields are selected, gray out 'Normalize' option local fbit= 0; // mask the bit fields local mbit = m_findmatch [ ['FP:BIT_Daylight', 'FP:BIT_MACCS'], first val.fld ]; // no. of bit fields local nbit = add mbit; if nbit == 1 then fbit = 1; WindowSetAttr [wkey, [desc_normalize: [sensitive:0]]]; elseif nbit > 1 then Warning 'Please select a single bit field!'; write 'Please use single bit field!\n'; local bitfld = first val.fld | mbit; write 'Bit fields: '; print bitfld; WindowSetAttr [wkey, [desc_normalize: [sensitive:1]]]; else WindowSetAttr [wkey, [desc_normalize: [sensitive:1]]]; endif if neL [val.fld, settings_for_selected_seeds] then settings_for_selected_seeds = val; endif endif endloop return; endfunction // normalize_descriptors will normalize a vector of data by dividing the data // by the standard deviation local function normalize_descriptors x local u = add x * invz length x; local s = add sqr x * invz dec length x - sqr u; return (x - [u]) * [invz sqrt s]; endfunction // kmeans_Classify classifies all data points to the nearest centers local function kmeans_Classify [data, cent, val] local nent = length data, idx = igen nent; local class = one idx; local i, d2, cost = 0; for i in idx loop if isnull join [i, cent] then if val.fbit then class(i) = x_min app add app bitcount bitxor [data[cent], data[i] ]; else class(i) = x_min app add sqr sub [data[cent], data[i]]; endif else class(i) = x_join [cent, i]; endif endloop return class; endfunction // kmeans_SelectCenters calculates culster center of each class local function kmeans_SelectCenters [data, class, val] local i, x_class, kidx, pos, cpos = [], newcent =[]; if val.k1 then newcent[igen val.k1] = val.keep_idx; endif for i = val.k1 + 1, val.k loop x_class = x_join[class, i]; if not isnull (kidx = join [val.keep_idx, x_class]) then newcent(i) = kidx; else pos = data[x_class]; if val.fbit then // binary centers cpos(i) = floor (add pos / length pos * 2); newcent(i) = x_class [ x_min app add app bitcount apt bitxor [cpos[i], pos] ]; else // Cartesian distance centers cpos(i) = add pos / length pos; newcent(i) = x_class[x_min app add sqr sub [[cpos(i)], pos]]; endif endif endloop return sort newcent; endfunction // SeedDiverse returns diverse seeds such that // 1. If val.use_global_center then select the center of the whole database // else use the first entry. // 2. Select the entry with the largest distance from the first center. // 3. Select the entry with the largest sum of distances from the previous // centers. // 4. Repeat 3 until the number of seeds get to val.k. local function SeedDiverse [data, val] local nent = length data, idx = igen nent; // 1. Select the first center. local i = 1; if val.seed1 === 'First Entry' then local seed = [1]; else // if val.seed1 === 'Global Center' // or val.seed1 === 'Farthest Traversal' then // select the global center seed = kmeans_SelectCenters [ data, one igen length data, // class, tagcat [ [k1: 0, k: 1, method: 'Random'], val ] ]; endif if val.seed1 === 'Farthest Traversal' then // select the farthest point from the global center if val.fbit then // binary farthest point from the global center seed = nest x_max app add app bitcount apt bitxor [ data, data[seed] ]; else // Cartesian farthest point from the global centerpair seed = x_max app add sqr sub [data, data[seed]]; endif endif if i >= val.k then return seed; endif // if val.cent already exist, add them to the seed if istrue val.cent then seed = uniq cat [seed, val.cent]; i = length seed; endif if i >= val.k then return seed; endif local j, d = [], idmax; loop // 2. Select the entry with the largest distance from the first // center(s). // 3. Select the entry with the largest sum of distances from the // previous centers. d = zero idx; if val.fbit then for j in diff [idx, seed] loop d(j) = add app add app bitcount bitxor [data[seed], data[j]]; endloop else for j in diff [idx, seed] loop d(j) = add app add sqr sub [data[seed], data[j]]; endloop endif idmax = x_max d; // 5. Add x_i to seed. if not indexof [idmax, seed] then seed = sort cat [seed, idmax]; i = inc i; endif // 4. Repeat step 3 until we have chosen k centers. until i >= val.k endloop return seed; endfunction // SeedKMeansPlusPlus returns universally randomly selected initial centers // according to the k-mean++ algorithm local function SeedKMeansPlusPlus [data, val] local i, j, k = val.k, nent = length data, idx = igen nent; local d2, sumd2 = [], seed = [], y, iy; // 1. Choose one point uniformly at random from (x_1, x_2, ..., x_n), // and add it to C. if istrue val.cent then seed = val.cent; i = length seed; else // randU generates numbers in [0, 1) * nent seed(1) = 1 + floor randU nent; i = 1; endif if i >= val.k then return seed; endif // 2. For each point x_i, set D(x_i) to be the distance between x_i and // the nearest point in C. while i < k loop for j = 1, nent loop if val.fbit then d2(j) = min app add app bitcount apt bitxor [ data[j], data[seed] ]; // d2(j) = min app bitcount bitxor [data[seed], data[j]]; else d2(j) = min app add app sqr apt sub [data[seed], data[j]]; endif if j == 1 then sumd2(j) = d2(j); else sumd2(j) = sumd2(j - 1) + d2(j); endif endloop // 3. Choose a real number y uniformly at random between 0 and // D(x_1)^2 + D(x_2)^2 + ... + D(x_n)^2. y = randU sumd2(nent) + 1; // 4. Find the unique integer i so that // D(x_1)^2 + D(x_2)^2 + ... D(x_i)^2 >= y // > D(x_1)^2 + D(x_2)^2 + ... + D(x_(i-1))^2. iy = first (idx | sumd2 >= y); // 5. Add x_i to C. if isfalse indexof [iy, seed] then seed = sort cat [seed, iy]; i = inc i; endif // 6. Repeat Steps 2-5 until we have chosen k centers. endloop return seed; endfunction // SeedKMeansSharp returns universally randomly selected initial centers // choosing 3 * log k at the same time up to k according to the k-mean# // algorithm local function SeedKMeansSharp [data, val] local i, j, k = val.k, nent = length data, idx = igen nent; local kadd, d2, sumd2 = [], seed = [], y, iy; // 1. Choose 3 * log k centers independently and uniformly at random // from X. if istrue val.cent then seed = val.cent; i = length seed; else // randU generates numbers in [0, 1) * nent seed(1) = 1 + floor randU nent; i = 1; endif if i >= val.k then return seed; endif // 2. Repeat (k - 1) times. while i < k loop for j = 1, nent loop if val.fbit then d2(j) = min app add app bitcount bitxor [data[j], data[seed]]; else d2(j) = min app add sqr sub [data[seed], data[j]]; endif if j == 1 then sumd2(j) = d2(j); else sumd2(j) = sumd2(j - 1) + d2(j); endif endloop // 3. Choose 3 * log k centers independently and with probability // D(x')^2 / Sigma_(x in X) D(x)^2 . // (here D(.) denotes the distances w.r.t. to the subset // of points chosen in the previous rounds) kadd = toint (3 * log k); y = randU (sumd2(nent) * one igen kadd); // 4. Find the unique integer i's so that // D(x_1)^2 + D(x_2)^2 + ... D(x_i)^2 >= y // > D(x_1)^2 + D(x_2)^2 + ... + D(x_(i-1))^2. iy = app first apt mget [[idx], ltE [y, tr sumd2]]; // 5. Add x_i's to C until the number up to k. iy = uniq diff [iy, seed]; if istrue iy then seed = sort cat [seed, sample [iy, min [k - i, length iy]]]; i = length seed; endif // 2. Repeat (k - 1) times. endloop return seed; endfunction // kmeans_Cost calculates the cost for the current clustering local function kmeans_Cost [data, class, val] if val.fbit then return add app add app bitcount bitxor [data[class], data]; else return add app add sqr sub [data[class], data]; endif endfunction global function KMeans val // [mdb, k, fld, lmt] val = tagcat [val, KMEANS_DEFAULTS]; if orE app isfalse [val.mdb, val.k, val.fld] then KMeansGUI val; exit[]; endif // if bit fields are selected, drop other descriptors local fbit= 0; // mask the bit fields local mbit = m_findmatch [['FP:BIT_Daylight', 'FP:BIT_MACCS'], val.fld]; // no. of bit fields local nbit = add mbit; if nbit > 1 then Warning 'Please select a single bit field!'; write 'Please use single bit field!\n'; val.fld = val.fld | mbit; write 'Bit fields: '; print val.fld; return []; elseif nbit == 1 then fbit = 1; val.fld = val.fld | mbit; else // if nbit < 1 then // do not change val.fld endif val.fbit = fbit; write ['KMeans ']; print val | not m_findmatch [ ['panel', 'lmt'], first untag val]; write '\n'; local msg_key = Message [0, '']; local msg = token swrite [ 'KMeans {} started: {}.', (modenv[]).version, asctime[] ]; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; write '\n'; local tstart = cpuclock []; if isnull val.k or val.k < 1 then return; endif if isnull val.lmt then val.lmt = 100; endif local db = db_Open [val.mdb, 'read-write']; if not MOE_BATCH then dbv_Open db_Filename db; endif if isnull val.fld then val.fld = db_NumericFields db; endif // clear entry selection local ent = db_Entries db; local nent = length ent; local idx = igen nent; local sstates = dbv_EntrySelected [db, ent]; dbv_EntrySetSelected [db, ent | sstates, 0]; val.k = min [val.k, length sstates, length ent]; local tord_k = totok floor inc log10 val.k; // visualize unvisible entries local vstates = dbv_EntryVisible [db, ent]; dbv_EntrySetVisible [db, ent | not vstates, 1]; local function db_CheckField [db, fldname, fldtype] if not (db_FieldType [db, fldname] === fldtype) then db_DeleteField [db, fldname]; elseif db_FieldType [db, fldname] === fldtype then return; endif db_CreateField [db, fldname, fldtype]; endfunction local function db_WriteClusters [db, ent, cluster, cent, val] if not MOE_BATCH then dbv_EntrySelection [db, m_join[igen length ent, cent]]; endif local i; for i = 1, length ent loop db_Write [ db, ent(i), tag [ [val.cluster, val.center], [cluster(i), orE m_join[i, cent]] ] ]; endloop endfunction local outfield = apt tagpeek [[val], ['pca','cluster','center']]; outfield(1) = app tok_cat tr [rep[outfield(1), 3], ['1','2','3']]; outfield = [cat outfield,['float','float','float','int','int']]; apt db_CheckField cat [db, outfield]; // ***** read descriptors ***** local trstart = cpuclock []; msg = token swrite ['Reading descriptors...']; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; local i, data = [], m_data1, m_data, nfld; if fbit then data = db_ReadColumn [db, val.fld]; else data = apt db_ReadFields [db, ent, [val.fld]]; endif if fbit then m_data1 = app isnull data; if orE m_data1 then write ['{}\n', 'Please fill the blank cells:']; local line = tok_cat rep ['-', 79]; write ['{}\n', line]; write ['Entry # | Field Name(s)\n']; write ['{}\n', line]; m_data = split [m_data1, nfld]; for i = 1, nent loop if isfalse m_data(i) then continue; endif write ['{n:7} | {}\n', i, val.fld]; endloop write line; Warning 'Please fill the blank cells\n' 'listed in the SVL Commands window!'; if not MOE_BATCH then msg_key = Message [msg_key, []]; endif return 0; endif else m_data1 = app isnull cat data; nfld = length val.fld; if orE m_data1 then write ['{}\n', 'Please fill the blank cells:']; line = tok_cat rep ['-', 79]; m_data = split [m_data1, nfld]; write line; write ['Entry # | Field Name(s)\n']; write line; for i = 1, nent loop if isfalse m_data(i) then continue; endif write ['{n:7} | {}\n', i, val.fld | m_data(i) ]; endloop write line; Warning 'Please fill the blank cells\n' 'listed in the SVL Commands window!'; if not MOE_BATCH then msg_key = Message [msg_key, []]; endif return 0; endif endif if not fbit then if val.desc_normalize then msg = token swrite ['Normalizing descriptors...']; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; data = normalize_descriptors data; endif endif msg = token swrite ['Reading done: {}', asctime[]]; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; write ['Reading elapsed: {n:20.3f} sec.\n\n', cpuclock[] - trstart]; // ***** minimize total intra-cluster variance ***** // assign the initial cluster centers local tsstart = cpuclock []; msg = token swrite ['Assigning the initial cluster centers...']; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; local keep_idx = []; // indeces of centers to be kept local lmt_idx = 0; local cent, cents = [], fail = 0; local newcent, class, cost, dcent; local min_cycle, min_dcent, min_cent, min_class, min_cost = REAL_MAX; if length val.seeds then keep_idx = val.seeds; elseif val.keep_centers and istrue sstates then keep_idx = idx | sstates; newcent = sort keep [uniq cat [keep_idx, newcent], val.k]; endif local k1 = length newcent; // number of centers to be kept if val.method === 'First' then newcent = cat [newcent, keep [idx, val.k - k1]]; elseif val.method === 'Selected' then newcent = idx | sstates; val.k = length newcent; elseif val.method === 'Diverse' then newcent = SeedDiverse [data, tagcat [[cent: newcent], val]]; elseif val.method === 'Random' then newcent = cat [newcent, sample [idx, val.k - k1]]; elseif val.method === 'k-means++' then newcent = SeedKMeansPlusPlus [ data, tagcat [[cent: newcent, fbit: fbit], val] ]; else // if val.method === 'k-means#' then // default initial seed selector newcent = SeedKMeansSharp [ data, tagcat [[cent: newcent, fbit: fbit], val] ]; endif msg = token swrite [ 'Assigning the initial cluster centers done: {}', asctime[] ]; write ['{}\n', msg]; msg_key = Message [msg_key, msg]; write [ 'Assigning the initial cluster centers elapsed: {n:20.3f} sec.\n\n', cpuclock[] - tsstart ]; if not MOE_BATCH then // clear entry selection sstates = dbv_EntrySelected [db, ent]; dbv_EntrySetSelected [db, ent | sstates, 0]; // visualize unvisible entries vstates = dbv_EntryVisible [db, ent]; dbv_EntrySetVisible [db, ent | not vstates, 1]; endif class = kmeans_Classify [data, newcent, tagcat [[fbit: fbit], val]]; db_WriteClusters [db, ent, class, newcent, val]; dcent = length newcent; cost = kmeans_Cost [data, class, val]; msg = token swrite ['{n:3}{} cycle:', 0, 'th']; write ['{} ', msg]; write [ tok_cat ['{n:', tord_k, '} changed centers;\tCost: {}\n'], dcent, cost ]; // write [' Initial\n Centers: {}\n', newcent]; if val.plot_seeds === 1 then // Plot Seeds in 3D PCA Space msg_key = Message [msg_key, '']; return [newcent, cost]; endif local function write_cycle [] local ord; if mod [lmt_idx, 10] == 1 then if lmt_idx == 11 then ord = 'th'; else ord = 'st'; endif elseif mod [lmt_idx, 10] == 2 then if lmt_idx == 12 then ord = 'th'; else ord = 'nd'; endif elseif mod [lmt_idx, 10] == 3 then if lmt_idx == 13 then ord = 'th'; else ord = 'rd'; endif else ord = 'th'; endif msg = token swrite [ '{n:3}{} cycle', lmt_idx, ord ]; write ['{}: ', msg]; if not MOE_BATCH then msg_key = Message [msg_key, msg]; endif endfunction local function write_stats [] write [ tok_cat ['{n:', tord_k, '} changed centers;\tCost: {}\tfail: {}\n'], dcent, cost, fail ]; if val.print_centers then write [' Centers: {}\n', newcent]; endif endfunction local tmstart = cpuclock []; msg = token swrite ['Minimizing total intra-cluster variance...']; write ['{}\n', msg]; if not MOE_BATCH then msg_key = Message [msg_key, msg]; endif loop lmt_idx = inc lmt_idx; if lmt_idx >= val.lmt then write ['***** Iteration limit, {}, reached! *****\n', val.lmt]; break; endif if fail >= val.fail_lmt then write ['***** Failure limit, {}, reached! *****\n', val.fail_lmt]; write ['***** Resume to the minimum cost clusters: *****\n']; lmt_idx = min_cycle; dcent = min_dcent; cost = min_cost; fail = 0; class = min_class; newcent = min_cent; write_cycle []; write_stats []; break; endif write_cycle []; // form clusters cent = newcent; cents = cat [cents, [cent]]; // lmt_idx = inc lmt_idx; // if lmt_idx > val.lmt then break; endif class = kmeans_Classify [data, cent, tagcat [ [k1: k1, keep_idx: keep_idx, fbit: fbit], val ]]; // compute new cluster centers newcent = kmeans_SelectCenters [ data, class, tagcat [ [k1: k1, keep_idx: keep_idx, fbit: fbit], val ] ]; if not MOE_BATCH then dbv_EntrySelection [db, m_join[igen length ent, newcent]]; endif // minimum difference in data centers from any previous centers dcent = min app length apt diff [cents, [newcent]]; cost = kmeans_Cost [data, class, val]; if cost < min_cost then min_cycle = lmt_idx; min_dcent = dcent; min_cost = cost; min_class = class; min_cent = newcent; fail = 0; else fail = inc fail; endif write_stats []; until not dcent endloop db_WriteClusters [db, ent, class, newcent, val]; // if not dcent then msg = token swrite [ 'Convergence has {}been reached: {}', select ['not ', '', dcent], asctime[] ]; // else // msg = token swrite [ // 'Convergence has been reached: {}', asctime[] // ]; // endif write ['{}\n', msg]; if not MOE_BATCH then msg_key = Message [msg_key, msg]; endif write [ 'Minimization elapsed: {n:20.3f} sec.\n\n', cpuclock[] - tmstart ]; write ['KMeans done: {}\n', asctime[]]; write ['KMeans elapsed: {n:20.3f} sec.\n\n', cpuclock[] - tstart]; if not MOE_BATCH then msg_key = Message [msg_key, []]; endif db_Close db; return [cent, cost]; endfunction