From 986cd230f9267d28c12ae27f9bb7abc89e7709ff Mon Sep 17 00:00:00 2001 From: Richard Townsend Date: Sat, 19 Sep 2015 11:26:23 +0100 Subject: [PATCH] clustering: creates the package and implements DBSCAN Verified against scikit-learn's implementation (gen_test.py) --- base/conversion.go | 69 ++++ clustering/cluster_test.go | 82 ++++ clustering/clustering.go | 107 +++++ clustering/dbscan.csv | 750 +++++++++++++++++++++++++++++++++++ clustering/dbscan.go | 188 +++++++++ clustering/dbscan_labels.csv | 750 +++++++++++++++++++++++++++++++++++ clustering/dbscan_test.go | 150 +++++++ clustering/gen_test.py | 30 ++ clustering/synthetic.csv | 5 + 9 files changed, 2131 insertions(+) create mode 100644 base/conversion.go create mode 100644 clustering/cluster_test.go create mode 100644 clustering/clustering.go create mode 100644 clustering/dbscan.csv create mode 100644 clustering/dbscan.go create mode 100644 clustering/dbscan_labels.csv create mode 100644 clustering/dbscan_test.go create mode 100644 clustering/gen_test.py create mode 100644 clustering/synthetic.csv diff --git a/base/conversion.go b/base/conversion.go new file mode 100644 index 0000000..2c81eee --- /dev/null +++ b/base/conversion.go @@ -0,0 +1,69 @@ +package base + +import ( + "fmt" + "github.com/gonum/matrix/mat64" +) + +func checkAllAttributesAreFloat(attrs []Attribute) error { + // Check that all the attributes are float + for _, a := range attrs { + if _, ok := a.(*FloatAttribute); !ok { + fmt.Errorf("All []Attributes to this method must be FloatAttributes") + } + } + return nil +} + +// ConvertRowToMat64 takes a list of Attributes, a FixedDataGrid +// and a row number, and returns the float values of that row +// in a mat64.Dense format. +func ConvertRowToMat64(attrs []Attribute, f FixedDataGrid, r int) (*mat64.Dense, error) { + + err := checkAllAttributesAreFloat(attrs) + if err != nil { + return nil, err + } + + // Allocate the return value + ret := mat64.NewDense(1, len(attrs), nil) + + // Resolve all the attributes + attrSpecs := ResolveAttributes(f, attrs) + + // Get the results + for i, a := range attrSpecs { + ret.Set(0, i, UnpackBytesToFloat(f.Get(a, r))) + } + + // Return the result + return ret, nil +} + +// ConvertAllRowsToMat64 takes a list of Attributes and returns a vector +// of all rows in a mat64.Dense format. +func ConvertAllRowsToMat64(attrs []Attribute, f FixedDataGrid) ([]*mat64.Dense, error) { + + // Check for floats + err := checkAllAttributesAreFloat(attrs) + if err != nil { + return nil, err + } + + // Return value + _, rows := f.Size() + ret := make([]*mat64.Dense, rows) + + // Resolve all attributes + attrSpecs := ResolveAttributes(f, attrs) + + // Set the values in each return value + for i := 0; i < rows; i++ { + cur := mat64.NewDense(1, len(attrs), nil) + for j, a := range attrSpecs { + cur.Set(0, j, UnpackBytesToFloat(f.Get(a, i))) + } + ret[i] = cur + } + return ret, nil +} diff --git a/clustering/cluster_test.go b/clustering/cluster_test.go new file mode 100644 index 0000000..8bb196f --- /dev/null +++ b/clustering/cluster_test.go @@ -0,0 +1,82 @@ +package clustering + +import ( + . "github.com/smartystreets/goconvey/convey" + "testing" +) + +func TestClusterEquality(t *testing.T) { + + Convey("Should be able to determine if two cluster maps represent the same thing...", t, func() { + + Convey("When everything's exactly the same...", func() { + + m1 := ClusterMap(make(map[int][]int)) + m1[0] = []int{1, 2, 3} + m1[1] = []int{4, 5} + + m2 := ClusterMap(make(map[int][]int)) + m2[0] = []int{1, 2, 3} + m2[1] = []int{4, 5} + + ret, err := m1.Equals(m2) + So(err, ShouldBeNil) + So(ret, ShouldBeTrue) + + }) + + Convey("With re-labelled clusters...", func() { + m1 := ClusterMap(make(map[int][]int)) + m1[1] = []int{1, 2, 3} + m1[0] = []int{4, 5} + + m2 := ClusterMap(make(map[int][]int)) + m2[1] = []int{1, 2, 3} + m2[0] = []int{4, 5} + + ret, err := m1.Equals(m2) + So(err, ShouldBeNil) + So(ret, ShouldBeTrue) + }) + + Convey("With missing clusters...", func() { + m1 := ClusterMap(make(map[int][]int)) + m1[1] = []int{1, 2, 3} + + m2 := ClusterMap(make(map[int][]int)) + m2[1] = []int{1, 2, 3} + m2[0] = []int{4, 5} + + _, err := m1.Equals(m2) + So(err, ShouldNotBeNil) + }) + + Convey("With missing points...", func() { + m1 := ClusterMap(make(map[int][]int)) + m1[1] = []int{1, 3} + m1[0] = []int{4, 5} + + m2 := ClusterMap(make(map[int][]int)) + m2[1] = []int{1, 2, 3} + m2[0] = []int{4, 5} + + _, err := m1.Equals(m2) + So(err, ShouldNotBeNil) + }) + + Convey("With invalid maps...", func() { + m1 := ClusterMap(make(map[int][]int)) + m1[0] = []int{1, 2, 3} + m1[1] = []int{4, 4, 5} + + m2 := ClusterMap(make(map[int][]int)) + m2[0] = []int{1, 2, 3} + m2[1] = []int{4, 5} + + _, err := m1.Equals(m2) + So(err, ShouldNotBeNil) + }) + + }) + +} diff --git a/clustering/clustering.go b/clustering/clustering.go new file mode 100644 index 0000000..a4a8176 --- /dev/null +++ b/clustering/clustering.go @@ -0,0 +1,107 @@ +/* This package implements clustering algorithms */ +package clustering + +import ( + "fmt" + "github.com/sjwhitworth/golearn/base" + "github.com/sjwhitworth/golearn/metrics/pairwise" +) + +// ClusterParameters takes a number of variables common to all clustering +// algorithms. + +type ClusterParameters struct { + // Attributes represents the set of Attributes which + // can be used for clustering + Attributes []base.Attribute + + // Metric is used to compute pairwise distance + Metric pairwise.PairwiseDistanceFunc +} + +// ClusterMap contains the cluster identifier as a key, followed by a vector of point +// indices that cluster contains. +type ClusterMap map[int][]int + +// Invert returns an alternative form of cluster map where the key represents the point +// index and the value represents the cluster index it's assigned to +func (ref ClusterMap) Invert() (map[int]int, error) { + ret := make(map[int]int) + for c := range ref { + for _, p := range ref[c] { + if _, ok := ret[p]; ok { + return nil, fmt.Errorf("Not a valid cluster map (points appear in more than one cluster)") + } else { + ret[p] = c + } + } + } + return ret, nil +} + +// Equals checks whether a bijection exists between two ClusterMaps (i.e. the clusters in one can +// be re-labelled to become the clusters of another) +func (ref ClusterMap) Equals(other ClusterMap) (bool, error) { + if len(ref) != len(other) { + return false, fmt.Errorf("ref and other do not contain the same number of clusters (%d and %d)", len(ref), len(other)) + } + + refInv, err := ref.Invert() + if err != nil { + return false, fmt.Errorf("ref: %s", err) + } + + otherInv, err := other.Invert() + if err != nil { + return false, fmt.Errorf("other: %s", err) + } + + clusterIdMap := make(map[int]int) + + // Range through each point index + for p := range refInv { + c1 := refInv[p] // Get the cluster index of this point + if c2, ok := otherInv[p]; ok { // Check if the other map has this point + // if so, c2 is the point's cluster in the other map + if c3, ok := clusterIdMap[c2]; ok { // what's our correspondance with c2? + if c1 != c3 { + // if c1 is not what we've currently got, error out + return false, fmt.Errorf("ref point %d (cluster %d) is assigned to a different cluster (%d) in ref %s", p, c2, c1, clusterIdMap) + } + } else { + clusterIdMap[c2] = c1 + } + } else { + return false, fmt.Errorf("failed to find reference point %d in src", p) + } + } + + // Check that after transformation, key contains the same points + arraysEqual := func(a1, a2 []int) bool { + + cnt := make(map[int]bool) + for _, a := range a1 { + cnt[a] = true + } + + for _, a := range a2 { + if _, ok := cnt[a]; !ok { + return false + } + } + + return true + + } + newMap := ClusterMap(make(map[int][]int)) + for cOld := range other { + cNew := clusterIdMap[cOld] + if !arraysEqual(ref[cNew], other[cOld]) { + return false, fmt.Errorf("Re-labelled cluster %d => %d doesn't contain the same points (%s, %s)", cOld, cNew, ref[cNew], other[cOld]) + } + newMap[cNew] = other[cOld] + } + + return true, nil + +} diff --git a/clustering/dbscan.csv b/clustering/dbscan.csv new file mode 100644 index 0000000..013c56e --- /dev/null +++ b/clustering/dbscan.csv @@ -0,0 +1,750 @@ +0.494260967249,1.45106696541 +-1.42808099324,-0.83706376669 +0.338559182384,1.03875870939 +0.119001013781,-1.05397553336 +1.12242460445,1.77493654436 +-1.26156989707,0.271881354299 +-1.30154774626,-0.762062025148 +0.585698651521,-0.339104628157 +1.08247212014,0.886855396912 +1.01416667809,1.34114022391 +-1.21578195893,-0.601021238858 +-1.25021782593,-1.05761650335 +-1.05160415572,-0.780084156141 +1.15263449272,-0.648539905918 +-0.783299140581,-1.2248966985 +0.202587147419,1.61104848936 +-1.43020789851,-1.82380067733 +-0.916300845616,-0.480830396598 +-0.506013825832,-0.295715454174 +0.436426179395,-1.06597144351 +0.468034167368,-0.974110220304 +0.522354793098,-0.641695891625 +0.94533367495,-0.543880951202 +0.94661473578,-0.939854758443 +-1.38551398913,-0.73950655252 +-1.15374916281,-0.250507932367 +0.493572698047,-0.949825244593 +0.884913340754,1.66591701207 +0.249587300835,1.57229126004 +1.02800263162,-0.340081504198 +0.478275464063,1.19798226443 +-1.19268844384,-0.510240121174 +-1.85804701232,-1.33021784213 +0.528139618545,1.32892750576 +-0.918024481532,-0.652157357893 +0.756316701741,0.920633635328 +0.855048505014,-0.481028310004 +0.492824086051,1.78274421923 +0.380510951332,1.24884772379 +-0.166999182256,-0.0916528008137 +0.862512958934,-0.29122649879 +-1.28326220483,-0.63402691263 +-1.46013480318,-0.722834729597 +-1.48000289758,-1.09948040102 +-2.19020872323,-0.630588973627 +-1.07505211635,-0.474050249508 +0.541969904427,1.03090707759 +0.824488329821,-0.264039880782 +0.456263169078,2.05788223562 +-1.58709404439,-0.54480731903 +1.32708272612,-0.345071514843 +0.68614239282,-0.490086592009 +-1.60725507262,0.070747440379 +-1.53337705952,-0.570087546452 +1.0491125845,-0.574435960384 +0.731933094085,-0.608068176075 +-1.13848133348,-0.0659881431468 +1.36805202458,1.65962813336 +0.222462580182,-0.65053906069 +-1.18662195919,-0.78239641499 +0.357717455186,-0.584924154569 +0.588086269107,-0.230283609581 +0.78242146637,-0.380417760077 +1.2682093931,-0.857019912656 +0.549567992097,-0.773931305337 +0.981410379535,1.01828533931 +0.707839055866,-0.233211620345 +0.0165651739637,-0.923844177798 +0.158530593126,1.68427935414 +0.498933328512,1.18944226235 +0.394392460137,1.10697668799 +0.52298152277,-0.915281143053 +0.363168115217,1.90748256868 +0.346568780252,1.26411862836 +0.966039504954,-0.4318119363 +-1.14222916165,-0.398461611165 +-0.134479180583,2.11039748445 +-1.18845711973,0.191151161919 +0.235515043844,1.71737552151 +0.648790787207,-0.936837517765 +-1.58852748366,-0.819181976895 +-1.04572997888,-0.29002720873 +0.467505726335,0.450459334368 +0.0198833944692,1.48714816824 +0.189992256516,1.10986299053 +1.48201717596,1.82713555691 +-1.30489683944,-1.15150866165 +0.757809431355,-0.47686276961 +-1.54387743826,-0.684212390528 +0.53240786142,-0.776648241672 +0.85665850455,-1.34594223446 +0.403144558116,1.57028295161 +-1.3011171994,-0.790729653327 +0.972620490761,1.21000471162 +-1.00025584409,-0.628924362444 +1.22425496262,0.501610912038 +-1.15175818324,0.22764659828 +-1.31816425788,-0.630999410835 +0.402531346597,1.15248839326 +0.0906743459729,1.61848052292 +0.598794476009,-0.744251645998 +-1.37198702139,-0.980218172223 +0.520218965558,-0.919220905523 +0.631969327359,1.19544068432 +0.728113832873,-0.518758002884 +0.262658464722,0.0128713235313 +1.01826270251,-0.800567265699 +1.0896513853,-0.503675186289 +1.53624088423,0.894604885123 +0.511997776458,0.678078694437 +1.68745105198,1.27830755696 +-1.84237360674,-0.904437839063 +-1.19200811061,-0.463511666939 +-1.29275263692,0.287881967384 +-2.03126575898,-0.895274949124 +0.529118462695,0.654914838633 +0.468283787666,0.755733587995 +-1.638498618,-0.154707320244 +0.605617236401,1.70403704905 +-0.767697521224,-1.01384394922 +0.421112557426,-0.813005680016 +-1.1727392859,-0.0801023370369 +0.763176137366,1.82318913399 +-0.0334381403655,1.44539596918 +-1.60758525806,-0.62956732394 +0.72250888945,-0.367506703588 +-1.48527973153,-0.62861576205 +0.978478897202,1.05374904006 +0.451784483015,1.13661154122 +1.27710347995,-0.491509617737 +0.7166105877,1.15073382716 +0.705050630765,-1.01884736371 +0.535813899767,-1.31595906212 +0.279302786611,-1.16319317603 +0.29795190705,1.14196446938 +-1.5319923175,-1.74146843932 +0.485447620689,-0.597755525309 +0.407314491616,-0.790408883348 +0.381481488856,1.90489980312 +-1.60594123991,-0.76522411796 +1.23408760826,1.97619040399 +0.909343480925,-0.618337223907 +0.495887533633,0.855925046745 +0.793312516951,0.879279610882 +0.346669837831,-0.395258378353 +0.463120268974,-0.842105995666 +-0.422275985459,-0.190344559422 +0.938840781419,-0.223971270792 +-1.58434365981,-0.845357036129 +1.52307352239,0.741157517894 +0.473203974657,-0.605056119142 +-1.33430726419,-0.787153064395 +-1.30774613959,-0.537830906671 +0.44437726176,-0.570907450386 +0.302728842099,-1.4022293954 +0.498614426707,-0.661820178158 +1.02546663264,2.17903746819 +-0.888963724459,-0.894519799863 +-0.0094375858741,2.06614833436 +-1.259326547,-1.33666248485 +0.334806319729,0.635350614538 +-2.0514671874,-0.491853069487 +0.511781097662,0.772058829646 +0.635381289585,-1.23415961512 +0.840452136147,-0.925641488461 +-0.46307453491,-1.26531794688 +-1.37224990492,-0.0477233997811 +0.128494145161,-0.146277558271 +0.629212436152,-0.545489790799 +-1.28799441742,-0.218570654523 +0.638847594716,1.0198939832 +-1.90824567176,-1.24854294321 +0.983925587407,-0.980132673476 +0.751915912284,-0.434247990685 +0.246162045698,-0.972003120401 +-1.42184967713,-1.00645441438 +-1.36258687372,-0.465192195174 +0.729107773809,-1.12124670875 +1.28828508776,-1.18972269812 +0.936218595433,0.844436650383 +-1.41967242002,-1.33553338128 +0.451293435185,-0.337043043077 +0.889211776584,0.683688380936 +0.946264899744,0.846407250351 +0.516908027375,-1.13002059107 +0.663113490975,0.662420359006 +0.985803048039,1.26228271875 +-1.4124239618,-0.947706065026 +0.642179325842,1.36969227279 +-1.32320503558,-0.518361624408 +0.389031988291,1.16716527963 +-0.806854584638,-0.613264833433 +-0.73049432945,-0.484378149065 +0.493548378749,-0.761716569457 +0.118175433165,-0.443557808199 +1.00315780403,1.4310943891 +0.778850340762,2.09349071844 +-0.745033802864,-0.756441323796 +-0.93389892072,-0.103482424997 +0.68196176411,-0.273220993773 +-1.16459401764,-0.315541399223 +0.740399605464,-0.0945591684424 +0.856407754419,0.252753351451 +0.803410992909,-1.32952562448 +0.429896355505,-0.758228537429 +0.595823625156,1.74945400458 +1.02085295004,-0.440804557414 +0.30307695482,1.45762223084 +1.18958904168,-0.581519032443 +0.96915905519,-0.511234999414 +0.697140552761,1.46349275366 +0.637227696862,-0.764858659877 +1.35045914484,-0.667938023256 +0.250651256786,-1.19493208012 +1.28347766291,1.37097619103 +-0.128975958296,1.09716295281 +-1.7517528787,-0.262053681114 +-1.3635857203,-1.06031600728 +-0.904113999203,0.191818430248 +0.165426717861,-0.866647109384 +0.232203921427,-0.682948158472 +0.350368147923,-0.295280019807 +1.5427482888,-0.592939512519 +-1.13795423209,-0.133498274187 +0.674237889386,-0.632143914378 +0.334556478351,-1.20237442694 +0.528396459186,1.34497258643 +0.268370506258,0.734359941775 +0.309361881005,-0.728426362716 +0.917435744228,1.30854004814 +0.428789300542,1.41209652083 +0.199130767118,1.65759766562 +-1.17444696491,-0.950375612201 +0.597006581866,1.19119789824 +-1.45170622969,-0.891168308477 +-1.41986354849,-0.273475605125 +-1.57409699552,-0.422236366569 +1.04184264467,-0.362737479132 +-2.14219480292,-0.482272076783 +-1.50691533211,-0.200973148817 +0.0543420665276,1.33168891813 +-1.13144663461,-0.651825483298 +1.07155174333,-0.692136570485 +0.583387651839,-0.491450887858 +-1.14297733022,-0.697948095468 +0.0998245638451,0.10950372489 +0.220588982913,-0.851548705937 +-1.13730048755,-0.564448259501 +0.905073179513,1.12779984735 +0.72504167988,1.28738215218 +-1.06955320593,-0.467663188307 +-0.880265370005,-1.02614239598 +-1.44264764226,-0.96145282057 +1.01333072504,1.24675601661 +-1.0093984377,-1.05143861237 +0.507657052315,1.36804853004 +1.26502785776,-0.711979714262 +1.31608042094,1.5734222567 +0.334632982453,-0.84147974129 +0.802031438762,0.228215838939 +1.38250775401,-0.644251339858 +0.919614961822,-1.22049235391 +0.929729151417,-0.208693463261 +-1.53633104344,-0.511275317046 +-0.665051865958,-0.739115745001 +-0.335795516652,1.56140541417 +1.23901518412,1.87882199622 +-1.35543673912,-0.601849685925 +-1.15154941392,-0.269135444753 +0.608439338548,1.46684269694 +1.06006794863,1.13065360895 +0.942890187819,-0.742929110414 +-1.15672050041,-0.436145800526 +1.62198216506,0.050201317777 +0.854125246175,-0.514807506009 +-1.14337683511,-0.490935142717 +-1.51048251847,-0.0345004965754 +0.880530249926,-0.869888336327 +-1.36540418059,-0.756111150943 +0.601814512111,-1.21412505961 +-0.0621652593321,1.12108597614 +0.74067770872,-0.576648130759 +-0.183577853633,-0.125433577503 +0.417995488425,1.21449387096 +-1.1856447963,-0.984315517908 +1.07887574968,-0.840413058707 +0.090657698723,-1.25434772582 +0.0261662265887,1.22429234588 +1.13673243898,-0.444139145222 +1.23361139042,-1.09421718393 +0.351468885092,1.51690258534 +0.255831769187,1.27677830087 +0.798195414423,-0.18283188485 +1.31845143924,1.69400632284 +0.938052607202,-0.419433668128 +0.388310366276,1.31945848095 +1.00904356759,-0.374533562373 +-1.08675207316,-0.230719819714 +0.956791915728,1.33752493245 +0.964894172999,1.3091321864 +0.630607763963,1.39287553367 +-1.41288695181,-0.864681477113 +0.261119656155,-1.02691248837 +-0.882375409513,-0.666629249983 +0.989911346176,-0.744391801077 +0.867329484559,-0.768003291115 +1.10613565156,1.4303998032 +0.77134497925,-0.692113237484 +0.343526184216,-0.991545218203 +0.758591550569,1.54398289162 +0.707946435833,1.45422137588 +0.709604992056,-1.40060170714 +-1.62485869339,-0.127799648835 +-1.66703749341,0.0158250976471 +-1.80730926772,-0.301662933271 +-1.45291560869,-0.535118179264 +-1.4701829607,-0.667609031391 +0.826731842161,1.41567303436 +-1.83590114306,-1.10954151061 +-1.6332275232,-0.563497927722 +-0.7388346936,-0.798186938046 +-1.82702823377,0.13893299319 +1.08739214482,0.826583726311 +0.196057452318,2.06336452546 +-0.962783057941,-0.109325188026 +-1.19668293625,-1.1087752111 +-0.920351459366,-0.706719513233 +1.1741662534,1.0387978517 +0.489318601459,-0.795493247886 +-0.0285631715351,1.48253801626 +-1.55996778776,-0.562017909444 +0.0907181454452,-0.814517495862 +1.04873107616,-0.452078258313 +0.641663493277,1.45460629445 +0.396805058072,1.10427025972 +1.00336963075,-0.459191567668 +0.907351763777,1.46562217387 +0.904912861981,-1.62473397987 +-1.30060206226,-0.639040245494 +0.22255248672,1.32737094419 +0.41209455966,-0.958675990971 +0.941556677173,1.35441829013 +1.28361991963,-1.24163477985 +-0.376722258575,1.54300064517 +0.930527863539,-0.784505897599 +1.05101554226,-0.405406154061 +1.22185277774,2.04479129366 +-1.10897541444,-0.568930353083 +0.637361305672,1.47374301327 +-0.735046904585,-0.332733398991 +0.914105951171,1.81364038611 +0.815815323504,-0.428342552091 +0.655466878695,-0.869548902941 +-1.1045597651,-0.600408464946 +-0.915703222184,-0.742626383383 +-1.3571704177,-0.68125832152 +0.69160775897,-0.893583583689 +0.978900301359,1.75109237406 +0.53683021324,-1.41620152234 +1.09237619762,1.72716832141 +0.866591909179,-0.581572078316 +-1.80307744469,-0.65461097373 +-0.127231346916,-0.409038899099 +0.541525702451,-0.201173106705 +0.68589072527,1.53390864901 +-0.502670916098,-0.757868411152 +0.417479823257,0.872860696972 +-2.0289141946,-0.993678879688 +0.245343426191,1.77834730722 +0.316274690117,2.05030729845 +1.23151797851,1.52230461678 +0.488799329286,1.01622700328 +0.736124228521,-0.560102473907 +0.0380991755979,1.54458039477 +0.348282296735,0.0373035505291 +0.791153859839,1.36235109152 +-1.89637476785,-0.983716547448 +0.529079350094,1.21622740397 +-1.2345838948,-0.786033236307 +0.206511679327,-0.620187190429 +-1.25908731883,-0.301031125224 +-1.09843278784,0.0369549195008 +-1.10406146313,-1.35048039511 +0.983155368445,1.41480769807 +-1.7328692309,-1.08216857053 +-0.917910107541,-0.0889436794991 +0.312585483993,1.0818337627 +-0.0811644021867,-0.707691032276 +-1.20266214326,-0.217504289139 +0.454419137278,2.2457941917 +0.471831725992,-0.493824106953 +1.29161652352,-0.520992830994 +-1.25588057463,-0.721197168795 +-1.20377898567,-1.33173379489 +1.11899200093,-0.713538916105 +0.339906689497,-0.72413604985 +0.615417018996,-0.858079193557 +-1.01823258109,-0.78714664658 +0.816099854449,-0.871668345031 +-1.7212991458,-0.777848794878 +0.843019145714,-0.498712137992 +1.4021067635,1.45886382804 +0.878294256485,-1.02266917785 +-0.88512932828,-0.853503063368 +0.430259456368,-0.453270444086 +-1.77952949337,-0.141961490527 +0.849914524615,1.24032152147 +-1.32980886649,-0.481002489736 +0.624470649758,1.26531866728 +-1.06157593269,-1.13833962673 +-1.3992137138,-0.965470741462 +0.896181657602,0.695919911938 +-1.418340371,-0.224255463115 +0.0738188763056,-0.0563312160229 +1.01170961883,0.241023782153 +-1.5363281273,0.0159593515193 +0.82770781377,0.709297571031 +0.545029125045,0.868146825735 +0.94527049937,-0.689257336931 +-1.19201851393,-0.0979642908923 +0.356642444398,-0.521177720048 +1.25677847275,-0.948042349321 +0.960112654402,-1.1046969869 +0.467333609641,-0.297755148203 +-1.09928800088,-0.782568121394 +0.499876498504,1.34378633999 +-0.0980920351721,1.38052928695 +-0.233897355292,1.40492904943 +0.951304495882,1.12558216168 +-1.57107850167,-0.657989767628 +0.284198318557,1.14751633136 +1.14780923861,-0.398627857264 +-1.63748393741,-0.707992965283 +0.396760739464,1.1549469915 +-0.856392511462,-0.729638141622 +0.743336814006,-0.0447286202516 +0.213902305912,1.02275520522 +0.866879045866,1.22042656018 +-0.88179618297,-1.43514524119 +0.334722303045,0.736465317357 +-1.71828945714,-0.333062709029 +-0.918042667376,-0.843035843758 +0.929243026125,1.35726190001 +-0.431851673719,-1.10093484648 +0.703743675795,1.87295209701 +0.98717412056,-0.391248211672 +0.446786417845,-0.232663277488 +0.833397671467,-1.01523684003 +-1.31380292373,-0.106348966316 +-1.98210412488,-0.520364529607 +0.882630413465,-0.204652953696 +0.57473870386,1.15343094618 +-1.64296177795,-0.545851844001 +0.812520126446,1.57046768 +-0.221156389297,0.90920018435 +-1.31918421048,-1.02294749184 +0.756117389326,1.26888096925 +-1.00145716326,-1.06765844508 +-1.16012367924,-1.17473398971 +0.140325452005,-0.427986994764 +0.5813642278,-0.83696135172 +-0.31645030278,-1.51218920885 +0.82452917064,0.93172792002 +-0.750534982503,-0.836888860558 +0.968658108542,-0.448623907721 +1.2006923499,-0.475696442665 +-1.26717115594,-0.665599874339 +-1.82087781658,-0.868101472932 +-1.16838236627,-1.54147890288 +-0.981140298879,-1.28505380627 +0.141023068843,1.12746333408 +0.754032847532,0.960404487137 +0.202135095167,1.18555519975 +0.849908773169,-0.847682954547 +0.744968023152,-0.228079376425 +-1.91222754219,-0.796509854232 +0.775623691917,-0.695029747499 +-0.767188336951,-0.677911431003 +0.712466108841,1.55417287552 +1.21349899534,1.6388133243 +-1.0869979326,-0.648693092282 +0.699067612971,-1.40916870622 +-1.53255598882,-0.261494722161 +1.38939876357,1.88316296941 +0.596690144163,1.72643881439 +0.804964907977,-0.170902873462 +0.40613498617,1.1198979641 +-1.20807438507,-0.788501079273 +0.728500901715,1.68709745134 +0.316645956769,-0.510754409208 +-0.823618040446,-0.884384414857 +1.01442400059,1.24817740818 +0.688659017161,-0.58639380357 +0.370731358867,-0.986204337596 +-1.02050291971,-0.913802249095 +1.07231521798,1.81215231098 +0.293755472217,0.389904123007 +0.384580005797,1.95282853017 +0.731079718128,-0.600671861978 +-1.27084815866,-0.599802102819 +-1.5506697485,-0.37391302332 +0.819305570722,1.43691036146 +0.758463908179,-0.257726277971 +1.00739359449,1.43935814903 +0.296387422059,1.74172031876 +-1.56792541994,-0.625734935299 +-1.58294937352,-0.212561302929 +-1.48429016855,-0.214074430447 +-1.57271416628,-0.983949703014 +0.535738594277,1.01076484292 +-1.47375056852,-0.955937874772 +0.568475265758,1.64956338847 +-0.862162831203,-0.884179051907 +0.544925120741,1.6193204064 +0.480499087021,2.02664864155 +0.122038139573,0.119143611341 +1.08322686266,1.50007405277 +-1.26363865114,-1.24824215223 +-1.09515150213,-0.580737374373 +0.745663888861,-0.797265870367 +-0.704911858139,-0.435654296496 +-1.08345708839,-0.683728002502 +-0.159115840147,1.35521476836 +-0.834099861805,-0.571377281807 +0.803301570929,1.04060299172 +0.882227724909,-1.04635993234 +-1.42356222195,-1.11563240162 +0.598075641758,1.34363133224 +1.00649041199,1.53362494993 +-1.74840606346,-0.757167172502 +0.665860879827,1.23423673133 +-2.27447426719,-1.08752048002 +-1.48420811929,-0.38750074543 +0.710494890905,-0.0301573663517 +0.2452388989,-1.06063486305 +-1.30030123852,-0.741203235798 +0.722560907798,-1.0887138629 +0.845890473528,-0.765476650879 +-0.987808045599,-0.300980235798 +0.798685296365,-0.0203804886503 +-1.27497724309,-0.358325506229 +-1.63280019769,-0.540251128077 +-1.64088545492,-0.242763376194 +-1.27425159086,-0.0278791499227 +0.710148737729,-1.00918956356 +0.383105045026,0.834657242727 +0.838530657897,1.26189497731 +1.37406167258,0.78263972991 +-1.95053657495,-1.14561235999 +0.582792508127,-0.690694356981 +0.913359445094,-0.619101104092 +0.785580964186,1.58201082262 +0.811102347249,-0.425468506696 +0.702937953149,1.18281732336 +0.532290267793,-0.895553891619 +-1.10621476202,0.0215080636159 +0.175894375098,1.6356268775 +-0.768255477238,-0.558603363551 +-1.04310626044,-1.16049754679 +1.07207441854,1.56204556067 +1.22409867855,1.22724262076 +-1.59405917837,-1.18378628891 +1.47675154752,-0.673616523166 +-1.07509128453,-0.542721555114 +0.930232971573,-0.66398380143 +0.857366630575,-0.359918879193 +0.251247597619,1.17090459393 +1.16780804184,-0.28530226059 +-1.6384800666,-1.12997990407 +-1.3215530792,-1.28470419469 +-1.09352620351,0.435204632862 +0.944537238213,-0.802446108456 +0.514125239901,1.36791043858 +0.621920898963,1.97375976496 +0.209206751878,-1.11104286684 +0.0993365991821,1.11511466724 +0.461961063861,-0.637500819694 +0.257071325333,-0.87287754892 +0.708547376969,1.39741781887 +0.475251941501,-1.31719059033 +0.610276462303,1.04890351278 +-1.52841458815,-0.454742945322 +-1.54735702492,-0.751499117222 +-0.503751481339,-0.655854045695 +-1.43543163548,-0.356638792552 +0.472644264313,1.31387413897 +0.20594576925,1.13671047633 +0.615102823628,1.3826749988 +0.634447199559,-0.507752603381 +-1.14039960054,-0.69475439419 +-1.32947946028,-0.197603144118 +-1.20830036549,0.0116873739331 +-1.73345855335,-0.132096369529 +0.357039754872,-1.10590660431 +0.077295305839,-1.13367252031 +0.487312167153,-1.06555350588 +0.353693691332,0.747606784869 +0.72201625169,-0.295881357905 +-1.92967429146,-0.682263484159 +-0.908678506749,-0.121343269386 +0.448752295827,0.926195545738 +0.418381709077,1.28661802591 +-1.61004687385,-0.988293010227 +1.56677839696,1.67458369438 +0.727890188144,-0.431864801265 +-1.37981436331,-0.709415449061 +0.686461132989,-0.0251400314444 +-1.60911292662,-1.23835579119 +0.165348852234,1.60960575054 +1.00429011937,-0.690085482535 +1.30821133172,-0.946724989662 +1.10257026468,1.23706132126 +1.15666930562,0.963710509452 +-1.67698616873,-0.808446550696 +0.576897088351,0.99280475248 +0.94555298466,1.70764263243 +0.568459449543,-0.902358290454 +0.273923190749,1.4295727757 +0.652270914927,2.0019813952 +0.60267889291,1.04240532752 +0.698937416681,-1.18542553401 +0.262702790697,0.705979429324 +-0.0487035963106,-0.47844505956 +0.278008906473,0.998459028372 +0.244071635336,0.754522731174 +-1.47458283881,-1.04780342988 +-1.41308398904,-1.16512495168 +0.376605532983,1.67410945603 +0.438984358586,1.46896397357 +-0.791357172838,-0.556204685175 +0.513404458949,0.148861596075 +0.55598980494,-0.734532332917 +-0.00629072685579,1.4577693075 +0.451340090014,0.847671272714 +0.587452946206,2.18519727161 +1.32815664066,1.46277983499 +1.15868201425,-0.705507183387 +-2.19229513315,-0.302285759118 +-1.49261826396,-0.821633387241 +-1.47130624425,-0.63019322747 +1.34514654196,-0.468598808267 +-1.58948534345,-1.07365303551 +0.279766265285,1.17244610684 +0.87120798495,1.44312846654 +0.620040716673,-1.14333094168 +0.807830356825,-0.835873745532 +0.894954831928,0.677891197264 +-0.602527884597,-0.69024906938 +0.658008384741,-0.400988830169 +0.914769130628,1.57740991958 +-1.48930440184,-1.37182156567 +0.273422769288,-0.11466674929 +0.472531390995,-0.0835618146858 +-1.12231828159,-0.713669220151 +0.241729907099,1.62624331977 +0.370860921385,0.99072512802 +0.980701101307,0.273732023633 +-0.863388034868,-1.63431795709 +0.734659138483,-1.29170380626 +1.3680516506,0.926799443937 +0.149907608089,1.41098618733 +0.0734364428587,1.47804655719 +0.415481089773,-1.08580981443 +1.87043802694,-0.564764189615 +1.33224467911,-0.791483035077 +0.463346223426,1.54846334002 +-1.77220711583,-0.866301255997 +-0.566063485459,-0.721836722342 +-1.36471735933,-0.972719760591 +-1.36476695392,-0.99379395024 +-1.28469298601,-0.318880090688 +1.51904736282,1.29062985586 +0.999041358538,1.43050704451 +0.187630466783,-0.83321811689 +-0.987675226269,-0.848285797745 +0.989408735718,-0.0650375905356 +0.915630353951,1.24498035147 +-1.43557093403,-0.974714006397 +-1.45387959068,-0.481585100397 +-0.278976401689,-0.795716253642 +0.400148808814,0.441913646796 +0.757992930737,-0.758884707769 +-1.92697167247,-0.984413224925 +-0.946132960141,-0.951642902807 +-0.374479291272,-1.13624183449 +0.40711034652,-0.252446172482 +-1.09764344526,-0.517312637143 +0.669488945211,-0.522164749169 +0.850151323586,1.34147908285 +1.25907388444,-0.386367441721 +0.347464827015,-0.533007300891 +-1.27564074212,-0.433436163023 +0.673862073887,1.42476903402 +0.849062018952,1.08302511354 +0.144638692788,1.95227594178 +-1.42722323034,-0.811204501399 +-1.64416438467,-0.543384736216 +0.463117573507,-0.274247994439 +0.943724941954,1.44571155091 +0.870463827056,1.22602060889 +1.18495335344,-0.234761356364 +0.733969567923,-0.67288952967 +-1.79850796029,-0.934840788244 +0.888166348926,-0.884107436609 +1.27188343182,-0.959291102507 +0.841840207037,-0.974022163633 +0.945701225098,1.6275104008 +0.660977855757,-0.970868100576 +-0.965730793499,-0.409989983699 +1.03969981683,-0.365715737398 +-1.16752822897,-0.477847154481 +-1.01284867036,-0.328504389699 +-1.54030317305,-0.13349028812 +0.624306174222,-0.148540727917 +0.667301055225,1.49923194215 +0.778588762664,-0.384883563802 +0.298516528096,1.32728546211 +0.624466159376,-1.19599990114 +0.810461715314,-0.542544742288 +0.469536296015,1.49442431098 +0.133949872378,0.783045014519 +1.68141770912,-0.716638871392 +-1.19132044269,-1.03412936975 +0.0382702789919,0.234517417471 +0.478943470729,1.37763420757 +1.59302092264,-0.720684680801 +0.361166988246,0.985493265361 +0.768892000974,0.97473143902 +0.962170710559,1.33000155872 +-1.1943589351,-0.485554696436 +0.306037432346,0.536315500264 +1.01845907675,-0.550623430195 +-1.26470285038,-0.339699106261 +0.633088431313,-1.00774511415 +0.800859848663,0.881746319329 +-1.90816425794,-0.664624198594 +0.403731959752,1.16592477989 +0.63308271635,0.852550261268 +0.184258505898,-0.476223501919 +0.368550995255,0.768785338289 +0.674247371136,-0.335924019381 +-1.79245127675,-1.4045223247 +-1.87656067,-0.476116371583 +0.493448265324,-0.820792271427 +-1.60183922321,-0.868539405266 +0.505927093362,1.21392675643 +-1.64046095603,-0.469972586457 +-0.0571387622957,-0.909261054305 +-1.1693940706,0.0395969246032 +0.263229511513,-0.926499490699 diff --git a/clustering/dbscan.go b/clustering/dbscan.go new file mode 100644 index 0000000..a40ee79 --- /dev/null +++ b/clustering/dbscan.go @@ -0,0 +1,188 @@ +package clustering + +import ( + "github.com/gonum/matrix/mat64" + "github.com/sjwhitworth/golearn/base" + "github.com/sjwhitworth/golearn/metrics/pairwise" + "math/big" +) + +// DBSCANParameters describes the parameters of the density-based +// clustering algorithm DBSCAN +type DBSCANParameters struct { + ClusterParameters + + // Eps represents the "reachability", or the maximum + // distance any point can be before being considered for + // inclusion. + Eps float64 + + // MinCount represents how many points need to be + // in a cluster before it is considered one. + MinCount int +} + +func regionQuery(p int, ret *big.Int, dist *mat64.Dense, eps float64) *big.Int { + rows, _ := dist.Dims() + // Return any points within the Eps neighbourhood + for i := 0; i < rows; i++ { + if dist.At(p, i) <= eps { + ret = ret.SetBit(ret, i, 1) // Mark as neighbour + } + } + return ret +} + +func computePairwiseDistances(inst base.FixedDataGrid, attrs []base.Attribute, metric pairwise.PairwiseDistanceFunc) (*mat64.Dense, error) { + // Compute pair-wise distances + // First convert everything to floats + mats, err := base.ConvertAllRowsToMat64(attrs, inst) + if err != nil { + return nil, err + } + + // Next, do an n^2 computation of all pairwise distances + _, rows := inst.Size() + dist := mat64.NewDense(rows, rows, nil) + for i := 0; i < rows; i++ { + for j := i + 1; j < rows; j++ { + d := metric.Distance(mats[i], mats[j]) + dist.Set(i, j, d) + dist.Set(j, i, d) + } + } + return dist, nil +} + +// DBSCAN clusters inst using the parameters allowed in and produces a ClusterId->[RowId] map +func DBSCAN(inst base.FixedDataGrid, params DBSCANParameters) (ClusterMap, error) { + + // Compute the distances between each possible point + dist, err := computePairwiseDistances(inst, params.Attributes, params.Metric) + if err != nil { + return nil, err + } + + _, rows := inst.Size() + + clusterMap := make(map[int][]int) + visited := big.NewInt(0) + clustered := big.NewInt(0) + // expandCluster adds P to a cluster C, visiting any neighbours + expandCluster := func(p int, neighbours *big.Int, c int) { + if clustered.Bit(p) == 1 { + panic("Shouldn't happen!") + } + // Add this point to cluster C + if _, ok := clusterMap[c]; !ok { + clusterMap[c] = make([]int, 0) + } + clusterMap[c] = append(clusterMap[c], p) + clustered.SetBit(clustered, p, 1) + visited.SetBit(visited, p, 1) + + for i := 0; i < rows; i++ { + reset := false + if neighbours.Bit(i) == 0 { + // Not a neighbour, so skip + continue + } + if visited.Bit(i) == 0 { + // not yet visited + visited = visited.SetBit(visited, i, 1) // Mark as visited + newNeighbours := big.NewInt(0) + newNeighbours = regionQuery(i, newNeighbours, dist, params.Eps) + if BitCount(newNeighbours) >= params.MinCount { + neighbours = neighbours.Or(neighbours, newNeighbours) + reset = true + } + } else { + continue + } + if clustered.Bit(i) == 0 { + clusterMap[c] = append(clusterMap[c], i) + clustered = clustered.SetBit(clustered, i, 1) + } + if reset { + i = 0 + } + } + } + + c := 0 + for i := 0; i < rows; i++ { + if visited.Bit(i) == 1 { + continue // Already visited here + } + visited.SetBit(visited, i, 1) + neighbours := big.NewInt(0) + neighbours = regionQuery(i, neighbours, dist, params.Eps) + if BitCount(neighbours) < params.MinCount { + // Noise, cluster 0 + clustered = clustered.Or(clustered, neighbours) + continue + } + c = c + 1 // Increment cluster count + expandCluster(i, neighbours, c) + } + + // Remove anything from the map which doesn't make + // minimum points + rmKeys := make([]int, 0) + for id := range clusterMap { + if len(clusterMap[id]) < params.MinCount { + rmKeys = append(rmKeys, id) + } + } + for _, r := range rmKeys { + delete(clusterMap, r) + } + + return ClusterMap(clusterMap), nil +} + +// How many bits? +func BitCount(n *big.Int) int { + var count int = 0 + for _, b := range n.Bytes() { + count += int(bitCounts[b]) + } + return count +} + +// The bit counts for each byte value (0 - 255). +var bitCounts = []int8{ + // Generated by Java BitCount of all values from 0 to 255 + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8, +} diff --git a/clustering/dbscan_labels.csv b/clustering/dbscan_labels.csv new file mode 100644 index 0000000..6edc502 --- /dev/null +++ b/clustering/dbscan_labels.csv @@ -0,0 +1,750 @@ +2 +0 +2 +1 +2 +0 +0 +1 +2 +2 +0 +0 +0 +1 +0 +2 +-1 +0 +0 +1 +1 +1 +1 +1 +0 +0 +1 +2 +2 +1 +2 +0 +0 +2 +0 +2 +1 +2 +2 +1 +1 +0 +0 +0 +0 +0 +2 +1 +2 +0 +1 +1 +0 +0 +1 +1 +0 +2 +1 +0 +1 +1 +1 +1 +1 +2 +1 +1 +2 +2 +2 +1 +2 +2 +1 +0 +-1 +0 +2 +1 +0 +0 +2 +2 +2 +2 +0 +1 +0 +1 +1 +2 +0 +2 +0 +-1 +0 +0 +2 +2 +1 +0 +1 +2 +1 +1 +1 +1 +-1 +2 +-1 +0 +0 +0 +0 +2 +2 +0 +2 +0 +1 +0 +2 +2 +0 +1 +0 +2 +2 +1 +2 +1 +1 +1 +2 +-1 +1 +1 +2 +0 +2 +1 +2 +2 +1 +1 +-1 +1 +0 +-1 +1 +0 +0 +1 +1 +1 +2 +0 +2 +0 +2 +0 +2 +1 +1 +-1 +0 +1 +1 +0 +2 +0 +1 +1 +1 +0 +0 +1 +1 +2 +0 +1 +2 +2 +1 +2 +2 +0 +2 +0 +2 +0 +0 +1 +1 +2 +2 +0 +0 +1 +0 +1 +1 +1 +1 +2 +1 +2 +1 +1 +2 +1 +1 +1 +2 +2 +0 +0 +0 +1 +1 +1 +1 +0 +1 +1 +2 +2 +1 +2 +2 +2 +0 +2 +0 +0 +0 +1 +0 +0 +2 +0 +1 +1 +0 +1 +1 +0 +2 +2 +0 +0 +0 +2 +0 +2 +1 +2 +1 +1 +1 +1 +1 +0 +0 +2 +2 +0 +0 +2 +2 +1 +0 +-1 +1 +0 +0 +1 +0 +1 +2 +1 +1 +2 +0 +1 +1 +2 +1 +1 +2 +2 +1 +2 +1 +2 +1 +0 +2 +2 +2 +0 +1 +0 +1 +1 +2 +1 +1 +2 +2 +1 +0 +0 +0 +0 +0 +2 +0 +0 +0 +0 +2 +2 +0 +0 +0 +2 +1 +2 +0 +1 +1 +2 +2 +1 +2 +1 +0 +2 +1 +2 +1 +2 +1 +1 +2 +0 +2 +0 +2 +1 +1 +0 +0 +0 +1 +2 +1 +2 +1 +0 +1 +1 +2 +0 +2 +0 +2 +2 +2 +2 +1 +2 +1 +2 +0 +2 +0 +1 +0 +0 +0 +2 +0 +0 +2 +1 +0 +2 +1 +1 +0 +0 +1 +1 +1 +0 +1 +0 +1 +2 +1 +0 +1 +0 +2 +0 +2 +0 +0 +2 +0 +1 +-1 +0 +2 +2 +1 +0 +1 +1 +1 +1 +0 +2 +2 +2 +2 +0 +2 +1 +0 +2 +0 +1 +2 +2 +0 +2 +0 +0 +2 +-1 +2 +1 +1 +1 +0 +0 +1 +2 +0 +2 +2 +0 +2 +0 +0 +1 +1 +-1 +2 +0 +1 +1 +0 +0 +0 +0 +2 +2 +2 +1 +1 +0 +1 +0 +2 +2 +0 +1 +0 +2 +2 +1 +2 +0 +2 +1 +0 +2 +1 +1 +0 +2 +2 +2 +1 +0 +0 +2 +1 +2 +2 +0 +0 +0 +0 +2 +0 +2 +0 +2 +2 +1 +2 +0 +0 +1 +0 +0 +2 +0 +2 +1 +0 +2 +2 +0 +2 +0 +0 +1 +1 +0 +1 +1 +0 +1 +0 +0 +0 +0 +1 +2 +2 +2 +0 +1 +1 +2 +1 +2 +1 +0 +2 +0 +0 +2 +2 +0 +1 +0 +1 +1 +2 +1 +0 +0 +0 +1 +2 +2 +1 +2 +1 +1 +2 +1 +2 +0 +0 +0 +0 +2 +2 +2 +1 +0 +0 +0 +0 +1 +1 +1 +2 +1 +0 +0 +2 +2 +0 +2 +1 +0 +1 +0 +2 +1 +1 +2 +2 +0 +2 +2 +1 +2 +2 +2 +1 +2 +1 +2 +2 +0 +0 +2 +2 +0 +1 +1 +2 +2 +2 +2 +1 +-1 +0 +0 +1 +0 +2 +2 +1 +1 +2 +0 +1 +2 +0 +1 +1 +0 +2 +2 +-1 +-1 +1 +2 +2 +2 +1 +-1 +1 +2 +0 +0 +0 +0 +0 +2 +2 +1 +0 +1 +2 +0 +0 +0 +2 +1 +0 +0 +-1 +1 +0 +1 +2 +1 +1 +0 +2 +2 +2 +0 +0 +1 +2 +2 +1 +1 +0 +1 +1 +1 +2 +1 +0 +1 +0 +0 +0 +1 +2 +1 +2 +1 +1 +2 +2 +1 +0 +1 +2 +1 +2 +2 +2 +0 +2 +1 +0 +1 +2 +0 +2 +2 +1 +2 +1 +0 +0 +1 +0 +2 +0 +1 +0 +1 diff --git a/clustering/dbscan_test.go b/clustering/dbscan_test.go new file mode 100644 index 0000000..182fca3 --- /dev/null +++ b/clustering/dbscan_test.go @@ -0,0 +1,150 @@ +package clustering + +import ( + "bufio" + "github.com/gonum/matrix/mat64" + "github.com/sjwhitworth/golearn/base" + "github.com/sjwhitworth/golearn/metrics/pairwise" + . "github.com/smartystreets/goconvey/convey" + "math" + "math/big" + "os" + "strconv" + "testing" +) + +func TestDBSCANDistanceQuery(t *testing.T) { + + Convey("Should be able to determine which points are in range...", t, func() { + + // Read in the synthetic test data + inst, err := base.ParseCSVToInstances("synthetic.csv", false) + So(err, ShouldBeNil) + + // Create a neighbours vector + neighbours := big.NewInt(0) + + // Compute pairwise distances + dist, err := computePairwiseDistances(inst, inst.AllAttributes(), pairwise.NewEuclidean()) + So(dist.At(0, 0), ShouldAlmostEqual, 0) + So(dist.At(0, 1), ShouldAlmostEqual, 1) + So(dist.At(1, 0), ShouldAlmostEqual, 1) + So(dist.At(0, 2), ShouldAlmostEqual, math.Sqrt(5)) + So(dist.At(2, 0), ShouldAlmostEqual, math.Sqrt(5)) + So(err, ShouldBeNil) + + // Do the region query + neighbours = regionQuery(0, neighbours, dist, 1) + So(neighbours.Bit(0), ShouldEqual, 1) + So(neighbours.Bit(1), ShouldEqual, 1) + So(neighbours.Bit(2), ShouldEqual, 0) + So(neighbours.Bit(3), ShouldEqual, 0) + So(neighbours.Bit(4), ShouldEqual, 0) + + }) + +} + +func TestDBSCANSynthetic(t *testing.T) { + Convey("Synthetic DBSCAN test should work...", t, func() { + + inst, err := base.ParseCSVToInstances("synthetic.csv", false) + So(err, ShouldBeNil) + + p := DBSCANParameters{ + ClusterParameters{ + inst.AllAttributes(), + pairwise.NewEuclidean(), + }, + 1, + 1, + } + + m, err := DBSCAN(inst, p) + So(err, ShouldBeNil) + + So(len(m), ShouldEqual, 2) + So(m[1], ShouldContain, 0) + So(m[1], ShouldContain, 1) + So(m[1], ShouldContain, 2) + So(m[1], ShouldContain, 3) + + }) +} + +func TestDBSCANDistanceMetric(t *testing.T) { + + Convey("Check the distance function is sane...", t, func() { + + d1 := mat64.NewDense(1, 2, nil) + d2 := mat64.NewDense(1, 2, nil) + + d1.Set(0, 0, 0.494260967249) + d1.Set(0, 1, 1.45106696541) + d2.Set(0, 0, -1.42808099324) + d2.Set(0, 1, -0.83706376669) + + e := pairwise.NewEuclidean() + So(e.Distance(d1, d2), ShouldAlmostEqual, 2.9882, 0.001) + + }) + +} + +func TestDBSCAN(t *testing.T) { + + Convey("Loading some data and labels...", t, func() { + + inst, err := base.ParseCSVToInstances("dbscan.csv", false) + So(err, ShouldBeNil) + + file, err := os.Open("dbscan_labels.csv") + defer file.Close() + So(err, ShouldBeNil) + + clusterMap := ClusterMap(make(map[int][]int)) + + scanner := bufio.NewScanner(file) + line := -1 + for scanner.Scan() { + line = line + 1 + v, err := strconv.ParseInt(scanner.Text(), 10, 64) + if err != nil { + panic(err) + } + v = v + 1 // -1 are noise in scikit-learn's DBSCAN + c := int(v) + if c == 0 { + continue + } + if _, ok := clusterMap[c]; !ok { + clusterMap[c] = make([]int, 0) + } + clusterMap[c] = append(clusterMap[c], line) + } + + Convey("Our DBSCAN implementation should match...", func() { + p := DBSCANParameters{ + ClusterParameters{ + inst.AllAttributes(), + pairwise.NewEuclidean(), + }, + 0.3, + 10, + } + m, err := DBSCAN(inst, p) + Convey("There should be nothing in the result that's smaller than MinPts", func() { + + for id := range m { + So(len(m[id]), ShouldBeGreaterThanOrEqualTo, 10) + } + + }) + So(err, ShouldBeNil) + eq, err := clusterMap.Equals(m) + So(err, ShouldBeNil) + So(eq, ShouldBeTrue) + }) + }) + +} diff --git a/clustering/gen_test.py b/clustering/gen_test.py new file mode 100644 index 0000000..dca3752 --- /dev/null +++ b/clustering/gen_test.py @@ -0,0 +1,30 @@ +# +# Generate sample data for the DBSCAN test +# +# Lifted from http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#example-cluster-plot-dbscan-py +# + +import numpy as np + +from sklearn.cluster import DBSCAN +from sklearn import metrics +from sklearn.datasets.samples_generator import make_blobs +from sklearn.preprocessing import StandardScaler + +centers = [[1, 1], [-1, -1], [1, -1]] +X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, + random_state=0) + +X = StandardScaler().fit_transform(X) +X = X.astype(np.float64) +db = DBSCAN(eps=0.3, min_samples=10, metric='l2', algorithm='brute').fit(X) +core_samples_mask = np.zeros_like(db.labels_, dtype=bool) +core_samples_mask[db.core_sample_indices_] = True +labels = db.labels_ + +with open('dbscan.csv', 'w') as fscanout: + with open('dbscan_labels.csv', 'w') as fscanlabout: + for i in range(750): + fscanout.write(",".join([str(x) for x in X[i,:]]) + "\n") + fscanlabout.write(str(labels[i]) + "\n") + diff --git a/clustering/synthetic.csv b/clustering/synthetic.csv new file mode 100644 index 0000000..5cc669b --- /dev/null +++ b/clustering/synthetic.csv @@ -0,0 +1,5 @@ +0,4 +1,4 +2,3 +2,4 +3,1