1
0
mirror of https://github.com/sjwhitworth/golearn.git synced 2025-04-28 13:48:56 +08:00
golearn/base/csv.go
Richard Townsend 527c6476e1 Optimised version of KNN for Euclidean distances
This patch also:
   * Completes removal of the edf/ package
   * Corrects an erroneous print statement
   * Introduces two new CSV functions
      * ParseCSVToInstancesTemplated makes sure that
        reading a second CSV file maintains strict Attribute
        compatibility with an existing DenseInstances
      * ParseCSVToInstancesWithAttributeGroups gives more control
        over where Attributes end up in memory, important for
        gaining predictable control over the KNN optimisation
      * Decouples BinaryAttributeGroup from FixedAttributeGroup for
        better casting support
2014-09-30 23:10:22 +01:00

431 lines
9.1 KiB
Go

package base
import (
"encoding/csv"
"fmt"
"io"
"os"
"regexp"
"strings"
)
// ParseCSVGetRows returns the number of rows in a given file.
func ParseCSVGetRows(filepath string) (int, error) {
file, err := os.Open(filepath)
if err != nil {
return 0, err
}
defer file.Close()
reader := csv.NewReader(file)
counter := 0
for {
_, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
return 0, err
}
counter++
}
return counter, nil
}
// ParseCSVGetAttributes returns an ordered slice of appropriate-ly typed
// and named Attributes.
func ParseCSVGetAttributes(filepath string, hasHeaders bool) []Attribute {
attrs := ParseCSVSniffAttributeTypes(filepath, hasHeaders)
names := ParseCSVSniffAttributeNames(filepath, hasHeaders)
for i, attr := range attrs {
attr.SetName(names[i])
}
return attrs
}
// ParseCSVSniffAttributeNames returns a slice containing the top row
// of a given CSV file, or placeholders if hasHeaders is false.
func ParseCSVSniffAttributeNames(filepath string, hasHeaders bool) []string {
file, err := os.Open(filepath)
if err != nil {
panic(err)
}
defer file.Close()
reader := csv.NewReader(file)
headers, err := reader.Read()
if err != nil {
panic(err)
}
if hasHeaders {
for i, h := range headers {
headers[i] = strings.TrimSpace(h)
}
return headers
}
for i := range headers {
headers[i] = fmt.Sprintf("%d", i)
}
return headers
}
// ParseCSVSniffAttributeTypes returns a slice of appropriately-typed Attributes.
//
// The type of a given attribute is determined by looking at the first data row
// of the CSV.
func ParseCSVSniffAttributeTypes(filepath string, hasHeaders bool) []Attribute {
var attrs []Attribute
// Open file
file, err := os.Open(filepath)
if err != nil {
panic(err)
}
defer file.Close()
// Create the CSV reader
reader := csv.NewReader(file)
if hasHeaders {
// Skip the headers
_, err := reader.Read()
if err != nil {
panic(err)
}
}
// Read the first line of the file
columns, err := reader.Read()
if err != nil {
panic(err)
}
for _, entry := range columns {
// Match the Attribute type with regular expressions
entry = strings.Trim(entry, " ")
matched, err := regexp.MatchString("^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$", entry)
if err != nil {
panic(err)
}
if matched {
attrs = append(attrs, NewFloatAttribute(""))
} else {
attrs = append(attrs, new(CategoricalAttribute))
}
}
return attrs
}
// ParseCSVBuildInstances updates an [[#UpdatableDataGrid]] from a filepath in place
func ParseCSVBuildInstances(filepath string, hasHeaders bool, u UpdatableDataGrid) {
// Read the input
file, err := os.Open(filepath)
if err != nil {
panic(err)
}
defer file.Close()
reader := csv.NewReader(file)
rowCounter := 0
specs := ResolveAttributes(u, u.AllAttributes())
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
panic(err)
}
if rowCounter == 0 {
if hasHeaders {
hasHeaders = false
continue
}
}
for i, v := range record {
u.Set(specs[i], rowCounter, specs[i].attr.GetSysValFromString(v))
}
rowCounter++
}
}
// ParseCSVToInstances reads the CSV file given by filepath and returns
// the read Instances.
func ParseCSVToInstances(filepath string, hasHeaders bool) (instances *DenseInstances, err error) {
// Read the number of rows in the file
rowCount, err := ParseCSVGetRows(filepath)
if err != nil {
return nil, err
}
if hasHeaders {
rowCount--
}
// Read the row headers
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
specs := make([]AttributeSpec, len(attrs))
// Allocate the Instances to return
instances = NewDenseInstances()
for i, a := range attrs {
spec := instances.AddAttribute(a)
specs[i] = spec
}
instances.Extend(rowCount)
// Read the input
file, err := os.Open(filepath)
if err != nil {
return nil, err
}
defer file.Close()
reader := csv.NewReader(file)
rowCounter := 0
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
if rowCounter == 0 {
if hasHeaders {
hasHeaders = false
continue
}
}
for i, v := range record {
v = strings.Trim(v, " ")
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
}
rowCounter++
}
instances.AddClassAttribute(attrs[len(attrs)-1])
return instances, nil
}
// ParseCSVToInstancesTemplated reads the CSV file given by filepath and returns
// the read Instances, using another already read DenseInstances as a template.
func ParseCSVToTemplatedInstances(filepath string, hasHeaders bool, template *DenseInstances) (instances *DenseInstances, err error) {
// Read the number of rows in the file
rowCount, err := ParseCSVGetRows(filepath)
if err != nil {
return nil, err
}
if hasHeaders {
rowCount--
}
// Read the row headers
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
templateAttrs := template.AllAttributes()
for i, a := range attrs {
for _, b := range templateAttrs {
if a.Equals(b) {
attrs[i] = b
} else if a.GetName() == b.GetName() {
attrs[i] = b
}
}
}
specs := make([]AttributeSpec, len(attrs))
// Allocate the Instances to return
instances = NewDenseInstances()
templateAgs := template.AllAttributeGroups()
for ag := range templateAgs {
agTemplate := templateAgs[ag]
if _, ok := agTemplate.(*BinaryAttributeGroup); ok {
instances.CreateAttributeGroup(ag, 0)
} else {
instances.CreateAttributeGroup(ag, 8)
}
}
for i, a := range templateAttrs {
s, err := template.GetAttribute(a)
if err != nil {
panic(err)
}
if ag, ok := template.agRevMap[s.pond]; !ok {
panic(ag)
} else {
spec, err := instances.AddAttributeToAttributeGroup(a, ag)
if err != nil {
panic(err)
}
specs[i] = spec
}
}
instances.Extend(rowCount)
// Read the input
file, err := os.Open(filepath)
if err != nil {
return nil, err
}
defer file.Close()
reader := csv.NewReader(file)
rowCounter := 0
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
if rowCounter == 0 {
if hasHeaders {
hasHeaders = false
continue
}
}
for i, v := range record {
v = strings.Trim(v, " ")
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
}
rowCounter++
}
for _, a := range template.AllClassAttributes() {
instances.AddClassAttribute(a)
}
return instances, nil
}
// ParseCSVToInstancesWithAttributeGroups reads the CSV file given by filepath,
// and returns the read DenseInstances, but also makes sure to group any Attributes
// specified in the first argument and also any class Attributes specified in the second
func ParseCSVToInstancesWithAttributeGroups(filepath string, attrGroups, classAttrGroups map[string]string, attrOverrides map[int]Attribute, hasHeaders bool) (instances *DenseInstances, err error) {
// Read row count
rowCount, err := ParseCSVGetRows(filepath)
if err != nil {
return nil, err
}
// Read the row headers
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
for i := range attrs {
if a, ok := attrOverrides[i]; ok {
attrs[i] = a
}
}
specs := make([]AttributeSpec, len(attrs))
// Allocate the Instances to return
instances = NewDenseInstances()
//
// Create all AttributeGroups
agsToCreate := make(map[string]int)
combinedAgs := make(map[string]string)
for a := range attrGroups {
agsToCreate[attrGroups[a]] = 0
combinedAgs[a] = attrGroups[a]
}
for a := range classAttrGroups {
agsToCreate[classAttrGroups[a]] = 8
combinedAgs[a] = classAttrGroups[a]
}
// Decide the sizes
for _, a := range attrs {
if ag, ok := combinedAgs[a.GetName()]; ok {
if _, ok := a.(*BinaryAttribute); ok {
agsToCreate[ag] = 0
} else {
agsToCreate[ag] = 8
}
}
}
// Create them
for i := range agsToCreate {
size := agsToCreate[i]
err = instances.CreateAttributeGroup(i, size)
if err != nil {
panic(err)
}
}
// Add the Attributes to them
for i, a := range attrs {
var spec AttributeSpec
if ag, ok := combinedAgs[a.GetName()]; ok {
spec, err = instances.AddAttributeToAttributeGroup(a, ag)
if err != nil {
panic(err)
}
specs[i] = spec
} else {
spec = instances.AddAttribute(a)
}
specs[i] = spec
if _, ok := classAttrGroups[a.GetName()]; ok {
err = instances.AddClassAttribute(a)
if err != nil {
panic(err)
}
}
}
// Allocate
instances.Extend(rowCount)
// Read the input
file, err := os.Open(filepath)
if err != nil {
return nil, err
}
defer file.Close()
reader := csv.NewReader(file)
rowCounter := 0
for {
record, err := reader.Read()
if err == io.EOF {
break
} else if err != nil {
return nil, err
}
if rowCounter == 0 {
// Skip header row
rowCounter++
continue
}
for i, v := range record {
v = strings.Trim(v, " ")
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
}
rowCounter++
}
// Add class Attributes
for _, a := range instances.AllAttributes() {
name := a.GetName() // classAttrGroups
if _, ok := classAttrGroups[name]; ok {
err = instances.AddClassAttribute(a)
if err != nil {
panic(err)
}
}
}
return instances, nil
}