mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-28 13:48:56 +08:00

This patch also: * Completes removal of the edf/ package * Corrects an erroneous print statement * Introduces two new CSV functions * ParseCSVToInstancesTemplated makes sure that reading a second CSV file maintains strict Attribute compatibility with an existing DenseInstances * ParseCSVToInstancesWithAttributeGroups gives more control over where Attributes end up in memory, important for gaining predictable control over the KNN optimisation * Decouples BinaryAttributeGroup from FixedAttributeGroup for better casting support
431 lines
9.1 KiB
Go
431 lines
9.1 KiB
Go
package base
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
)
|
|
|
|
// ParseCSVGetRows returns the number of rows in a given file.
|
|
func ParseCSVGetRows(filepath string) (int, error) {
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
defer file.Close()
|
|
|
|
reader := csv.NewReader(file)
|
|
counter := 0
|
|
for {
|
|
_, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return 0, err
|
|
}
|
|
counter++
|
|
}
|
|
return counter, nil
|
|
}
|
|
|
|
// ParseCSVGetAttributes returns an ordered slice of appropriate-ly typed
|
|
// and named Attributes.
|
|
func ParseCSVGetAttributes(filepath string, hasHeaders bool) []Attribute {
|
|
attrs := ParseCSVSniffAttributeTypes(filepath, hasHeaders)
|
|
names := ParseCSVSniffAttributeNames(filepath, hasHeaders)
|
|
for i, attr := range attrs {
|
|
attr.SetName(names[i])
|
|
}
|
|
return attrs
|
|
}
|
|
|
|
// ParseCSVSniffAttributeNames returns a slice containing the top row
|
|
// of a given CSV file, or placeholders if hasHeaders is false.
|
|
func ParseCSVSniffAttributeNames(filepath string, hasHeaders bool) []string {
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer file.Close()
|
|
|
|
reader := csv.NewReader(file)
|
|
headers, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
if hasHeaders {
|
|
for i, h := range headers {
|
|
headers[i] = strings.TrimSpace(h)
|
|
}
|
|
return headers
|
|
}
|
|
|
|
for i := range headers {
|
|
headers[i] = fmt.Sprintf("%d", i)
|
|
}
|
|
return headers
|
|
|
|
}
|
|
|
|
// ParseCSVSniffAttributeTypes returns a slice of appropriately-typed Attributes.
|
|
//
|
|
// The type of a given attribute is determined by looking at the first data row
|
|
// of the CSV.
|
|
func ParseCSVSniffAttributeTypes(filepath string, hasHeaders bool) []Attribute {
|
|
var attrs []Attribute
|
|
// Open file
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer file.Close()
|
|
// Create the CSV reader
|
|
reader := csv.NewReader(file)
|
|
if hasHeaders {
|
|
// Skip the headers
|
|
_, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
// Read the first line of the file
|
|
columns, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
for _, entry := range columns {
|
|
// Match the Attribute type with regular expressions
|
|
entry = strings.Trim(entry, " ")
|
|
matched, err := regexp.MatchString("^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$", entry)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if matched {
|
|
attrs = append(attrs, NewFloatAttribute(""))
|
|
} else {
|
|
attrs = append(attrs, new(CategoricalAttribute))
|
|
}
|
|
}
|
|
|
|
return attrs
|
|
}
|
|
|
|
// ParseCSVBuildInstances updates an [[#UpdatableDataGrid]] from a filepath in place
|
|
func ParseCSVBuildInstances(filepath string, hasHeaders bool, u UpdatableDataGrid) {
|
|
|
|
// Read the input
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
defer file.Close()
|
|
reader := csv.NewReader(file)
|
|
|
|
rowCounter := 0
|
|
|
|
specs := ResolveAttributes(u, u.AllAttributes())
|
|
|
|
for {
|
|
record, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
panic(err)
|
|
}
|
|
if rowCounter == 0 {
|
|
if hasHeaders {
|
|
hasHeaders = false
|
|
continue
|
|
}
|
|
}
|
|
for i, v := range record {
|
|
u.Set(specs[i], rowCounter, specs[i].attr.GetSysValFromString(v))
|
|
}
|
|
rowCounter++
|
|
}
|
|
|
|
}
|
|
|
|
// ParseCSVToInstances reads the CSV file given by filepath and returns
|
|
// the read Instances.
|
|
func ParseCSVToInstances(filepath string, hasHeaders bool) (instances *DenseInstances, err error) {
|
|
|
|
// Read the number of rows in the file
|
|
rowCount, err := ParseCSVGetRows(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if hasHeaders {
|
|
rowCount--
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
|
|
specs := make([]AttributeSpec, len(attrs))
|
|
// Allocate the Instances to return
|
|
instances = NewDenseInstances()
|
|
for i, a := range attrs {
|
|
spec := instances.AddAttribute(a)
|
|
specs[i] = spec
|
|
}
|
|
instances.Extend(rowCount)
|
|
|
|
// Read the input
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
reader := csv.NewReader(file)
|
|
|
|
rowCounter := 0
|
|
|
|
for {
|
|
record, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
if rowCounter == 0 {
|
|
if hasHeaders {
|
|
hasHeaders = false
|
|
continue
|
|
}
|
|
}
|
|
for i, v := range record {
|
|
v = strings.Trim(v, " ")
|
|
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
|
|
}
|
|
rowCounter++
|
|
}
|
|
|
|
instances.AddClassAttribute(attrs[len(attrs)-1])
|
|
|
|
return instances, nil
|
|
}
|
|
|
|
// ParseCSVToInstancesTemplated reads the CSV file given by filepath and returns
|
|
// the read Instances, using another already read DenseInstances as a template.
|
|
func ParseCSVToTemplatedInstances(filepath string, hasHeaders bool, template *DenseInstances) (instances *DenseInstances, err error) {
|
|
|
|
// Read the number of rows in the file
|
|
rowCount, err := ParseCSVGetRows(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if hasHeaders {
|
|
rowCount--
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
|
|
templateAttrs := template.AllAttributes()
|
|
for i, a := range attrs {
|
|
for _, b := range templateAttrs {
|
|
if a.Equals(b) {
|
|
attrs[i] = b
|
|
} else if a.GetName() == b.GetName() {
|
|
attrs[i] = b
|
|
}
|
|
}
|
|
}
|
|
|
|
specs := make([]AttributeSpec, len(attrs))
|
|
// Allocate the Instances to return
|
|
instances = NewDenseInstances()
|
|
|
|
templateAgs := template.AllAttributeGroups()
|
|
for ag := range templateAgs {
|
|
agTemplate := templateAgs[ag]
|
|
if _, ok := agTemplate.(*BinaryAttributeGroup); ok {
|
|
instances.CreateAttributeGroup(ag, 0)
|
|
} else {
|
|
instances.CreateAttributeGroup(ag, 8)
|
|
}
|
|
}
|
|
|
|
for i, a := range templateAttrs {
|
|
s, err := template.GetAttribute(a)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if ag, ok := template.agRevMap[s.pond]; !ok {
|
|
panic(ag)
|
|
} else {
|
|
spec, err := instances.AddAttributeToAttributeGroup(a, ag)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
specs[i] = spec
|
|
}
|
|
}
|
|
|
|
instances.Extend(rowCount)
|
|
|
|
// Read the input
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
reader := csv.NewReader(file)
|
|
|
|
rowCounter := 0
|
|
|
|
for {
|
|
record, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
if rowCounter == 0 {
|
|
if hasHeaders {
|
|
hasHeaders = false
|
|
continue
|
|
}
|
|
}
|
|
for i, v := range record {
|
|
v = strings.Trim(v, " ")
|
|
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
|
|
}
|
|
rowCounter++
|
|
}
|
|
|
|
for _, a := range template.AllClassAttributes() {
|
|
instances.AddClassAttribute(a)
|
|
}
|
|
|
|
return instances, nil
|
|
}
|
|
|
|
// ParseCSVToInstancesWithAttributeGroups reads the CSV file given by filepath,
|
|
// and returns the read DenseInstances, but also makes sure to group any Attributes
|
|
// specified in the first argument and also any class Attributes specified in the second
|
|
func ParseCSVToInstancesWithAttributeGroups(filepath string, attrGroups, classAttrGroups map[string]string, attrOverrides map[int]Attribute, hasHeaders bool) (instances *DenseInstances, err error) {
|
|
|
|
// Read row count
|
|
rowCount, err := ParseCSVGetRows(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributes(filepath, hasHeaders)
|
|
for i := range attrs {
|
|
if a, ok := attrOverrides[i]; ok {
|
|
attrs[i] = a
|
|
}
|
|
}
|
|
|
|
specs := make([]AttributeSpec, len(attrs))
|
|
// Allocate the Instances to return
|
|
instances = NewDenseInstances()
|
|
|
|
//
|
|
// Create all AttributeGroups
|
|
agsToCreate := make(map[string]int)
|
|
combinedAgs := make(map[string]string)
|
|
for a := range attrGroups {
|
|
agsToCreate[attrGroups[a]] = 0
|
|
combinedAgs[a] = attrGroups[a]
|
|
}
|
|
for a := range classAttrGroups {
|
|
agsToCreate[classAttrGroups[a]] = 8
|
|
combinedAgs[a] = classAttrGroups[a]
|
|
}
|
|
|
|
// Decide the sizes
|
|
for _, a := range attrs {
|
|
if ag, ok := combinedAgs[a.GetName()]; ok {
|
|
if _, ok := a.(*BinaryAttribute); ok {
|
|
agsToCreate[ag] = 0
|
|
} else {
|
|
agsToCreate[ag] = 8
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create them
|
|
for i := range agsToCreate {
|
|
size := agsToCreate[i]
|
|
err = instances.CreateAttributeGroup(i, size)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
// Add the Attributes to them
|
|
for i, a := range attrs {
|
|
var spec AttributeSpec
|
|
if ag, ok := combinedAgs[a.GetName()]; ok {
|
|
spec, err = instances.AddAttributeToAttributeGroup(a, ag)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
specs[i] = spec
|
|
} else {
|
|
spec = instances.AddAttribute(a)
|
|
}
|
|
specs[i] = spec
|
|
if _, ok := classAttrGroups[a.GetName()]; ok {
|
|
err = instances.AddClassAttribute(a)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
}
|
|
// Allocate
|
|
instances.Extend(rowCount)
|
|
|
|
// Read the input
|
|
file, err := os.Open(filepath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer file.Close()
|
|
reader := csv.NewReader(file)
|
|
|
|
rowCounter := 0
|
|
|
|
for {
|
|
record, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return nil, err
|
|
}
|
|
if rowCounter == 0 {
|
|
// Skip header row
|
|
rowCounter++
|
|
continue
|
|
}
|
|
for i, v := range record {
|
|
v = strings.Trim(v, " ")
|
|
instances.Set(specs[i], rowCounter, attrs[i].GetSysValFromString(v))
|
|
}
|
|
rowCounter++
|
|
}
|
|
|
|
// Add class Attributes
|
|
for _, a := range instances.AllAttributes() {
|
|
name := a.GetName() // classAttrGroups
|
|
if _, ok := classAttrGroups[name]; ok {
|
|
err = instances.AddClassAttribute(a)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
}
|
|
return instances, nil
|
|
|
|
}
|