mirror of
https://github.com/sjwhitworth/golearn.git
synced 2025-04-26 13:49:14 +08:00

Each method now ontains a -FromReader counterpart such that it'll allow use of those helper-methods even when someone does not have his data in a physical file. The original methods make use of those -FromReader methods. The reader is being reset (Seek(0, 0)) before every method- specific read, to ensure it's reading from the start of the reader. Test cases are not yet touched, and I'm not sure they should.
369 lines
8.6 KiB
Go
369 lines
8.6 KiB
Go
package base
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/csv"
|
|
"fmt"
|
|
"io"
|
|
"regexp"
|
|
"runtime"
|
|
"strings"
|
|
)
|
|
|
|
// ParseCSVGetRowsFromReader returns the number of rows in a given reader.
|
|
func ParseCSVGetRowsFromReader(r io.ReadSeeker) (int, error) {
|
|
r.Seek(0, 0)
|
|
reader := csv.NewReader(r)
|
|
counter := 0
|
|
for {
|
|
_, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return 0, err
|
|
}
|
|
counter++
|
|
}
|
|
return counter, nil
|
|
}
|
|
|
|
// ParseCSVEstimateFilePrecisionFromReader determines what the maximum number of
|
|
// digits occuring anywhere after the decimal point within the reader.
|
|
func ParseCSVEstimateFilePrecisionFromReader(r io.ReadSeeker) (int, error) {
|
|
// Creat a basic regexp
|
|
rexp := regexp.MustCompile("[0-9]+(.[0-9]+)?")
|
|
|
|
// Scan through the file line-by-line
|
|
maxL := 0
|
|
|
|
r.Seek(0, 0)
|
|
scanner := bufio.NewScanner(r)
|
|
lineCount := 0
|
|
for scanner.Scan() {
|
|
if lineCount > 5 {
|
|
break
|
|
}
|
|
line := scanner.Text()
|
|
if len(line) == 0 {
|
|
continue
|
|
}
|
|
if line[0] == '@' {
|
|
continue
|
|
}
|
|
if line[0] == '%' {
|
|
continue
|
|
}
|
|
matches := rexp.FindAllString(line, -1)
|
|
for _, m := range matches {
|
|
p := strings.Split(m, ".")
|
|
if len(p) == 2 {
|
|
l := len(p[len(p)-1])
|
|
if l > maxL {
|
|
maxL = l
|
|
}
|
|
}
|
|
}
|
|
lineCount++
|
|
}
|
|
return maxL, nil
|
|
}
|
|
|
|
// ParseCSVGetAttributesFromReader returns an ordered slice of appropriate-ly typed
|
|
// and named Attributes.
|
|
func ParseCSVGetAttributesFromReader(r io.ReadSeeker, hasHeaders bool) []Attribute {
|
|
attrs := ParseCSVSniffAttributeTypesFromReader(r, hasHeaders)
|
|
names := ParseCSVSniffAttributeNamesFromReader(r, hasHeaders)
|
|
for i, attr := range attrs {
|
|
attr.SetName(names[i])
|
|
}
|
|
return attrs
|
|
}
|
|
|
|
// ParseCSVSniffAttributeNamesFromReader returns a slice containing the top row
|
|
// of a given reader with CSV-contents, or placeholders if hasHeaders is false.
|
|
func ParseCSVSniffAttributeNamesFromReader(r io.ReadSeeker, hasHeaders bool) []string {
|
|
|
|
r.Seek(0, 0)
|
|
reader := csv.NewReader(r)
|
|
headers, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
if hasHeaders {
|
|
for i, h := range headers {
|
|
headers[i] = strings.TrimSpace(h)
|
|
}
|
|
return headers
|
|
}
|
|
|
|
for i := range headers {
|
|
headers[i] = fmt.Sprintf("%d", i)
|
|
}
|
|
return headers
|
|
|
|
}
|
|
|
|
// ParseCSVSniffAttributeTypesFromReader returns a slice of appropriately-typed Attributes.
|
|
//
|
|
// The type of a given attribute is determined by looking at the first data row
|
|
// of the CSV.
|
|
func ParseCSVSniffAttributeTypesFromReader(r io.ReadSeeker, hasHeaders bool) []Attribute {
|
|
var attrs []Attribute
|
|
|
|
// Create the CSV reader
|
|
r.Seek(0, 0)
|
|
reader := csv.NewReader(r)
|
|
if hasHeaders {
|
|
// Skip the headers
|
|
_, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
// Read the first line of the file
|
|
columns, err := reader.Read()
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
for _, entry := range columns {
|
|
// Match the Attribute type with regular expressions
|
|
entry = strings.Trim(entry, " ")
|
|
matched, err := regexp.MatchString("^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$", entry)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
if matched {
|
|
attrs = append(attrs, NewFloatAttribute(""))
|
|
} else {
|
|
attrs = append(attrs, new(CategoricalAttribute))
|
|
}
|
|
}
|
|
|
|
// Estimate file precision
|
|
maxP, err := ParseCSVEstimateFilePrecisionFromReader(r)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
for _, a := range attrs {
|
|
if f, ok := a.(*FloatAttribute); ok {
|
|
f.Precision = maxP
|
|
}
|
|
}
|
|
|
|
return attrs
|
|
}
|
|
|
|
// ParseCSVBuildInstancesFromReader updates an [[#UpdatableDataGrid]] from a io.Reader
|
|
func ParseCSVBuildInstancesFromReader(r io.ReadSeeker, attrs []Attribute, hasHeader bool, u UpdatableDataGrid) (err error) {
|
|
var rowCounter int
|
|
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
if _, ok := r.(runtime.Error); ok {
|
|
panic(err)
|
|
}
|
|
err = fmt.Errorf("error at line %d (error %s)", rowCounter, r.(error))
|
|
}
|
|
}()
|
|
|
|
specs := ResolveAttributes(u, attrs)
|
|
|
|
r.Seek(0, 0)
|
|
reader := csv.NewReader(r)
|
|
|
|
for {
|
|
record, err := reader.Read()
|
|
if err == io.EOF {
|
|
break
|
|
} else if err != nil {
|
|
return err
|
|
}
|
|
if rowCounter == 0 {
|
|
if hasHeader {
|
|
hasHeader = false
|
|
continue
|
|
}
|
|
}
|
|
for i, v := range record {
|
|
u.Set(specs[i], rowCounter, specs[i].attr.GetSysValFromString(strings.TrimSpace(v)))
|
|
}
|
|
rowCounter++
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ParseCSVToInstancesFromReader reads the reader containing CSV and returns
|
|
// the read Instances.
|
|
func ParseCSVToInstancesFromReader(r io.ReadSeeker, hasHeaders bool) (instances *DenseInstances, err error) {
|
|
// Read the number of rows in the file
|
|
rowCount, err := ParseCSVGetRowsFromReader(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if hasHeaders {
|
|
rowCount--
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributesFromReader(r, hasHeaders)
|
|
specs := make([]AttributeSpec, len(attrs))
|
|
// Allocate the Instances to return
|
|
instances = NewDenseInstances()
|
|
for i, a := range attrs {
|
|
spec := instances.AddAttribute(a)
|
|
specs[i] = spec
|
|
}
|
|
instances.Extend(rowCount)
|
|
|
|
err = ParseCSVBuildInstancesFromReader(r, attrs, hasHeaders, instances)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
instances.AddClassAttribute(attrs[len(attrs)-1])
|
|
|
|
return instances, nil
|
|
}
|
|
|
|
// ParseUtilsMatchAttrs tries to match the set of Attributes read from one file with
|
|
// those read from another, and writes the matching Attributes back to the original set.
|
|
func ParseMatchAttributes(attrs, templateAttrs []Attribute) {
|
|
for i, a := range attrs {
|
|
for _, b := range templateAttrs {
|
|
if a.Equals(b) {
|
|
attrs[i] = b
|
|
} else if a.GetName() == b.GetName() {
|
|
attrs[i] = b
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ParseCSVToTemplatedInstancesFromReader reads the reader containing CSV and returns
|
|
// the read Instances, using another already read DenseInstances as a template.
|
|
func ParseCSVToTemplatedInstancesFromReader(r io.ReadSeeker, hasHeaders bool, template *DenseInstances) (instances *DenseInstances, err error) {
|
|
// Read the number of rows in the file
|
|
rowCount, err := ParseCSVGetRowsFromReader(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if hasHeaders {
|
|
rowCount--
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributesFromReader(r, hasHeaders)
|
|
templateAttrs := template.AllAttributes()
|
|
ParseMatchAttributes(attrs, templateAttrs)
|
|
|
|
// Allocate the Instances to return
|
|
instances = CopyDenseInstances(template, templateAttrs)
|
|
instances.Extend(rowCount)
|
|
|
|
err = ParseCSVBuildInstancesFromReader(r, attrs, hasHeaders, instances)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, a := range template.AllClassAttributes() {
|
|
err = instances.AddClassAttribute(a)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return instances, nil
|
|
}
|
|
|
|
// ParseCSVToInstancesWithAttributeGroupsFromReader reads the CSV file given by filepath,
|
|
// and returns the read DenseInstances, but also makes sure to group any Attributes
|
|
// specified in the first argument and also any class Attributes specified in the second
|
|
func ParseCSVToInstancesWithAttributeGroupsFromReader(r io.ReadSeeker, attrGroups, classAttrGroups map[string]string, attrOverrides map[int]Attribute, hasHeaders bool) (instances *DenseInstances, err error) {
|
|
// Read row count
|
|
rowCount, err := ParseCSVGetRowsFromReader(r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Read the row headers
|
|
attrs := ParseCSVGetAttributesFromReader(r, hasHeaders)
|
|
for i := range attrs {
|
|
if a, ok := attrOverrides[i]; ok {
|
|
attrs[i] = a
|
|
}
|
|
}
|
|
|
|
specs := make([]AttributeSpec, len(attrs))
|
|
// Allocate the Instances to return
|
|
instances = NewDenseInstances()
|
|
|
|
//
|
|
// Create all AttributeGroups
|
|
agsToCreate := make(map[string]int)
|
|
combinedAgs := make(map[string]string)
|
|
for a := range attrGroups {
|
|
agsToCreate[attrGroups[a]] = 0
|
|
combinedAgs[a] = attrGroups[a]
|
|
}
|
|
for a := range classAttrGroups {
|
|
agsToCreate[classAttrGroups[a]] = 8
|
|
combinedAgs[a] = classAttrGroups[a]
|
|
}
|
|
|
|
// Decide the sizes
|
|
for _, a := range attrs {
|
|
if ag, ok := combinedAgs[a.GetName()]; ok {
|
|
if _, ok := a.(*BinaryAttribute); ok {
|
|
agsToCreate[ag] = 0
|
|
} else {
|
|
agsToCreate[ag] = 8
|
|
}
|
|
}
|
|
}
|
|
|
|
// Create them
|
|
for i := range agsToCreate {
|
|
size := agsToCreate[i]
|
|
err = instances.CreateAttributeGroup(i, size)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
|
|
// Add the Attributes to them
|
|
for i, a := range attrs {
|
|
var spec AttributeSpec
|
|
if ag, ok := combinedAgs[a.GetName()]; ok {
|
|
spec, err = instances.AddAttributeToAttributeGroup(a, ag)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
specs[i] = spec
|
|
} else {
|
|
spec = instances.AddAttribute(a)
|
|
}
|
|
specs[i] = spec
|
|
if _, ok := classAttrGroups[a.GetName()]; ok {
|
|
err = instances.AddClassAttribute(a)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
}
|
|
}
|
|
// Allocate
|
|
instances.Extend(rowCount)
|
|
|
|
err = ParseCSVBuildInstancesFromReader(r, attrs, hasHeaders, instances)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return instances, nil
|
|
|
|
}
|